├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── ScalaBFS-proj.zip ├── add_init.py ├── build.sbt ├── data_preprocess ├── .~README.md ├── GraphToScalaBFS.cpp ├── Makefile ├── README.md ├── Wiki-Vote.txt ├── rmat_generate.txt └── transfer.cpp ├── docs ├── fig11-compare-naive.jpg └── screenshot.png └── src └── main └── scala ├── Decoupled_Mem.scala ├── Mem_write.scala ├── Test.scala ├── Top.scala ├── bram.scala ├── clocking_wizard.scala ├── configuration.scala ├── crossbar.scala ├── frontier.scala ├── master.scala ├── p1.scala └── p2.scala /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | /.ipynb_checkpoints 3 | /.metals 4 | /.bloop 5 | /project 6 | /target 7 | /.DS_Store 8 | /*/.DS_Store 9 | /*/*/.DS_Store 10 | *.anno 11 | *.anno.json 12 | *.fir 13 | *.v 14 | *.xml 15 | *.v 16 | *.log 17 | # Jupyter-scala scripts 18 | coursier 19 | almond 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SBT = sbt 2 | CUR_DIR = $(shell pwd) 3 | OBJDIR = $(CUR_DIR)/obj 4 | PROJECTDIR = $(CUR_DIR)/project 5 | TARGETDIR = $(CUR_DIR)/target 6 | 7 | hdl: unzip top movev update_kernel 8 | 9 | top: 10 | $(SBT) -mem 51200 "runMain HBMGraph.Top" 11 | python add_init.py 12 | 13 | movev: 14 | cp Top.v ScalaBFS-proj/workspace/FinalBFS_32/vivado_rtl_kernel/FinalBFS_32_ex/FinalBFS_32_ex.srcs/sources_1/imports/bfs_u280/ 15 | 16 | creat_kernel: 17 | cd ScalaBFS-proj/workspace/FinalBFS_32/vivado_rtl_kernel/FinalBFS_32_ex/;\ 18 | echo "exit" >> bfs_project.tcl 19 | vivado -mode tcl -source bfs_project.tcl 20 | mv -f FinalBFS_32_ex ../ 21 | cd ../../../ 22 | 23 | unzip: 24 | unzip ScalaBFS-proj.zip 25 | rm -rf ScalaBFS-proj.zip 26 | 27 | update_kernel: 28 | rm -f ScalaBFS-proj/workspace/FinalBFS_32/vivado_rtl_kernel/FinalBFS_32_ex/exports/FinalBFS_32.xo 29 | echo "open_project ScalaBFS-proj/workspace/FinalBFS_32/vivado_rtl_kernel/FinalBFS_32_ex/FinalBFS_32_ex.xpr" > update_kernel.tcl 30 | echo "update_compile_order -fileset sources_1" >> update_kernel.tcl 31 | echo "source -notrace ScalaBFS-proj/workspace/FinalBFS_32/vivado_rtl_kernel/FinalBFS_32_ex/imports/package_kernel.tcl" >> update_kernel.tcl 32 | echo "package_project ScalaBFS-proj/workspace/FinalBFS_32/vivado_rtl_kernel/FinalBFS_32_ex/FinalBFS_32 mycompany.com kernel FinalBFS_32" >> update_kernel.tcl 33 | echo "package_xo -xo_path ScalaBFS-proj/workspace/FinalBFS_32/vivado_rtl_kernel/FinalBFS_32_ex/exports/FinalBFS_32.xo -kernel_name FinalBFS_32 -ip_directory ScalaBFS-proj/workspace/FinalBFS_32/vivado_rtl_kernel/FinalBFS_32_ex/FinalBFS_32 -kernel_xml ScalaBFS-proj/workspace/FinalBFS_32/vivado_rtl_kernel/FinalBFS_32_ex/imports/kernel.xml" >> update_kernel.tcl 34 | echo "file mkdir ScalaBFS-proj/workspace/FinalBFS_32/src/vitis_rtl_kernel/FinalBFS_32" >> update_kernel.tcl 35 | echo "exit" >> update_kernel.tcl 36 | vivado -mode tcl -source update_kernel.tcl 37 | 38 | movecpp: 39 | cp -f /space/graph_data/HBMGraph/hostcpp/* ScalaBFS-proj/workspace/FinalBFS_32/src/vitis_rtl_kernel/FinalBFS_32/ 40 | 41 | testall: testp1 testp2 testp3 testp4 testmem testreq testmaster testres move 42 | 43 | testp1: 44 | $(SBT) "runMain HBMGraph.Testp1" 45 | 46 | testp2: 47 | $(SBT) "runMain HBMGraph.Testp2" 48 | 49 | testmem_write: 50 | $(SBT) "runMain HBMGraph.TestMem_write" 51 | 52 | testmemory: 53 | $(SBT) "runMain HBMGraph.TestMemory" 54 | 55 | testv: 56 | $(SBT) "runMain HBMGraph.Testv" 57 | 58 | testw: 59 | $(SBT) "runMain HBMGraph.Testwrite_frontier_and_level" 60 | 61 | testreq: 62 | $(SBT) "runMain HBMGraph.Testreq" 63 | 64 | testmaster: 65 | $(SBT) "runMain HBMGraph.Testmaster" 66 | 67 | testres: 68 | $(SBT) "runMain HBMGraph.Testres" 69 | 70 | test: 71 | $(SBT) "runMain HBMGraph.bfsTester" 72 | 73 | testbram: 74 | $(SBT) "runMain HBMGraph.Testbram" 75 | 76 | testbram2: 77 | $(SBT) "runMain HBMGraph.Testbram2" 78 | 79 | testfrontier: 80 | $(SBT) "runMain HBMGraph.Testfrontier" 81 | 82 | move0: 83 | @if [ ! -d $(OBJDIR) ]; then mkdir -p $(OBJDIR); fi; 84 | mv *.v $(OBJDIR)/ 85 | mv *.json $(OBJDIR)/ 86 | mv *.fir $(OBJDIR)/ 87 | 88 | clean: 89 | rm -f *.v 90 | rm -f *.json 91 | rm -f *.fir 92 | rm -f .*.swo 93 | rm -f .*.swp 94 | rm -rf $(OBJDIR) 95 | rm -rf $(PROJECTDIR) 96 | rm -rf $(TARGETDIR) 97 | rm -f *.log 98 | clear: 99 | rm -f *.json 100 | rm -f *.fir 101 | rm -f .*.swo 102 | rm -f .*.swp 103 | rm -f $(OBJDIR)/*.json 104 | rm -f $(OBJDIR)/*.fir 105 | rm -f $(OBJDIR)/.*.swo 106 | rm -f $(OBJDIR)/.*.swp 107 | 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ScalaBFS: A Scalable BFS Accelerator on FPGA-HBM Platform 2 | 3 | ScalaBFS is an BFS accelerator built on top of an FPGA configured with HBM (i.e., FPGA-HBM platform) which can scale its performance according to the available memory channels (on a single card). It utlizes multiple processing elements to sufficiently exploit the high bandwidth of HBM to improve efficiency. We implement the prototype system of ScalaBFS on Xilinx Alveo U280 FPGA card (real hardware). Paper: https://arxiv.org/abs/2105.11754 4 | 5 | ## Organization 6 | 7 | The code for ScalaBFS using Chisel language is located in src/ directory. Vitis project is located in ScalaBFS-proj/ directory after deployment. Graph data processing files are provided in preprocess/ directory. 8 | 9 | ## Prerequisites 10 | 11 | ### Hardware 12 | 13 | This project works on [Xilinx U280 Data Center Accelerator card](https://www.xilinx.com/products/boards-and-kits/alveo/u280.html). 14 | 15 | ### Operation System 16 | 17 | Ubuntu 18.04 LTS 18 | 19 | ### Software 20 | 21 | [Vitis 2019.2](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vitis/2019-2.html) 22 | 23 | [U280 Package File on Vitis 2019.2](https://www.xilinx.com/products/boards-and-kits/alveo/u280.html#gettingStarted) 24 | 25 | Notice: 26 | 27 | 1. After the installation of xdma and update the shell on alveo card manually(under normal circumstances , the command is shown in the process of the installtion of xdma. If not , you can use command "/opt/xilinx/xrt/bin/xbmgmt flash --update"), you should cold reboot your machine. The cold reboot means that you should shutdown your machine , unplug the power , wating for several minutes , plug the power and boot up your machine.You can use command 28 | 29 | ``` 30 | /opt/xilinx/xrt/bin/xbmgmt flash --scan 31 | /opt/xilinx/xrt/bin/xbutil validate 32 | ``` 33 | 34 | to make sure that the runtime enviroment and the alveo card is ready. 35 | 36 | 2. Don't forget to add the xrt and Vitis to your PATH. Typically you can 37 | 38 | ``` 39 | source /opt/xilinx/xrt/setup.sh 40 | source /tools/Xilinx/Vitis/2019.2/settings64.sh 41 | ``` 42 | 43 | You can also add this two commands to your .bashrc file.If in the process of making ScalaBFS you fail and see "make: vivado: Command not found", you very likely ignored this step. 44 | 45 | 3. If you meet "PYOPENCL INSTALL FAILED" in the installtion of xrt , refer to [AR# 73055](https://www.xilinx.com/support/answers/73055.html) 46 | 47 | 4. If you meet "XRT Requires opencl header" when you open Vitis , refer to [Vitis prompt “XRT Requires opencl header"](https://forums.xilinx.com/t5/Vitis-Acceleration-SDAccel-SDSoC/Vitis-prompt-XRT-Requires-opencl-header-quot/td-p/1087072) 48 | ### Environment 49 | 50 | To compile chisel code, you need to install: 51 | 52 | - Java 1.0.8 53 | 54 | ``` 55 | sudo apt install openjdk-8-jre-headless 56 | sudo apt-get install java-wrappers 57 | sudo apt-get install default-jdk 58 | ``` 59 | 60 | - sbt 1.4.2 61 | 62 | ``` 63 | echo "deb https://dl.bintray.com/sbt/debian /" | \ 64 | sudo tee -a /etc/apt/sources.list.d/sbt.list 65 | sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 \ 66 | --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 67 | sudo apt-get update 68 | sudo apt-get install sbt 69 | ``` 70 | 71 | - Scala 2.11.12 72 | 73 | ``` 74 | sudo apt install scala 75 | ``` 76 | 77 | ## Clone and Build 78 | 79 | ``` 80 | $ git clone https://github.com/lizardll/ScalaBFS.git 81 | $ make 82 | ``` 83 | 84 | ## Quick Start Guide 85 | 86 | ### Preprocess 87 | 88 | Before deploying and running ScalaBFS, we need to make sure that you have specific graph data with divided csc-csr format that ScalaBFS required. For complete graph data preprocess guide, see [Data Preprocess.](https://github.com/lizardll/ScalaBFS/tree/master/data_preprocess) 89 | 90 | We start with a small directed graph named Wiki-Vote for example. First we should make for directed or undirected graph for propose. Then we generate divided graph data with 32 channels and 64 PEs for ScalaBFS. 91 | 92 | ```bash 93 | cd data_preprocess 94 | make all 95 | ./GraphToScalaBFS Wiki-Vote.txt 32 64 96 | ``` 97 | 98 | 99 | 100 | ### Deploy and play 101 | 102 | #### Open Vitis & Select workspace: 103 | 104 | ``` 105 | ScalaBFS-proj/workspace 106 | ``` 107 | 108 | #### Choose graph data (modify host_example.cpp in vitis) 109 | 110 | For the preprocessed wiki-vote graph data mentioned before, we should first modify the input file name at line 121: 111 | 112 | ``` 113 | string bfs_filename = "YOUR_DIR_HERE/ScalaBFS/data_preprocess/Wiki-Vote_pe_64_ch_"; 114 | ``` 115 | 116 | Then we have to modify the following line 122-127 according to data_preprocess/Wiki-Vote_addr_pe_64_ch_32.log: 117 | 118 | ``` 119 | cl_uint csr_c_addr = 260; 120 | cl_uint csr_r_addr = 0; 121 | cl_uint level_addr = 2958; 122 | cl_uint node_num = 8298; 123 | cl_uint csc_c_addr = 1780; 124 | cl_uint csc_r_addr = 1520; 125 | ``` 126 | 127 | And in order to show correct prerformance value, on line 132 we also need to set the edge count of the dataset (in this case, wiki-vote has 103689 edges): 128 | ``` 129 | result = 103689; 130 | ``` 131 | After that, it's time to build the whole project in vitis. Select the "Hardware" target in the left down corner, and press the hammer button to build it! Genarally it will take 10~15 hours. 132 | 133 | The running results will be like this: 134 | 135 | 136 | 137 | ## Experiment results 138 | 139 | TABLE 1: Graph datasets 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 |
GraphsVerticesEdgesAvg.Directed
(M)(M)Degree
soc-Pokec (PK)1.6330.6218.75Y
soc-LiveJournal (LJ)4.8568.9914.23Y
com-Orkut (OR)3.07234.3776.28N
hollywood-2009 (HO)1.14113.8999.91N
RMAT18-80.262.057.81N
RMAT18-160.264.0315.39N
RMAT18-320.267.8830.06N
RMAT18-640.2615.2258.07N
RMAT22-164.1965.9715.73N
RMAT22-324.19130.4931.11N
RMAT22-644.19256.6261.18N
RMAT23-168.39132.3815.78N
RMAT23-328.39262.3331.27N
RMAT23-648.39517.3461.67N
257 | 258 | TABLE 2: Performance comparison between GunRock and ScalaBFS (32-PC/64-PE configuration) 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 |
Gunrock on V100ScalaBFS on U280
DatasetsThroughput
(GTEPS)
Power eff.
(GTEPS/watt)
Throughput
(GTEPS)
Power eff.
(GTEPS/watt)
soc-Pokec (PK)14.90.05016.20.506
soc-LiveJournal (LJ)18.50.06211.20.350
com-Orkut (OR)150.60.50219.10.597
hollywood-2009 (HO)730.24316.40.513
306 | 307 | FIGURE 1: Performances and aggregated bandwidths of ScalaBFS (with 32 HBM PCs and 64 PEs) and baseline case 308 | 309 | 310 | -------------------------------------------------------------------------------- /ScalaBFS-proj.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lizardll/ScalaBFS/0a276ae5d48d01dd1c2b6890836ea5dd684f8462/ScalaBFS-proj.zip -------------------------------------------------------------------------------- /add_init.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | fp = open("Top.v", "r") 3 | findmodule = 0 4 | findmodule2 = 0 5 | findmodule3 = 0 6 | findinit = 0 7 | find4 = 0 8 | find5 = 0 9 | find = 0 10 | modulename = "module RRArbiter" 11 | modulename2= "module sub_crossbar" 12 | # modulename3= "module uram" 13 | 14 | init = "initial begin" 15 | cont = "\tlastGrant = 1'b0;" 16 | f = fp.read() 17 | a = f.split('\n') 18 | i = 0 19 | count = 0 20 | for s in a: 21 | i = i + 1 22 | findmodule = s.find(modulename) 23 | findmodule2 = s.find(modulename2) 24 | # findmodule3 = s.find(modulename3) 25 | # find4 = s.find('reg [31:0] uram_douta;') 26 | # find5 = s.find('reg [31:0] uram_doutb;') 27 | if findmodule >= 0: 28 | print(s) 29 | find = find + 1 30 | if find > 0: 31 | findinit = s.find(init) 32 | if(findinit >= 0): 33 | a.insert(i, cont) 34 | print("write") 35 | find = 0 36 | # if findmodule2 >= 0: 37 | # a[i - 1] = '(* keep_hierarchy = "yes" *) ' + a[i - 1] 38 | # if findmodule3 >= 0: 39 | # a[i - 1] = '(* dont_touch = "true" *) ' + a[i - 1] 40 | # if s.find("reg [31:0] uram_douta;") >= 0: 41 | # a[i - 1] = '(* dont_touch = "true" *) ' + a[i - 1] 42 | # count = count + 1 43 | # if s.find("reg [31:0] uram_doutb;") >= 0: 44 | # a[i - 1] = '(* dont_touch = "true" *) ' + a[i - 1] 45 | # count = count + 1 46 | f = '\n'.join(a) 47 | fp = open("Top.v", "w") 48 | fp.write(f) 49 | fp.close() 50 | print(count) -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | 2 | scalaVersion := "2.11.12" 3 | 4 | resolvers ++= Seq( 5 | Resolver.sonatypeRepo("snapshots"), 6 | Resolver.sonatypeRepo("releases") 7 | ) 8 | 9 | libraryDependencies += "edu.berkeley.cs" %% "chisel3" % "3.3.2" 10 | libraryDependencies += "edu.berkeley.cs" %% "chisel-iotesters" % "1.4.2" 11 | libraryDependencies += "edu.berkeley.cs" %% "firrtl" % "1.3.2" 12 | -------------------------------------------------------------------------------- /data_preprocess/.~README.md: -------------------------------------------------------------------------------- 1 | # ScalaBFS Graph Data Preprocess Usage 2 | 3 | 1111 4 | 5 | ## Introduction of Input and Output Graph Data 6 | 7 | ## Convert Undirected Graph to Directed Graph [optional] 8 | 9 | ## Generate Divided ScalaBFS Graph Data with Scalable Channels and PEs 10 | 11 | ## -------------------------------------------------------------------------------- /data_preprocess/GraphToScalaBFS.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | typedef unsigned long long uint64; 14 | typedef long long int64; 15 | typedef unsigned int uint32; 16 | uint32 cpuBFS( 17 | uint32 *graphData, uint32 *graphIndices,uint32 * level_array,uint32 * visited_map,uint32 root); 18 | //unsigned int qc_addr,qn_addr,level_addr,data_depth,q_length; 19 | //unsigned int C_addr,R_addr; 20 | int node_num = 0; 21 | int main(int argc, char *argv[]) { 22 | //args 23 | if(argc != 4){ 24 | cout<<"please use correct argument!"<, less> graph_csc; 93 | map, less> graph_csr; 94 | cout << "Start setup .." << endl; 95 | int fscanfcount = 0; 96 | // read the graph data from file 97 | while(!feof(fp)) { 98 | // read a data tuple 99 | fscanfcount = fscanf(fp, "%u %u", &i, &j); 100 | if(i>=node_num){ 101 | node_num = i; 102 | } 103 | if(j>=node_num){ 104 | node_num = j; 105 | } 106 | graph_csc[j].push_back(i); 107 | graph_csr[i].push_back(j); 108 | } 109 | node_num+=1; 110 | 111 | //csr 112 | uint32 *graphData = (uint32*) malloc(16 * nonZeroCount * sizeof(uint32)); 113 | uint32 *graphIndices = (uint32*) malloc((node_num + 1) * sizeof(uint32)); 114 | uint64 *graphInfo = (uint64*) malloc((node_num + 1) * sizeof(uint64)); 115 | 116 | //csc 117 | uint32 *cscData = (uint32*) malloc(16 * nonZeroCount * sizeof(uint32)); 118 | uint32 *cscIndices = (uint32*) malloc((node_num + 1) * sizeof(uint32)); 119 | uint64 *cscInfo = (uint64*) malloc((node_num + 1) * sizeof(uint64)); 120 | 121 | 122 | 123 | //csr 124 | graphIndices[0] = 0; 125 | map, less>::iterator iter_csr; 126 | int count_csr = 0; 127 | int point_csr = 0; 128 | int pre_count_csr = 0; 129 | int max_edge_csr = 0; 130 | int count_temp = 0; 131 | for (iter_csr = graph_csr.begin(); iter_csr != graph_csr.end(); iter_csr++) 132 | { 133 | if (iter_csr->first > point_csr) 134 | { 135 | for (int p = point_csr; p <= iter_csr->first; p++) 136 | { 137 | graphIndices[p] = pre_count_csr; 138 | } 139 | } 140 | vector b = iter_csr->second; 141 | if(b.size()>=max_edge_csr){ 142 | max_edge_csr = b.size(); 143 | } 144 | for (int i = 0; i < b.size(); i++) 145 | { 146 | graphData[count_csr] = b[i]; 147 | fprintf(flog, "%d %d\n" ,iter_csr->first,b[i]); 148 | count_csr++; 149 | } 150 | // if(count_csr%2 == 1){ 151 | // graphData[count_csr] = 0xffffffff; 152 | // count_csr++; 153 | // } 154 | 155 | if(count_csr%ali_num != 0){ 156 | count_temp = count_csr%ali_num; 157 | for(int count_f = 0;count_f<(ali_num-count_temp);count_f++){ 158 | graphData[count_csr] = 0xffffffff; 159 | count_csr++; 160 | } 161 | } 162 | 163 | graphIndices[iter_csr->first + 1] = count_csr; 164 | point_csr = iter_csr->first + 1; 165 | pre_count_csr = count_csr; 166 | } 167 | for (unsigned int k = ((--iter_csr)->first) + 1; k < node_num + 1; k++) { 168 | printf("csc not ending!\r"); 169 | graphIndices[k] = count_csr; 170 | } 171 | // converting graphIndices to graphInfo 172 | for (int k = 0; k < node_num; k++) { 173 | // graphInfo = neigh_start_index (32 bit) | neighbours_count (32 bit) 174 | uint32 index = (uint32) (graphIndices[k] / ali_num); // graphIndices/2 175 | uint32 size = (uint32)((graphIndices[k+1] - graphIndices[k])/ali_num); 176 | graphInfo[k] = ((uint64)index << 32) | size; 177 | fprintf(fdebug,"index:%d size:%d i:%d graphIndicesk:%d graphIndicesk+1:%d \n",index,size,k,graphIndices[k],graphIndices[k+1]); 178 | } 179 | cout << "CSR Data generation done .." << endl; 180 | //csc 181 | cscIndices[0] = 0; 182 | map, less>::iterator iter; 183 | int count_csc = 0; 184 | int point = 0; 185 | int pre_count = 0; 186 | int count0 = 0; 187 | int count1 = 0; 188 | int count00 = 0; 189 | int count11 = 0; 190 | int count_1_0 = 0; 191 | int count_1_1 = 0; 192 | int max_edge_csc = 0; 193 | for (iter = graph_csc.begin(); iter != graph_csc.end(); iter++) 194 | { 195 | if (iter->first > point) 196 | { 197 | for (int p = point; p <= iter->first; p++) 198 | { 199 | cscIndices[p] = pre_count; 200 | } 201 | } 202 | vector b = iter->second; 203 | // if(b.size() > 255){ 204 | // printf(">255"); 205 | // } 206 | if(b.size()>=max_edge_csc){ 207 | max_edge_csc = b.size(); 208 | } 209 | if(iter->first !=1){ 210 | if(iter->first%2 == 0){ 211 | count00+= b.size(); 212 | }else{ 213 | count11+= b.size(); 214 | } 215 | } 216 | 217 | for (int i = 0; i < b.size(); i++) 218 | { 219 | cscData[count_csc] = b[i]; 220 | fprintf(flog, "%d %d\n" ,iter->first,b[i]); 221 | if(iter->first !=1){ 222 | if(b[i]%2 == 0){ 223 | count0++; 224 | }else{ 225 | count1++; 226 | } 227 | } 228 | count_csc++; 229 | } 230 | // if(count_csc%2 == 1){ 231 | // cscData[count_csc] = 0xffffffff; 232 | // count_csc++; 233 | // } 234 | if(count_csc%ali_num != 0){ 235 | count_temp = count_csc%ali_num; 236 | for(int count_f = 0;count_f<(ali_num-count_temp);count_f++){ 237 | cscData[count_csc] = 0xffffffff; 238 | count_csc++; 239 | } 240 | } 241 | cscIndices[iter->first + 1] = count_csc; 242 | point = iter->first + 1; 243 | pre_count = count_csc; 244 | } 245 | for (unsigned int k = ((--iter)->first) + 1; k < node_num + 1; k++) { 246 | printf("csc not ending!\r"); 247 | cscIndices[k] = count_csc; 248 | } 249 | 250 | for (int k = 0; k < node_num; k++) { 251 | // graphInfo = neigh_start_index (32 bit) | neighbours_count (32 bit) 252 | uint32 index = (uint32) (cscIndices[k] / ali_num); // graphIndices/2 253 | uint32 size = (uint32)((cscIndices[k+1] - cscIndices[k])/ali_num); 254 | cscInfo[k] = ((uint64)index << 32) | size; 255 | fprintf(fdebug,"index:%d size:%d i:%d graphIndicesk:%d graphIndicesk+1:%d \n",index,size,k,cscIndices[k],cscIndices[k+1]); 256 | } 257 | 258 | cout << "CSC Data generation done .." << endl; 259 | 260 | //divide 261 | vector fbin(divide_num); 262 | vector > csc_c(divide_num); 263 | vector > csc_r(divide_num); 264 | vector > csr_c(divide_num); 265 | vector > csr_r(divide_num); 266 | vector csr_index_temp(divide_num,0); 267 | vector csc_index_temp(divide_num,0); 268 | uint32 index_csr = 0; 269 | uint32 size_csr = 0; 270 | uint32 index_csc = 0; 271 | uint32 size_csc = 0; 272 | uint32 dest_num = 0; 273 | for(int r = 0;r>32); 277 | size_csr = (uint32)graphInfo[r]; 278 | //push c 279 | for(int csr_push_i = 0;csr_push_i < (size_csr*ali_num);csr_push_i++){ 280 | csr_c[dest_num].push_back(graphData[ali_num * index_csr + csr_push_i]); 281 | } 282 | //push and count new r 283 | csr_r[dest_num].push_back(((uint64)csr_index_temp[dest_num] << 32) | size_csr); 284 | csr_index_temp[dest_num] += size_csr; 285 | //csc 286 | index_csc = (uint32)(cscInfo[r]>>32); 287 | size_csc = (uint32)cscInfo[r]; 288 | //push c 289 | for(int csc_push_i = 0;csc_push_i < (size_csc*ali_num);csc_push_i++){ 290 | csc_c[dest_num].push_back(cscData[ali_num * index_csc + csc_push_i]); 291 | } 292 | //push and count new r 293 | csc_r[dest_num].push_back(((uint64)csc_index_temp[dest_num] << 32) | size_csc); 294 | csc_index_temp[dest_num] += size_csc; 295 | } 296 | 297 | for(int addr_i = 0;addr_i csc_c_addr){ 299 | csc_c_addr = csc_c[addr_i].size(); 300 | 301 | } 302 | if(csr_c[addr_i].size() > csr_c_addr){ 303 | csr_c_addr = csr_c[addr_i].size(); 304 | } 305 | if(csr_r[addr_i].size() > csr_r_addr){ 306 | csr_r_addr = csr_r[addr_i].size(); 307 | } 308 | if(csc_r[addr_i].size() > csc_r_addr){ 309 | csc_r_addr = csc_r[addr_i].size(); 310 | } 311 | } 312 | 313 | level_addr = csr_r_addr + csc_r_addr + csc_c_addr/ali_num + csr_c_addr/ali_num; 314 | csc_c_addr = csr_r_addr + csr_c_addr/ali_num + csc_r_addr; 315 | csc_r_addr = csr_r_addr + csr_c_addr/ali_num; 316 | csr_c_addr = csr_r_addr; 317 | csr_r_addr = 0; 318 | int *zero_num = {0}; 319 | for(unsigned int file_i = 0;file_i, less>::iterator iter_run; 385 | int count_run = 0; 386 | int point_run = 0; 387 | int pre_count_run = 0; 388 | for (iter_run = graph_csr.begin(); iter_run != graph_csr.end(); iter_run++) 389 | { 390 | if (iter_run->first > point_run) 391 | { 392 | for (int p = point_run; p <= iter_run->first; p++) 393 | { 394 | graphIndices_run[p] = pre_count_run; 395 | } 396 | } 397 | vector b = iter_run->second; 398 | 399 | for (int i = 0; i < b.size(); i++) 400 | { 401 | graphData_run[count_run] = b[i]; 402 | count_run++; 403 | } 404 | graphIndices_run[iter_run->first + 1] = count_run; 405 | point_run = iter_run->first + 1; 406 | pre_count_run = count_run; 407 | } 408 | for (unsigned int k = ((--iter_run)->first) + 1; k < node_num + 1; k++) { 409 | printf("csc not ending!\r"); 410 | graphIndices_run[k] = count_run; 411 | } 412 | 413 | 414 | 415 | cout << "Enter root node number(0 to N):" << endl; 416 | cin >> root; 417 | visited_map[root / 32] = 1 << (root % 32); 418 | cout << "cpuBFS running ..." << endl; 419 | level = cpuBFS(graphData_run, graphIndices_run,level_array,visited_map,root); 420 | 421 | string AddrFile = FileName+"_addr_pe_"+to_string(pe_num)+"_ch_"+to_string(ch_num)+".log"; 422 | faddr = fopen(AddrFile.c_str(), "w+"); 423 | if (faddr == NULL) { 424 | cout << "Error: can't create faddr file!" << endl; 425 | exit(1); 426 | } 427 | cout << "successfully generated graph data, address log and cpuBFS's result!" << endl; 428 | fprintf(faddr, " cl_uint csr_c_addr = %u;\n" ,csr_c_addr); 429 | fprintf(faddr, " cl_uint csr_r_addr = %u;\n" ,csr_r_addr); 430 | fprintf(faddr, " cl_uint level_addr = %u;\n" ,level_addr); 431 | fprintf(faddr, " cl_uint node_num = %u;\n" ,node_num); 432 | fprintf(faddr, " cl_uint csc_c_addr = %u;\n" ,csc_c_addr); 433 | fprintf(faddr, " cl_uint csc_r_addr = %u;\n" ,csc_r_addr); 434 | fprintf(faddr, "max_edge_csr = %u;\n" ,max_edge_csr); 435 | fprintf(faddr, "max_edge_csc = %u;\n" ,max_edge_csc); 436 | free(graphData); 437 | free(graphIndices); 438 | free(graphInfo); 439 | free(cscData); 440 | free(cscIndices); 441 | free(cscInfo); 442 | free(graphData_run); 443 | free(graphIndices_run); 444 | free(level_array); 445 | free(visited_map); 446 | 447 | fclose(fp); 448 | //fclose(fbin); 449 | fclose(flog); 450 | fclose(faddr); 451 | fclose(fdebug); 452 | return 0; 453 | } 454 | 455 | // Do BFS in CPU and return the number of traversed levels 456 | inline uint32 cpuBFS(uint32 *graphData, uint32 *graphIndices, uint32 * level_array,uint32 * visited_map,uint32 root) { 457 | uint32 level = 1; 458 | int qc_count = 0; 459 | int qn_count = 0; 460 | // declare Next/Current queues 461 | queue Current, Next; 462 | // Add root to next queue and it's level 1 463 | Next.push(root); 464 | level_array[root] = level; 465 | 466 | // Traverse the graph 467 | while (!Next.empty()) { 468 | // pop next level into current level 469 | level ++; 470 | int i = 0; 471 | while (!Next.empty()) { 472 | Current.push(Next.front()); 473 | i++; 474 | Next.pop(); 475 | } 476 | qc_count = 0; 477 | qn_count = 0; 478 | // Traverse current level 479 | while (!Current.empty()) { 480 | uint32 current = Current.front(); 481 | uint32 neigh_count = graphIndices[current + 1] - graphIndices[current]; 482 | uint32 neigh_index = graphIndices[current]; 483 | 484 | qc_count++; 485 | 486 | Current.pop(); 487 | for (uint32 k = 0; k < neigh_count; k++,neigh_index++) { 488 | // if neighbor is not visited, visit it and push it to next queue 489 | if ((visited_map[graphData[neigh_index]/32] & (1 << graphData[neigh_index])) == 0) { 490 | Next.push(graphData[neigh_index]); 491 | qn_count++; 492 | level_array[graphData[neigh_index]] = level; 493 | visited_map[graphData[neigh_index]/32] = visited_map[graphData[neigh_index]/32] | (1 << (graphData[neigh_index] % 32)); 494 | }else{ 495 | } 496 | } 497 | 498 | } 499 | 500 | } 501 | //kyle : result 打印最后的level和bitmap,作为比对仿真结果的基准 502 | FILE * result; 503 | result = fopen("result.txt", "w"); 504 | for(int i = 0;i < node_num;i++){ 505 | fprintf(result,"level[%u]%u\n",i,level_array[i]); 506 | } 507 | fclose(result); 508 | return level; 509 | } 510 | -------------------------------------------------------------------------------- /data_preprocess/Makefile: -------------------------------------------------------------------------------- 1 | CC:=g++ 2 | transfer=transfer.cpp 3 | generate=GraphToScalaBFS.cpp 4 | 5 | all : transfer GraphToScalaBFS 6 | 7 | transfer:$(transfer) 8 | ${CC} $(transfer) -o transfer 9 | 10 | GraphToScalaBFS:$(generate) 11 | ${CC} $(generate) -o GraphToScalaBFS 12 | 13 | clean: 14 | rm -f *.o 15 | rm -f transfer 16 | rm -f GraphToScalaBFS -------------------------------------------------------------------------------- /data_preprocess/README.md: -------------------------------------------------------------------------------- 1 | # ScalaBFS Graph Data Preprocess Usage 2 | 3 | 4 | 5 | ### Generate Divided Graph Data with Scalable Channels and PEs 6 | 7 | #### 1) Process Undirected Graph 8 | 9 | i) Download original graph data which have correct data format 10 | 11 | Format: The fist line of graph file contains total vertices and edges of the graph. Then comes graph edges with adjacency list format. 12 | 13 | | M[Vertices] | N[Edges] | 14 | | -------------- | ------------- | 15 | | a1[First node] | b1[Tail node] | 16 | | a2 | b2 | 17 | | ... | ... | 18 | | an | bn | 19 | 20 | ii) Generate Divided Graph Data with Scalable Channels and PEs 21 | 22 | Usage: 23 | 24 | ```bash 25 | [executable program] [filename with suffix] [the number of channels] [the number of PEs] 26 | ``` 27 | 28 | Example: 29 | 30 | ```bash 31 | cd data_preprocess 32 | make all 33 | ./GraphToScalaBFS soc-livejournal.txt 32 64 34 | ``` 35 | 36 | #### 2) Process Undirected Graph 37 | 38 | i) Download original graph data which have correct data format 39 | 40 | Format: The fist line of graph file contains total vertices and edges of the graph. Then comes graph edges with adjacency list format. 41 | 42 | | M[Vertices] | N[Edges] | 43 | | -------------- | ------------- | 44 | | a1[First node] | b1[Tail node] | 45 | | a2 | b2 | 46 | | ... | ... | 47 | | an | bn | 48 | 49 | ii) Convert Undirected Graph to Directed Graph 50 | 51 | Usage: 52 | 53 | ```bash 54 | [executable program] [filename without suffix] 55 | ``` 56 | 57 | Example: 58 | 59 | ```bash 60 | cd data_preprocess 61 | make all 62 | ./transfer soc-livejournal.txt 63 | ``` 64 | 65 | iii) Generate Divided Graph Data with Scalable Channels and PEs 66 | 67 | ```bash 68 | ./GraphToScalaBFS soc-livejournal_transfer_to_directed.txt 32 64 69 | ``` 70 | 71 | ### Well-tested Graph Data Set 72 | 73 | | Graphs | Vertices(M) | Edges(M) | Avg Degree | Directed | Download Link | 74 | | ----------------------- | ----------- | -------- | ---------- | -------- | ------------- | 75 | | soc\-Pokec \(PK\) | 1\.63 | 30\.62 | 18\.75 | Y | | 76 | | soc\-LiveJournal \(LJ\) | 4\.85 | 68\.99 | 14\.23 | Y | | 77 | | com\-Orkut \(OR\) | 3\.07 | 234\.37 | 76\.28 | N | | 78 | | hollywood\-2009 \(HO\) | 1\.14 | 113\.89 | 99\.91 | N | | 79 | | RMAT18\-8 | 0\.26 | 2\.05 | 7\.81 | N | | 80 | | RMAT18\-16 | 0\.26 | 4\.03 | 15\.39 | N | | 81 | | RMAT18\-32 | 0\.26 | 7\.88 | 30\.06 | N | | 82 | | RMAT18\-64 | 0\.26 | 15\.22 | 58\.07 | N | | 83 | | RMAT22\-16 | 4\.19 | 65\.97 | 15\.73 | N | | 84 | | RMAT22\-32 | 4\.19 | 130\.49 | 31\.11 | N | | 85 | | RMAT22\-64 | 4\.19 | 256\.62 | 61\.18 | N | | 86 | | RMAT23\-16 | 8\.39 | 132\.38 | 15\.78 | N | | 87 | | RMAT23\-32 | 8\.39 | 262\.33 | 31\.27 | N | | 88 | | RMAT23\-64 | 8\.39 | 517\.34 | 61\.67 | N | | 89 | 90 | -------------------------------------------------------------------------------- /data_preprocess/rmat_generate.txt: -------------------------------------------------------------------------------- 1 | function ijw = kronecker_generator (SCALE, edgefactor) 2 | %% Generate an edgelist according to the Graph500 parameters. In this 3 | %% sample, the edge list is returned in an array with three rows, 4 | %% where StartVertex is first row, EndVertex is the second row, and 5 | %% Weight is the third row. The vertex labels start at zero. 6 | %% 7 | %% Example, creating a sparse matrix for viewing: 8 | %% ijw = kronecker_generator (10, 16); 9 | %% G = sparse (ijw(1,:)+1, ijw(2,:)+1, ones (1, size (ijw, 2))); 10 | %% spy (G); 11 | %% The spy plot should appear fairly dense. Any locality 12 | %% is removed by the final permutations. 13 | 14 | %% Set number of vertices. 15 | N = 2^SCALE; 16 | 17 | %% Set number of edges. 18 | M = edgefactor * N; 19 | 20 | %% Set initiator probabilities. 21 | [A, B, C] = deal (0.57, 0.19, 0.19); 22 | 23 | %% Create index arrays. 24 | ijw = ones (2, M); 25 | %% Loop over each order of bit. 26 | ab = A + B; 27 | c_norm = C/(1 - (A + B)); 28 | a_norm = A/(A + B); 29 | 30 | for ib = 1:SCALE, 31 | %% Compare with probabilities and set bits of indices. 32 | ii_bit = rand (1, M) > ab; 33 | jj_bit = rand (1, M) > ( c_norm * ii_bit + a_norm * not (ii_bit) ); 34 | ijw(1:2,:) = ijw(1:2,:) + 2^(ib-1) * [ii_bit; jj_bit]; 35 | end 36 | 37 | %% Generate weights 38 | %% ijw(3,:) = unifrnd(0, 1, M); 39 | 40 | %% Permute vertex labels 41 | p = randperm (N); 42 | ijw(1:2,:) = p(ijw(1:2,:)); 43 | 44 | %% Permute the edge list 45 | p = randperm (M); 46 | ijw = ijw(:, p); 47 | 48 | %% Adjust to zero-based labels. 49 | ijw(1:2,:) = ijw(1:2,:) - 1; 50 | 51 | endfunction 52 | 53 | 54 | ijw = kronecker_generator (scale, edgefactor); 55 | G = sparse (ijw(1,:)+1, ijw(2,:)+1, ones (1, size (ijw, 2))); 56 | %% spy(G) 57 | 58 | [r,c] = find(G) 59 | edge_trans = [r,c].' 60 | 61 | fid = fopen('filename','w'); 62 | fprintf(fid,'%d %d\n',edge_trans); 63 | fclose(fid); 64 | -------------------------------------------------------------------------------- /data_preprocess/transfer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | //kyle : 将乱序或有序的无向图转换为有序的有向图 13 | using namespace std; 14 | struct edge { 15 | unsigned int x , y; 16 | }; 17 | bool cmp(struct edge a , struct edge b); 18 | int main(int argc, char *argv[]) { 19 | if(argc != 2){ 20 | cout << "args error" << endl; 21 | exit(1); 22 | } 23 | string graghfile; 24 | FILE * fp , * result; 25 | int i , j ,fscanfcount; 26 | struct edge temp; 27 | vector graph; 28 | graph.clear(); 29 | graghfile = argv[1]; 30 | fp = fopen((const char *)(graghfile).c_str(), "r"); 31 | while(!feof(fp)) { 32 | // read a data tuple 33 | fscanfcount = fscanf(fp, "%u %u", &i, &j); 34 | if (fscanfcount == 2){ 35 | if(i == 2) i = 3; 36 | if(j == 2) j = 3; 37 | temp.x = i; 38 | temp.y = j; 39 | graph.push_back(temp); 40 | if(i!=j) 41 | { 42 | temp.x = j; 43 | temp.y = i; 44 | graph.push_back(temp); 45 | } 46 | } 47 | } 48 | result = fopen((const char *)(graghfile.substr(0,graghfile.find_first_of('.'))+"_transfer_to_directed.txt").c_str(),"w"); 49 | sort(graph.begin(),graph.end(),cmp); 50 | vector::iterator it; 51 | for(it = graph.begin();it != graph.end() ; it++){ 52 | fprintf(result,"%u %u\n",it->x,it->y); 53 | } 54 | fclose(fp); 55 | fclose(result); 56 | return 0; 57 | } 58 | bool cmp(struct edge a , struct edge b){ 59 | if(a.x != b.x){ 60 | return a.x < b.x; 61 | } 62 | else return a.y < b.y; 63 | } 64 | -------------------------------------------------------------------------------- /docs/fig11-compare-naive.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lizardll/ScalaBFS/0a276ae5d48d01dd1c2b6890836ea5dd684f8462/docs/fig11-compare-naive.jpg -------------------------------------------------------------------------------- /docs/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lizardll/ScalaBFS/0a276ae5d48d01dd1c2b6890836ea5dd684f8462/docs/screenshot.png -------------------------------------------------------------------------------- /src/main/scala/Decoupled_Mem.scala: -------------------------------------------------------------------------------- 1 | package HBMGraph 2 | import chisel3._ 3 | import chisel3.Driver 4 | import chisel3.util._ 5 | 6 | 7 | class AXIAddress(val addrWidthBits: Int, val idBits: Int) extends Bundle { 8 | // address for the transaction, should be burst aligned if bursts are used 9 | val addr = UInt(addrWidthBits.W) 10 | // // number of data beats -1 in burst: max 255 for incrementing, 15 for wrapping 11 | val len = UInt(8.W) 12 | // transaction ID for multiple outstanding requests 13 | val id = UInt(idBits.W) 14 | // size of data beat in bytes 15 | // set to UInt(log2Up((dataBits/8)-1)) for full-width bursts 16 | // val size = UInt(log2Up((dataBits/8)-1)) 17 | // // burst mode: 0 for fixed, 1 for incrementing, 2 for wrapping 18 | // val burst = UInt(2.W) 19 | // // set to 1 for exclusive access 20 | // val lock = Bool() 21 | // // cachability, set to 0010 or 0011 22 | // val cache = UInt(4.W) 23 | // // generally ignored, set to to all zeroes 24 | // val prot = UInt(3.W) 25 | // // not implemented, set to zeroes 26 | // val qos = UInt(4.W) 27 | } 28 | 29 | class AXIWriteData(val dataWidthBits: Int) extends Bundle { 30 | val data = UInt(dataWidthBits.W) 31 | val strb = UInt((dataWidthBits/8).W) 32 | val last = Bool() 33 | } 34 | 35 | class AXIWriteResponse(val idBits: Int) extends Bundle { 36 | val id = UInt(idBits.W) 37 | val resp = UInt(2.W) 38 | } 39 | 40 | class AXIReadData(val dataWidthBits: Int, val idBits: Int) extends Bundle { 41 | // 64 bits data can be divided into 2 32-bits data 42 | val data = UInt((dataWidthBits).W) 43 | val id = UInt(idBits.W) 44 | val last = Bool() 45 | val resp = UInt(2.W) 46 | } 47 | 48 | // Part II: Definitions for the actual AXI interfaces 49 | class AXIMasterIF(val addrWidthBits: Int, val dataWidthBits: Int, val idBits: Int) extends Bundle { 50 | // write address channel 51 | val writeAddr = Decoupled(new AXIAddress(addrWidthBits, idBits)) 52 | // write data channel 53 | val writeData = Decoupled(new AXIWriteData(dataWidthBits)) 54 | // write response channel (for memory consistency) 55 | val writeResp = Flipped(Decoupled(new AXIWriteResponse(idBits))) 56 | 57 | // read address channel 58 | val readAddr = Decoupled(new AXIAddress(addrWidthBits, idBits)) 59 | // read data channel 60 | val readData = Flipped(Decoupled(new AXIReadData(dataWidthBits, idBits))) 61 | } 62 | 63 | // Read neighbour using burst and deal with the situation where neighbour size > 256 64 | class Read_neighbour(implicit val conf : HBMGraphConfiguration) extends Module { 65 | val io = IO(new Bundle { 66 | // input 67 | val readData = Flipped(Decoupled(new AXIReadData(64, conf.memIDBits))) //HBM data out 68 | val offsets = Input(new offsets) // offsets 69 | val push_or_pull_state = Input(Bool()) // 0 for push 70 | 71 | //output 72 | val to_arbiter = Decoupled(new Bundle{ 73 | val index = UInt(conf.Data_width.W) 74 | val burst_len = UInt(8.W) // in 64 bits 75 | val id = UInt(conf.memIDBits.W) // 0->index, 1->neighbour 76 | }) 77 | 78 | // these are for counters 79 | val count_val_n0 = Output(UInt(conf.Data_width.W)) // burst number summation 80 | val queue_ready = Output(Bool()) 81 | val queue_valid = Output(Bool()) 82 | 83 | // these are for src_index_queue 84 | val src_q0_deq = Flipped(Decoupled(UInt(conf.Data_width.W))) 85 | val src_q1_enq = Decoupled(UInt(conf.Data_width.W)) 86 | }) 87 | val read_rsp :: loop :: Nil = Enum(2) 88 | val state = RegInit(read_rsp) 89 | val neighbour_count = RegInit(0.U(conf.Data_width.W)) 90 | val temp_index = RegInit(0.U(conf.Data_width.W)) 91 | val queue_readData = Module(new Queue(new AXIReadData(64, conf.memIDBits), conf.Mem_queue_readData_len)) // big queue to ensure no deadlock 92 | val burst_sum = RegInit(0.U(conf.Data_width.W)) 93 | val before_4k_bound_count = Wire(UInt(conf.Data_width.W)) 94 | before_4k_bound_count := DontCare 95 | val C_offset = Mux(io.push_or_pull_state, io.offsets.CSC_C_offset ,io.offsets.CSR_C_offset) 96 | 97 | 98 | burst_sum <> io.count_val_n0 99 | io.to_arbiter.bits <> DontCare 100 | 101 | // queue_readData <> io.readData 102 | queue_readData.io.enq.ready <> io.readData.ready 103 | queue_readData.io.enq.valid := io.readData.valid && io.readData.bits.id === 0.U 104 | queue_readData.io.enq.bits := io.readData.bits 105 | 106 | // queue_readData <> to_arbiter 107 | queue_readData.io.deq.ready := false.B 108 | io.to_arbiter.valid := false.B 109 | 110 | // for counter 111 | io.queue_ready := queue_readData.io.deq.ready 112 | io.queue_valid := queue_readData.io.deq.valid 113 | val unpacked_readData = queue_readData.io.deq.bits.data.asTypeOf( 114 | Vec(2, UInt(conf.Data_width.W)) // 0->size, 1->index 115 | ) 116 | 117 | //for src_index_queue0 <> src_index_queue1 todo 118 | io.src_q1_enq.bits <> io.src_q0_deq.bits 119 | io.src_q0_deq.ready := false.B // send 120 | io.src_q1_enq.valid := false.B // recieve 121 | 122 | switch(state) { 123 | is(read_rsp){ 124 | when(io.to_arbiter.ready && queue_readData.io.deq.valid && io.src_q0_deq.valid && io.src_q1_enq.ready){ 125 | when(unpacked_readData(0) > 256.U){ // split into multiple burst 126 | when(((unpacked_readData(1) + C_offset)*(conf.HBM_Data_width / 8).asUInt(conf.Data_width.W))/4096.U =/= 127 | ((unpacked_readData(1) + C_offset)*(conf.HBM_Data_width / 8).asUInt(conf.Data_width.W)+(256*(conf.HBM_Data_width / 8)).U-1.U)/4096.U){ 128 | // when burst cross 4k boundary 129 | before_4k_bound_count := (((((unpacked_readData(1) + C_offset)*(conf.HBM_Data_width / 8).asUInt(conf.Data_width.W)) & 0xFFFFF000L.U) + 0x1000.U) - ((unpacked_readData(1) + C_offset)*(conf.HBM_Data_width / 8).asUInt(conf.Data_width.W))) / (conf.HBM_Data_width / 8).U 130 | io.to_arbiter.bits.id := 1.U(conf.memIDBits.W) 131 | io.to_arbiter.bits.index := unpacked_readData(1) + C_offset 132 | io.to_arbiter.bits.burst_len := before_4k_bound_count - 1.U 133 | neighbour_count := unpacked_readData(0) - before_4k_bound_count 134 | burst_sum := burst_sum + before_4k_bound_count 135 | temp_index := unpacked_readData(1) + before_4k_bound_count 136 | io.src_q0_deq.ready := false.B // send 137 | io.src_q1_enq.valid := true.B // recieve 138 | queue_readData.io.deq.ready := false.B // send 139 | io.to_arbiter.valid := true.B // recieve 140 | state := loop 141 | }. otherwise{ 142 | io.to_arbiter.bits.index := unpacked_readData(1) + C_offset 143 | io.to_arbiter.bits.burst_len := 255.U(8.W) 144 | io.to_arbiter.bits.id := 1.U(conf.memIDBits.W) 145 | neighbour_count := unpacked_readData(0) - 256.U 146 | burst_sum := burst_sum + 256.U 147 | temp_index := unpacked_readData(1) + 256.U 148 | io.src_q0_deq.ready := false.B // send 149 | io.src_q1_enq.valid := true.B // recieve 150 | queue_readData.io.deq.ready := false.B // send 151 | io.to_arbiter.valid := true.B // recieve 152 | state := loop 153 | } 154 | }.elsewhen(unpacked_readData(0) =/= 0.U){ //0 256 189 | when(io.to_arbiter.ready && io.src_q0_deq.valid && io.src_q1_enq.ready){ 190 | when(neighbour_count > 256.U){ 191 | when(((temp_index + C_offset)*(conf.HBM_Data_width / 8).asUInt(conf.Data_width.W))/4096.U =/= 192 | ((temp_index + C_offset)*(conf.HBM_Data_width / 8).asUInt(conf.Data_width.W)+(256*(conf.HBM_Data_width / 8)).U-1.U)/4096.U){ 193 | // cross 4k boundary 194 | before_4k_bound_count := (((((temp_index + C_offset)*(conf.HBM_Data_width / 8).asUInt(conf.Data_width.W)) & 0xFFFFF000L.U) + 0x1000.U) - ((temp_index + C_offset)*(conf.HBM_Data_width / 8).asUInt(conf.Data_width.W))) / (conf.HBM_Data_width / 8).U 195 | io.to_arbiter.bits.id := 1.U(conf.memIDBits.W) 196 | io.to_arbiter.bits.index := temp_index + C_offset 197 | io.to_arbiter.bits.burst_len := before_4k_bound_count - 1.U 198 | neighbour_count := neighbour_count - before_4k_bound_count 199 | burst_sum := burst_sum + before_4k_bound_count 200 | temp_index := temp_index + before_4k_bound_count 201 | io.src_q0_deq.ready := false.B // send 202 | io.src_q1_enq.valid := true.B // recieve 203 | queue_readData.io.deq.ready := false.B // send 204 | io.to_arbiter.valid := true.B // recieve 205 | }. otherwise{ 206 | io.to_arbiter.bits.index := temp_index + C_offset 207 | io.to_arbiter.bits.burst_len := 255.U(8.W) 208 | io.to_arbiter.bits.id := 1.U(conf.memIDBits.W) 209 | temp_index := temp_index + 256.U 210 | burst_sum := burst_sum + 256.U 211 | neighbour_count := neighbour_count - 256.U 212 | io.src_q0_deq.ready := false.B // send 213 | io.src_q1_enq.valid := true.B // recieve 214 | queue_readData.io.deq.ready := false.B // send 215 | io.to_arbiter.valid := true.B // recieve 216 | } 217 | }.otherwise{ // <=256 218 | when(((temp_index + C_offset)*(conf.HBM_Data_width / 8).asUInt(conf.Data_width.W))/4096.U =/= 219 | ((temp_index + C_offset)*(conf.HBM_Data_width / 8).asUInt(conf.Data_width.W)+(neighbour_count*(conf.HBM_Data_width / 8).U)-1.U)/4096.U){ 220 | // cross 4k boundary 221 | before_4k_bound_count := (((((temp_index + C_offset)*(conf.HBM_Data_width / 8).asUInt(conf.Data_width.W)) & 0xFFFFF000L.U) + 0x1000.U) - ((temp_index + C_offset)*(conf.HBM_Data_width / 8).asUInt(conf.Data_width.W))) / (conf.HBM_Data_width / 8).U 222 | io.to_arbiter.bits.id := 1.U(conf.memIDBits.W) 223 | io.to_arbiter.bits.index := temp_index + C_offset 224 | io.to_arbiter.bits.burst_len := before_4k_bound_count - 1.U 225 | neighbour_count := neighbour_count - before_4k_bound_count 226 | burst_sum := burst_sum + before_4k_bound_count 227 | temp_index := temp_index + before_4k_bound_count 228 | io.src_q0_deq.ready := false.B // send 229 | io.src_q1_enq.valid := true.B // recieve 230 | queue_readData.io.deq.ready := false.B // send 231 | io.to_arbiter.valid := true.B // recieve 232 | state := loop 233 | }. otherwise{ 234 | io.to_arbiter.bits.index := temp_index + C_offset 235 | io.to_arbiter.bits.burst_len := neighbour_count - 1.U 236 | io.to_arbiter.bits.id := 1.U(conf.memIDBits.W) 237 | burst_sum := burst_sum + neighbour_count 238 | temp_index := 0.U 239 | neighbour_count := 0.U 240 | io.src_q0_deq.ready := true.B // send 241 | io.src_q1_enq.valid := true.B // recieve 242 | queue_readData.io.deq.ready := true.B // send 243 | io.to_arbiter.valid := true.B // recieve 244 | state := read_rsp 245 | } 246 | } 247 | } 248 | } 249 | } 250 | } 251 | 252 | class myArbiterIO(implicit val conf : HBMGraphConfiguration) extends Bundle { 253 | val index = UInt(conf.Data_width.W) // in 64 bits 254 | val burst_len = UInt(8.W) // in 64 bits 255 | val id = UInt(2.W) // 0->index, 1->neighbour 256 | } 257 | 258 | 259 | // Memory logic 260 | // (read modified CSR: R indices followed by neighbour number) 261 | class Memory(val num :Int)(implicit val conf : HBMGraphConfiguration) extends Module { 262 | val io = IO(new Bundle { 263 | // input 264 | val R_array_index = Vec(conf.pipe_num_per_channel, Flipped(Decoupled(UInt(conf.Data_width.W)))) 265 | // input vertex index of the required CSR 266 | val write_vertex_index = Vec(conf.pipe_num_per_channel, Flipped(Decoupled(UInt(conf.Data_width.W)))) 267 | // input vertex index you want to write 268 | val p1_end = Input(Bool()) // input p1_end 269 | val level = Input(UInt(conf.Data_width.W)) // input level (constant in one iter) 270 | val push_or_pull_state = Input(Bool()) // 0 for push 271 | val offsets = Input(new offsets) // offsets 272 | val if_write = Input(Bool()) // if write 273 | val uram_out_a = Vec(conf.pipe_num_per_channel, Input(UInt(conf.Data_width_uram.W))) 274 | val uram_out_b = Vec(conf.pipe_num_per_channel, Input(UInt(conf.Data_width_uram.W))) 275 | //kernel count reg 276 | val kernel_count = Input(UInt(32.W)) 277 | val master_finish = Input(Bool()) 278 | val node_num = Input(UInt(conf.Data_width.W)) 279 | 280 | // output 281 | val neighbour_cnt = Output(UInt(conf.Data_width.W)) // output neighbour count of the vertex 282 | val mem_end = Output(Bool()) // output neighbour count of the vertex 283 | val neighbours = Vec(2*conf.pipe_num_per_channel, Decoupled(UInt((conf.Data_width*2).W))) 284 | // output 2*pipe_num_per_channel neighbours with src in local subgraph 285 | val uram_addr_a = Vec(conf.pipe_num_per_channel, Output(UInt(conf.Addr_width_uram.W))) 286 | val uram_addr_b = Vec(conf.pipe_num_per_channel, Output(UInt(conf.Addr_width_uram.W))) 287 | val write_finish = Output(Bool()) 288 | val HBM_interface = new AXIMasterIF(conf.HBM_Addr_width, conf.HBM_Data_width, conf.memIDBits) // HBM interface 289 | }) 290 | dontTouch(io) 291 | 292 | // write module 293 | val write_channel = Module(new Memory_write(num)) 294 | // write_channel.io.write_vertex_index <> io.write_vertex_index 295 | val write_index_arb = Module(new RRArbiter(UInt(conf.Data_width.W), conf.pipe_num_per_channel)) 296 | for(i <- 0 until conf.pipe_num_per_channel){ 297 | // write_vertex_index 298 | val tmp_q = Queue(io.write_vertex_index(i), conf.write_vertex_index_pre_len) 299 | tmp_q <> write_index_arb.io.in(i) 300 | 301 | // uram 302 | io.uram_addr_a <> write_channel.io.uram_addr_a 303 | io.uram_addr_b <> write_channel.io.uram_addr_b 304 | io.uram_out_a <> write_channel.io.uram_out_a 305 | io.uram_out_b <> write_channel.io.uram_out_b 306 | } 307 | write_index_arb.io.out <> write_channel.io.write_vertex_index 308 | write_channel.io.level <> io.level 309 | write_channel.io.offsets <> io.offsets 310 | write_channel.io.if_write <> io.if_write 311 | write_channel.io.HBM_write_interface.writeAddr <> io.HBM_interface.writeAddr 312 | write_channel.io.HBM_write_interface.writeData <> io.HBM_interface.writeData 313 | write_channel.io.HBM_write_interface.writeResp <> io.HBM_interface.writeResp 314 | write_channel.io.kernel_count := io.kernel_count 315 | write_channel.io.master_finish := io.master_finish 316 | write_channel.io.write_finish <> io.write_finish 317 | write_channel.io.node_num <> io.node_num 318 | // modules 319 | val arb = Module(new Arbiter(new myArbiterIO, 2)) 320 | val R_array_index_queue = Module(new Queue(UInt(conf.Data_width.W), conf.Mem_R_array_index_queue_len)) 321 | val src_index_queue0 = Module(new Queue(UInt(conf.Data_width.W), conf.src_index_queue_len)) 322 | val src_index_queue1 = Module(new Queue(UInt(conf.Data_width.W), conf.src_index_queue_len)) 323 | val read_neighbour = Module(new Read_neighbour) 324 | 325 | //counters 326 | val neighbours_valid = Wire(Bool()) 327 | val (count_val_i0, counterWrap_i0) = Counter(R_array_index_queue.io.enq.ready && R_array_index_queue.io.enq.valid, 2147483647) 328 | val (count_val_i1, counterWrap_i1) = Counter(read_neighbour.io.queue_ready && read_neighbour.io.queue_valid, 2147483647) 329 | val count_val_n0 = read_neighbour.io.count_val_n0 330 | val (count_val_n1, counterWrap_n1) = Counter(neighbours_valid, 2147483647) //HBM req num 331 | 332 | val count_n_vec = Array.ofDim[UInt](2*conf.pipe_num_per_channel) 333 | val count_w_vec = Array.ofDim[Bool](2*conf.pipe_num_per_channel) 334 | // val (count_neighbour0, counterWrap_nei0) = Counter(io.neighbours(0).ready && io.neighbours(0).valid, 2147483647) 335 | // val (count_neighbour1, counterWrap_nei1) = Counter(io.neighbours(1).ready && io.neighbours(1).valid, 2147483647) 336 | for(n_id <- 0 until 2*conf.pipe_num_per_channel){ 337 | val (tmp0, tmp1) = Counter(io.neighbours(n_id).ready && io.neighbours(n_id).valid, 2147483647) 338 | count_n_vec(n_id) = tmp0 339 | count_w_vec(n_id) = tmp1 340 | } 341 | 342 | //io.neighbours cat 343 | val neighbour = Wire(Vec(2*conf.pipe_num_per_channel, UInt(conf.crossbar_data_width.W))) 344 | val src = Wire(Vec(2*conf.pipe_num_per_channel, UInt(conf.crossbar_data_width.W))) 345 | for(i <- 0 until 2*conf.pipe_num_per_channel){ 346 | io.neighbours(i).bits := Cat(Fill((conf.Data_width - conf.crossbar_data_width) * 2, 0.U(1.W)),neighbour(i), src(i)) 347 | } 348 | 349 | // neighbour_cnt 350 | io.neighbour_cnt := count_n_vec.reduce(_ + _) 351 | 352 | // mem_end 353 | io.mem_end := RegNext((count_val_i0 === count_val_i1) && (count_val_n0 === count_val_n1) && io.p1_end 354 | && io.HBM_interface.writeAddr.valid === false.B && io.HBM_interface.writeData.valid === false.B) 355 | 356 | // R_array_index <> R_array_index_arb 357 | val R_array_index_arb = Module(new RRArbiter(UInt(conf.Data_width.W), conf.pipe_num_per_channel)) 358 | for(p1_id <- 0 until conf.pipe_num_per_channel){ 359 | io.R_array_index(p1_id) <> R_array_index_arb.io.in(p1_id) 360 | } 361 | 362 | // R_array_index_arb <> R_array_index_queue 363 | R_array_index_arb.io.out <> R_array_index_queue.io.enq 364 | 365 | // R_array_index_queue <> arbiter & src_index_queue0 366 | arb.io.in(1).bits.index := (R_array_index_queue.io.deq.bits/(conf.channel_num).asUInt()) + Mux(io.push_or_pull_state, io.offsets.CSC_R_offset ,io.offsets.CSR_R_offset) 367 | arb.io.in(1).bits.burst_len := 0.U(8.W) 368 | arb.io.in(1).bits.id := 0.U(2.W) 369 | arb.io.in(1).valid := R_array_index_queue.io.deq.valid && src_index_queue0.io.enq.ready 370 | src_index_queue0.io.enq.valid := R_array_index_queue.io.deq.valid && arb.io.in(1).ready 371 | src_index_queue0.io.enq.bits <> R_array_index_queue.io.deq.bits 372 | 373 | R_array_index_queue.io.deq.ready := arb.io.in(1).ready && src_index_queue0.io.enq.ready 374 | 375 | // arbiter <> HBM_interface.readAddr 376 | val to_readAddr_queue = Queue(arb.io.out, conf.to_readAddr_queue_len) 377 | 378 | io.HBM_interface.readAddr.bits.addr <> (to_readAddr_queue.bits.index) * (conf.HBM_Data_width / 8).asUInt(conf.Data_width.W) + conf.HBM_base_addr * num.asUInt() 379 | io.HBM_interface.readAddr.bits.len <> to_readAddr_queue.bits.burst_len // assume <= 255 380 | io.HBM_interface.readAddr.bits.id <> to_readAddr_queue.bits.id // 0->index, 1->neighbour 381 | io.HBM_interface.readAddr.ready <> to_readAddr_queue.ready 382 | io.HBM_interface.readAddr.valid <> to_readAddr_queue.valid 383 | 384 | //src_index_queue0 <> src_index_queue1 385 | src_index_queue0.io.deq <> read_neighbour.io.src_q0_deq 386 | src_index_queue1.io.enq <> read_neighbour.io.src_q1_enq 387 | 388 | // HBM_interface.readData <> read_neighbour 389 | io.HBM_interface.readData.bits <> read_neighbour.io.readData.bits 390 | io.HBM_interface.readData.valid <> read_neighbour.io.readData.valid 391 | 392 | // HBM_interface.readData & src_index_queue1 <> neighbours 393 | val unpacked_readData = io.HBM_interface.readData.bits.data.asTypeOf( 394 | Vec(2*conf.pipe_num_per_channel, UInt(conf.Data_width.W)) 395 | ) 396 | for(C_id <- 0 until 2*conf.pipe_num_per_channel){ 397 | neighbour(C_id) := unpacked_readData(C_id) 398 | src(C_id) := src_index_queue1.io.deq.bits 399 | } 400 | src_index_queue1.io.deq.ready := io.HBM_interface.readData.bits.last && io.HBM_interface.readData.valid && (io.HBM_interface.readData.bits.id === 1.U) && io.HBM_interface.readData.ready 401 | 402 | // reduce neighbours_valid 403 | val fifo_ready_vec = Wire(Vec(2*conf.pipe_num_per_channel, Bool())) 404 | for(C_id <- 0 until 2*conf.pipe_num_per_channel){ 405 | fifo_ready_vec(C_id) := io.neighbours(C_id).ready||io.neighbours(C_id).bits===(~(0.U(32.W))) 406 | } 407 | 408 | neighbours_valid := io.HBM_interface.readData.valid && io.HBM_interface.readData.bits.id === 1.U && 409 | src_index_queue1.io.deq.valid && fifo_ready_vec.reduce(_&&_) 410 | for(C_id <- 0 until 2*conf.pipe_num_per_channel){ 411 | io.neighbours(C_id).valid := neighbours_valid && (unpacked_readData(C_id) =/= ~(0.U(32.W))) 412 | } 413 | 414 | // HBM_interface.readData.ready: when one of the consumers ready 415 | io.HBM_interface.readData.ready := (read_neighbour.io.readData.ready && io.HBM_interface.readData.valid && io.HBM_interface.readData.bits.id === 0.U) || neighbours_valid 416 | 417 | // read_neighbour <> arbiter 418 | arb.io.in(0) <> read_neighbour.io.to_arbiter 419 | 420 | // read_neighbour <> offsets 421 | io.offsets <> read_neighbour.io.offsets 422 | io.push_or_pull_state <> read_neighbour.io.push_or_pull_state 423 | 424 | } 425 | -------------------------------------------------------------------------------- /src/main/scala/Mem_write.scala: -------------------------------------------------------------------------------- 1 | package HBMGraph 2 | import chisel3._ 3 | import chisel3.Driver 4 | import chisel3.util._ 5 | 6 | class Memory_write(val num :Int)(implicit val conf : HBMGraphConfiguration) extends Module { 7 | val io = IO(new Bundle { 8 | val write_vertex_index = Flipped(Decoupled(UInt(conf.Data_width.W))) // input vertex index you want to write 9 | val level = Input(UInt(conf.Data_width.W)) // input level (constant in one iter) 10 | val offsets = Input(new offsets) // CSR CSC offsets 11 | val if_write = Input(Bool()) 12 | val node_num = Input(UInt(conf.Data_width.W)) 13 | val uram_out_a = Vec(conf.pipe_num_per_channel, Input(UInt(conf.Data_width_uram.W))) 14 | val uram_out_b = Vec(conf.pipe_num_per_channel, Input(UInt(conf.Data_width_uram.W))) 15 | 16 | //kernel count reg 17 | val kernel_count = Input(UInt(32.W)) 18 | val master_finish = Input(Bool()) 19 | val HBM_write_interface = new Bundle{ 20 | // write address channel 21 | val writeAddr = Decoupled(new AXIAddress(conf.HBM_Addr_width, conf.memIDBits)) 22 | // write data channel 23 | val writeData = Decoupled(new AXIWriteData(conf.HBM_Data_width)) 24 | // write response channel (for memory consistency) 25 | val writeResp = Flipped(Decoupled(new AXIWriteResponse(conf.memIDBits))) 26 | } 27 | 28 | // output 29 | val uram_addr_a = Vec(conf.pipe_num_per_channel, Output(UInt(conf.Addr_width_uram.W))) 30 | val uram_addr_b = Vec(conf.pipe_num_per_channel, Output(UInt(conf.Addr_width_uram.W))) 31 | val write_finish = Output(Bool()) 32 | }) 33 | val write_vertex_index_queue = Queue(io.write_vertex_index, conf.write_vertex_index_len) 34 | val write_addr = io.HBM_write_interface.writeAddr //axi write address channel 35 | val write_data = io.HBM_write_interface.writeData //axi write data channel 36 | io.HBM_write_interface.writeResp.ready := true.B 37 | write_addr.bits.id := 0.U //id is useless 38 | write_addr.bits.addr := DontCare //wait for write_vertex_index_queue 39 | write_addr.bits.len := 0.U //dont need burst 40 | write_data.bits.data := DontCare 41 | 42 | write_data.bits.strb := "hffffffffffffffffffffffffffffffff".U(128.W) //custom data width 43 | write_data.bits.last := true.B //dont need burst 44 | io.write_finish := false.B 45 | io.uram_addr_a <> DontCare 46 | io.uram_addr_b <> DontCare 47 | 48 | //write_vertex_index_queue's ready is asserted by write address and data channels' ready 49 | 50 | write_vertex_index_queue.ready := write_addr.ready 51 | val write_count_and_max :: write_level_arr :: write_done :: Nil = Enum(3) 52 | val valid_state_addr = RegInit(write_count_and_max) // if false, write kernel_count and max_level; if true, write level array 53 | val valid_state_data = RegInit(write_count_and_max) // if false, write kernel_count and max_level; if true, write level array 54 | val (count_val_node_addr, counterWrap_node_addr) = Counter((valid_state_addr===write_level_arr) && write_addr.ready && write_addr.valid, 17000000) //level array index 55 | val (count_val_node_data, counterWrap_node_data) = Counter((valid_state_data===write_level_arr) && write_data.ready && write_data.valid, 17000000) //level array index 56 | 57 | //uram data fifo 58 | val uram_out_q = Module(new Queue(UInt(8.W), 16)) 59 | val write_addr_flag = (valid_state_addr === write_level_arr) && write_addr.ready && uram_out_q.io.count<15.U 60 | uram_out_q.io.enq.valid := RegNext(write_addr_flag) //next cycle of write addr 61 | uram_out_q.io.enq.bits := io.uram_out_a((count_val_node_addr-1.U) % conf.pipe_num_per_channel.U) >> (((count_val_node_addr-1.U)/conf.pipe_num_per_channel.U)%9.U)*8.U //place at (7:0) 62 | uram_out_q.io.deq <> DontCare 63 | 64 | //write address channel 65 | when(io.master_finish & valid_state_addr === write_count_and_max){ // write kernel_count and max_level 66 | valid_state_addr := write_level_arr 67 | write_addr.bits.addr := conf.HBM_base_addr * num.asUInt() 68 | write_addr.valid := true.B 69 | }.elsewhen(write_addr_flag){ // write HBM addr 70 | write_addr.bits.addr := (count_val_node_addr / (conf.HBM_Data_width.U/8.U) + io.offsets.level_offset) * (conf.HBM_Data_width.U/8.U) + conf.HBM_base_addr * num.asUInt() 71 | write_addr.valid := true.B 72 | io.uram_addr_a(count_val_node_addr % conf.pipe_num_per_channel.U) := count_val_node_addr / conf.pipe_num_per_channel.U / 9.U 73 | }.otherwise{ 74 | write_addr.valid := false.B 75 | write_vertex_index_queue.ready := true.B 76 | } 77 | 78 | //write data channel 79 | when(io.master_finish && (valid_state_data === write_count_and_max) && write_data.ready){ 80 | valid_state_data := write_level_arr 81 | write_data.bits.data := io.kernel_count | (io.level << 32) 82 | write_data.valid := true.B 83 | }.elsewhen(valid_state_data === write_level_arr && write_data.ready && uram_out_q.io.deq.valid){ //write HBM data 84 | val node_level = uram_out_q.io.deq.bits //level value, 8bits 85 | write_data.bits.data := Mux(io.if_write, count_val_node_data << (count_val_node_data % (conf.HBM_Data_width.U/8.U))*8.U, node_level << (count_val_node_data % (conf.HBM_Data_width.U/8.U))*8.U) 86 | write_data.bits.strb := 1.U << (count_val_node_data % (conf.HBM_Data_width.U/8.U)) 87 | write_data.valid := true.B 88 | uram_out_q.io.deq.ready := true.B 89 | } 90 | .otherwise{ 91 | write_data.valid := false.B 92 | } 93 | 94 | //if level array write done 95 | when(count_val_node_addr >= io.node_num+1.U){ 96 | valid_state_addr := write_done 97 | } 98 | when(count_val_node_data >= io.node_num+1.U){ 99 | valid_state_data := write_done 100 | io.write_finish := RegNext(true.B) 101 | } 102 | 103 | } -------------------------------------------------------------------------------- /src/main/scala/Test.scala: -------------------------------------------------------------------------------- 1 | package HBMGraph 2 | import chisel3._ 3 | import chisel3.Driver 4 | import chisel3.util._ 5 | import chisel3.iotesters.PeekPokeTester 6 | 7 | 8 | object Testp1 extends App{ 9 | implicit val configuration = HBMGraphConfiguration() 10 | chisel3.Driver.execute(Array[String](), () => new P1(0)(configuration)) 11 | 12 | } 13 | 14 | 15 | object Testp2 extends App{ 16 | implicit val configuration = HBMGraphConfiguration() 17 | chisel3.Driver.execute(Array[String](), () => new P2(0)(configuration)) 18 | 19 | } 20 | 21 | 22 | object TestMem_write extends App{ 23 | implicit val configuration = HBMGraphConfiguration() 24 | chisel3.Driver.execute(Array[String](), () => new Memory_write(0)(configuration)) 25 | 26 | } 27 | 28 | object TestMemory extends App{ 29 | implicit val configuration = HBMGraphConfiguration() 30 | chisel3.Driver.execute(Array[String](), () => new Memory(0)(configuration)) 31 | 32 | } 33 | 34 | object Testread_visited_map extends App{ 35 | implicit val configuration = HBMGraphConfiguration() 36 | chisel3.Driver.execute(Array[String](), () => new p2_read_visited_map_or_frontier(0)(configuration)) 37 | 38 | } 39 | 40 | object Testwrite_frontier_and_level extends App{ 41 | implicit val configuration = HBMGraphConfiguration() 42 | chisel3.Driver.execute(Array[String](), () => new write_frontier_and_level(0)(configuration)) 43 | 44 | } 45 | 46 | object Testfrontier extends App{ 47 | implicit val configuration = HBMGraphConfiguration() 48 | chisel3.Driver.execute(Array[String](), () => new Frontier(0)(configuration)) 49 | 50 | } 51 | 52 | object Testmaster extends App{ 53 | implicit val configuration = HBMGraphConfiguration() 54 | chisel3.Driver.execute(Array[String](), () => new master()(configuration)) 55 | 56 | } 57 | 58 | // class Test(c:Top) extends PeekPokeTester(c){ 59 | // for(t <- 0 until 50){ 60 | // println("-----------------------------------------------------------------------------------------------") 61 | // step(1) 62 | // } 63 | // } 64 | 65 | 66 | // object bfsTester { 67 | // def main(args: Array[String]): Unit = { 68 | // println("Testing bfs") 69 | // implicit val configuration = HBMGraphConfiguration() 70 | // iotesters.Driver.execute(Array[String](), () => new Top()) { 71 | // c => new Test(c) 72 | // } 73 | // } 74 | // } 75 | 76 | /*object Testbram extends App{ 77 | implicit val configuration = HBMGraphConfiguration() 78 | chisel3.Driver.execute(Array[String](), () => new bram_top) 79 | }*/ 80 | 81 | -------------------------------------------------------------------------------- /src/main/scala/Top.scala: -------------------------------------------------------------------------------- 1 | package HBMGraph 2 | import chisel3._ 3 | import chisel3.Driver 4 | import chisel3.util._ 5 | 6 | class offsets (implicit val conf : HBMGraphConfiguration) extends Bundle{ 7 | val CSR_R_offset = Input(UInt(conf.Data_width.W)) // input R_offset in 64 bits (constant in one iter) 8 | val CSR_C_offset = Input(UInt(conf.Data_width.W)) // input C_offset in 64 bits (constant in one iter) 9 | val CSC_R_offset = Input(UInt(conf.Data_width.W)) // input R_offset in 64 bits (constant in one iter) 10 | val CSC_C_offset = Input(UInt(conf.Data_width.W)) // input C_offset in 64 bits (constant in one iter) 11 | val level_offset = Input(UInt(conf.Data_width.W)) // input level_offset (constant in one iter) 12 | } 13 | 14 | class levels (implicit val conf : HBMGraphConfiguration) extends Bundle{ 15 | val push_to_pull_level = Input(UInt(conf.Data_width.W)) // input push_to_pull_level in 64 bits (constant in one iter) 16 | val pull_to_push_level = Input(UInt(conf.Data_width.W)) // input pull_to_push_level in 64 bits (constant in one iter) 17 | } 18 | 19 | class Top(implicit val conf : HBMGraphConfiguration) extends Module{ 20 | val io = IO(new Bundle{ 21 | val ap_start_pulse = Input(Bool()) 22 | val ap_done = Output(Bool()) 23 | val hbm = Vec(conf.channel_num,new AXIMasterIF(conf.HBM_Addr_width, conf.HBM_Data_width,conf.memIDBits)) 24 | val offsets = Input(new offsets) 25 | val levels = Input(new levels) 26 | val node_num = Input(UInt(conf.Data_width.W)) 27 | val if_write = Input(UInt(32.W)) 28 | }) 29 | dontTouch(io) 30 | //count kernel time 31 | val kernel_count = RegInit(0.U(32.W)) 32 | kernel_count := kernel_count + 1.U 33 | val reset_reg = RegNext(reset) 34 | val clk_wiz = withReset(reset_reg){Module(new clk_wiz_0)} 35 | clk_wiz.io.clock := clock 36 | 37 | 38 | val pipeline_array = new Array[pipeline](conf.channel_num) 39 | val write_finish_vec = new Array[Bool](conf.channel_num) 40 | for(i <- 0 until conf.channel_num) { 41 | pipeline_array(i) = withReset(reset_reg){Module(new pipeline(i))} 42 | } 43 | 44 | 45 | val if_write_state = withReset(reset_reg){RegInit(false.B)} 46 | val master = withReset(reset_reg){Module(new master)} 47 | master.io.levels <> io.levels 48 | val crossbar_array_mem = new Array[crossbar](2) 49 | val crossbar_array_visit = new Array[crossbar](2) 50 | when(io.if_write===1.U){ 51 | if_write_state := true.B 52 | }.otherwise{ 53 | if_write_state := false.B 54 | } 55 | for(i <- 0 until 2){ 56 | crossbar_array_mem(i) = withReset(reset_reg){Module(new crossbar(is_double_width=true))} 57 | crossbar_array_visit(i) = withReset(reset_reg){Module(new crossbar(is_double_width=false))} 58 | } 59 | 60 | 61 | for(i <- 0 until conf.channel_num) { 62 | io.hbm(i)<>pipeline_array(i).io.axiport 63 | pipeline_array(i).io.bram_clock<>clk_wiz.io.clk_bram 64 | 65 | master.io.mem_end(i) := pipeline_array(i).io.mem_end 66 | master.io.end(i) := pipeline_array(i).io.end 67 | master.io.p2_count(i) := pipeline_array(i).io.p2_count 68 | master.io.mem_count(i) := pipeline_array(i).io.mem_count 69 | master.io.p2_pull_count(i) := pipeline_array(i).io.p2_pull_count 70 | master.io.frontier_pull_count(i) := pipeline_array(i).io.frontier_pull_count 71 | master.io.last_iteration_state(i) := pipeline_array(i).io.last_iteration 72 | //kernel count 73 | pipeline_array(i).io.kernel_count := kernel_count 74 | pipeline_array(i).io.master_finish := master.io.global_finish 75 | 76 | pipeline_array(i).io.p2_end := master.io.p2_end 77 | pipeline_array(i).io.start := master.io.start 78 | pipeline_array(i).io.frontier_flag := master.io.frontier_flag 79 | pipeline_array(i).io.level := master.io.current_level 80 | pipeline_array(i).io.offsets <> io.offsets 81 | // pipeline_array(i).io.node_num <> (io.node_num + conf.numSubGraphs.U - 1.U - i.U) / conf.numSubGraphs.U 82 | pipeline_array(i).io.node_num <> io.node_num 83 | pipeline_array(i).io.push_or_pull := master.io.push_or_pull // pure pull 84 | pipeline_array(i).io.if_write := if_write_state // pure pull 85 | write_finish_vec(i) = pipeline_array(i).io.write_finish 86 | 87 | for(j <- 0 until 2){ 88 | for(k <- 0 until conf.pipe_num_per_channel){ 89 | crossbar_array_mem(j).io.in(conf.channel_num * k + i) <> pipeline_array(i).io.mem_out(2 * k + j) 90 | crossbar_array_mem(j).io.out(conf.channel_num * k + i) <> pipeline_array(i).io.p2_in(2 * k + j) 91 | 92 | // crossbar_array_mem(j).io.in(i) <> pipeline_array(i).io.mem_out(j) 93 | // crossbar_array_mem(j).io.out(i) <> pipeline_array(i).io.p2_in(j) 94 | // TODO 95 | when(master.io.push_or_pull === 0.U){ // push mode 96 | crossbar_array_visit(j).io.in(conf.channel_num * k + i) <> DontCare 97 | crossbar_array_visit(j).io.out(conf.channel_num * k + i) <> DontCare 98 | crossbar_array_visit(j).io.in(conf.channel_num * k + i).valid := false.B 99 | pipeline_array(i).io.p2_out(2 * k + j) <> pipeline_array(i).io.frontier_in(2 * k + j) 100 | }.otherwise { // pull mode 101 | crossbar_array_visit(j).io.in(conf.channel_num * k + i) <> pipeline_array(i).io.p2_out(2 * k + j) 102 | crossbar_array_visit(j).io.out(conf.channel_num * k + i) <> pipeline_array(i).io.frontier_in(2 * k + j) 103 | } 104 | } 105 | } 106 | } 107 | 108 | val start_state = withReset(reset_reg){RegInit(false.B)} 109 | master.io.global_start := start_state 110 | val master_finish_count = RegInit(0.U(8.W)) 111 | val global_write_finish = RegNext(write_finish_vec.reduce(_&&_)) 112 | when(global_write_finish){ 113 | master_finish_count := master_finish_count + 1.U 114 | } 115 | 116 | when(master_finish_count>=200.U){ 117 | io.ap_done := true.B 118 | }.otherwise{ 119 | io.ap_done := false.B 120 | } 121 | 122 | when(io.ap_start_pulse){ 123 | kernel_count := 0.U 124 | start_state := true.B 125 | }.elsewhen(master.io.global_finish){ 126 | start_state := false.B 127 | } 128 | } 129 | 130 | class pipeline(val num: Int)(implicit val conf : HBMGraphConfiguration) extends Module{ 131 | val io = IO(new Bundle{ 132 | 133 | val bram_clock = Input(Clock()) 134 | //kernel count reg 135 | val kernel_count = Input(UInt(32.W)) 136 | val master_finish = Input(Bool()) 137 | //frontier io 138 | val frontier_flag = Input(UInt(1.W)) 139 | val p2_end = Input(Bool()) 140 | val start = Input(Bool()) 141 | val end = Output(Bool()) 142 | val last_iteration = Output(Bool()) 143 | //p2 io 144 | val p2_count = Output(UInt(conf.Data_width.W)) 145 | //mem io 146 | val axiport = new AXIMasterIF(conf.HBM_Addr_width, conf.HBM_Data_width,conf.memIDBits) 147 | val mem_count =Output(UInt(conf.Data_width.W)) 148 | val frontier_pull_count = Output(UInt(conf.Data_width.W)) 149 | val p2_pull_count = Output(UInt(conf.Data_width.W)) 150 | val mem_end = Output(Bool()) 151 | val level = Input(UInt(conf.Data_width.W)) 152 | val offsets = Input(new offsets) 153 | val write_finish = Output(Bool()) 154 | 155 | //crossbar io 156 | // TODO 157 | //mem <> p2 158 | val p2_in = Vec(2 * conf.pipe_num_per_channel, Flipped(Decoupled(UInt((conf.Data_width*2).W)))) 159 | val mem_out = Vec(2 * conf.pipe_num_per_channel, Decoupled(UInt((conf.Data_width*2).W))) 160 | // p2 <> frontier 161 | val frontier_in = Vec(2 * conf.pipe_num_per_channel, Flipped(Decoupled(UInt(conf.Data_width.W)))) 162 | val p2_out = Vec(2 * conf.pipe_num_per_channel, Decoupled(UInt(conf.Data_width.W))) 163 | 164 | // parameter 165 | val node_num = Input(UInt(conf.Data_width.W)) 166 | val push_or_pull = Input(UInt(1.W)) //input flag mark push or pull state 167 | val if_write = Input(Bool()) 168 | 169 | 170 | }) 171 | 172 | val memory = Module(new Memory(num)) 173 | val p1 = new Array[P1](conf.pipe_num_per_channel) 174 | val p2 = new Array[P2](conf.pipe_num_per_channel) 175 | val frontier = new Array[Frontier](conf.pipe_num_per_channel) 176 | 177 | val frontier_end_vec = Array.ofDim[Bool](conf.pipe_num_per_channel) 178 | val last_iteration_vec = Array.ofDim[Bool](conf.pipe_num_per_channel) 179 | val p2_count_vec = Array.ofDim[UInt](conf.pipe_num_per_channel) 180 | val p1_end_vec = Array.ofDim[Bool](conf.pipe_num_per_channel) 181 | val frontier_pull_count_vec = Array.ofDim[UInt](conf.pipe_num_per_channel) 182 | val p2_pull_count_vec = Array.ofDim[UInt](conf.pipe_num_per_channel) 183 | 184 | for(i <- 0 until conf.pipe_num_per_channel){ 185 | p1(i) = Module(new P1(i * conf.channel_num + num)) 186 | p2(i) = Module(new P2(i * conf.channel_num + num)) 187 | frontier(i) = Module(new Frontier(i * conf.channel_num + num)) 188 | } 189 | //io <> frontier 190 | for(i <- 0 until conf.pipe_num_per_channel){ 191 | io.frontier_flag <> frontier(i).io.frontier_flag 192 | io.p2_end <> frontier(i).io.p2_end 193 | io.start <> frontier(i).io.start 194 | io.level <> frontier(i).io.level 195 | frontier_end_vec(i) = frontier(i).io.end 196 | last_iteration_vec(i) = frontier(i).io.last_iteration 197 | // io.end <> frontier(i).io.end 198 | io.bram_clock <> frontier(i).io.bram_clock 199 | io.push_or_pull <> frontier(i).io.push_or_pull_state 200 | frontier(i).io.node_num <> (io.node_num + conf.numSubGraphs.U - 1.U - (i.U * conf.channel_num.U + num.U)) / conf.numSubGraphs.U //node num in each PE 201 | frontier_pull_count_vec(i) = frontier(i).io.frontier_pull_count 202 | } 203 | io.frontier_pull_count := RegNext(frontier_pull_count_vec.reduce(_+_)) 204 | 205 | io.end := frontier_end_vec.reduce(_&_) 206 | 207 | io.last_iteration := last_iteration_vec.reduce(_&_) 208 | // io.last_iteration <> frontier.io.last_iteration 209 | // io.node_num <> frontier.io.node_num 210 | 211 | 212 | //io <> p2 213 | // io.p2_count <> p2.io.p2_count 214 | for(i <- 0 until conf.pipe_num_per_channel){ 215 | io.bram_clock <> p2(i).io.bram_clock 216 | io.push_or_pull <> p2(i).io.push_or_pull_state 217 | io.if_write <> p2(i).io.if_write 218 | p2_count_vec(i) = p2(i).io.p2_count 219 | p2_pull_count_vec(i) = p2(i).io.p2_pull_count 220 | } 221 | io.p2_count := p2_count_vec.reduce(_+_) 222 | io.p2_pull_count := RegNext(p2_pull_count_vec.reduce(_+_)) 223 | 224 | 225 | //io <> p1 226 | for(i <- 0 until conf.pipe_num_per_channel){ 227 | io.start <> p1(i).io.start 228 | io.push_or_pull <> p1(i).io.push_or_pull_state 229 | p1(i).io.node_num <> (io.node_num + conf.numSubGraphs.U - 1.U - (i.U * conf.channel_num.U + num.U)) / conf.numSubGraphs.U //node num in each PE 230 | } 231 | // io.node_num <> p1.io.node_num 232 | 233 | 234 | //io <> mem 235 | io.axiport <> memory.io.HBM_interface 236 | io.mem_count <> memory.io.neighbour_cnt 237 | io.mem_end <> memory.io.mem_end 238 | io.level <> memory.io.level 239 | io.offsets <> memory.io.offsets 240 | io.push_or_pull <> memory.io.push_or_pull_state 241 | io.if_write <> memory.io.if_write 242 | io.write_finish <> memory.io.write_finish 243 | memory.io.kernel_count := io.kernel_count 244 | memory.io.master_finish := io.master_finish 245 | memory.io.node_num := (io.node_num + conf.channel_num.U - 1.U - num.U) / conf.channel_num.U //node num in each memory channel 246 | 247 | //p1 <> frontier 248 | for(i <- 0 until conf.pipe_num_per_channel){ 249 | p1(i).io.frontier_value <> frontier(i).io.frontier_value 250 | p1(i).io.frontier_count <> frontier(i).io.frontier_count 251 | } 252 | 253 | //p1 <> mem 254 | for(i <- 0 until conf.pipe_num_per_channel){ 255 | p1(i).io.R_array_index <> memory.io.R_array_index(i) 256 | p1_end_vec(i) = p1(i).io.p1_end 257 | } 258 | memory.io.p1_end <> p1_end_vec.reduce(_&_) 259 | 260 | 261 | //mem <> p2 262 | memory.io.neighbours <> io.mem_out 263 | for(i <- 0 until conf.pipe_num_per_channel){ 264 | io.p2_in(2 * i) <> p2(i).io.neighbours(0) 265 | io.p2_in(2 * i + 1) <> p2(i).io.neighbours(1) 266 | memory.io.write_vertex_index(i) <> p2(i).io.write_vertex_index 267 | } 268 | 269 | // io.p2_in <> p2.io.neighbours 270 | // memory.io.write_vertex_index <> p2.io.write_vertex_index 271 | 272 | //p2 <> frontier 273 | for(i <- 0 until conf.pipe_num_per_channel){ 274 | p2(i).io.write_frontier(0) <> io.p2_out(2 * i) 275 | p2(i).io.write_frontier(1) <> io.p2_out(2 * i + 1) 276 | frontier(i).io.write_frontier(0) <> io.frontier_in(2 * i) 277 | frontier(i).io.write_frontier(1) <> io.frontier_in(2 * i + 1) 278 | 279 | p2(i).io.bram_to_frontier <> frontier(i).io.bram_from_p2 280 | } 281 | 282 | 283 | //frontier <> mem 284 | for(i <- 0 until conf.pipe_num_per_channel){ 285 | frontier(i).io.uram_addr_a <> memory.io.uram_addr_a(i) 286 | frontier(i).io.uram_addr_b <> memory.io.uram_addr_b(i) 287 | frontier(i).io.uram_out_a <> memory.io.uram_out_a(i) 288 | frontier(i).io.uram_out_b <> memory.io.uram_out_b(i) 289 | } 290 | 291 | } 292 | 293 | object Top extends App{ 294 | implicit val configuration = HBMGraphConfiguration() 295 | override val args = Array("-o", "Top.v", 296 | "-X", "verilog", 297 | "--no-dce", 298 | "--info-mode=ignore" 299 | ) 300 | chisel3.Driver.execute(args, () => new Top) 301 | //chisel3.Driver.execute(Array[String](), () => new Top()) 302 | } 303 | 304 | -------------------------------------------------------------------------------- /src/main/scala/bram.scala: -------------------------------------------------------------------------------- 1 | package HBMGraph 2 | import chisel3._ 3 | import chisel3.Driver 4 | import chisel3.util._ 5 | 6 | class bram_IO(implicit val conf : HBMGraphConfiguration) extends Bundle{ 7 | val ena = Input(Bool()) 8 | val addra = Input(UInt(conf.Addr_width.W)) 9 | val clka = Input(Clock()) 10 | val dina = Input(UInt(conf.Data_width_bram.W)) 11 | val wea = Input(Bool()) 12 | val douta = Output(UInt(conf.Data_width_bram.W)) 13 | val enb = Input(Bool()) 14 | val addrb = Input(UInt(conf.Addr_width.W)) 15 | val clkb = Input(Clock()) 16 | val dinb = Input(UInt(conf.Data_width_bram.W)) 17 | val web = Input(Bool()) 18 | val doutb = Output(UInt(conf.Data_width_bram.W)) 19 | } 20 | 21 | class uram_IO(implicit val conf : HBMGraphConfiguration) extends Bundle{ 22 | val ena = Input(Bool()) 23 | val addra = Input(UInt(conf.Addr_width_uram.W)) 24 | val clka = Input(Clock()) 25 | val dina = Input(UInt(conf.Data_width_uram.W)) 26 | val wea = Input(UInt((conf.Data_width_uram / 8).W)) 27 | val douta = Output(UInt(conf.Data_width_uram.W)) 28 | val enb = Input(Bool()) 29 | val addrb = Input(UInt(conf.Addr_width_uram.W)) 30 | val clkb = Input(Clock()) 31 | val dinb = Input(UInt(conf.Data_width_uram.W)) 32 | val web = Input(UInt((conf.Data_width_uram / 8).W)) 33 | val doutb = Output(UInt(conf.Data_width_uram.W)) 34 | } 35 | 36 | 37 | // num is the channel num 38 | // when bram_num = 0 means visited_map 39 | // when bram_num = 1 or 2 means frontier 40 | 41 | class bram(val num : Int, bram_num : Int)(implicit val conf : HBMGraphConfiguration) extends BlackBox{ 42 | val io = IO(new bram_IO) 43 | override def desiredName = 44 | if(bram_num < 3){ 45 | "bram_" + num + "_" + bram_num 46 | }else{ 47 | "uram_" + num 48 | } 49 | } 50 | class uram(val num : Int, bram_num : Int)(implicit val conf : HBMGraphConfiguration) extends BlackBox{ 51 | val io = IO(new uram_IO) 52 | override def desiredName = 53 | if(bram_num < 3){ 54 | "bram_" + num + "_" + bram_num 55 | }else{ 56 | "uram_" + num 57 | } 58 | } 59 | 60 | class bram_controller_IO(implicit val conf : HBMGraphConfiguration) extends Bundle{ 61 | // bram io 62 | // val bram = new bram_IO 63 | val ena = Input(Bool()) 64 | val addra = Input(UInt(conf.Addr_width.W)) 65 | val clka = Input(Clock()) 66 | val dina = Input(UInt(conf.Data_width_bram.W)) 67 | val wea = Input(Bool()) 68 | val douta = Output(UInt(conf.Data_width_bram.W)) 69 | val enb = Input(Bool()) 70 | val addrb = Input(UInt(conf.Addr_width.W)) 71 | val clkb = Input(Clock()) 72 | val dinb = Input(UInt(conf.Data_width_bram.W)) 73 | val web = Input(Bool()) 74 | val doutb = Output(UInt(conf.Data_width_bram.W)) 75 | 76 | // wmode 0 : read-or-write 1 : write 2 data for clear 77 | val wmode = Input(UInt(1.W)) 78 | // nodea and nodeb's value is between 0 and conf.Data_width_bram - 1 79 | val nodea = Input(UInt((conf.Data_width_bram.U.getWidth - 1).W)) 80 | val nodeb = Input(UInt((conf.Data_width_bram.U.getWidth - 1).W)) 81 | // only visited_map need these two signals to show if the node is visited 82 | // val visited_a = Output(UInt(1.W)) 83 | // val visited_b = Output(UInt(1.W)) 84 | } 85 | 86 | class bram_controller(val num : Int, bram_num : Int)(implicit val conf : HBMGraphConfiguration) extends Module{ 87 | val io = IO(new bram_controller_IO) 88 | dontTouch(io) 89 | val ram = Module(new bram(num, bram_num)) 90 | // init signals 91 | ram.io.ena := io.ena 92 | ram.io.enb := io.enb 93 | ram.io.addra := io.addra 94 | ram.io.addrb := io.addrb 95 | ram.io.clka := io.clka 96 | ram.io.clkb := io.clkb 97 | ram.io.wea := false.B 98 | ram.io.web := false.B 99 | ram.io.dina := DontCare 100 | ram.io.dinb := DontCare 101 | io.douta := ram.io.douta 102 | io.doutb := ram.io.doutb 103 | 104 | // val visited_a = RegInit(0.U(1.W)) 105 | // val visited_b = RegInit(0.U(1.W)) 106 | // io.visited_a := visited_a 107 | // io.visited_b := visited_b 108 | val cnt = RegInit(0.U(1.W)) 109 | // init cnt 110 | cnt := 0.U 111 | when((io.wea || io.web) && io.wmode === 0.U){ //need to write data 112 | when(cnt === 0.U){ 113 | // read 114 | cnt := cnt + 1.U 115 | ram.io.wea := false.B 116 | ram.io.web := false.B 117 | // ram.io.addra := io.addra 118 | // ram.io.addrb := io.addrb 119 | } 120 | .otherwise{ //write 121 | cnt := cnt + 1.U 122 | // visited_map result 123 | // visited_a := ram.io.douta(io.nodea) 124 | // visited_b := ram.io.doutb(io.nodeb) 125 | 126 | when(io.addra === io.addrb && io.wea && io.web){ // just write in port a 127 | // // we only write when the node is not visited or the node is not in frontier 128 | // when(ram.io.douta(io.nodea) === 0.U || ram.io.doutb(io.nodeb) === 0.U){ 129 | ram.io.wea := true.B 130 | ram.io.web := false.B 131 | ram.io.dina := ram.io.douta | (1.U << io.nodea) | (1.U << io.nodeb) 132 | // } 133 | } 134 | .otherwise{ 135 | // write in port a 136 | // when(ram.io.douta(io.nodea) === 0.U){ 137 | ram.io.wea := true.B & io.wea 138 | ram.io.dina := ram.io.douta | (1.U << io.nodea) 139 | // } 140 | // write in port b 141 | // when(ram.io.doutb(io.nodeb) === 0.U){ 142 | ram.io.web := true.B & io.web 143 | ram.io.dinb := ram.io.doutb | (1.U << io.nodeb) 144 | // } 145 | } 146 | } 147 | } 148 | // for clear 149 | .elsewhen((io.wea || io.web) && io.wmode === 1.U){ 150 | when(io.wea){ 151 | when(cnt === 0.U){ 152 | cnt := cnt + 1.U 153 | ram.io.wea := io.wea 154 | ram.io.addra := io.addra 155 | ram.io.dina := io.dina 156 | } 157 | .otherwise{ 158 | cnt := cnt + 1.U 159 | ram.io.wea := io.wea 160 | ram.io.addra := io.addra + 1.U 161 | ram.io.dina := io.dina 162 | } 163 | 164 | } 165 | when(io.web){ 166 | when(cnt === 0.U){ 167 | cnt := cnt + 1.U 168 | ram.io.web := io.web 169 | ram.io.addrb := io.addrb 170 | ram.io.dinb := io.dinb 171 | } 172 | .otherwise{ 173 | cnt := cnt + 1.U 174 | ram.io.web := io.web 175 | ram.io.addrb := io.addrb + 1.U 176 | ram.io.dinb := io.dinb 177 | } 178 | } 179 | } 180 | .otherwise{ 181 | ram.io.wea := false.B 182 | ram.io.web := false.B 183 | } 184 | } 185 | 186 | 187 | class uram_controller_IO(implicit val conf : HBMGraphConfiguration) extends Bundle{ 188 | 189 | val clka = Input(Clock()) 190 | val clkb = Input(Clock()) 191 | 192 | val addra = Input(UInt(conf.Addr_width_uram.W)) 193 | // val addra1 = Input(UInt(conf.Addr_width_uram.W)) 194 | val dina = Input(UInt(conf.Data_width_uram.W)) 195 | // val dina1 = Input(UInt(conf.Data_width_uram.W)) 196 | val wea = Input(UInt((conf.Data_width_uram / 8).W)) 197 | val douta = Output(UInt(conf.Data_width_uram.W)) 198 | // val wea1 = Input(UInt(conf.Data_width_uram / 8).W)) 199 | val addrb = Input(UInt(conf.Addr_width_uram.W)) 200 | // val addrb1 = Input(UInt(conf.Addr_width_uram.W)) 201 | val dinb = Input(UInt(conf.Data_width_uram.W)) 202 | // val dinb1 = Input(UInt(conf.Data_width_uram.W)) 203 | val web = Input(UInt((conf.Data_width_uram / 8).W)) 204 | // val web1 = Input(UInt(conf.Data_width_uram / 8).W)) 205 | val doutb = Output(UInt(conf.Data_width_uram.W)) 206 | 207 | } 208 | 209 | class uram_controller(val num : Int, bram_num : Int)(implicit val conf : HBMGraphConfiguration) extends Module{ 210 | val io = IO(new uram_controller_IO) 211 | dontTouch(io) 212 | val ram = Module(new uram(num, bram_num)) 213 | // init signals 214 | ram.io.ena := true.B 215 | ram.io.enb := true.B 216 | ram.io.addra := DontCare 217 | ram.io.addrb := DontCare 218 | ram.io.clka := io.clka 219 | ram.io.clkb := io.clkb 220 | ram.io.wea := false.B 221 | ram.io.web := false.B 222 | ram.io.dina := DontCare 223 | ram.io.dinb := DontCare 224 | io.douta := ram.io.douta 225 | io.doutb := ram.io.doutb 226 | 227 | 228 | // val cnt = RegInit(0.U(1.W)) 229 | // cnt := cnt + 1.U 230 | 231 | // when(cnt === 0.U){ 232 | ram.io.addra := io.addra 233 | ram.io.addrb := io.addrb 234 | ram.io.wea := io.wea 235 | ram.io.web := io.web 236 | ram.io.dina := io.dina 237 | ram.io.dinb := io.dinb 238 | // }.otherwise{ 239 | // ram.io.addra := io.addra1 240 | // ram.io.addrb := io.addrb1 241 | // ram.io.wea := io.wea1 242 | // ram.io.web := io.web1 243 | // ram.io.dina := io.dina1 244 | // ram.io.dinb := io.dinb1 245 | // } 246 | } 247 | -------------------------------------------------------------------------------- /src/main/scala/clocking_wizard.scala: -------------------------------------------------------------------------------- 1 | package HBMGraph 2 | import chisel3._ 3 | import chisel3.Driver 4 | import chisel3.util._ 5 | 6 | class clk_wiz_0 extends BlackBox{ 7 | val io = IO(new Bundle{ 8 | // Clock out ports 9 | val clk_bram = Output(Clock()) 10 | // Clock in ports 11 | val clock = Input(Clock()) 12 | }) 13 | 14 | } 15 | 16 | 17 | class rst_n_to_rst extends BlackBox{ 18 | val io = IO(new Bundle{ 19 | // Clock out ports 20 | val reset = Output(Bool()) 21 | val reset_n = Input(Bool()) 22 | }) 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/configuration.scala: -------------------------------------------------------------------------------- 1 | package HBMGraph 2 | import chisel3._ 3 | import chisel3.util._ 4 | case class HBMGraphConfiguration() 5 | { 6 | val numSubGraphs = 64 7 | val pipe_num_per_channel = 2 8 | val channel_num = numSubGraphs/pipe_num_per_channel 9 | 10 | val sub_crossbar_size = 11 | if(numSubGraphs < 64){ 12 | numSubGraphs 13 | }else if(numSubGraphs == 64){ 14 | 4 15 | }else if(numSubGraphs == 128){ 16 | 4 17 | }else{ 18 | 4 19 | } 20 | val sub_crossbar_size_2 = 2 21 | val sub_crossbar_number = 22 | if(numSubGraphs == 64){ 23 | 16 24 | }else if(numSubGraphs == 128){ 25 | 32 26 | }else{ 27 | 16 28 | } 29 | val sub_crossbar_number_2 = 64 30 | val Data_width_uram = 72 31 | val Addr_width_uram = 20 32 | val Write_width_uram = 9 33 | 34 | val Addr_width = 17 35 | val Data_width = 32 36 | val Data_width_bram = 64 37 | // val root_node = 5 38 | val if_write = true.B 39 | 40 | // Memory 41 | val memIDBits = 6 42 | val HBM_Data_width = 64 * pipe_num_per_channel 43 | val HBM_Addr_width = 64 44 | val HBM_base_addr = "h10000000".U 45 | 46 | // // Queue 47 | // // p1 48 | // val q_frontier_to_p1_len = 8 49 | // // p2 50 | // val q_neighbours_len = 8 51 | // val q_p2_to_mem_len = 8 // write level 52 | // val q_visited_map_len = 8 53 | // val q_mem_to_p2_len = 8 54 | // // frontier 55 | // val q_p1_to_frontier_len = 8 56 | // val q_p2_to_frontier_len = 8 57 | 58 | // // memory 59 | // val Mem_queue_readData_len = 64 //big to prevent deadlock 60 | // val src_index_queue_len = 64 //big to prevent deadlock 61 | // val Mem_R_array_index_queue_len = 16 62 | // val to_readAddr_queue_len = 16 63 | // val write_vertex_index_pre_len = 8 // write level 64 | // val write_vertex_index_len = 8 // write level 65 | 66 | // Queue 67 | // p1 68 | val q_frontier_to_p1_len = 32 69 | // p2 70 | val q_neighbours_len = 32 71 | val q_p2_to_mem_len = 8 // write level 72 | val q_visited_map_len = 32 73 | val q_mem_to_p2_len = 32 74 | // frontier 75 | val q_p1_to_frontier_len = 32 76 | val q_p2_to_frontier_len = 32 77 | 78 | // memory 79 | val Mem_queue_readData_len = 64 //big to prevent deadlock 80 | val src_index_queue_len = 64 //big to prevent deadlock 81 | val Mem_R_array_index_queue_len = 16 82 | val to_readAddr_queue_len = 16 83 | val write_vertex_index_pre_len = 8 // write level 84 | val write_vertex_index_len = 8 // write level 85 | 86 | 87 | // crossbar 88 | val crossbar_in_fifo_len = 16 89 | val crossbar_main_fifo_len = 90 | if(numSubGraphs < 64){ 91 | (64 / sub_crossbar_size)*2+2 92 | }else{ 93 | (64 / sub_crossbar_number)*2+2 94 | } 95 | val crossbar_data_width = 24 96 | val crossbar_connect_fifo_len = 8 97 | 98 | } 99 | -------------------------------------------------------------------------------- /src/main/scala/crossbar.scala: -------------------------------------------------------------------------------- 1 | package HBMGraph 2 | import chisel3._ 3 | import chisel3.Driver 4 | import chisel3.util._ 5 | 6 | 7 | // n*n crossbar, is_double_width is true when src info needed 8 | class crossbar(val is_double_width: Boolean)(implicit val conf : HBMGraphConfiguration) extends Module{ 9 | def high(n : UInt) : UInt = 10 | if(is_double_width){ 11 | n(conf.crossbar_data_width * 2 - 1, conf.crossbar_data_width) 12 | }else{ 13 | n 14 | } 15 | 16 | val cb_datawidth = 17 | if(is_double_width){ 18 | (conf.crossbar_data_width*2).W 19 | }else{ 20 | conf.crossbar_data_width.W 21 | } 22 | 23 | val io = IO(new Bundle { 24 | val in = Vec(conf.numSubGraphs, Flipped(Decoupled(UInt(cb_datawidth)))) 25 | val out = Vec(conf.numSubGraphs, Decoupled(UInt(cb_datawidth))) 26 | }) 27 | if (conf.numSubGraphs < 64){ 28 | val sub_crossbar = Module(new sub_crossbar(is_double_width, conf.numSubGraphs, 1, 0,conf.sub_crossbar_size)) 29 | io.in <> sub_crossbar.io.in 30 | io.out <> sub_crossbar.io.out 31 | } 32 | else if(conf.numSubGraphs == 64){ 33 | val crossbar_array_in = new Array[sub_crossbar](conf.sub_crossbar_number) 34 | val crossbar_array_second = new Array[sub_crossbar](conf.sub_crossbar_number) 35 | val crossbar_array_out = new Array[sub_crossbar](conf.sub_crossbar_number) 36 | for(i <- 0 until conf.sub_crossbar_number){ 37 | crossbar_array_in(i) = Module(new sub_crossbar(is_double_width, conf.sub_crossbar_size, 1, 0, conf.sub_crossbar_size)) 38 | crossbar_array_second(i) = Module(new sub_crossbar(is_double_width, conf.sub_crossbar_number, conf.sub_crossbar_size, i % conf.sub_crossbar_size, conf.sub_crossbar_size)) 39 | crossbar_array_out(i) = Module(new sub_crossbar(is_double_width, conf.numSubGraphs, conf.sub_crossbar_number, i, conf.sub_crossbar_size)) 40 | } 41 | 42 | for(i <- 0 until conf.sub_crossbar_number){ 43 | // crossbar_array_in(i).io.modnum := conf.sub_crossbar_size.asUInt(conf.crossbar_data_width.W) 44 | // crossbar_array_in(i).io.size := 1.U 45 | // crossbar_array_in(i).io.number := 0.U 46 | // crossbar_array_out(i).io.modnum := conf.numSubGraphs.asUInt(conf.crossbar_data_width.W) 47 | // crossbar_array_out(i).io.size := conf.sub_crossbar_size.U 48 | // crossbar_array_out(i).io.number := i.U 49 | for(j <- 0 until conf.sub_crossbar_size){ 50 | crossbar_array_in(i).io.in(j) <> io.in(i * conf.sub_crossbar_size + j) 51 | crossbar_array_in(i).io.out(j) <> crossbar_array_second((i / conf.sub_crossbar_size) * conf.sub_crossbar_size + j).io.in(i % conf.sub_crossbar_size) 52 | crossbar_array_second(i).io.out(j) <> crossbar_array_out(i % conf.sub_crossbar_size + j * conf.sub_crossbar_size).io.in(i / conf.sub_crossbar_size) 53 | crossbar_array_out(i).io.out(j) <> io.out(j * conf.sub_crossbar_number + i) 54 | } 55 | } 56 | } 57 | else if(conf.numSubGraphs == 128){ 58 | val crossbar_array_in = new Array[sub_crossbar](conf.sub_crossbar_number) 59 | val crossbar_array_second = new Array[sub_crossbar](conf.sub_crossbar_number) 60 | val crossbar_array_third = new Array[sub_crossbar](conf.sub_crossbar_number) 61 | val crossbar_array_out = new Array[sub_crossbar](conf.sub_crossbar_number_2) 62 | for(i <- 0 until conf.sub_crossbar_number){ 63 | crossbar_array_in(i) = Module(new sub_crossbar(is_double_width, 4, 1, 0, conf.sub_crossbar_size)) 64 | crossbar_array_second(i) = Module(new sub_crossbar(is_double_width, 16, 4, i % 4, conf.sub_crossbar_size)) 65 | crossbar_array_third(i) = Module(new sub_crossbar(is_double_width, 64, 16, i % 16, conf.sub_crossbar_size)) 66 | } 67 | for( i <- 0 until conf.sub_crossbar_number_2){ 68 | crossbar_array_out(i) = Module(new sub_crossbar(is_double_width, conf.numSubGraphs, 64, i, conf.sub_crossbar_size_2)) 69 | } 70 | for(i <- 0 until conf.sub_crossbar_number){ 71 | for(j <- 0 until conf.sub_crossbar_size){ 72 | crossbar_array_in(i).io.in(j) <> io.in(i * conf.sub_crossbar_size + j) 73 | crossbar_array_in(i).io.out(j) <> crossbar_array_second((i / conf.sub_crossbar_size) * conf.sub_crossbar_size + j).io.in(i % conf.sub_crossbar_size) 74 | crossbar_array_second(i).io.out(j) <> crossbar_array_third(i % 4 + j * 4 + 16 * (i / 16)).io.in((i % 16) / conf.sub_crossbar_size) 75 | crossbar_array_third(i).io.out(j) <> crossbar_array_out(i % 16 + j * 16).io.in(i / 16) 76 | } 77 | 78 | } 79 | for(i <- 0 until conf.sub_crossbar_number_2){ 80 | for(j <- 0 until conf.sub_crossbar_size_2){ 81 | crossbar_array_out(i).io.out(j) <> io.out(j * conf.sub_crossbar_number_2 + i) 82 | } 83 | } 84 | } 85 | 86 | 87 | } 88 | class sub_crossbar(val is_double_width: Boolean, val modnum : Int, val size : Int, val number : Int, val sub_crossbar_size: Int )(implicit val conf : HBMGraphConfiguration) extends Module{ 89 | 90 | def high(n : UInt) : UInt = 91 | if(is_double_width){ 92 | n(conf.crossbar_data_width * 2 - 1, conf.crossbar_data_width) 93 | }else{ 94 | n 95 | } 96 | 97 | val cb_datawidth = 98 | if(is_double_width){ 99 | (conf.crossbar_data_width*2).W 100 | }else{ 101 | conf.crossbar_data_width.W 102 | } 103 | 104 | val io = IO(new Bundle { 105 | val in = Vec(sub_crossbar_size, Flipped(Decoupled(UInt(cb_datawidth)))) 106 | val out = Vec(sub_crossbar_size, Decoupled(UInt(cb_datawidth))) 107 | // val modnum = Input(UInt(conf.crossbar_data_width.W)) 108 | // val size = Input(UInt(conf.crossbar_data_width.W)) 109 | // val number = Input(UInt(conf.crossbar_data_width.W)) 110 | }) 111 | // val modnum = sub_crossbar_size.asUInt(conf.crossbar_data_width.W) 112 | // val modnum = io.modnum 113 | // val size = io.size 114 | // val number = io.number 115 | 116 | // Generate array 117 | val in_queue_vec = Array.ofDim[Queue[UInt]](sub_crossbar_size) 118 | val queue_vec = Array.ofDim[Queue[UInt]](sub_crossbar_size, sub_crossbar_size) 119 | val RRarbiter_vec = Array.ofDim[RRArbiter[UInt]](sub_crossbar_size) 120 | // val fifo_ready_vec = Array.ofDim[Bool](conf.numSubGraphs, conf.numSubGraphs) 121 | 122 | 123 | for(in_idx <- 0 until sub_crossbar_size){ 124 | in_queue_vec(in_idx) = Module(new Queue(UInt(cb_datawidth), conf.crossbar_in_fifo_len)) 125 | in_queue_vec(in_idx).io.enq <> io.in(in_idx) 126 | in_queue_vec(in_idx).io.deq <> DontCare 127 | RRarbiter_vec(in_idx) = Module(new RRArbiter(UInt(cb_datawidth), sub_crossbar_size)) 128 | for(in_idy <- 0 until sub_crossbar_size){ 129 | queue_vec(in_idx)(in_idy) = Module(new Queue(UInt(cb_datawidth), conf.crossbar_main_fifo_len)) 130 | queue_vec(in_idx)(in_idy).io.enq <> DontCare 131 | // fifo_ready_vec(in_idx)(in_idy) = queue_vec(in_idx)(in_idy).io.enq.ready 132 | } 133 | 134 | } 135 | 136 | // pre queue logic 137 | for(in_idx <- 0 until sub_crossbar_size){ 138 | when(in_queue_vec(in_idx).io.deq.valid){ 139 | for(in_idy <- 0 until sub_crossbar_size){ 140 | when(high(in_queue_vec(in_idx).io.deq.bits) % modnum.U === (in_idy.asUInt(conf.crossbar_data_width.W) * size.U + number.U)){ //32bits compare 141 | in_queue_vec(in_idx).io.deq <> queue_vec(in_idx)(in_idy).io.enq 142 | } .otherwise { 143 | queue_vec(in_idx)(in_idy).io.enq.valid := false.B 144 | } 145 | } 146 | } .otherwise { 147 | for(in_idy <- 0 until sub_crossbar_size){ 148 | queue_vec(in_idx)(in_idy).io.enq.valid := false.B 149 | } 150 | in_queue_vec(in_idx).io.deq.ready := false.B // fifo_ready_vec(in_idx).reduce(_ && _) 151 | } 152 | } 153 | 154 | //post queue logic 155 | for(out_idy <- 0 until sub_crossbar_size){ 156 | for(out_idx <- 0 until sub_crossbar_size){ 157 | queue_vec(out_idx)(out_idy).io.deq <> RRarbiter_vec(out_idy).io.in(out_idx) 158 | } 159 | } 160 | 161 | // output 162 | for(out_id <- 0 until sub_crossbar_size){ 163 | RRarbiter_vec(out_id).io.out <> io.out(out_id) 164 | } 165 | 166 | } 167 | 168 | // object Custom_function_cb{ 169 | // implicit val conf = HBMGraphConfiguration() 170 | // def high(n : UInt) : UInt = 171 | // n(conf.crossbar_data_width * 2 - 1, conf.crossbar_data_width) 172 | 173 | // def low(n : UInt) : UInt = 174 | // n(conf.crossbar_data_width - 1, 0) 175 | // } -------------------------------------------------------------------------------- /src/main/scala/frontier.scala: -------------------------------------------------------------------------------- 1 | package HBMGraph 2 | import chisel3._ 3 | import chisel3.Driver 4 | import chisel3.util._ 5 | 6 | /* push -> pull mode 7 | * p1 read current_frontier -> p1 read visited_map √ 8 | * p2 read visited_map -> p2 read current_frontier √ 9 | * p2 write next_frontier -> crossbar write visited_map + next_frontier √ 10 | * visited_map will be read and write at the same time (write first) 11 | */ 12 | 13 | 14 | class Frontier_IO(implicit val conf : HBMGraphConfiguration) extends Bundle{ 15 | //Input 16 | val frontier_count = Flipped(Decoupled(UInt(conf.Data_width.W))) // input count of the required current_frontier 17 | val write_frontier = Vec(2, Flipped(Decoupled(UInt(conf.Data_width.W))))// input 2 next_frontier you want to write 18 | val frontier_flag = Input(UInt(1.W)) // input flag mark which frontier to use as current_frontier or next_frontier 19 | val p2_end = Input(Bool()) // input p2 finish signal 20 | val bram_clock = Input(Clock()) // input clock with 2x frenquency. used by bram. 21 | val start = Input(Bool()) // input start 22 | val node_num = Input(UInt(conf.Data_width.W)) 23 | val push_or_pull_state = Input(UInt(1.W)) //input flag mark push or pull state 24 | val level = Input(UInt(conf.Data_width.W)) 25 | val uram_addr_a = Input(UInt(conf.Addr_width_uram.W)) 26 | val uram_addr_b = Input(UInt(conf.Addr_width_uram.W)) 27 | 28 | //Output 29 | val frontier_value = Decoupled(UInt(conf.Data_width_bram.W)) // output frontier data 30 | val end = Output(Bool()) // output end signal 31 | val last_iteration = Output(Bool()) // output. write next frontier in last iteration or not. 32 | val bram_from_p2 = new bram_controller_IO 33 | val uram_out_a = Output(UInt(conf.Data_width_uram.W)) 34 | val uram_out_b = Output(UInt(conf.Data_width_uram.W)) 35 | val frontier_pull_count = Output(UInt(conf.Data_width.W)) // pull crossbar count 36 | 37 | } 38 | 39 | class Frontier (val num :Int) (implicit val conf : HBMGraphConfiguration) extends Module{ 40 | val io = IO(new Frontier_IO()) 41 | dontTouch(io) 42 | io.frontier_value.valid := false.B 43 | io.frontier_value.bits := DontCare 44 | io.end := true.B 45 | 46 | val last_iteration_reg = RegInit(false.B) 47 | io.last_iteration := last_iteration_reg 48 | 49 | // two frontiers 50 | val frontier_0 = withClock(io.bram_clock){Module(new bram_controller(num, 1))} 51 | val frontier_1 = withClock(io.bram_clock){Module(new bram_controller(num, 2))} 52 | val uram = Module(new uram_controller(num, 3)) 53 | frontier_0.io := DontCare 54 | frontier_0.io.ena := true.B 55 | frontier_0.io.enb := true.B 56 | frontier_0.io.clka := io.bram_clock 57 | frontier_0.io.clkb := io.bram_clock 58 | frontier_0.io.wea := false.B 59 | frontier_0.io.web := false.B 60 | frontier_0.io.wmode := 0.U 61 | frontier_1.io := DontCare 62 | frontier_1.io.ena := true.B 63 | frontier_1.io.enb := true.B 64 | frontier_1.io.clka := io.bram_clock 65 | frontier_1.io.clkb := io.bram_clock 66 | frontier_1.io.wea := false.B 67 | frontier_1.io.web := false.B 68 | frontier_1.io.wmode := 0.U 69 | 70 | uram.io := DontCare 71 | uram.io.clka := clock //io.bram_clock 72 | uram.io.clkb := clock //io.bram_clock 73 | uram.io.wea := 0.U 74 | uram.io.web := 0.U 75 | io.uram_out_a := uram.io.douta 76 | io.uram_out_b := uram.io.doutb 77 | io.uram_addr_a <> uram.io.addra 78 | io.uram_addr_b <> uram.io.addrb 79 | 80 | // input Queue 81 | val q_frontier_count = Queue(io.frontier_count, conf.q_p1_to_frontier_len) 82 | val q_write_frontier_0 = Queue(io.write_frontier(0), conf.q_p2_to_frontier_len) 83 | val q_write_frontier_1 = Queue(io.write_frontier(1), conf.q_p2_to_frontier_len) 84 | q_frontier_count.ready := false.B 85 | q_write_frontier_0.ready := false.B 86 | q_write_frontier_1.ready := false.B 87 | 88 | // counter 89 | val (count_wf0, _) = Counter(q_write_frontier_0.ready && q_write_frontier_0.valid, 2147483647) 90 | val (count_wf1, _) = Counter(q_write_frontier_1.ready && q_write_frontier_1.valid, 2147483647) 91 | io.frontier_pull_count := count_wf0 + count_wf1 92 | 93 | //as designed , at beginning io.p2_end is high. should wait it low and high again. 94 | val p2_end_flag = RegInit(0.U(1.W)) 95 | when(io.p2_end === false.B){p2_end_flag := 1.U} 96 | 97 | val clear_addr = RegInit(0.U(conf.Addr_width.W)) 98 | val state0 :: state1 :: state2 :: state_write_uram :: Nil = Enum(4) 99 | val stateReg = RegInit(state2) 100 | 101 | io.bram_from_p2 := DontCare 102 | //for visited_map 103 | val visited_map = withClock(io.bram_clock){Module(new bram_controller(num, 0))} 104 | visited_map.io := DontCare 105 | // push mode, read and write 106 | visited_map.io.clka := io.bram_clock 107 | visited_map.io.clkb := io.bram_clock 108 | visited_map.io.ena := true.B 109 | visited_map.io.enb := true.B 110 | when(io.push_or_pull_state === 0.U){ 111 | visited_map.io.addra := io.bram_from_p2.addra 112 | visited_map.io.addrb := io.bram_from_p2.addrb 113 | 114 | // visited_map.io.clka := io.bram_from_p2.clka 115 | // visited_map.io.clkb := io.bram_from_p2.clkb 116 | visited_map.io.wea := io.bram_from_p2.wea 117 | visited_map.io.web := io.bram_from_p2.web 118 | visited_map.io.wmode := io.bram_from_p2.wmode 119 | visited_map.io.nodea := io.bram_from_p2.nodea 120 | visited_map.io.nodeb := io.bram_from_p2.nodeb 121 | io.bram_from_p2.douta := visited_map.io.douta 122 | io.bram_from_p2.doutb := visited_map.io.doutb 123 | } 124 | // pull mode, current_frontier is frontier_0, read only 125 | when(io.push_or_pull_state === 1.U && io.frontier_flag === 0.U){ 126 | frontier_0.io.addra := io.bram_from_p2.addra 127 | frontier_0.io.addrb := io.bram_from_p2.addrb 128 | io.bram_from_p2.douta := frontier_0.io.douta 129 | io.bram_from_p2.doutb := frontier_0.io.doutb 130 | } 131 | // pull mode, current_frontier is frontier_1, read only 132 | when(io.push_or_pull_state === 1.U && io.frontier_flag === 1.U){ 133 | frontier_1.io.addra := io.bram_from_p2.addra 134 | frontier_1.io.addrb := io.bram_from_p2.addrb 135 | io.bram_from_p2.douta := frontier_1.io.douta 136 | io.bram_from_p2.doutb := frontier_1.io.doutb 137 | } 138 | 139 | // for pull mode 140 | val port_a_is_writing_flag = Wire(Bool()) // visited_map is write first in pull mode 141 | val port_b_is_writing_flag = Wire(Bool()) // visited_map is write first in pull mode 142 | port_a_is_writing_flag := false.B 143 | port_b_is_writing_flag := false.B 144 | 145 | // for write level 146 | // val node_a0 = RegInit(0.U(conf.Data_width.W)) 147 | // val node_a1 = RegInit(0.U(conf.Data_width.W)) 148 | // val frontier_a = RegInit(0.U(conf.Data_width.W)) 149 | // val count_node_in_frontier_a0 = RegInit(0.U(conf.Data_width.W)) 150 | // val count_node_in_frontier_a1 = RegInit(0.U(conf.Data_width.W)) 151 | // val node_num_in_frontier_a0 = RegInit(0.U(conf.Data_width.W)) 152 | // val node_num_in_frontier_a1 = RegInit(0.U(conf.Data_width.W)) 153 | // val node_b0 = RegInit(0.U(conf.Data_width.W)) 154 | // val node_b1 = RegInit(0.U(conf.Data_width.W)) 155 | // val frontier_b = RegInit(0.U(conf.Data_width.W)) 156 | // val count_node_in_frontier_b0 = RegInit(0.U(conf.Data_width.W)) 157 | // val count_node_in_frontier_b1 = RegInit(0.U(conf.Data_width.W)) 158 | // val node_num_in_frontier_b0 = RegInit(0.U(conf.Data_width.W)) 159 | // val node_num_in_frontier_b1 = RegInit(0.U(conf.Data_width.W)) 160 | 161 | switch(stateReg){ 162 | is(state0){ 163 | io.end := false.B 164 | //state 0 read and write 165 | q_frontier_count.ready := false.B 166 | q_write_frontier_0.ready := true.B 167 | q_write_frontier_1.ready := true.B 168 | frontier_0.io.wea := false.B 169 | frontier_0.io.web := false.B 170 | frontier_1.io.wea := false.B 171 | frontier_1.io.web := false.B 172 | frontier_0.io.wmode := 0.U 173 | frontier_1.io.wmode := 0.U 174 | 175 | when(q_write_frontier_0.valid){ 176 | last_iteration_reg := false.B 177 | //write port a in next frontier 178 | when(io.push_or_pull_state === 1.U){ 179 | visited_map.io.wea := true.B 180 | // convert the total number of points to the number inside the pipeline 181 | visited_map.io.nodea := (Custom_function3.low(q_write_frontier_0.bits) / conf.numSubGraphs.U) % conf.Data_width_bram.U 182 | visited_map.io.addra := (Custom_function3.low(q_write_frontier_0.bits) / conf.numSubGraphs.U) / conf.Data_width_bram.U 183 | port_a_is_writing_flag := true.B // port a is write first 184 | } 185 | when(io.frontier_flag === 0.U){ 186 | //now next frontier is frontier_1 187 | frontier_1.io.wea := true.B 188 | frontier_1.io.nodea := (Custom_function3.low(q_write_frontier_0.bits) / conf.numSubGraphs.U) % conf.Data_width_bram.U 189 | frontier_1.io.addra := (Custom_function3.low(q_write_frontier_0.bits) / conf.numSubGraphs.U) / conf.Data_width_bram.U 190 | } 191 | .otherwise{ 192 | //now next frontier is frontier_0 193 | frontier_0.io.wea := true.B 194 | frontier_0.io.nodea := (Custom_function3.low(q_write_frontier_0.bits) / conf.numSubGraphs.U) % conf.Data_width_bram.U 195 | frontier_0.io.addra := (Custom_function3.low(q_write_frontier_0.bits) / conf.numSubGraphs.U) / conf.Data_width_bram.U 196 | } 197 | uram.io.wea := 1.U << ((Custom_function3.low(q_write_frontier_0.bits) / conf.numSubGraphs.U) % conf.Write_width_uram.U) 198 | uram.io.dina := io.level << (8.U * ((Custom_function3.low(q_write_frontier_0.bits) / conf.numSubGraphs.U) % conf.Write_width_uram.U)) 199 | uram.io.addra := (Custom_function3.low(q_write_frontier_0.bits) / conf.numSubGraphs.U) / conf.Write_width_uram.U 200 | } 201 | when(q_write_frontier_1.valid){ 202 | last_iteration_reg := false.B 203 | //write port b in next frontier 204 | when(io.push_or_pull_state === 1.U){ 205 | visited_map.io.web := true.B 206 | visited_map.io.nodeb := (Custom_function3.low(q_write_frontier_1.bits) / conf.numSubGraphs.U) % conf.Data_width_bram.U 207 | visited_map.io.addrb := (Custom_function3.low(q_write_frontier_1.bits) / conf.numSubGraphs.U) / conf.Data_width_bram.U 208 | port_b_is_writing_flag := true.B 209 | } 210 | when(io.frontier_flag === 0.U){ 211 | //now next frontier is frontier_1 212 | frontier_1.io.web := true.B 213 | frontier_1.io.nodeb := (Custom_function3.low(q_write_frontier_1.bits) / conf.numSubGraphs.U) % conf.Data_width_bram.U 214 | frontier_1.io.addrb := (Custom_function3.low(q_write_frontier_1.bits) / conf.numSubGraphs.U) / conf.Data_width_bram.U 215 | } 216 | .otherwise{ 217 | //now next frontier is frontier_0 218 | frontier_0.io.web := true.B 219 | frontier_0.io.nodeb := (Custom_function3.low(q_write_frontier_1.bits) / conf.numSubGraphs.U) % conf.Data_width_bram.U 220 | frontier_0.io.addrb := (Custom_function3.low(q_write_frontier_1.bits) / conf.numSubGraphs.U) / conf.Data_width_bram.U 221 | } 222 | uram.io.web := 1.U << ((Custom_function3.low(q_write_frontier_1.bits) / conf.numSubGraphs.U) % conf.Write_width_uram.U) 223 | uram.io.dinb := io.level << (8.U * ((Custom_function3.low(q_write_frontier_1.bits) / conf.numSubGraphs.U) % conf.Write_width_uram.U)) 224 | uram.io.addrb := (Custom_function3.low(q_write_frontier_1.bits) / conf.numSubGraphs.U) / conf.Write_width_uram.U 225 | } 226 | // visited_map is write first in pull mode 227 | // when(q_frontier_count.valid && io.frontier_value.ready && (!port_a_is_writing_flag || !port_b_is_writing_flag)){ 228 | when(q_frontier_count.valid && io.frontier_value.ready && (!q_write_frontier_0.valid || !q_write_frontier_1.valid || io.push_or_pull_state === 0.U )){ 229 | // q_frontier_count.ready := true.B 230 | // read visited_map in pull mode 231 | when(io.push_or_pull_state === 1.U){ 232 | // when(!q_write_frontier_0.valid && !q_write_frontier_0.valid){ // port a & b are free 233 | when(!q_write_frontier_0.valid){ 234 | q_frontier_count.ready := true.B 235 | visited_map.io.addra := q_frontier_count.bits 236 | io.frontier_value.valid := true.B 237 | io.frontier_value.bits := visited_map.io.douta 238 | } 239 | // .elsewhen(!q_write_frontier_0.valid && (q_frontier_count.bits =/= ((Custom_function3.low(q_write_frontier_1.bits) / conf.numSubGraphs.U) / conf.Data_width_bram.U))){ // port a is free 240 | // q_frontier_count.ready := true.B 241 | // visited_map.io.addra := q_frontier_count.bits 242 | // io.frontier_value.valid := true.B 243 | // io.frontier_value.bits := visited_map.io.douta 244 | // } 245 | // .elsewhen(!q_write_frontier_1.valid && (q_frontier_count.bits =/= ((Custom_function3.low(q_write_frontier_0.bits) / conf.numSubGraphs.U) / conf.Data_width_bram.U))){ // port b is free 246 | // q_frontier_count.ready := true.B 247 | // visited_map.io.addrb := q_frontier_count.bits 248 | // io.frontier_value.valid := true.B 249 | // io.frontier_value.bits := visited_map.io.doutb 250 | // } 251 | } 252 | 253 | // .elsewhen(io.push_or_pull_state === 1.U && !port_b_is_writing_flag &&((q_frontier_count.bits =/= (Custom_function3.low(q_write_frontier_0.bits) / conf.numSubGraphs.U) / conf.Data_width_bram.U))){ // port b is free 254 | // q_frontier_count.ready := true.B 255 | // visited_map.io.addrb := q_frontier_count.bits 256 | // io.frontier_value.valid := true.B 257 | // io.frontier_value.bits := visited_map.io.doutb 258 | // } 259 | 260 | // read from current frontier in push mode 261 | when(io.push_or_pull_state === 0.U ){ 262 | when(io.frontier_flag === 0.U){ 263 | q_frontier_count.ready := true.B 264 | frontier_0.io.addra := q_frontier_count.bits 265 | io.frontier_value.valid := true.B 266 | io.frontier_value.bits := frontier_0.io.douta 267 | } 268 | .otherwise{ 269 | q_frontier_count.ready := true.B 270 | frontier_1.io.addra := q_frontier_count.bits 271 | io.frontier_value.valid := true.B 272 | io.frontier_value.bits := frontier_1.io.douta 273 | } 274 | } 275 | 276 | } 277 | when(io.p2_end && p2_end_flag === 1.U && !q_frontier_count.valid && !q_write_frontier_0.valid && !q_write_frontier_1.valid){ 278 | stateReg := state1 279 | clear_addr := 0.U 280 | } 281 | } 282 | is(state1){ // clear bits 283 | io.end := false.B 284 | q_frontier_count.ready := false.B 285 | q_write_frontier_0.ready := false.B 286 | q_write_frontier_1.ready := false.B 287 | frontier_0.io.wea := false.B 288 | frontier_0.io.web := false.B 289 | frontier_1.io.wea := false.B 290 | frontier_1.io.web := false.B 291 | frontier_0.io.wmode := 1.U 292 | frontier_1.io.wmode := 1.U 293 | clear_addr := clear_addr + 4.U 294 | when(io.frontier_flag === 0.U){ 295 | // now current frontier is frontier 0 , at next level it will become next frontier , so clear it 296 | frontier_0.io.wea := true.B 297 | frontier_0.io.web := true.B 298 | frontier_0.io.dina := 0.U 299 | frontier_0.io.dinb := 0.U 300 | frontier_0.io.addra := clear_addr 301 | frontier_0.io.addrb := clear_addr + 2.U 302 | 303 | // frontier_1.io.wea := false.B 304 | // frontier_1.io.web := false.B 305 | // frontier_1.io.addra := clear_addr 306 | // frontier_1.io.addrb := clear_addr + 1.U 307 | // node_num_in_frontier_a0 :=(PopCount(frontier_1.io.douta) + 1.U) / 2.U 308 | // node_num_in_frontier_a1 :=PopCount(frontier_1.io.douta) / 2.U 309 | // node_num_in_frontier_b0 :=(PopCount(frontier_1.io.doutb) + 1.U) / 2.U 310 | // node_num_in_frontier_b1 :=PopCount(frontier_1.io.doutb) /2.U 311 | // when(frontier_1.io.douta =/= 0.U){ 312 | // node_a0 := Custom_function.find_node(frontier_1.io.douta, conf.Data_width_bram, clear_addr) 313 | // node_a1 := Custom_function.find_node(Custom_function.remove_one(frontier_1.io.douta), conf.Data_width_bram, clear_addr) 314 | // // frontier_a0 := Custom_function.remove_one(Custom_function.remove_one(frontier_1.io.douta)) 315 | // frontier_a := Custom_function.remove_one(Custom_function.remove_one(frontier_1.io.douta)) 316 | // count_node_in_frontier_a0 := 0.U 317 | // count_node_in_frontier_a1 := 0.U 318 | // stateReg := state_write_uram 319 | // } 320 | // when(frontier_1.io.doutb =/= 0.U){ 321 | // node_b0 := Custom_function.find_node(frontier_1.io.doutb, conf.Data_width_bram, clear_addr + 1.U) 322 | // node_b1 := Custom_function.find_node(Custom_function.remove_one(frontier_1.io.doutb), conf.Data_width_bram, clear_addr) 323 | // // frontier_b0 := Custom_function.remove_one(Custom_function.remove_one(frontier_1.io.doutb)) 324 | // frontier_b := Custom_function.remove_one(Custom_function.remove_one(frontier_1.io.doutb)) 325 | // count_node_in_frontier_b0 := 0.U 326 | // count_node_in_frontier_b1 := 0.U 327 | // stateReg := state_write_uram 328 | // } 329 | when(clear_addr >= (io.node_num / conf.Data_width_bram.U)){ // >= -> > 330 | stateReg := state2 331 | io.end := true.B 332 | } 333 | 334 | } 335 | .otherwise{ 336 | // now current frontier is frontier 1 , at next level it will become next frontier , so clear it 337 | frontier_1.io.wea := true.B 338 | frontier_1.io.web := true.B 339 | frontier_1.io.dina := 0.U 340 | frontier_1.io.dinb := 0.U 341 | frontier_1.io.addra := clear_addr 342 | frontier_1.io.addrb := clear_addr + 2.U 343 | 344 | // frontier_0.io.wea := false.B 345 | // frontier_0.io.web := false.B 346 | // frontier_0.io.addra := clear_addr 347 | // frontier_0.io.addrb := clear_addr + 1.U 348 | // node_num_in_frontier_a0 :=(PopCount(frontier_0.io.douta) + 1.U) / 2.U 349 | // node_num_in_frontier_a1 :=PopCount(frontier_0.io.douta) / 2.U 350 | // node_num_in_frontier_b0 :=(PopCount(frontier_0.io.doutb) + 1.U) / 2.U 351 | // node_num_in_frontier_b1 :=PopCount(frontier_0.io.doutb) /2.U 352 | // when(frontier_0.io.douta =/= 0.U){ 353 | // node_a0 := Custom_function.find_node(frontier_0.io.douta, conf.Data_width_bram, clear_addr) 354 | // node_a1 := Custom_function.find_node(Custom_function.remove_one(frontier_0.io.douta), conf.Data_width_bram, clear_addr) 355 | // // frontier_a0 := Custom_function.remove_one(Custom_function.remove_one(frontier_0.io.douta)) 356 | // frontier_a := Custom_function.remove_one(Custom_function.remove_one(frontier_0.io.douta)) 357 | // count_node_in_frontier_a0 := 0.U 358 | // count_node_in_frontier_a1 := 0.U 359 | // stateReg := state_write_uram 360 | // } 361 | // when(frontier_0.io.doutb =/= 0.U){ 362 | // node_b0 := Custom_function.find_node(frontier_0.io.doutb, conf.Data_width_bram, clear_addr + 1.U) 363 | // node_b1 := Custom_function.find_node(Custom_function.remove_one(frontier_0.io.doutb), conf.Data_width_bram, clear_addr) 364 | // // frontier_b0 := Custom_function.remove_one(Custom_function.remove_one(frontier_0.io.doutb)) 365 | // frontier_b := Custom_function.remove_one(Custom_function.remove_one(frontier_0.io.doutb)) 366 | // count_node_in_frontier_b0 := 0.U 367 | // count_node_in_frontier_b1 := 0.U 368 | // stateReg := state_write_uram 369 | // } 370 | when(clear_addr >= (io.node_num / conf.Data_width_bram.U)){ 371 | stateReg := state2 372 | io.end := true.B 373 | } 374 | } 375 | } 376 | // is(state_write_uram){ 377 | // // need not to consider when nodea/nodeb > nodnum 378 | // uram.io.wea0 := false.B 379 | // uram.io.wea1 := false.B 380 | // uram.io.web0 := false.B 381 | // uram.io.web1 := false.B 382 | // when((node_num_in_frontier_a0 === count_node_in_frontier_a0) && 383 | // (node_num_in_frontier_a1 === count_node_in_frontier_a1) && 384 | // (node_num_in_frontier_b0 === count_node_in_frontier_b0) && 385 | // (node_num_in_frontier_b1 === count_node_in_frontier_b1)){ 386 | // stateReg := state1 387 | // } 388 | // .otherwise{ 389 | // frontier_a := Custom_function.remove_one(Custom_function.remove_one(frontier_a)) 390 | // frontier_b := Custom_function.remove_one(Custom_function.remove_one(frontier_b)) 391 | // when(node_num_in_frontier_a0 < count_node_in_frontier_a0){ 392 | // uram.io.wea0 := true.B 393 | // uram.io.addra0 := node_a0 394 | // uram.io.dina0 := io.level 395 | // node_a0 := Custom_function.find_node(frontier_a, conf.Data_width_bram, clear_addr - 2.U) 396 | // node_num_in_frontier_a0 := node_num_in_frontier_a0 + 1.U 397 | // } 398 | // when(node_num_in_frontier_a1 < count_node_in_frontier_a1){ 399 | // uram.io.wea1 := true.B 400 | // uram.io.addra1 := node_a1 401 | // uram.io.dina1 := io.level 402 | // node_a1 := Custom_function.find_node(Custom_function.remove_one(frontier_a), conf.Data_width_bram, clear_addr - 2.U) 403 | // node_num_in_frontier_a1 := node_num_in_frontier_a1 + 1.U 404 | // } 405 | // when(node_num_in_frontier_b0 < count_node_in_frontier_b0){ 406 | // uram.io.web0 := true.B 407 | // uram.io.addrb0 := node_b0 408 | // uram.io.dinb0 := io.level 409 | // node_b0 := Custom_function.find_node(frontier_b, conf.Data_width_bram, clear_addr - 1.U) 410 | // node_num_in_frontier_b0 := node_num_in_frontier_b0 + 1.U 411 | // } 412 | // when(node_num_in_frontier_b1 < count_node_in_frontier_b1){ 413 | // uram.io.web1 := true.B 414 | // uram.io.addrb1 := node_b1 415 | // uram.io.dinb1 := io.level 416 | // node_b1 := Custom_function.find_node(Custom_function.remove_one(frontier_b), conf.Data_width_bram, clear_addr - 1.U) 417 | // node_num_in_frontier_b1 := node_num_in_frontier_b1 + 1.U 418 | // } 419 | // stateReg := state_write_uram 420 | // } 421 | // } 422 | is(state2){ 423 | io.end := true.B 424 | q_frontier_count.ready := false.B 425 | q_write_frontier_0.ready := false.B 426 | q_write_frontier_1.ready := false.B 427 | frontier_0.io.wea := false.B 428 | frontier_0.io.web := false.B 429 | frontier_1.io.wea := false.B 430 | frontier_1.io.web := false.B 431 | frontier_0.io.wmode := 0.U 432 | frontier_1.io.wmode := 0.U 433 | when(io.start){ 434 | io.end := false.B 435 | stateReg := state0 436 | last_iteration_reg := true.B 437 | } 438 | } 439 | } 440 | } 441 | 442 | object Custom_function3{ 443 | implicit val conf = HBMGraphConfiguration() 444 | def low(n : UInt) : UInt = 445 | n(conf.crossbar_data_width - 1, 0) 446 | 447 | } 448 | -------------------------------------------------------------------------------- /src/main/scala/master.scala: -------------------------------------------------------------------------------- 1 | package HBMGraph 2 | import chisel3._ 3 | import chisel3.Driver 4 | import chisel3.util._ 5 | 6 | 7 | /* chisel中无inout端口,各process所需的上一process的finish信号,是否按照设计从master接出? 8 | * 添加flag_Qc,其中p1与p4所使用的flag信号应该相反 9 | * 添加current_level处理 10 | */ 11 | 12 | 13 | 14 | class master(implicit val conf : HBMGraphConfiguration) extends Module{ 15 | val io = IO(new Bundle{ 16 | val global_start = Input(Bool()) 17 | val global_finish = Output(Bool()) 18 | val start = Output(Bool()) //send to p1 and frontier 19 | val frontier_flag = Output(UInt(1.W)) //send to frontier 20 | val current_level = Output(UInt(32.W)) //send to mem for level write 21 | 22 | val mem_end = Input(Vec(conf.channel_num,Bool())) //mem end 23 | val p2_end = Output(Bool()) //send to frontier 24 | val end = Input(Vec(conf.channel_num,Bool())) // end for each step 25 | 26 | val p2_count = Input(Vec(conf.channel_num,UInt(conf.Data_width.W))) // count for neighbour check nodes 27 | val mem_count = Input(Vec(conf.channel_num,UInt(conf.Data_width.W))) // count for neighbour nodes 28 | val frontier_pull_count = Input(Vec(conf.channel_num,UInt(conf.Data_width.W))) // count for pull crossbar nodes 29 | val p2_pull_count = Input(Vec(conf.channel_num,UInt(conf.Data_width.W))) // count for pull crossbar nodes 30 | val last_iteration_state = Input(Vec(conf.channel_num,Bool())) //show frontier write to assert global_finish 31 | 32 | val levels = Input(new levels) 33 | val push_or_pull = Output(UInt(1.W)) 34 | }) 35 | 36 | val mem_end_state = Wire(Bool()) 37 | val end_state = Wire(Bool()) 38 | mem_end_state := io.mem_end.reduce(_&_) 39 | end_state := io.end.reduce(_&_) 40 | 41 | val global_finish_state = RegInit(false.B) 42 | io.global_finish := global_finish_state 43 | 44 | val p2_cnt_total = RegInit(0.U(conf.Data_width.W)) 45 | val mem_cnt_total = RegInit(0.U(conf.Data_width.W)) 46 | p2_cnt_total := io.p2_count.reduce(_+_) 47 | mem_cnt_total := io.mem_count.reduce(_+_) 48 | val p2_pull_count_total = RegNext(io.p2_pull_count.reduce(_+_)) 49 | val frontier_pull_count_total = RegNext(io.frontier_pull_count.reduce(_+_)) 50 | 51 | val push_or_pull_state = RegInit(0.U(1.W)) 52 | io.push_or_pull := push_or_pull_state 53 | dontTouch(io) 54 | val level = RegInit(0.U(32.W)) 55 | val frontier_flag = RegInit(1.U(1.W)) 56 | io.current_level := level 57 | io.frontier_flag := frontier_flag 58 | io.start := false.B 59 | val state0 :: state1 :: Nil = Enum(2) 60 | val stateReg = RegInit(state0) 61 | switch(stateReg){ 62 | is(state0){ 63 | io.start := false.B 64 | when(end_state && io.global_start){ 65 | stateReg := state1 66 | frontier_flag := frontier_flag + 1.U 67 | 68 | 69 | when(level === io.levels.push_to_pull_level){ //change push or pull mode logic 70 | push_or_pull_state := 1.U 71 | }.elsewhen(level === io.levels.pull_to_push_level){ 72 | push_or_pull_state := 0.U 73 | } 74 | 75 | when(io.last_iteration_state.reduce(_&_)){ 76 | global_finish_state := true.B 77 | stateReg := state0 78 | } .otherwise{ 79 | level := level + 1.U 80 | } 81 | } 82 | } 83 | is(state1){ 84 | io.start := RegNext(true.B) 85 | stateReg := state0 86 | } 87 | } 88 | 89 | when(mem_end_state && mem_cnt_total===p2_cnt_total && p2_pull_count_total===frontier_pull_count_total){ 90 | io.p2_end := true.B 91 | }.otherwise{ 92 | io.p2_end := false.B 93 | 94 | } 95 | 96 | } 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /src/main/scala/p1.scala: -------------------------------------------------------------------------------- 1 | package HBMGraph 2 | import chisel3._ 3 | import chisel3.Driver 4 | import chisel3.util._ 5 | 6 | class P1_IO (implicit val conf : HBMGraphConfiguration) extends Bundle{ 7 | //Input 8 | val start = Input(Bool()) // input start signal 9 | val frontier_value = Flipped(Decoupled(UInt(conf.Data_width_bram.W))) // input frontier data 10 | val node_num = Input(UInt(conf.Data_width.W)) 11 | val push_or_pull_state = Input(UInt(1.W)) //input flag mark push or pull state 12 | 13 | //Output 14 | val frontier_count = Decoupled(UInt(conf.Data_width.W)) // output count of the required current_frontier 15 | val R_array_index = Decoupled(UInt(conf.Data_width.W)) // output vertex index of the required CSR 16 | val p1_end = Output(Bool()) // output p1 finish signal 17 | } 18 | 19 | 20 | class P1 (val num :Int)(implicit val conf : HBMGraphConfiguration) extends Module{ 21 | val io = IO(new P1_IO()) 22 | dontTouch(io) 23 | // io := DontCare 24 | val p1_read_frontier_or_visited_map = Module(new p1_read_frontier_or_visited_map) 25 | val read_R_array_index = Module(new read_R_array_index(num)) 26 | 27 | // io <> p1_read_frontier_or_visited_map 28 | p1_read_frontier_or_visited_map.io.start := io.start 29 | io.frontier_count <> p1_read_frontier_or_visited_map.io.frontier_count 30 | p1_read_frontier_or_visited_map.io.node_num := io.node_num 31 | 32 | // io <> read_R_array_index 33 | read_R_array_index.io.node_num := io.node_num 34 | read_R_array_index.io.start := io.start 35 | read_R_array_index.io.frontier_value <> io.frontier_value 36 | read_R_array_index.io.push_or_pull_state := io.push_or_pull_state 37 | io.R_array_index <> read_R_array_index.io.R_array_index 38 | io.p1_end := read_R_array_index.io.p1_end 39 | 40 | } 41 | 42 | // read frontier valud in push state 43 | // read visited_map value in pull state 44 | class p1_read_frontier_or_visited_map (implicit val conf : HBMGraphConfiguration) extends Module{ 45 | val io = IO(new Bundle{ 46 | //Input 47 | val start = Input(Bool()) // input start signal 48 | val node_num = Input(UInt(conf.Data_width.W)) 49 | //Output 50 | val frontier_count = Decoupled(UInt(conf.Data_width.W)) // output count of the required current_frontier 51 | }) 52 | dontTouch(io) 53 | 54 | // init signals 55 | io.frontier_count.valid := false.B 56 | io.frontier_count.bits := DontCare 57 | val state0 ::state1 :: Nil = Enum(2) 58 | val stateReg = RegInit(state0) // mark state of read current_frontier 59 | 60 | // local variables 61 | val count = RegInit(0.U(32.W)) // count the number of current frontier to require 62 | val size = ((io.node_num - 1.U) / conf.Data_width_bram.U) + 1.U // the total number of current frontier 63 | 64 | // require current_frontier from frontier module 65 | // not process p1_end signal 66 | switch(stateReg){ 67 | is(state0){ 68 | io.frontier_count.valid := false.B 69 | io.frontier_count.bits := DontCare 70 | when(io.start){ 71 | stateReg := state1 72 | count := 0.U 73 | } 74 | } 75 | is(state1){ 76 | // require current frontier 77 | io.frontier_count.valid := true.B 78 | io.frontier_count.bits := count 79 | when(io.frontier_count.ready){ 80 | count := count + 1.U 81 | when(count === size - 1.U){ 82 | stateReg := state0 83 | } 84 | .otherwise{ 85 | stateReg := state1 86 | } 87 | } 88 | } 89 | } 90 | } 91 | 92 | class read_R_array_index (val num :Int)(implicit val conf : HBMGraphConfiguration) extends Module{ 93 | val io = IO(new Bundle{ 94 | //Input 95 | val start = Input(Bool()) // input start signal 96 | val frontier_value = Flipped(Decoupled(UInt(conf.Data_width_bram.W))) // input frontier data 97 | val node_num = Input(UInt(conf.Data_width.W)) 98 | val push_or_pull_state = Input(UInt(1.W)) //input flag mark push or pull state 99 | 100 | //Output 101 | val R_array_index = Decoupled(UInt(conf.Data_width.W)) // output vertex index of the required CSR 102 | val p1_end = Output(Bool()) // output p1 finish signal 103 | }) 104 | dontTouch(io) 105 | 106 | // init signals 107 | io.R_array_index.valid := false.B 108 | io.R_array_index.bits := DontCare 109 | io.p1_end := false.B 110 | 111 | // local variables 112 | val state0 ::state1 :: state2 :: state3 :: Nil = Enum(4) 113 | val stateReg = RegInit(state0) // mark state of read R array 114 | val q_frontier_value = Queue(io.frontier_value, conf.q_frontier_to_p1_len) // use a FIFO queue to receive data 115 | // dontTouch(q_frontier_value) // to prevent the optimization of port io_enq_bits and io_deq_bits 116 | val size = ((io.node_num - 1.U) / conf.Data_width_bram.U) + 1.U // the total number of current frontier 117 | val count_f = RegInit(0.U(32.W)) // count the number of frontier received 118 | val frontier = RegInit(0.U(conf.Data_width_bram.W)) // store current frontier received 119 | val node = RegInit(0.U(conf.Data_width.W)) // the node num inside current frontier 120 | val node_num_in_frontier = RegInit(0.U(32.W)) // the number of node in current frontier(Data_width bits) 121 | val count_node_in_frontier = RegInit(0.U(32.W)) // count the number of node dealed in current frontier(Data_width bits) 122 | q_frontier_value.ready := false.B // equivalent to q_frontier_value.nodeq() 123 | 124 | // receive current_frontier from frontier module and require R array from Memory 125 | // give p1_end signal 126 | switch(stateReg){ 127 | is(state0){ 128 | io.p1_end := true.B 129 | q_frontier_value.ready := false.B 130 | io.R_array_index.valid := false.B 131 | io.R_array_index.bits := DontCare 132 | q_frontier_value.ready := false.B 133 | when(io.start){ 134 | io.p1_end := false.B 135 | count_f := 0.U 136 | stateReg := state1 137 | } 138 | } 139 | is(state1){ 140 | //receive current frontier 141 | io.p1_end := false.B 142 | q_frontier_value.ready := true.B 143 | io.R_array_index.valid := false.B 144 | when(q_frontier_value.valid){ 145 | stateReg := state2 146 | count_f := count_f + 1.U 147 | // differentiate between push mode and pull mode 148 | node_num_in_frontier := Mux(io.push_or_pull_state === 0.U, PopCount(q_frontier_value.bits), PopCount(~q_frontier_value.bits)) // the number of node need to process in current frontier received 149 | // push mode 150 | when(q_frontier_value.bits =/= 0.U && io.push_or_pull_state === 0.U){ 151 | //exist node inside current frontier 152 | // node := Log2(q_frontier_value.bits - (q_frontier_value.bits & (q_frontier_value.bits - 1.U))) + conf.Data_width.U * count_f 153 | node := Custom_function.find_node(q_frontier_value.bits, conf.Data_width_bram, count_f) 154 | // frontier := q_frontier_value.bits & (q_frontier_value.bits - 1.U) 155 | frontier := Custom_function.remove_one(q_frontier_value.bits) 156 | count_node_in_frontier := 0.U 157 | stateReg := state2 158 | } 159 | // pull mode 160 | .elsewhen((~q_frontier_value.bits) =/= 0.U && io.push_or_pull_state === 1.U){ 161 | //exist node unvisited 162 | node := Custom_function.find_node(~q_frontier_value.bits, conf.Data_width_bram, count_f) 163 | // frontier := q_frontier_value.bits & (q_frontier_value.bits - 1.U) 164 | frontier := Custom_function.remove_one(~q_frontier_value.bits) 165 | count_node_in_frontier := 0.U 166 | stateReg := state2 167 | } 168 | .otherwise{ 169 | stateReg := state3 170 | } 171 | } 172 | } 173 | is(state2){ 174 | // send R array 175 | q_frontier_value.ready := false.B 176 | when(node > io.node_num - 1.U){ 177 | // all the points have been processed and the round is over 178 | stateReg := state0 179 | } 180 | .elsewhen(count_node_in_frontier === node_num_in_frontier){ 181 | stateReg := state3 182 | } 183 | .otherwise{ 184 | io.R_array_index.valid := true.B 185 | // convert the number of points inside the pipeline to the total number 186 | io.R_array_index.bits := node * conf.numSubGraphs.U + num.U 187 | when(io.R_array_index.ready){ 188 | count_node_in_frontier := count_node_in_frontier + 1.U 189 | // frontier := frontier & (frontier - 1.U) 190 | frontier := Custom_function.remove_one(frontier) 191 | // node := Log2(frontier - (frontier & (frontier - 1.U))) + conf.Data_width_bram.U * (count_f - 1.U) 192 | node := Custom_function.find_node(frontier, conf.Data_width_bram, count_f - 1.U) 193 | stateReg := state2 194 | } 195 | } 196 | } 197 | is(state3){ 198 | q_frontier_value.ready := false.B 199 | io.R_array_index.valid := false.B 200 | when(count_f === size){ 201 | // all the points have been processed and the round is over 202 | stateReg := state0 203 | } 204 | .otherwise{ 205 | // to receive new current frontier 206 | stateReg := state1 207 | } 208 | } 209 | } 210 | } 211 | 212 | object Custom_function{ 213 | def find_one(n : UInt) : UInt = 214 | Log2(n - (n & (n - 1.U))) 215 | 216 | def find_node(n : UInt, data_width : Int, count : UInt) : UInt = 217 | find_one(n) + data_width.U * count 218 | 219 | def remove_one(n : UInt) : UInt = 220 | n & (n - 1.U) 221 | } -------------------------------------------------------------------------------- /src/main/scala/p2.scala: -------------------------------------------------------------------------------- 1 | package HBMGraph 2 | import chisel3._ 3 | import chisel3.Driver 4 | import chisel3.util._ 5 | 6 | class P2_IO (implicit val conf : HBMGraphConfiguration) extends Bundle{ 7 | //Input 8 | // val start = Input(Bool()) // input p1 finish signal 9 | val neighbours = Vec(2, Flipped(Decoupled(UInt((conf.Data_width * 2).W )))) // input 2 neighbours in local subgraph 10 | val bram_clock = Input(Clock()) // bram clock 11 | val push_or_pull_state = Input(UInt(1.W)) //input flag mark push or pull state 12 | val if_write = Input(Bool()) 13 | 14 | //Output 15 | val p2_count = Output(UInt(conf.Data_width.W)) 16 | val p2_pull_count = Output(UInt(conf.Data_width.W)) 17 | val write_vertex_index = Decoupled(UInt(conf.Data_width.W))// output vertex index you want to write 18 | // write next_frontier in push mode 19 | // write next_frontier and visited_map in pull mode 20 | val write_frontier = Vec(2, Decoupled(UInt(conf.Data_width.W))) // output 2 next_frontier you want to write 21 | val bram_to_frontier = Flipped(new bram_controller_IO) 22 | } 23 | 24 | class P2 (val num :Int)(implicit val conf : HBMGraphConfiguration) extends Module{ 25 | val io = IO(new P2_IO) 26 | dontTouch(io) 27 | val p2_read_visited_map_or_frontier = Module(new p2_read_visited_map_or_frontier(num)) 28 | val write_frontier_and_level = Module(new write_frontier_and_level(num)) 29 | 30 | // connect between p2 io and p2_read_visited_map_or_frontier 31 | // p2_read_visited_map_or_frontier.io.bram_to_frontier.start := io.start 32 | p2_read_visited_map_or_frontier.io.bram_clock := io.bram_clock 33 | p2_read_visited_map_or_frontier.io.neighbours <> io.neighbours 34 | p2_read_visited_map_or_frontier.io.bram_to_frontier <> io.bram_to_frontier 35 | p2_read_visited_map_or_frontier.io.push_or_pull_state := io.push_or_pull_state 36 | 37 | 38 | // connect between p2_read_visited_map_or_frontier and write_frontier_and_level 39 | // val queue_0 = Queue(p2_read_visited_map_or_frontier.io.visited_map_or_frontier(0), conf.q_visited_map_len) 40 | // val queue_1 = Queue(queue_0,conf.q_visited_map_len) 41 | // val queue_2 = Queue(p2_read_visited_map_or_frontier.io.visited_map_or_frontier(1), conf.q_visited_map_len) 42 | // val queue_3 = Queue(queue_2,conf.q_visited_map_len) 43 | 44 | // write_frontier_and_level.io.visited_map_or_frontier(0) <> queue_1 45 | // write_frontier_and_level.io.visited_map_or_frontier(1) <> queue_3 46 | write_frontier_and_level.io.visited_map_or_frontier(0) <> Queue(p2_read_visited_map_or_frontier.io.visited_map_or_frontier(0), conf.q_visited_map_len) 47 | write_frontier_and_level.io.visited_map_or_frontier(1) <> Queue(p2_read_visited_map_or_frontier.io.visited_map_or_frontier(1), conf.q_visited_map_len) 48 | 49 | write_frontier_and_level.io.neighbours(0) <> p2_read_visited_map_or_frontier.io.neighbours_out(0) // the FIFO queue is in module p2_read_visited_map_or_frontier 50 | write_frontier_and_level.io.neighbours(1) <> p2_read_visited_map_or_frontier.io.neighbours_out(1) 51 | 52 | // connect between p2 io and write_frontier_and_level 53 | // write_frontier_and_level.io.start := io.start 54 | io.p2_count := write_frontier_and_level.io.p2_count 55 | io.p2_pull_count := write_frontier_and_level.io.p2_pull_count 56 | io.write_vertex_index <> write_frontier_and_level.io.write_vertex_index 57 | io.write_frontier <> write_frontier_and_level.io.write_frontier 58 | io.if_write <> write_frontier_and_level.io.if_write 59 | write_frontier_and_level.io.push_or_pull_state := io.push_or_pull_state 60 | } 61 | 62 | 63 | class p2_read_visited_map_or_frontier (val num :Int)(implicit val conf : HBMGraphConfiguration) extends Module{ 64 | val io = IO(new Bundle{ 65 | //Input 66 | // val start = Input(Bool()) // input p1 finish signal 67 | val neighbours = Vec(2, Flipped(Decoupled(UInt((conf.Data_width * 2).W)))) // input 2 neighbours in local subgraph 68 | val bram_clock = Input(Clock()) // bram clock 69 | val push_or_pull_state = Input(UInt(1.W)) //input flag mark push or pull state 70 | 71 | //Output 72 | val visited_map_or_frontier = Vec(2, Decoupled(UInt(1.W))) // output 2 visited_map result in push mode or frontier result in pull mode 73 | val neighbours_out = Vec(2, Decoupled(UInt((conf.Data_width * 2).W))) // output 2 neighbours in local subgraph 74 | val bram_to_frontier = Flipped(new bram_controller_IO) 75 | }) 76 | dontTouch(io) 77 | 78 | // init signals 79 | io.visited_map_or_frontier(0).valid := false.B 80 | io.visited_map_or_frontier(1).valid := false.B 81 | io.visited_map_or_frontier(0).bits := DontCare 82 | io.visited_map_or_frontier(1).bits := DontCare 83 | 84 | // local variables 85 | val count0 = RegInit(0.U(conf.Data_width.W)) // count the number of neighbour0 received 86 | val count1 = RegInit(0.U(conf.Data_width.W)) // count the number of neighbour0 received 87 | 88 | // use a FIFO queue to receive data 89 | // val q_neighbour0 = Queue(io.neighbours(0), conf.q_mem_to_p2_len) 90 | // val q_neighbour1 = Queue(io.neighbours(1), conf.q_mem_to_p2_len) 91 | val q_neighbour0 = Module(new Queue(UInt((conf.Data_width * 2).W), conf.q_mem_to_p2_len)) 92 | val q_neighbour1 = Module(new Queue(UInt((conf.Data_width * 2).W), conf.q_mem_to_p2_len)) 93 | // /* the queue is longer ?? different length will cause data loss 94 | // ensure that the data passed to module write_frontier_and_level will not be lost 95 | // */ 96 | val q_neighbours_out0 = Module(new Queue(UInt((conf.Data_width * 2).W), conf.q_neighbours_len)) 97 | val q_neighbours_out1 = Module(new Queue(UInt((conf.Data_width * 2).W), conf.q_neighbours_len)) 98 | 99 | io.neighbours(0).ready := q_neighbour0.io.enq.ready & q_neighbours_out0.io.enq.ready 100 | q_neighbour0.io.enq.valid := io.neighbours(0).valid & q_neighbours_out0.io.enq.ready 101 | q_neighbour0.io.enq.bits := io.neighbours(0).bits 102 | 103 | io.neighbours(1).ready := q_neighbour1.io.enq.ready & q_neighbours_out1.io.enq.ready 104 | q_neighbour1.io.enq.valid := io.neighbours(1).valid & q_neighbours_out1.io.enq.ready 105 | q_neighbour1.io.enq.bits := io.neighbours(1).bits 106 | // q_neighbour0.ready := false.B 107 | // q_neighbour1.io.deq.ready := false.B 108 | q_neighbour0.io.deq.ready := false.B 109 | q_neighbour1.io.deq.ready := false.B 110 | 111 | 112 | q_neighbours_out0.io.enq.valid := io.neighbours(0).valid & q_neighbour0.io.enq.ready 113 | q_neighbours_out0.io.enq.bits := io.neighbours(0).bits 114 | io.neighbours_out(0) <> q_neighbours_out0.io.deq 115 | 116 | q_neighbours_out1.io.enq.valid := io.neighbours(1).valid & q_neighbour1.io.enq.ready 117 | q_neighbours_out1.io.enq.bits := io.neighbours(1).bits 118 | io.neighbours_out(1) <> q_neighbours_out1.io.deq 119 | // io.neighbours_out(0) <> Queue(io.neighbours(0), conf.q_neighbours_len) 120 | // io.neighbours_out(1) <> Queue(io.neighbours(1), conf.q_neighbours_len) 121 | 122 | /* delay one cycle 123 | if visited_req equals 1, it means that we have read visited map and get the result 124 | */ 125 | // val visited_req0 = ShiftRegister(Mux(q_neighbour0.io.deq.valid && io.visited_map(0).ready, 1.U(1.W), 0.U(1.W)), 1,0.U(4.W), true.B) 126 | // val visited_req1 = ShiftRegister(Mux(q_neighbour1.io.deq.valid && io.visited_map(1).ready, 1.U(1.W), 0.U(1.W)), 1,0.U(4.W), true.B) 127 | 128 | // val visited_map = withClock(io.bram_clock){Module(new bram_controller(num, 0))} 129 | io.bram_to_frontier := DontCare 130 | io.bram_to_frontier.ena := true.B 131 | io.bram_to_frontier.enb := true.B 132 | io.bram_to_frontier.clka := io.bram_clock 133 | io.bram_to_frontier.clkb := io.bram_clock 134 | io.bram_to_frontier.wea := false.B 135 | io.bram_to_frontier.web := false.B 136 | io.bram_to_frontier.wmode := 0.U 137 | 138 | // receive neighbours and require visited_map 139 | // deal neighbour0 140 | 141 | /* To prevent the number of received signals neighbour0 and neighbour1 not match, 142 | we will only deal when all valid(receive) and ready(send) signals are pulled high. 143 | Because the data read from bram cannot be paused, only when there is room for the queue 144 | that stores the result read in bram, the new data will continue to be processed. 145 | */ 146 | when(q_neighbour0.io.deq.valid && io.visited_map_or_frontier(0).ready){ 147 | q_neighbour0.io.deq.ready := true.B 148 | // only need to read in push mode 149 | io.bram_to_frontier.wea := true.B && (io.push_or_pull_state === 0.U) 150 | // convert the total number of points to the number inside the pipeline 151 | io.bram_to_frontier.nodea := (Custom_function2.high(q_neighbour0.io.deq.bits) / conf.numSubGraphs.U) % conf.Data_width_bram.U 152 | io.bram_to_frontier.addra := (Custom_function2.high(q_neighbour0.io.deq.bits) / conf.numSubGraphs.U) / conf.Data_width_bram.U 153 | // TODO 检查直接不延迟,直接读dout(node)的数据,在bram的第二拍是正确的,待检测在流水线中是否正确 154 | io.visited_map_or_frontier(0).valid := true.B 155 | io.visited_map_or_frontier(0).bits := io.bram_to_frontier.douta((Custom_function2.high(q_neighbour0.io.deq.bits) / conf.numSubGraphs.U) % conf.Data_width_bram.U) 156 | count0 := count0 + 1.U 157 | } 158 | 159 | when(q_neighbour1.io.deq.valid && io.visited_map_or_frontier(1).ready){ 160 | q_neighbour1.io.deq.ready := true.B 161 | io.bram_to_frontier.web := true.B && (io.push_or_pull_state === 0.U) 162 | io.bram_to_frontier.nodeb := (Custom_function2.high(q_neighbour1.io.deq.bits) / conf.numSubGraphs.U) % conf.Data_width_bram.U 163 | io.bram_to_frontier.addrb := (Custom_function2.high(q_neighbour1.io.deq.bits) / conf.numSubGraphs.U) / conf.Data_width_bram.U 164 | io.visited_map_or_frontier(1).valid := true.B 165 | io.visited_map_or_frontier(1).bits := io.bram_to_frontier.doutb((Custom_function2.high(q_neighbour1.io.deq.bits) / conf.numSubGraphs.U) % conf.Data_width_bram.U) 166 | count1 := count1 + 1.U 167 | } 168 | 169 | // // store the data read from bram into FIFO queue 170 | // when(visited_req0 === 1.U){ 171 | // io.visited_map(0).valid := true.B 172 | // io.visited_map(0).bits := io.bram_to_frontier.visited_a 173 | // } 174 | 175 | // when(visited_req1 === 1.U){ 176 | // io.visited_map(1).valid := true.B 177 | // io.visited_map(1).bits := io.bram_to_frontier.visited_b 178 | // } 179 | } 180 | 181 | class write_frontier_and_level (val num :Int)(implicit val conf : HBMGraphConfiguration) extends Module{ 182 | val io = IO(new Bundle{ 183 | //Input 184 | // val start = Input(Bool()) // input p1 finish signal 185 | val visited_map_or_frontier = Vec(2, Flipped(Decoupled(UInt(1.W)))) // input 2 neighbours in local subgraph 186 | val neighbours = Vec(2, Flipped(Decoupled(UInt((conf.Data_width * 2).W)))) 187 | val push_or_pull_state = Input(UInt(1.W)) //input flag mark push or pull state 188 | val if_write = Input(Bool()) 189 | 190 | //Output 191 | val write_vertex_index = Decoupled(UInt(conf.Data_width.W)) // output vertex index you want to write to mem (deprecated) 192 | val write_frontier = Vec(2, Decoupled(UInt(conf.Data_width.W))) // output 2 next_frontier you want to write 193 | val p2_count = Output(UInt(conf.Data_width.W)) 194 | val p2_pull_count = Output(UInt(conf.Data_width.W)) // to pull crossbar count 195 | }) 196 | dontTouch(io) 197 | 198 | // init signals 199 | io.visited_map_or_frontier(0).ready := false.B 200 | io.visited_map_or_frontier(1).ready := false.B 201 | io.neighbours(0).ready := false.B 202 | io.neighbours(1).ready := false.B 203 | io.write_frontier(0).valid := false.B 204 | io.write_frontier(1).valid := false.B 205 | io.write_frontier(0).bits := DontCare 206 | io.write_frontier(1).bits := DontCare 207 | 208 | // local variables 209 | val count0 = RegInit(0.U(conf.Data_width.W)) // count the number of neighbour0 received 210 | val count1 = RegInit(0.U(conf.Data_width.W)) // count the number of neighbour1 received 211 | val (count_wf0, _) = Counter(io.write_frontier(0).ready && io.write_frontier(0).valid, 2147483647) 212 | val (count_wf1, _) = Counter(io.write_frontier(1).ready && io.write_frontier(1).valid, 2147483647) 213 | io.p2_count := count0 + count1 214 | io.p2_pull_count := count_wf0 + count_wf1 215 | 216 | // to store the vertex_index need to write 217 | // val q_vertex_index = Array(2, Module(new Queue(UInt(conf.Data_width.W), 64))) 218 | val q_vertex_index0 = Module(new Queue(UInt(conf.Data_width.W), conf.q_p2_to_mem_len)) 219 | val q_vertex_index1 = Module(new Queue(UInt(conf.Data_width.W), conf.q_p2_to_mem_len)) 220 | q_vertex_index0.io.enq.valid := false.B 221 | q_vertex_index1.io.enq.valid := false.B 222 | q_vertex_index0.io.enq.bits := DontCare 223 | q_vertex_index1.io.enq.bits := DontCare 224 | 225 | // use RRArbiter to sequence 2 vertex_indexs into 1 226 | val vertex_index = Module(new RRArbiter(UInt(conf.Data_width.W),2)) 227 | vertex_index.io.in(0) <> q_vertex_index0.io.deq 228 | vertex_index.io.in(1) <> q_vertex_index1.io.deq 229 | io.write_vertex_index <> vertex_index.io.out 230 | 231 | // receive visited_map_or_frontier, write next frontier and level 232 | // deal vec(0) 233 | when(io.visited_map_or_frontier(0).valid && io.neighbours(0).valid 234 | && io.write_frontier(0).ready && q_vertex_index0.io.enq.ready){ 235 | io.visited_map_or_frontier(0).ready := true.B 236 | io.neighbours(0).ready := true.B 237 | // push 238 | when(io.visited_map_or_frontier(0).bits === 0.U && io.push_or_pull_state === 0.U){ // write unvisited_node 239 | q_vertex_index0.io.enq.valid := true.B 240 | // data to memory: not need to convert 241 | q_vertex_index0.io.enq.bits := Custom_function2.high(io.neighbours(0).bits) 242 | io.write_frontier(0).valid := true.B 243 | // // data to frontier: convert the total number of points to the number inside the pipeline 244 | io.write_frontier(0).bits := Custom_function2.high(io.neighbours(0).bits) 245 | } 246 | // pull 247 | when(io.visited_map_or_frontier(0).bits === 1.U && io.push_or_pull_state === 1.U){ // write unvisited_node 248 | q_vertex_index0.io.enq.valid := true.B 249 | // data to memory: not need to convert 250 | q_vertex_index0.io.enq.bits := Custom_function2.low(io.neighbours(0).bits) 251 | io.write_frontier(0).valid := true.B 252 | // // data to frontier: convert the total number of points to the number inside the pipeline 253 | io.write_frontier(0).bits := Custom_function2.low(io.neighbours(0).bits) 254 | } 255 | count0 := count0 + 1.U 256 | } 257 | 258 | // deal vec(1) 259 | when(io.visited_map_or_frontier(1).valid && io.neighbours(1).valid 260 | && io.write_frontier(1).ready && q_vertex_index1.io.enq.ready){ 261 | io.visited_map_or_frontier(1).ready := true.B 262 | io.neighbours(1).ready := true.B 263 | // push 264 | when(io.visited_map_or_frontier(1).bits === 0.U && io.push_or_pull_state === 0.U){ // write unvisited_node 265 | q_vertex_index1.io.enq.valid := true.B 266 | // data to memory: not need to convert 267 | q_vertex_index1.io.enq.bits := Custom_function2.high(io.neighbours(1).bits) 268 | io.write_frontier(1).valid := true.B 269 | // // convert the total number of points to the number inside the pipeline 270 | io.write_frontier(1).bits := Custom_function2.high(io.neighbours(1).bits) 271 | } 272 | // pull 273 | when(io.visited_map_or_frontier(1).bits === 1.U && io.push_or_pull_state === 1.U){ // write unvisited_node 274 | q_vertex_index1.io.enq.valid := true.B 275 | // data to memory: not need to convert 276 | q_vertex_index1.io.enq.bits := Custom_function2.low(io.neighbours(1).bits) 277 | io.write_frontier(1).valid := true.B 278 | // // convert the total number of points to the number inside the pipeline 279 | io.write_frontier(1).bits := Custom_function2.low(io.neighbours(1).bits) 280 | } 281 | count1 := count1 + 1.U 282 | } 283 | 284 | when(io.if_write === false.B){ 285 | q_vertex_index0.io.enq.valid := false.B 286 | q_vertex_index1.io.enq.valid := false.B 287 | } 288 | } 289 | 290 | object Custom_function2{ 291 | implicit val conf = HBMGraphConfiguration() 292 | def high(n : UInt) : UInt = 293 | n(conf.crossbar_data_width * 2 - 1, conf.crossbar_data_width) 294 | 295 | def low(n : UInt) : UInt = 296 | n(conf.crossbar_data_width - 1, 0) 297 | 298 | } 299 | 300 | --------------------------------------------------------------------------------