├── .gitignore
├── .gitmodules
├── Cargo.toml
├── LICENSE
├── LICENSES.txt
├── README.md
├── aigpdk
    ├── aigpdk.lib
    ├── aigpdk.v
    └── memlib_yosys.txt
├── build.rs
├── csrc
    ├── kernel_v1.cu
    └── kernel_v1_impl.cuh
├── src
    ├── aig.rs
    ├── aigpdk.rs
    ├── bin
    │   ├── boomerang_test.rs
    │   ├── cuda_dummy_test.rs
    │   ├── cuda_test.rs
    │   ├── cut_map_interactive.rs
    │   ├── flatten_test.rs
    │   ├── level_test.rs
    │   ├── naive_sim.rs
    │   └── repcut_test.rs
    ├── flatten.rs
    ├── lib.rs
    ├── pe.rs
    ├── repcut.rs
    └── staging.rs
└── usage.md


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /Cargo.lock
3 | /compile_commands.json
4 | .cache
5 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "eda-infra-rs"]
2 | 	path = eda-infra-rs
3 | 	url = https://github.com/gzz2000/eda-infra-rs.git
4 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "gem"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | links = "gem"
 6 | 
 7 | [dependencies]
 8 | cachedhash = "0.2.0"
 9 | clap = { version = "4.5.15", features = ["derive"] }
10 | clilog = { version = "0.2.5", path = "eda-infra-rs/clilog" }
11 | compact_str = "0.7.1"
12 | indexmap = "2.4.0"
13 | itertools = "0.13.0"
14 | netlistdb = { version = "0.4.6", path = "eda-infra-rs/netlistdb" }
15 | rand = "0.8.5"
16 | rand_chacha = "0.3.1"
17 | rayon = "1.10.0"
18 | serde = { version = "1.0.208", features = ["derive"] }
19 | serde_bare = "0.5.0"
20 | sverilogparse = { version = "0.4.2", path = "eda-infra-rs/sverilogparse" }
21 | tempdir = "0.3.7"
22 | ulib = { version = "0.3.13", path = "eda-infra-rs/ulib" }
23 | vcd-ng = { version = "0.2.0", path = "eda-infra-rs/vcd-ng" }
24 | 
25 | [build-dependencies]
26 | ucc = { version = "0.2.5", path = "eda-infra-rs/ucc" }
27 | 
28 | [features]
29 | cuda = ["ulib/cuda"]
30 | 
31 | [[bin]]
32 | name = "cuda_test"
33 | required-features = ["cuda"]
34 | 
35 | [[bin]]
36 | name = "cuda_dummy_test"
37 | required-features = ["cuda"]
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 | SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
191 | SPDX-License-Identifier: Apache-2.0
192 | 
193 | Licensed under the Apache License, Version 2.0 (the "License");
194 | you may not use this file except in compliance with the License.
195 | You may obtain a copy of the License at
196 | 
197 | http://www.apache.org/licenses/LICENSE-2.0
198 | 
199 | Unless required by applicable law or agreed to in writing, software
200 | distributed under the License is distributed on an "AS IS" BASIS,
201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202 | See the License for the specific language governing permissions and
203 | limitations under the License.
204 | 


--------------------------------------------------------------------------------
/LICENSES.txt:
--------------------------------------------------------------------------------
  1 | GEM uses the following third-party libraries:
  2 | 
  3 | 1. eda-infra-rs
  4 |    - Repository: https://github.com/gzz2000/eda-infra-rs/tree/master
  5 |    - License:
  6 |                                Apache License
  7 |                            Version 2.0, January 2004
  8 |                         http://www.apache.org/licenses/
  9 | 
 10 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 11 | 
 12 |    1. Definitions.
 13 | 
 14 |       "License" shall mean the terms and conditions for use, reproduction,
 15 |       and distribution as defined by Sections 1 through 9 of this document.
 16 | 
 17 |       "Licensor" shall mean the copyright owner or entity authorized by
 18 |       the copyright owner that is granting the License.
 19 | 
 20 |       "Legal Entity" shall mean the union of the acting entity and all
 21 |       other entities that control, are controlled by, or are under common
 22 |       control with that entity. For the purposes of this definition,
 23 |       "control" means (i) the power, direct or indirect, to cause the
 24 |       direction or management of such entity, whether by contract or
 25 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 26 |       outstanding shares, or (iii) beneficial ownership of such entity.
 27 | 
 28 |       "You" (or "Your") shall mean an individual or Legal Entity
 29 |       exercising permissions granted by this License.
 30 | 
 31 |       "Source" form shall mean the preferred form for making modifications,
 32 |       including but not limited to software source code, documentation
 33 |       source, and configuration files.
 34 | 
 35 |       "Object" form shall mean any form resulting from mechanical
 36 |       transformation or translation of a Source form, including but
 37 |       not limited to compiled object code, generated documentation,
 38 |       and conversions to other media types.
 39 | 
 40 |       "Work" shall mean the work of authorship, whether in Source or
 41 |       Object form, made available under the License, as indicated by a
 42 |       copyright notice that is included in or attached to the work
 43 |       (an example is provided in the Appendix below).
 44 | 
 45 |       "Derivative Works" shall mean any work, whether in Source or Object
 46 |       form, that is based on (or derived from) the Work and for which the
 47 |       editorial revisions, annotations, elaborations, or other modifications
 48 |       represent, as a whole, an original work of authorship. For the purposes
 49 |       of this License, Derivative Works shall not include works that remain
 50 |       separable from, or merely link (or bind by name) to the interfaces of,
 51 |       the Work and Derivative Works thereof.
 52 | 
 53 |       "Contribution" shall mean any work of authorship, including
 54 |       the original version of the Work and any modifications or additions
 55 |       to that Work or Derivative Works thereof, that is intentionally
 56 |       submitted to Licensor for inclusion in the Work by the copyright owner
 57 |       or by an individual or Legal Entity authorized to submit on behalf of
 58 |       the copyright owner. For the purposes of this definition, "submitted"
 59 |       means any form of electronic, verbal, or written communication sent
 60 |       to the Licensor or its representatives, including but not limited to
 61 |       communication on electronic mailing lists, source code control systems,
 62 |       and issue tracking systems that are managed by, or on behalf of, the
 63 |       Licensor for the purpose of discussing and improving the Work, but
 64 |       excluding communication that is conspicuously marked or otherwise
 65 |       designated in writing by the copyright owner as "Not a Contribution."
 66 | 
 67 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 68 |       on behalf of whom a Contribution has been received by Licensor and
 69 |       subsequently incorporated within the Work.
 70 | 
 71 |    2. Grant of Copyright License. Subject to the terms and conditions of
 72 |       this License, each Contributor hereby grants to You a perpetual,
 73 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 74 |       copyright license to reproduce, prepare Derivative Works of,
 75 |       publicly display, publicly perform, sublicense, and distribute the
 76 |       Work and such Derivative Works in Source or Object form.
 77 | 
 78 |    3. Grant of Patent License. Subject to the terms and conditions of
 79 |       this License, each Contributor hereby grants to You a perpetual,
 80 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 81 |       (except as stated in this section) patent license to make, have made,
 82 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 83 |       where such license applies only to those patent claims licensable
 84 |       by such Contributor that are necessarily infringed by their
 85 |       Contribution(s) alone or by combination of their Contribution(s)
 86 |       with the Work to which such Contribution(s) was submitted. If You
 87 |       institute patent litigation against any entity (including a
 88 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 89 |       or a Contribution incorporated within the Work constitutes direct
 90 |       or contributory patent infringement, then any patent licenses
 91 |       granted to You under this License for that Work shall terminate
 92 |       as of the date such litigation is filed.
 93 | 
 94 |    4. Redistribution. You may reproduce and distribute copies of the
 95 |       Work or Derivative Works thereof in any medium, with or without
 96 |       modifications, and in Source or Object form, provided that You
 97 |       meet the following conditions:
 98 | 
 99 |       (a) You must give any other recipients of the Work or
100 |           Derivative Works a copy of this License; and
101 | 
102 |       (b) You must cause any modified files to carry prominent notices
103 |           stating that You changed the files; and
104 | 
105 |       (c) You must retain, in the Source form of any Derivative Works
106 |           that You distribute, all copyright, patent, trademark, and
107 |           attribution notices from the Source form of the Work,
108 |           excluding those notices that do not pertain to any part of
109 |           the Derivative Works; and
110 | 
111 |       (d) If the Work includes a "NOTICE" text file as part of its
112 |           distribution, then any Derivative Works that You distribute must
113 |           include a readable copy of the attribution notices contained
114 |           within such NOTICE file, excluding those notices that do not
115 |           pertain to any part of the Derivative Works, in at least one
116 |           of the following places: within a NOTICE text file distributed
117 |           as part of the Derivative Works; within the Source form or
118 |           documentation, if provided along with the Derivative Works; or,
119 |           within a display generated by the Derivative Works, if and
120 |           wherever such third-party notices normally appear. The contents
121 |           of the NOTICE file are for informational purposes only and
122 |           do not modify the License. You may add Your own attribution
123 |           notices within Derivative Works that You distribute, alongside
124 |           or as an addendum to the NOTICE text from the Work, provided
125 |           that such additional attribution notices cannot be construed
126 |           as modifying the License.
127 | 
128 |       You may add Your own copyright statement to Your modifications and
129 |       may provide additional or different license terms and conditions
130 |       for use, reproduction, or distribution of Your modifications, or
131 |       for any such Derivative Works as a whole, provided Your use,
132 |       reproduction, and distribution of the Work otherwise complies with
133 |       the conditions stated in this License.
134 | 
135 |    5. Submission of Contributions. Unless You explicitly state otherwise,
136 |       any Contribution intentionally submitted for inclusion in the Work
137 |       by You to the Licensor shall be under the terms and conditions of
138 |       this License, without any additional terms or conditions.
139 |       Notwithstanding the above, nothing herein shall supersede or modify
140 |       the terms of any separate license agreement you may have executed
141 |       with Licensor regarding such Contributions.
142 | 
143 |    6. Trademarks. This License does not grant permission to use the trade
144 |       names, trademarks, service marks, or product names of the Licensor,
145 |       except as required for reasonable and customary use in describing the
146 |       origin of the Work and reproducing the content of the NOTICE file.
147 | 
148 |    7. Disclaimer of Warranty. Unless required by applicable law or
149 |       agreed to in writing, Licensor provides the Work (and each
150 |       Contributor provides its Contributions) on an "AS IS" BASIS,
151 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
152 |       implied, including, without limitation, any warranties or conditions
153 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
154 |       PARTICULAR PURPOSE. You are solely responsible for determining the
155 |       appropriateness of using or redistributing the Work and assume any
156 |       risks associated with Your exercise of permissions under this License.
157 | 
158 |    8. Limitation of Liability. In no event and under no legal theory,
159 |       whether in tort (including negligence), contract, or otherwise,
160 |       unless required by applicable law (such as deliberate and grossly
161 |       negligent acts) or agreed to in writing, shall any Contributor be
162 |       liable to You for damages, including any direct, indirect, special,
163 |       incidental, or consequential damages of any character arising as a
164 |       result of this License or out of the use or inability to use the
165 |       Work (including but not limited to damages for loss of goodwill,
166 |       work stoppage, computer failure or malfunction, or any and all
167 |       other commercial damages or losses), even if such Contributor
168 |       has been advised of the possibility of such damages.
169 | 
170 |    9. Accepting Warranty or Additional Liability. While redistributing
171 |       the Work or Derivative Works thereof, You may choose to offer,
172 |       and charge a fee for, acceptance of support, warranty, indemnity,
173 |       or other liability obligations and/or rights consistent with this
174 |       License. However, in accepting such obligations, You may act only
175 |       on Your own behalf and on Your sole responsibility, not on behalf
176 |       of any other Contributor, and only if You agree to indemnify,
177 |       defend, and hold each Contributor harmless for any liability
178 |       incurred by, or claims asserted against, such Contributor by reason
179 |       of your accepting any such warranty or additional liability.
180 | 
181 |    END OF TERMS AND CONDITIONS
182 | 
183 |    APPENDIX: How to apply the Apache License to your work.
184 | 
185 |       To apply the Apache License to your work, attach the following
186 |       boilerplate notice, with the fields enclosed by brackets "[]"
187 |       replaced with your own identifying information. (Don't include
188 |       the brackets!)  The text should be enclosed in the appropriate
189 |       comment syntax for the file format. We also recommend that a
190 |       file or class name and description of purpose be included on the
191 |       same "printed page" as the copyright notice for easier
192 |       identification within third-party archives.
193 | 
194 |    Copyright [yyyy] [name of copyright owner]
195 | 
196 |    Licensed under the Apache License, Version 2.0 (the "License");
197 |    you may not use this file except in compliance with the License.
198 |    You may obtain a copy of the License at
199 | 
200 |        http://www.apache.org/licenses/LICENSE-2.0
201 | 
202 |    Unless required by applicable law or agreed to in writing, software
203 |    distributed under the License is distributed on an "AS IS" BASIS,
204 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
205 |    See the License for the specific language governing permissions and
206 |    limitations under the License.
207 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Welcome to GEM
 2 | GEM is an open-source RTL logic simulator with CUDA acceleration, developed and maintained by NVIDIA Research.
 3 | GEM can deliver up to 5--40X speed-up compared to CPU-based leading RTL simulators.
 4 | 
 5 | ## Compile and Run Your Design with GEM
 6 | GEM works in a way similar to an FPGA-based RTL emulator.
 7 | It first synthesizes your design with a special and-inverter graph (AIG) process, and then map the synthesized gate-level netlist to a virtual manycore Boolean processor which can be emulated with CUDA-compatible GPUs.
 8 | 
 9 | The synthesis and mapping is slower than the compiling/elaboration process of CPU-based simulators. But it is a one-time cost and your design can be simulated under different testbenches without re-running the synthesis or mapping.
10 | 
11 | **See [usage.md](./usage.md) for usage documentation.**
12 | 
13 | ## Citation
14 | Please cite our paper if you find GEM useful.
15 | 
16 | ``` bibtex
17 | @inproceedings{gem,
18 |  author = {Guo, Zizheng and Zhang, Yanqing and Wang, Runsheng and Lin, Yibo and Ren, Haoxing},
19 |  booktitle = {Proceedings of the 62nd Annual Design Automation Conference 2025},
20 |  organization = {IEEE},
21 |  title = {{GEM}: {GPU}-Accelerated Emulator-Inspired {RTL} Simulation},
22 |  year = {2025}
23 | }
24 | ```
25 | 


--------------------------------------------------------------------------------
/aigpdk/aigpdk.lib:
--------------------------------------------------------------------------------
  1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | library(aigpdk) {
  4 | 
  5 |   delay_model : table_lookup;
  6 |   in_place_swap_mode : match_footprint;
  7 | 
  8 |   /* unit attributes */
  9 |   time_unit : "1ps";
 10 |   voltage_unit : "1V";
 11 |   current_unit : "1uA";
 12 |   pulling_resistance_unit : "1kohm";
 13 |   leakage_power_unit : "1nW";
 14 |   capacitive_load_unit (1,pf);
 15 | 
 16 |   slew_upper_threshold_pct_rise : 80;
 17 |   slew_lower_threshold_pct_rise : 20;
 18 |   slew_upper_threshold_pct_fall : 80;
 19 |   slew_lower_threshold_pct_fall : 20;
 20 |   input_threshold_pct_rise : 50;
 21 |   input_threshold_pct_fall : 50;
 22 |   output_threshold_pct_rise : 50;
 23 |   output_threshold_pct_fall : 50;
 24 |   nom_process : 1;
 25 |   nom_voltage : 1.8;
 26 |   nom_temperature : 25;
 27 |   operating_conditions ( typical ) {
 28 |      process : 1;
 29 |      voltage : 1.8;
 30 |      temperature : 25;
 31 |   }
 32 |   default_operating_conditions : typical;
 33 |   default_max_capacitance: 999999999.9;
 34 | 
 35 |   type ( sram_addr_bus_13 ) {
 36 |     base_type : array ;
 37 |     data_type : bit ;
 38 |     bit_width : 13 ;
 39 |     bit_from : 12 ;
 40 |     bit_to : 0 ;
 41 |     downto : true ;
 42 |   }
 43 | 
 44 |   type ( sram_data_bus_32 ) {
 45 |     base_type : array ;
 46 |     data_type : bit ;
 47 |     bit_width : 32 ;
 48 |     bit_from : 31 ;
 49 |     bit_to : 0 ;
 50 |     downto : true ;
 51 |   }
 52 | 
 53 | cell (AND2_00_0) {
 54 |   area : 32;
 55 |   cell_leakage_power : 0.0746794;
 56 |   pin(A)  {
 57 |     direction : input;
 58 |     capacitance : 0.0129077;
 59 |     rise_capacitance : 0.0129077;
 60 |     fall_capacitance : 0.0128842;
 61 |   }
 62 |   pin(B)  {
 63 |     direction : input;
 64 |     capacitance : 0.0125298;
 65 |     rise_capacitance : 0.0125298;
 66 |     fall_capacitance : 0.0122586;
 67 |   }
 68 |   pin(Y)  {
 69 |     direction : output;
 70 |     capacitance : 0;
 71 |     rise_capacitance : 0;
 72 |     fall_capacitance : 0;
 73 |     function : "(A B)";
 74 |     timing() {
 75 |       related_pin : "A";
 76 |       timing_sense : positive_unate;
 77 |       cell_rise(scalar) {
 78 |         values ( "1.0" );
 79 |       }
 80 |       rise_transition(scalar) {
 81 |         values ( "1.0" );
 82 |       }
 83 |       cell_fall(scalar) {
 84 |         values ( "1.0" );
 85 |       }
 86 |       fall_transition(scalar) {
 87 |         values ( "1.0" );
 88 |       }
 89 |     }
 90 |     timing() {
 91 |       related_pin : "B";
 92 |       timing_sense : positive_unate;
 93 |       cell_rise(scalar) {
 94 |         values ( "1.0" );
 95 |       }
 96 |       rise_transition(scalar) {
 97 |         values ( "1.0" );
 98 |       }
 99 |       cell_fall(scalar) {
100 |         values ( "1.0" );
101 |       }
102 |       fall_transition(scalar) {
103 |         values ( "1.0" );
104 |       }
105 |     }
106 |   }
107 | }
108 | 
109 | cell (AND2_01_0) {
110 |   area : 32;
111 |   cell_leakage_power : 0.0746794;
112 |   pin(A)  {
113 |     direction : input;
114 |     capacitance : 0.0129077;
115 |     rise_capacitance : 0.0129077;
116 |     fall_capacitance : 0.0128842;
117 |   }
118 |   pin(B)  {
119 |     direction : input;
120 |     capacitance : 0.0125298;
121 |     rise_capacitance : 0.0125298;
122 |     fall_capacitance : 0.0122586;
123 |   }
124 |   pin(Y)  {
125 |     direction : output;
126 |     capacitance : 0;
127 |     rise_capacitance : 0;
128 |     fall_capacitance : 0;
129 |     function : "(A B')";
130 |     timing() {
131 |       related_pin : "A";
132 |       timing_sense : positive_unate;
133 |       cell_rise(scalar) {
134 |         values ( "1.0" );
135 |       }
136 |       rise_transition(scalar) {
137 |         values ( "1.0" );
138 |       }
139 |       cell_fall(scalar) {
140 |         values ( "1.0" );
141 |       }
142 |       fall_transition(scalar) {
143 |         values ( "1.0" );
144 |       }
145 |     }
146 |     timing() {
147 |       related_pin : "B";
148 |       timing_sense : positive_unate;
149 |       cell_rise(scalar) {
150 |         values ( "1.0" );
151 |       }
152 |       rise_transition(scalar) {
153 |         values ( "1.0" );
154 |       }
155 |       cell_fall(scalar) {
156 |         values ( "1.0" );
157 |       }
158 |       fall_transition(scalar) {
159 |         values ( "1.0" );
160 |       }
161 |     }
162 |   }
163 | }
164 | 
165 | cell (AND2_10_0) {
166 |   area : 32;
167 |   cell_leakage_power : 0.0746794;
168 |   pin(A)  {
169 |     direction : input;
170 |     capacitance : 0.0129077;
171 |     rise_capacitance : 0.0129077;
172 |     fall_capacitance : 0.0128842;
173 |   }
174 |   pin(B)  {
175 |     direction : input;
176 |     capacitance : 0.0125298;
177 |     rise_capacitance : 0.0125298;
178 |     fall_capacitance : 0.0122586;
179 |   }
180 |   pin(Y)  {
181 |     direction : output;
182 |     capacitance : 0;
183 |     rise_capacitance : 0;
184 |     fall_capacitance : 0;
185 |     function : "(A' B)";
186 |     timing() {
187 |       related_pin : "A";
188 |       timing_sense : positive_unate;
189 |       cell_rise(scalar) {
190 |         values ( "1.0" );
191 |       }
192 |       rise_transition(scalar) {
193 |         values ( "1.0" );
194 |       }
195 |       cell_fall(scalar) {
196 |         values ( "1.0" );
197 |       }
198 |       fall_transition(scalar) {
199 |         values ( "1.0" );
200 |       }
201 |     }
202 |     timing() {
203 |       related_pin : "B";
204 |       timing_sense : positive_unate;
205 |       cell_rise(scalar) {
206 |         values ( "1.0" );
207 |       }
208 |       rise_transition(scalar) {
209 |         values ( "1.0" );
210 |       }
211 |       cell_fall(scalar) {
212 |         values ( "1.0" );
213 |       }
214 |       fall_transition(scalar) {
215 |         values ( "1.0" );
216 |       }
217 |     }
218 |   }
219 | }
220 | 
221 | cell (AND2_11_0) {
222 |   area : 32;
223 |   cell_leakage_power : 0.0746794;
224 |   pin(A)  {
225 |     direction : input;
226 |     capacitance : 0.0129077;
227 |     rise_capacitance : 0.0129077;
228 |     fall_capacitance : 0.0128842;
229 |   }
230 |   pin(B)  {
231 |     direction : input;
232 |     capacitance : 0.0125298;
233 |     rise_capacitance : 0.0125298;
234 |     fall_capacitance : 0.0122586;
235 |   }
236 |   pin(Y)  {
237 |     direction : output;
238 |     capacitance : 0;
239 |     rise_capacitance : 0;
240 |     fall_capacitance : 0;
241 |     function : "(A' B')";
242 |     timing() {
243 |       related_pin : "A";
244 |       timing_sense : positive_unate;
245 |       cell_rise(scalar) {
246 |         values ( "1.0" );
247 |       }
248 |       rise_transition(scalar) {
249 |         values ( "1.0" );
250 |       }
251 |       cell_fall(scalar) {
252 |         values ( "1.0" );
253 |       }
254 |       fall_transition(scalar) {
255 |         values ( "1.0" );
256 |       }
257 |     }
258 |     timing() {
259 |       related_pin : "B";
260 |       timing_sense : positive_unate;
261 |       cell_rise(scalar) {
262 |         values ( "1.0" );
263 |       }
264 |       rise_transition(scalar) {
265 |         values ( "1.0" );
266 |       }
267 |       cell_fall(scalar) {
268 |         values ( "1.0" );
269 |       }
270 |       fall_transition(scalar) {
271 |         values ( "1.0" );
272 |       }
273 |     }
274 |   }
275 | }
276 | 
277 | cell (AND2_11_1) {
278 |   area : 32;
279 |   cell_leakage_power : 0.0746794;
280 |   pin(A)  {
281 |     direction : input;
282 |     capacitance : 0.0129077;
283 |     rise_capacitance : 0.0129077;
284 |     fall_capacitance : 0.0128842;
285 |   }
286 |   pin(B)  {
287 |     direction : input;
288 |     capacitance : 0.0125298;
289 |     rise_capacitance : 0.0125298;
290 |     fall_capacitance : 0.0122586;
291 |   }
292 |   pin(Y)  {
293 |     direction : output;
294 |     capacitance : 0;
295 |     rise_capacitance : 0;
296 |     fall_capacitance : 0;
297 |     function : "(A + B)";
298 |     timing() {
299 |       related_pin : "A";
300 |       timing_sense : positive_unate;
301 |       cell_rise(scalar) {
302 |         values ( "1.0" );
303 |       }
304 |       rise_transition(scalar) {
305 |         values ( "1.0" );
306 |       }
307 |       cell_fall(scalar) {
308 |         values ( "1.0" );
309 |       }
310 |       fall_transition(scalar) {
311 |         values ( "1.0" );
312 |       }
313 |     }
314 |     timing() {
315 |       related_pin : "B";
316 |       timing_sense : positive_unate;
317 |       cell_rise(scalar) {
318 |         values ( "1.0" );
319 |       }
320 |       rise_transition(scalar) {
321 |         values ( "1.0" );
322 |       }
323 |       cell_fall(scalar) {
324 |         values ( "1.0" );
325 |       }
326 |       fall_transition(scalar) {
327 |         values ( "1.0" );
328 |       }
329 |     }
330 |   }
331 | }
332 | 
333 | cell (INV) {
334 |   cell_footprint : inv;
335 |   area : 0;
336 |   cell_leakage_power : 0.0221741;
337 |   pin(A)  {
338 |     direction : input;
339 |     capacitance : 0.00932456;
340 |     rise_capacitance : 0.00932196;
341 |     fall_capacitance : 0.00932456;
342 |   }
343 |   pin(Y)  {
344 |     direction : output;
345 |     capacitance : 0;
346 |     rise_capacitance : 0;
347 |     fall_capacitance : 0;
348 |     function : "(!A)";
349 |     timing() {
350 |       related_pin : "A";
351 |       timing_sense : negative_unate;
352 |       cell_fall(scalar) {
353 |         values ( "0.0001" );
354 |       }
355 |       fall_transition(scalar) {
356 |         values ( "0.0001" );
357 |       }
358 |       cell_rise(scalar) {
359 |         values ( "0.0001" );
360 |       }
361 |       rise_transition(scalar) {
362 |         values ( "0.0001" );
363 |       }
364 |     }
365 |   }
366 | }
367 | 
368 | cell (BUF) {
369 |   cell_footprint : buf;
370 |   area : 0;
371 |   cell_leakage_power : 0.0221741;
372 |   pin(A)  {
373 |     direction : input;
374 |     capacitance : 0.00932456;
375 |     rise_capacitance : 0.00932196;
376 |     fall_capacitance : 0.00932456;
377 |   }
378 |   pin(Y)  {
379 |     direction : output;
380 |     capacitance : 0;
381 |     rise_capacitance : 0;
382 |     fall_capacitance : 0;
383 |     function : "A";
384 |     timing() {
385 |       related_pin : "A";
386 |       timing_sense : positive_unate;
387 |       cell_fall(scalar) {
388 |         values ( "0.0001" );
389 |       }
390 |       fall_transition(scalar) {
391 |         values ( "0.0001" );
392 |       }
393 |       cell_rise(scalar) {
394 |         values ( "0.0001" );
395 |       }
396 |       rise_transition(scalar) {
397 |         values ( "0.0001" );
398 |       }
399 |     }
400 |   }
401 | }
402 | 
403 | /* ----------------- *
404 |  * Design : DFFPOSX1 *
405 |  * ----------------- */
406 | cell (DFF) {
407 | area : 100;
408 |   cell_leakage_power : 0.160725;
409 |   ff (DS0000,P0002) {
410 |     next_state : "D";
411 |     clocked_on : "CLK";
412 |   }
413 |   pin(CLK)  {
414 |     direction : input;
415 |     capacitance : 0.0279235;
416 |     rise_capacitance : 0.0279235;
417 |     fall_capacitance : 0.0274634;
418 |     clock : true;
419 |     min_pulse_width_high : 0.106969;
420 |     min_pulse_width_low : 0.09927;
421 |   }
422 |   pin(D)  {
423 |     direction : input;
424 |     capacitance : 0.00882947;
425 |     rise_capacitance : 0.00882947;
426 |     fall_capacitance : 0.00881001;
427 |     timing() {
428 |       related_pin : "CLK";
429 |       timing_type : hold_rising;
430 |       rise_constraint(scalar) {
431 |         values ( "0.0001" );
432 |       }
433 |       fall_constraint(scalar) {
434 |         values ( "0.0001" );
435 |       }
436 |     }
437 |     timing() {
438 |       related_pin : "CLK";
439 |       timing_type : setup_rising;
440 |       rise_constraint(scalar) {
441 |         values ( "0.0001" );
442 |       }
443 |       fall_constraint(scalar) {
444 |         values ( "0.0001" );
445 |       }
446 |     }
447 |   }
448 |   pin(Q)  {
449 |     direction : output;
450 |     capacitance : 0;
451 |     rise_capacitance : 0;
452 |     fall_capacitance : 0;
453 |     function : "DS0000";
454 |     timing() {
455 |       related_pin : "CLK";
456 |       timing_sense : non_unate;
457 |       timing_type : rising_edge;
458 |       cell_fall(scalar) {
459 |         values ( "0.0001" );
460 |       }
461 |       fall_transition(scalar) {
462 |         values ( "0.0001" );
463 |       }
464 |       cell_rise(scalar) {
465 |         values ( "0.0001" );
466 |       }
467 |       rise_transition(scalar) {
468 |         values ( "0.0001" );
469 |       }
470 |     }
471 |   }
472 | }
473 | 
474 | /* -------------- *
475 |  * Design : DFFSR *
476 |  * -------------- */
477 | cell (DFFSR) {
478 | area : 176;
479 |   cell_leakage_power : 0.27727;
480 |   ff (P0002,P0003) {
481 |     next_state : "D";
482 |     clocked_on : "CLK";
483 |     clear : "(!R)";
484 |     preset : "(!S)";
485 |     clear_preset_var1 : L;
486 |   }
487 |   pin(CLK)  {
488 |     direction : input;
489 |     capacitance : 0.00937511;
490 |     rise_capacitance : 0.00932314;
491 |     fall_capacitance : 0.00937511;
492 |     clock : true;
493 |   }
494 |   pin(D)  {
495 |     direction : input;
496 |     capacitance : 0.00940895;
497 |     rise_capacitance : 0.00940895;
498 |     fall_capacitance : 0.00932956;
499 |     timing() {
500 |       related_pin : "CLK";
501 |       timing_type : hold_rising;
502 |       when : "S&R";
503 |       sdf_cond : "S\&R";
504 |       rise_constraint(scalar) {
505 |         values ( "0.0001" );
506 |       }
507 |       fall_constraint(scalar) {
508 |         values ( "0.0001" );
509 |       }
510 |     }
511 |     timing() {
512 |       related_pin : "CLK";
513 |       timing_type : setup_rising;
514 |       when : "S&R";
515 |       sdf_cond : "S\&R";
516 |       rise_constraint(scalar) {
517 |         values ( "0.0001" );
518 |       }
519 |       fall_constraint(scalar) {
520 |         values ( "0.0001" );
521 |       }
522 |     }
523 |   }
524 |   pin(Q)  {
525 |     direction : output;
526 |     capacitance : 0;
527 |     rise_capacitance : 0;
528 |     fall_capacitance : 0;
529 |     function : "P0002";
530 |     timing() {
531 |       related_pin : "CLK";
532 |       timing_sense : non_unate;
533 |       timing_type : rising_edge;
534 |       cell_fall(scalar) {
535 |         values ( "0.0001" );
536 |       }
537 |       fall_transition(scalar) {
538 |         values ( "0.0001" );
539 |       }
540 |       cell_rise(scalar) {
541 |         values ( "0.0001" );
542 |       }
543 |       rise_transition(scalar) {
544 |         values ( "0.0001" );
545 |       }
546 |     }
547 |     timing() {
548 |       related_pin : "R";
549 |       timing_sense : positive_unate;
550 |       timing_type : clear;
551 |       cell_fall(scalar) {
552 |         values ( "0.0001" );
553 |       }
554 |       fall_transition(scalar) {
555 |         values ( "0.0001" );
556 |       }
557 |       cell_rise(scalar) {
558 |         values ( "0.0001" );
559 |       }
560 |       rise_transition(scalar) {
561 |         values ( "0.0001" );
562 |       }
563 |     }
564 |     timing() {
565 |       related_pin : "S";
566 |       timing_sense : negative_unate;
567 |       timing_type : preset;
568 |       cell_rise(scalar) {
569 |         values ( "0.0001" );
570 |       }
571 |       rise_transition(scalar) {
572 |         values ( "0.0001" );
573 |       }
574 |     }
575 |   }
576 |   pin(R)  {
577 |     direction : input;
578 |     capacitance : 0.0255048;
579 |     rise_capacitance : 0.0255048;
580 |     fall_capacitance : 0.0220338;
581 |     min_pulse_width_low : 0.152176;
582 |     timing() {
583 |       related_pin : "CLK";
584 |       timing_type : recovery_rising;
585 |       when : "D&S";
586 |       sdf_cond : "D\&S";
587 |       rise_constraint(scalar) {
588 |         values ( "0.0001" );
589 |       }
590 |     }
591 |     timing() {
592 |       related_pin : "S";
593 |       timing_type : recovery_rising;
594 |       rise_constraint(scalar) {
595 |         values ( "0.0001" );
596 |       }
597 |     }
598 |     timing() {
599 |       related_pin : "CLK";
600 |       timing_type : removal_rising;
601 |       when : "D&S";
602 |       sdf_cond : "D\&S";
603 |       rise_constraint(scalar) {
604 |         values ( "0.0001" );
605 |       }
606 |     }
607 |   }
608 |   pin(S)  {
609 |     direction : input;
610 |     capacitance : 0.0230606;
611 |     rise_capacitance : 0.0141532;
612 |     fall_capacitance : 0.0230606;
613 |     timing() {
614 |       related_pin : "CLK";
615 |       timing_type : recovery_rising;
616 |       when : "!D&R";
617 |       sdf_cond : "\~D\&R";
618 |       rise_constraint(scalar) {
619 |         values ( "0.0001" );
620 |       }
621 |     }
622 |     timing() {
623 |       related_pin : "R";
624 |       timing_type : recovery_rising;
625 |       rise_constraint(scalar) {
626 |         values ( "0.0001" );
627 |       }
628 |     }
629 |     timing() {
630 |       related_pin : "CLK";
631 |       timing_type : removal_rising;
632 |       when : "!D&R";
633 |       sdf_cond : "\~D\&R";
634 |       rise_constraint(scalar) {
635 |         values ( "0.0001" );
636 |       }
637 |     }
638 |   }
639 | }
640 | 
641 | cell ( $__RAMGEM_SYNC_ ) {
642 |   memory () {
643 |     type : ram ;
644 |     address_width : 13 ;
645 |     word_width : 32 ;
646 |   }
647 |   area : 10000 ;
648 |   interface_timing : TRUE ;
649 |   dont_use : TRUE ;
650 |   dont_touch : TRUE ;
651 |   map_only : TRUE ;
652 |   is_macro_cell : TRUE ;
653 | 
654 |   pin(PORT_R_CLK)  {
655 |     direction : input;
656 |     capacitance : 0.0279235;
657 |     rise_capacitance : 0.0279235;
658 |     fall_capacitance : 0.0274634;
659 |     clock : true;
660 |     min_pulse_width_high : 0.106969;
661 |     min_pulse_width_low : 0.09927;
662 |   }
663 | 
664 |   pin(PORT_W_CLK)  {
665 |     direction : input;
666 |     capacitance : 0.0279235;
667 |     rise_capacitance : 0.0279235;
668 |     fall_capacitance : 0.0274634;
669 |     clock : true;
670 |     min_pulse_width_high : 0.106969;
671 |     min_pulse_width_low : 0.09927;
672 |   }
673 | 
674 |   bus(PORT_R_RD_DATA) {
675 |     bus_type : sram_data_bus_32 ;
676 |     direction : output;
677 |     capacitance : 0.01;
678 |     pin(PORT_R_RD_DATA[31:0]) {
679 |       is_isolated : true;
680 |     }
681 |     memory_read () {
682 |       address : PORT_R_ADDR ;
683 |     }
684 |     timing () {
685 |       related_pin : "PORT_R_CLK" ;
686 |       timing_type : rising_edge ;
687 |       timing_sense : non_unate;
688 |       cell_rise(scalar) {
689 |         values ( "1.0" );
690 |       }
691 |       rise_transition(scalar) {
692 |         values ( "1.0" );
693 |       }
694 |       cell_fall(scalar) {
695 |         values ( "1.0" );
696 |       }
697 |       fall_transition(scalar) {
698 |         values ( "1.0" );
699 |       }
700 |     }
701 |   }
702 | 
703 |   bus(PORT_R_ADDR) {
704 |     bus_type : sram_addr_bus_13 ;
705 |     direction : input;
706 |     capacitance : 0.01;
707 |     pin(PORT_R_ADDR[12:0]) {
708 |       is_isolated : true;
709 |     }
710 |     timing () {
711 |       related_pin : "PORT_R_CLK" ;
712 |       timing_type : hold_falling;
713 |       rise_constraint(scalar) {
714 |         values ( "0.0001" );
715 |       }
716 |       fall_constraint(scalar) {
717 |         values ( "0.0001" );
718 |       }
719 |     }
720 |     timing() {
721 |       related_pin : "PORT_R_CLK";
722 |       timing_type : setup_falling;
723 |       rise_constraint(scalar) {
724 |         values ( "0.0001" );
725 |       }
726 |       fall_constraint(scalar) {
727 |         values ( "0.0001" );
728 |       }
729 |     }
730 |   }
731 | 
732 |   bus(PORT_W_ADDR) {
733 |     bus_type : sram_addr_bus_13 ;
734 |     direction : input;
735 |     capacitance : 0.01;
736 |     pin(PORT_W_ADDR[12:0]) {
737 |       is_isolated : true;
738 |     }
739 |     timing () {
740 |       related_pin : "PORT_W_CLK" ;
741 |       timing_type : hold_falling;
742 |       rise_constraint(scalar) {
743 |         values ( "0.0001" );
744 |       }
745 |       fall_constraint(scalar) {
746 |         values ( "0.0001" );
747 |       }
748 |     }
749 |     timing() {
750 |       related_pin : "PORT_W_CLK";
751 |       timing_type : setup_falling;
752 |       rise_constraint(scalar) {
753 |         values ( "0.0001" );
754 |       }
755 |       fall_constraint(scalar) {
756 |         values ( "0.0001" );
757 |       }
758 |     }
759 |   }
760 |   bus(PORT_W_WR_DATA) {
761 |     bus_type : sram_data_bus_32 ;
762 |     direction : input;
763 |     capacitance : 0.01;
764 |     pin(PORT_W_WR_DATA[31:0]) {
765 |       is_isolated : true;
766 |     }
767 |     memory_write () {
768 |       address : PORT_W_ADDR ;
769 |       clocked_on : PORT_W_CLK ;
770 |     }
771 |     timing () {
772 |       related_pin : "PORT_W_CLK" ;
773 |       timing_type : hold_falling;
774 |       rise_constraint(scalar) {
775 |         values ( "0.0001" );
776 |       }
777 |       fall_constraint(scalar) {
778 |         values ( "0.0001" );
779 |       }
780 |     }
781 |     timing() {
782 |       related_pin : "PORT_W_CLK";
783 |       timing_type : setup_falling;
784 |       rise_constraint(scalar) {
785 |         values ( "0.0001" );
786 |       }
787 |       fall_constraint(scalar) {
788 |         values ( "0.0001" );
789 |       }
790 |     }
791 |   }
792 |   bus(PORT_W_WR_EN) {
793 |     bus_type : sram_data_bus_32 ;
794 |     direction : input;
795 |     capacitance : 0.01;
796 |     pin(PORT_W_WR_EN[31:0]) {
797 |       is_isolated : true;
798 |     }
799 |     timing () {
800 |       related_pin : "PORT_W_CLK" ;
801 |       timing_type : hold_falling;
802 |       rise_constraint(scalar) {
803 |         values ( "0.0001" );
804 |       }
805 |       fall_constraint(scalar) {
806 |         values ( "0.0001" );
807 |       }
808 |     }
809 |     timing() {
810 |       related_pin : "PORT_W_CLK";
811 |       timing_type : setup_falling;
812 |       rise_constraint(scalar) {
813 |         values ( "0.0001" );
814 |       }
815 |       fall_constraint(scalar) {
816 |         values ( "0.0001" );
817 |       }
818 |     }
819 |   }
820 | }
821 | 
822 | cell (CKLNQD) {
823 | area : 100;
824 |   cell_leakage_power : 0.160725;
825 |   clock_gating_integrated_cell : latch_posedge_precontrol;
826 | 
827 |   statetable ("  CP  E", " QD ") {
828 |     table : "  L   L :  -  : L   , \
829 | L   H :  -  : H   , \
830 | H   - :  -  : N ";
831 |   }
832 | 
833 |   pin(CP) {
834 |     clock : true;
835 |     clock_gate_clock_pin : true;
836 |     direction : input;
837 |     capacitance : 0.0279235;
838 |   }
839 |   pin(E) {
840 |     clock_gate_enable_pin : true;
841 |     direction : input;
842 |     capacitance : 0.00882947;
843 |     timing () {
844 |       related_pin : "CP";
845 |       timing_type : hold_rising;
846 |       rise_constraint(scalar) {
847 |         values ( "0.0001" );
848 |       }
849 |       fall_constraint(scalar) {
850 |         values ( "0.0001" );
851 |       }
852 |     }
853 |     timing () {
854 |       related_pin : "CP";
855 |       timing_type : setup_rising;
856 |       rise_constraint(scalar) {
857 |         values ( "0.0001" );
858 |       }
859 |       fall_constraint(scalar) {
860 |         values ( "0.0001" );
861 |       }
862 |     }
863 |   }
864 |   pin(Q) {
865 |     clock_gate_out_pin : true;
866 |     direction : output;
867 |     state_function : "CP*QD";
868 |     timing () {
869 |       related_pin : "CP";
870 |       timing_sense : positive_unate;
871 |       cell_fall(scalar) {
872 |         values ( "0.0001" );
873 |       }
874 |       fall_transition(scalar) {
875 |         values ( "0.0001" );
876 |       }
877 |       cell_rise(scalar) {
878 |         values ( "0.0001" );
879 |       }
880 |       rise_transition(scalar) {
881 |         values ( "0.0001" );
882 |       }
883 |     }
884 |   }
885 |   pin(QD) {
886 |     direction : internal;
887 |     internal_node : "QD";
888 |   }
889 | }
890 | 
891 | }
892 | 


--------------------------------------------------------------------------------
/aigpdk/aigpdk.v:
--------------------------------------------------------------------------------
  1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | module AND2_00_0 (A, B, Y);
  4 | input  A ;
  5 | input  B ;
  6 | output Y ;
  7 | 
  8 |    assign Y = A & B;
  9 | 
 10 | endmodule // AND2_00_0
 11 | 
 12 | module AND2_01_0 (A, B, Y);
 13 | input  A ;
 14 | input  B ;
 15 | output Y ;
 16 | 
 17 |    assign Y = A & ~B;
 18 | 
 19 | endmodule // AND2_01_0
 20 | 
 21 | module AND2_10_0 (A, B, Y);
 22 | input  A ;
 23 | input  B ;
 24 | output Y ;
 25 | 
 26 |    assign Y = ~A & B;
 27 | 
 28 | endmodule // AND2_10_0
 29 | 
 30 | module AND2_11_0 (A, B, Y);
 31 | input  A ;
 32 | input  B ;
 33 | output Y ;
 34 | 
 35 |    assign Y = ~A & ~B;
 36 | 
 37 | endmodule // AND2_11_0
 38 | 
 39 | module AND2_11_1 (A, B, Y);
 40 | input  A ;
 41 | input  B ;
 42 | output Y ;
 43 | 
 44 |    assign Y = A | B;
 45 | 
 46 | endmodule // AND2_11_1
 47 | 
 48 | module INV (A, Y);
 49 | input  A ;
 50 | output Y ;
 51 | 
 52 |    not (Y, A);
 53 | 
 54 | endmodule // INV
 55 | 
 56 | module BUF (A, Y);
 57 | input  A ;
 58 | output Y ;
 59 | 
 60 |    assign Y = A;
 61 | 
 62 | endmodule // BUF
 63 | 
 64 | primitive udp_dff (out, in, clk, clr, set, NOTIFIER);
 65 |    output out;
 66 |    input  in, clk, clr, set, NOTIFIER;
 67 |    reg    out;
 68 | 
 69 |    table
 70 | 
 71 | // in  clk  clr   set  NOT  : Qt : Qt+1
 72 | //
 73 |    0  r   ?   0   ?   : ?  :  0  ; // clock in 0
 74 |    1  r   0   ?   ?   : ?  :  1  ; // clock in 1
 75 |    1  *   0   ?   ?   : 1  :  1  ; // reduce pessimism
 76 |    0  *   ?   0   ?   : 0  :  0  ; // reduce pessimism
 77 |    ?  f   ?   ?   ?   : ?  :  -  ; // no changes on negedge clk
 78 |    *  b   ?   ?   ?   : ?  :  -  ; // no changes when in switches
 79 |    ?  ?   ?   1   ?   : ?  :  1  ; // set output
 80 |    ?  b   0   *   ?   : 1  :  1  ; // cover all transistions on set
 81 |    1  x   0   *   ?   : 1  :  1  ; // cover all transistions on set
 82 |    ?  ?   1   0   ?   : ?  :  0  ; // reset output
 83 |    ?  b   *   0   ?   : 0  :  0  ; // cover all transistions on clr
 84 |    0  x   *   0   ?   : 0  :  0  ; // cover all transistions on clr
 85 |    ?  ?   ?   ?   *   : ?  :  x  ; // any notifier changed
 86 | 
 87 |    endtable
 88 | endprimitive // udp_dff
 89 | 
 90 | primitive udp_tlat (out, in, enable, clr, set, NOTIFIER);
 91 | 
 92 |    output out;
 93 |    input  in, enable, clr, set, NOTIFIER;
 94 |    reg    out;
 95 | 
 96 |    table
 97 | 
 98 | // in  enable  clr   set  NOT  : Qt : Qt+1
 99 | //
100 |    1  1   0   ?   ?   : ?  :  1  ; //
101 |    0  1   ?   0   ?   : ?  :  0  ; //
102 |    1  *   0   ?   ?   : 1  :  1  ; // reduce pessimism
103 |    0  *   ?   0   ?   : 0  :  0  ; // reduce pessimism
104 |    *  0   ?   ?   ?   : ?  :  -  ; // no changes when in switches
105 |    ?  ?   ?   1   ?   : ?  :  1  ; // set output
106 |    ?  0   0   *   ?   : 1  :  1  ; // cover all transistions on set
107 |    1  ?   0   *   ?   : 1  :  1  ; // cover all transistions on set
108 |    ?  ?   1   0   ?   : ?  :  0  ; // reset output
109 |    ?  0   *   0   ?   : 0  :  0  ; // cover all transistions on clr
110 |    0  ?   *   0   ?   : 0  :  0  ; // cover all transistions on clr
111 |    ?  ?   ?   ?   *   : ?  :  x  ; // any notifier changed
112 | 
113 |    endtable
114 | endprimitive // udp_tlat
115 | 
116 | module DFF (CLK, D, Q);
117 | input  CLK ;
118 | input  D ;
119 | output Q ;
120 | reg NOTIFIER ;
121 | 
122 |    udp_dff (DS0000, D, CLK, 1'B0, 1'B0, NOTIFIER);
123 |    not (P0002, DS0000);
124 |    buf (Q, DS0000);
125 | 
126 | endmodule // DFF
127 | 
128 | module DFFSR (CLK, D, R, S, Q);
129 | input  CLK ;
130 | input  D ;
131 | input  R ;
132 | input  S ;
133 | output Q ;
134 | reg NOTIFIER ;
135 | 
136 |    not (I0_CLEAR, R);
137 |    not (I0_SET, S);
138 |    udp_dff (P0003, D_, CLK, I0_SET, I0_CLEAR, NOTIFIER);
139 |    not (D_, D);
140 |    not (P0002, P0003);
141 |    buf (Q, P0002);
142 |    and (\D&S , D, S);
143 |    not (I7_out, D);
144 |    and (\~D&R , I7_out, R);
145 |    and (\S&R , S, R);
146 | 
147 | endmodule // DFFSR
148 | 
149 | // module LATCH (CLK, D, Q);
150 | // input  CLK ;
151 | // input  D ;
152 | // output Q ;
153 | // reg NOTIFIER ;
154 | 
155 | //    udp_tlat (DS0000, D, CLK, 1'B0, 1'B0, NOTIFIER);
156 | //    not (P0000, DS0000);
157 | //    buf (Q, DS0000);
158 | 
159 | // endmodule
160 | 
161 | module CKLNQD (CP, E, Q);
162 |    (* gated_clock = "true" *) input CP;
163 |    input E;
164 |    output Q;
165 |    reg    QD;
166 |    always @* begin
167 |       if (~CP) QD <= E;
168 |    end
169 |    assign Q = CP & QD;
170 | endmodule
171 | 


--------------------------------------------------------------------------------
/aigpdk/memlib_yosys.txt:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # RAMGEM: virtual 13-bit address, 32-bit data ram.
 5 | # size: 2**13 * 32 = 256k bits = 32 kB
 6 | # two read ports, one write port. (first read, then write)
 7 | # read and write controlled by a single clock.
 8 | 
 9 | ram block $__RAMGEM_SYNC_ {
10 |     # Has 13 address bits
11 |     abits 13;
12 |     # The r/w width is 32 bits
13 |     width 32;
14 |     cost 1;
15 |     # init any;
16 |     byte 1;
17 | 
18 |     port sw "W" {
19 |         clock posedge;
20 |     }
21 |     port sr "R" {
22 |         clock posedge;
23 |     }
24 | }
25 | 
26 | # this is a trap cell: if it is used, we will print an error.
27 | # only sync memory is supported in simulator.
28 | ram block $__RAMGEM_ASYNC_ {
29 |     # Has 13 address bits
30 |     abits 13;
31 |     # The r/w width is 32 bits
32 |     width 32;
33 |     cost 100;
34 |     # init any;
35 |     byte 1;
36 | 
37 |     port sw "W" {
38 |         clock posedge;
39 |     }
40 |     port ar "R" {
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/build.rs:
--------------------------------------------------------------------------------
 1 | //! this build script compiles GEM kernels
 2 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | // SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | fn main() {
 6 |     println!("Building cuda source files for GEM...");
 7 |     println!("cargo:rerun-if-changed=csrc");
 8 | 
 9 |     #[cfg(feature = "cuda")] {
10 |         let csrc_headers = ucc::import_csrc();
11 |         let mut cl_cuda = ucc::cl_cuda();
12 |         cl_cuda.ccbin(false);
13 |         cl_cuda.flag("-lineinfo");
14 |         cl_cuda.flag("-maxrregcount=128");
15 |         cl_cuda.debug(false).opt_level(3)
16 |             .include(&csrc_headers)
17 |             .files(["csrc/kernel_v1.cu"]);
18 |         cl_cuda.compile("gemcu");
19 |         println!("cargo:rustc-link-lib=static=gemcu");
20 |         println!("cargo:rustc-link-lib=dylib=cudart");
21 |         ucc::bindgen(["csrc/kernel_v1.cu"], "kernel_v1.rs");
22 |         ucc::export_csrc();
23 |         ucc::make_compile_commands(&[&cl_cuda]);
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/csrc/kernel_v1.cu:
--------------------------------------------------------------------------------
 1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #include "kernel_v1_impl.cuh"
 5 | 
 6 | #define checkCudaErrors(call)                                 \
 7 |   do {                                                        \
 8 |     cudaError_t err = call;                                   \
 9 |     if (err != cudaSuccess) {                                 \
10 |       printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \
11 |              cudaGetErrorString(err));                        \
12 |       exit(EXIT_FAILURE);                                     \
13 |     }                                                         \
14 |   } while (0)
15 | 
16 | extern "C"
17 | void simulate_v1_noninteractive_simple_scan_cuda(
18 |   usize num_blocks,
19 |   usize num_major_stages,
20 |   const usize *blocks_start,
21 |   const u32 *blocks_data,
22 |   u32 *sram_data,
23 |   usize num_cycles,
24 |   usize state_size,
25 |   u32 *states_noninteractive
26 |   )
27 | {
28 |   void *arg_ptrs[8] = {
29 |     (void *)&num_blocks, (void *)&num_major_stages,
30 |     (void *)&blocks_start, (void *)&blocks_data,
31 |     (void *)&sram_data, (void *)&num_cycles, (void *)&state_size,
32 |     (void *)&states_noninteractive
33 |   };
34 |   checkCudaErrors(cudaLaunchCooperativeKernel(
35 |     (void *)simulate_v1_noninteractive_simple_scan, num_blocks, 256,
36 |     arg_ptrs, 0, (cudaStream_t)0
37 |     ));
38 | }
39 | 


--------------------------------------------------------------------------------
/csrc/kernel_v1_impl.cuh:
--------------------------------------------------------------------------------
  1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | #include <crates/ulib/includes.hpp>
  5 | #include <cstdio>
  6 | #include <cooperative_groups.h>
  7 | 
  8 | struct alignas(8) VectorRead2 {
  9 |   u32 c1, c2;
 10 | 
 11 |   __device__ __forceinline__ void read(const VectorRead2 *t) {
 12 |     *this = *t;
 13 |   }
 14 | };
 15 | 
 16 | struct alignas(16) VectorRead4 {
 17 |   u32 c1, c2, c3, c4;
 18 | 
 19 |   __device__ __forceinline__ void read(const VectorRead4 *t) {
 20 |     *this = *t;
 21 |   }
 22 | };
 23 | 
 24 | __device__ void simulate_block_v1(
 25 |   const u32 *__restrict__ script,
 26 |   usize script_size,
 27 |   const u32 *__restrict__ input_state,
 28 |   u32 *__restrict__ output_state,
 29 |   u32 *__restrict__ sram_data,
 30 |   u32 *__restrict__ shared_metadata,
 31 |   u32 *__restrict__ shared_writeouts,
 32 |   u32 *__restrict__ shared_state
 33 |   )
 34 | {
 35 |   int script_pi = 0;
 36 |   while(true) {
 37 |     VectorRead2 t2_1, t2_2;
 38 |     VectorRead4 t4_1, t4_2, t4_3, t4_4, t4_5;
 39 |     shared_metadata[threadIdx.x] = script[script_pi + threadIdx.x];
 40 |     script_pi += 256;
 41 |     t2_1.read(((const VectorRead2 *)(script + script_pi)) + threadIdx.x);
 42 |     __syncthreads();
 43 |     int num_stages = shared_metadata[0];
 44 |     if(!num_stages) {
 45 |       break;
 46 |     }
 47 |     int is_last_part = shared_metadata[1];
 48 |     int num_ios = shared_metadata[2];
 49 |     int io_offset = shared_metadata[3];
 50 |     int num_srams = shared_metadata[4];
 51 |     int sram_offset = shared_metadata[5];
 52 |     int num_global_read_rounds = shared_metadata[6];
 53 |     int num_output_duplicates = shared_metadata[7];
 54 |     u32 writeout_hook_i = shared_metadata[128 + threadIdx.x / 2];
 55 |     if(threadIdx.x % 2 == 0) {
 56 |       writeout_hook_i = writeout_hook_i & ((1 << 16) - 1);
 57 |     }
 58 |     else {
 59 |       writeout_hook_i = writeout_hook_i >> 16;
 60 |     }
 61 | 
 62 |     t4_1.read((const VectorRead4 *)(script + script_pi + 256 * 2 * num_global_read_rounds) + threadIdx.x);
 63 |     t4_2.read((const VectorRead4 *)(script + script_pi + 256 * 2 * num_global_read_rounds + 256 * 4) + threadIdx.x);
 64 |     t4_3.read((const VectorRead4 *)(script + script_pi + 256 * 2 * num_global_read_rounds + 256 * 4 * 2) + threadIdx.x);
 65 |     t4_4.read((const VectorRead4 *)(script + script_pi + 256 * 2 * num_global_read_rounds + 256 * 4 * 3) + threadIdx.x);
 66 |     t4_5.read((const VectorRead4 *)(script + script_pi + 256 * 2 * num_global_read_rounds + 256 * 4 * 4) + threadIdx.x);
 67 |     u32 t_global_rd_state = 0;
 68 |     for(int gr_i = 0; gr_i < num_global_read_rounds; gr_i += 2) {
 69 |       u32 idx = t2_1.c1;
 70 |       u32 mask = t2_1.c2;
 71 |       script_pi += 256 * 2;
 72 |       t2_2.read(((const VectorRead2 *)(script + script_pi)) + threadIdx.x);
 73 |       if(mask) {
 74 |         const u32 *real_input_array;
 75 |         if(idx >> 31) real_input_array = output_state - (1 << 31);
 76 |         else real_input_array = input_state;
 77 |         u32 value = real_input_array[idx];
 78 |         while(mask) {
 79 |           t_global_rd_state <<= 1;
 80 |           u32 lowbit = mask & -mask;
 81 |           if(value & lowbit) t_global_rd_state |= 1;
 82 |           mask ^= lowbit;
 83 |         }
 84 |       }
 85 | 
 86 |       if(gr_i + 1 >= num_global_read_rounds) break;
 87 |       idx = t2_2.c1;
 88 |       mask = t2_2.c2;
 89 |       script_pi += 256 * 2;
 90 |       t2_1.read(((const VectorRead2 *)(script + script_pi)) + threadIdx.x);
 91 |       if(mask) {
 92 |         const u32 *real_input_array;
 93 |         if(idx >> 31) real_input_array = output_state - (1 << 31);
 94 |         else real_input_array = input_state;
 95 |         u32 value = real_input_array[idx];
 96 |         while(mask) {
 97 |           t_global_rd_state <<= 1;
 98 |           u32 lowbit = mask & -mask;
 99 |           if(value & lowbit) t_global_rd_state |= 1;
100 |           mask ^= lowbit;
101 |         }
102 |       }
103 |     }
104 |     shared_state[threadIdx.x] = t_global_rd_state;
105 |     __syncthreads();
106 | 
107 |     for(int bs_i = 0; bs_i < num_stages; ++bs_i) {
108 |       u32 hier_input = 0, hier_flag_xora = 0, hier_flag_xorb = 0, hier_flag_orb = 0;
109 | #define GEMV1_SHUF_INPUT_K(k_outer, k_inner, t_shuffle) {           \
110 |         u32 k = k_outer * 4 + k_inner;                              \
111 |         u32 t_shuffle_1_idx = t_shuffle & ((1 << 16) - 1);          \
112 |         u32 t_shuffle_2_idx = t_shuffle >> 16;                      \
113 |                                                                     \
114 |         hier_input |= (shared_state[t_shuffle_1_idx >> 5] >>        \
115 |                        (t_shuffle_1_idx & 31) & 1) << (k * 2);      \
116 |         hier_input |= (shared_state[t_shuffle_2_idx >> 5] >>        \
117 |                        (t_shuffle_2_idx & 31) & 1) << (k * 2 + 1);  \
118 |       }
119 | #define GEMV1_SHUF_INPUT_K_4(k_outer, t_shuffle) {    \
120 |         GEMV1_SHUF_INPUT_K(k_outer, 0, t_shuffle.c1); \
121 |         GEMV1_SHUF_INPUT_K(k_outer, 1, t_shuffle.c2); \
122 |         GEMV1_SHUF_INPUT_K(k_outer, 2, t_shuffle.c3); \
123 |         GEMV1_SHUF_INPUT_K(k_outer, 3, t_shuffle.c4); \
124 |       }
125 |       script_pi += 256 * 4 * 5;
126 |       GEMV1_SHUF_INPUT_K_4(0, t4_1);
127 |       t4_1.read(((const VectorRead4 *)(script + script_pi)) + threadIdx.x);
128 |       GEMV1_SHUF_INPUT_K_4(1, t4_2);
129 |       t4_2.read(((const VectorRead4 *)(script + script_pi + 256 * 4)) + threadIdx.x);
130 |       GEMV1_SHUF_INPUT_K_4(2, t4_3);
131 |       t4_3.read(((const VectorRead4 *)(script + script_pi + 256 * 4 * 2)) + threadIdx.x);
132 |       GEMV1_SHUF_INPUT_K_4(3, t4_4);
133 |       t4_4.read(((const VectorRead4 *)(script + script_pi + 256 * 4 * 3)) + threadIdx.x);
134 | #undef GEMV1_SHUF_INPUT_K
135 | #undef GEMV1_SHUF_INPUT_K_4
136 |       hier_flag_xora = t4_5.c1;
137 |       hier_flag_xorb = t4_5.c2;
138 |       hier_flag_orb = t4_5.c3;
139 |       t4_5.read(((const VectorRead4 *)(script + script_pi + 256 * 4 * 4)) + threadIdx.x);
140 | 
141 |       __syncthreads();
142 |       shared_state[threadIdx.x] = hier_input;
143 |       __syncthreads();
144 | 
145 |       // hier[0]
146 |       if(threadIdx.x >= 128) {
147 |         u32 hier_input_a = shared_state[threadIdx.x - 128];
148 |         u32 hier_input_b = hier_input;
149 |         u32 ret = (hier_input_a ^ hier_flag_xora) & ((hier_input_b ^ hier_flag_xorb) | hier_flag_orb);
150 |         shared_state[threadIdx.x] = ret;
151 |       }
152 |       __syncthreads();
153 |       // hier[1..3]
154 |       u32 tmp_cur_hi;
155 |       for(int hi = 1; hi <= 3; ++hi) {
156 |         int hier_width = 1 << (7 - hi);
157 |         if(threadIdx.x >= hier_width && threadIdx.x < hier_width * 2) {
158 |           u32 hier_input_a = shared_state[threadIdx.x + hier_width];
159 |           u32 hier_input_b = shared_state[threadIdx.x + hier_width * 2];
160 |           u32 ret = (hier_input_a ^ hier_flag_xora) & ((hier_input_b ^ hier_flag_xorb) | hier_flag_orb);
161 |           tmp_cur_hi = ret;
162 |           shared_state[threadIdx.x] = ret;
163 |         }
164 |         __syncthreads();
165 |       }
166 |       // hier[4..7], within the first warp.
167 |       if(threadIdx.x < 32) {
168 |         for(int hi = 4; hi <= 7; ++hi) {
169 |           int hier_width = 1 << (7 - hi);
170 |           u32 hier_input_a = __shfl_down_sync(0xffffffff, tmp_cur_hi, hier_width);
171 |           u32 hier_input_b = __shfl_down_sync(0xffffffff, tmp_cur_hi, hier_width * 2);
172 |           if(threadIdx.x >= hier_width && threadIdx.x < hier_width * 2) {
173 |             tmp_cur_hi = (hier_input_a ^ hier_flag_xora) & ((hier_input_b ^ hier_flag_xorb) | hier_flag_orb);
174 |           }
175 |         }
176 |         u32 v1 = __shfl_down_sync(0xffffffff, tmp_cur_hi, 1);
177 |         // hier[8..12]
178 |         if(threadIdx.x == 0) {
179 |           u32 r8 = ((v1 << 16) ^ hier_flag_xora) & ((v1 ^ hier_flag_xorb) | hier_flag_orb) & 0xffff0000;
180 |           u32 r9 = ((r8 >> 8) ^ hier_flag_xora) & (((r8 >> 16) ^ hier_flag_xorb) | hier_flag_orb) & 0xff00;
181 |           u32 r10 = ((r9 >> 4) ^ hier_flag_xora) & (((r9 >> 8) ^ hier_flag_xorb) | hier_flag_orb) & 0xf0;
182 |           u32 r11 = ((r10 >> 2) ^ hier_flag_xora) & (((r10 >> 4) ^ hier_flag_xorb) | hier_flag_orb) & 12 /* 0b1100 */;
183 |           u32 r12 = ((r11 >> 1) ^ hier_flag_xora) & (((r11 >> 2) ^ hier_flag_xorb) | hier_flag_orb) & 2 /* 0b10 */;
184 |           tmp_cur_hi = r8 | r9 | r10 | r11 | r12;
185 |         }
186 |         shared_state[threadIdx.x] = tmp_cur_hi;
187 |       }
188 |       __syncthreads();
189 | 
190 |       // write out
191 |       if((writeout_hook_i >> 8) == bs_i) {
192 |         shared_writeouts[threadIdx.x] = shared_state[writeout_hook_i & 255];
193 |       }
194 |     }
195 |     __syncthreads();
196 | 
197 |     // sram & duplicate permutation
198 |     u32 sram_duplicate_t = 0;
199 | #define GEMV1_SHUF_SRAM_DUPL_K(k_outer, k_inner, t_shuffle) { \
200 |       u32 k = k_outer * 4 + k_inner;                          \
201 |       u32 t_shuffle_1_idx = t_shuffle & ((1 << 16) - 1);      \
202 |       u32 t_shuffle_2_idx = t_shuffle >> 16;                  \
203 |                                                               \
204 |       sram_duplicate_t |=                                     \
205 |         (shared_writeouts[t_shuffle_1_idx >> 5] >>            \
206 |          (t_shuffle_1_idx & 31) & 1) << (k * 2);              \
207 |       sram_duplicate_t |=                                     \
208 |         (shared_writeouts[t_shuffle_2_idx >> 5] >>            \
209 |          (t_shuffle_2_idx & 31) & 1) << (k * 2 + 1);          \
210 |     }
211 | #define GEMV1_SHUF_SRAM_DUPL_K_4(k_outer, t_shuffle) {  \
212 |       GEMV1_SHUF_SRAM_DUPL_K(k_outer, 0, t_shuffle.c1); \
213 |       GEMV1_SHUF_SRAM_DUPL_K(k_outer, 1, t_shuffle.c2); \
214 |       GEMV1_SHUF_SRAM_DUPL_K(k_outer, 2, t_shuffle.c3); \
215 |       GEMV1_SHUF_SRAM_DUPL_K(k_outer, 3, t_shuffle.c4); \
216 |     }
217 |     script_pi += 256 * 4 * 5;
218 |     GEMV1_SHUF_SRAM_DUPL_K_4(0, t4_1);
219 |     t4_1.read(((const VectorRead4 *)(script + script_pi)) + threadIdx.x);
220 |     GEMV1_SHUF_SRAM_DUPL_K_4(1, t4_2);
221 |     t4_2.read(((const VectorRead4 *)(script + script_pi + 256 * 4)) + threadIdx.x);
222 |     GEMV1_SHUF_SRAM_DUPL_K_4(2, t4_3);
223 |     t4_3.read(((const VectorRead4 *)(script + script_pi + 256 * 4 * 2)) + threadIdx.x);
224 |     GEMV1_SHUF_SRAM_DUPL_K_4(3, t4_4);
225 |     t4_4.read(((const VectorRead4 *)(script + script_pi + 256 * 4 * 3)) + threadIdx.x);
226 | #undef GEMV1_SHUF_SRAM_DUPL_K_4
227 | #undef GEMV1_SHUF_SRAM_DUPL_K
228 |     sram_duplicate_t = (sram_duplicate_t & ~t4_5.c2) ^ t4_5.c1;
229 |     t4_5.read(((const VectorRead4 *)(script + script_pi + 256 * 4 * 4)) + threadIdx.x);
230 | 
231 |     // sram read fires here.
232 |     u32 *ram = nullptr;
233 |     u32 r, w0;
234 |     u32 port_w_addr_iv, port_w_wr_en, port_w_wr_data_iv;
235 |     if(threadIdx.x < num_srams * 4) {
236 |       u32 addrs = sram_duplicate_t;
237 |       u32 last_tid = 32 + threadIdx.x / 32 * 32;
238 |       u32 mask = (last_tid <= num_srams * 4)
239 |         ? 0xffffffff : (0xffffffff >> (last_tid - num_srams * 4));
240 |       port_w_wr_en = __shfl_down_sync(mask, sram_duplicate_t, 1);
241 |       port_w_wr_data_iv = __shfl_down_sync(mask, sram_duplicate_t, 2);
242 | 
243 |       if(threadIdx.x % 4 == 0) {
244 |         u32 sram_i = threadIdx.x / 4;
245 |         u32 sram_st = sram_offset + sram_i * (1 << 13);
246 |         // u32 sram_ed = sram_st + (1 << 13);
247 |         u32 port_r_addr_iv = addrs & 0xffff;
248 |         port_w_addr_iv = addrs >> 16;
249 | 
250 |         ram = sram_data + sram_st;
251 |         r = ram[port_r_addr_iv];
252 |         w0 = ram[port_w_addr_iv];
253 |       }
254 |     }
255 |     // __syncthreads();
256 | 
257 |     // clock enable permutation
258 |     u32 clken_perm = 0;
259 | #define GEMV1_SHUF_CLKEN_K(k_outer, k_inner, t_shuffle) { \
260 |       u32 k = k_outer * 4 + k_inner;                      \
261 |       u32 t_shuffle_1_idx = t_shuffle & ((1 << 16) - 1);  \
262 |       u32 t_shuffle_2_idx = t_shuffle >> 16;              \
263 |                                                           \
264 |       clken_perm |=                                       \
265 |         (shared_writeouts[t_shuffle_1_idx >> 5] >>        \
266 |          (t_shuffle_1_idx & 31) & 1) << (k * 2);          \
267 |       clken_perm |=                                       \
268 |         (shared_writeouts[t_shuffle_2_idx >> 5] >>        \
269 |          (t_shuffle_2_idx & 31) & 1) << (k * 2 + 1);      \
270 |     }
271 | #define GEMV1_SHUF_CLKEN_K_4(k_outer, t_shuffle) {  \
272 |       GEMV1_SHUF_CLKEN_K(k_outer, 0, t_shuffle.c1); \
273 |       GEMV1_SHUF_CLKEN_K(k_outer, 1, t_shuffle.c2); \
274 |       GEMV1_SHUF_CLKEN_K(k_outer, 2, t_shuffle.c3); \
275 |       GEMV1_SHUF_CLKEN_K(k_outer, 3, t_shuffle.c4); \
276 |     }
277 |     script_pi += 256 * 4 * 5;
278 |     GEMV1_SHUF_CLKEN_K_4(0, t4_1);
279 |     GEMV1_SHUF_CLKEN_K_4(1, t4_2);
280 |     GEMV1_SHUF_CLKEN_K_4(2, t4_3);
281 |     GEMV1_SHUF_CLKEN_K_4(3, t4_4);
282 | #undef GEMV1_SHUF_CLKEN_K
283 | #undef GEMV1_SHUF_CLKEN_K_4
284 | 
285 |     // sram commit
286 |     if(threadIdx.x < num_srams * 4) {
287 |       if(threadIdx.x % 4 == 0) {
288 |         u32 sram_i = threadIdx.x / 4;
289 |         shared_writeouts[num_ios - num_srams + sram_i] = r;
290 |         ram[port_w_addr_iv] = (w0 & ~port_w_wr_en) | (port_w_wr_data_iv & port_w_wr_en);
291 |       }
292 |     }
293 |     else if(threadIdx.x < num_srams * 4 + num_output_duplicates) {
294 |       shared_writeouts[num_ios - num_srams - num_output_duplicates + (threadIdx.x - num_srams * 4)] = sram_duplicate_t;
295 |     }
296 | 
297 |     __syncthreads();
298 |     u32 writeout_inv = shared_writeouts[threadIdx.x];
299 | 
300 |     clken_perm = (clken_perm & ~t4_5.c2) ^ t4_5.c1;
301 |     writeout_inv ^= t4_5.c3;
302 | 
303 |     if(threadIdx.x < num_ios) {
304 |       u32 old_wo = input_state[io_offset + threadIdx.x];
305 |       u32 wo = (old_wo & ~clken_perm) | (writeout_inv & clken_perm);
306 |       output_state[io_offset + threadIdx.x] = wo;
307 |     }
308 | 
309 |     if(is_last_part) break;
310 |   }
311 |   assert(script_size == script_pi);
312 | }
313 | 
314 | __global__ void simulate_v1_noninteractive_simple_scan(
315 |   usize num_blocks,
316 |   usize num_major_stages,
317 |   const usize *__restrict__ blocks_start,
318 |   const u32 *__restrict__ blocks_data,
319 |   u32 *__restrict__ sram_data,
320 |   usize num_cycles,
321 |   usize state_size,
322 |   u32 *__restrict__ states_noninteractive
323 |   )
324 | {
325 |   assert(num_blocks == gridDim.x);
326 |   assert(256 == blockDim.x);
327 |   __shared__ u32 shared_metadata[256];
328 |   __shared__ u32 shared_writeouts[256];
329 |   __shared__ u32 shared_state[256];
330 |   __shared__ u32 script_starts[32], script_sizes[32];
331 |   assert(num_major_stages <= 32);
332 |   if(threadIdx.x < num_major_stages) {
333 |     script_starts[threadIdx.x] = blocks_start[threadIdx.x * num_blocks + blockIdx.x];
334 |     script_sizes[threadIdx.x] = blocks_start[threadIdx.x * num_blocks + blockIdx.x + 1] - script_starts[threadIdx.x];
335 |   }
336 |   __syncthreads();
337 |   for(usize cycle_i = 0; cycle_i < num_cycles; ++cycle_i) {
338 |     for(usize stage_i = 0; stage_i < num_major_stages; ++stage_i) {
339 |       simulate_block_v1(
340 |         blocks_data + script_starts[stage_i],
341 |         script_sizes[stage_i],
342 |         states_noninteractive + cycle_i * state_size,
343 |         states_noninteractive + (cycle_i + 1) * state_size,
344 |         sram_data,
345 |         shared_metadata, shared_writeouts, shared_state
346 |         );
347 |       cooperative_groups::this_grid().sync();
348 |     }
349 |   }
350 | }
351 | 


--------------------------------------------------------------------------------
/src/aig.rs:
--------------------------------------------------------------------------------
  1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //! And-inverter graph format
  4 | //!
  5 | //! An AIG is derived from netlistdb synthesized in AIGPDK.
  6 | 
  7 | use netlistdb::{NetlistDB, GeneralPinName, Direction};
  8 | use indexmap::{IndexMap, IndexSet};
  9 | use crate::aigpdk::AIGPDK_SRAM_ADDR_WIDTH;
 10 | 
 11 | /// A DFF.
 12 | #[derive(Debug, Default, Clone)]
 13 | pub struct DFF {
 14 |     /// The D input pin with invert (last bit)
 15 |     pub d_iv: usize,
 16 |     /// If the DFF is enabled, i.e., if the clock, S, or R is active.
 17 |     pub en_iv: usize,
 18 |     /// The Q pin output with invert.
 19 |     pub q: usize,
 20 | }
 21 | 
 22 | /// A ram block resembling the interface of `$__RAMGEM_SYNC_`.
 23 | #[derive(Debug, Default, Clone)]
 24 | pub struct RAMBlock {
 25 |     pub port_r_addr_iv: [usize; AIGPDK_SRAM_ADDR_WIDTH],
 26 | 
 27 |     /// controls whether r_rd_data should update. (from read clock)
 28 |     pub port_r_en_iv: usize,
 29 |     pub port_r_rd_data: [usize; 32],
 30 | 
 31 |     pub port_w_addr_iv: [usize; AIGPDK_SRAM_ADDR_WIDTH],
 32 |     /// controls whether memory should be updated.
 33 |     ///
 34 |     /// this is a combination of write enable and write clock.
 35 |     pub port_w_wr_en_iv: [usize; 32],
 36 |     pub port_w_wr_data_iv: [usize; 32],
 37 | }
 38 | 
 39 | /// A type of endpoint group. can be a primary output-related pin,
 40 | /// a D flip-flop, or a ram block.
 41 | ///
 42 | /// A group means a task for the partition to complete.
 43 | /// For primary output pins, the task is just to store.
 44 | /// For DFFs, the task is to store only when the clock is enable.
 45 | /// For RAMBlocks, the task is to simulate a sync SRAM.
 46 | /// A StagedIOPin indicates a temporary live pin between different
 47 | /// major stages but reside in the same simulated cycle.
 48 | #[derive(Debug, Copy, Clone)]
 49 | pub enum EndpointGroup<'i> {
 50 |     PrimaryOutput(usize),
 51 |     DFF(&'i DFF),
 52 |     RAMBlock(&'i RAMBlock),
 53 |     StagedIOPin(usize),
 54 | }
 55 | 
 56 | impl EndpointGroup<'_> {
 57 |     /// Enumerate all related aigpin inputs for this endpoint group.
 58 |     ///
 59 |     /// The enumerated inputs may have duplicates.
 60 |     pub fn for_each_input(self, mut f_nz: impl FnMut(usize)) {
 61 |         let mut f = |i| {
 62 |             if i >= 1 { f_nz(i); }
 63 |         };
 64 |         match self {
 65 |             Self::PrimaryOutput(idx) => f(idx >> 1),
 66 |             Self::DFF(dff) => {
 67 |                 f(dff.en_iv >> 1);
 68 |                 f(dff.d_iv >> 1);
 69 |             },
 70 |             Self::RAMBlock(ram) => {
 71 |                 f(ram.port_r_en_iv >> 1);
 72 |                 for i in 0..13 {
 73 |                     f(ram.port_r_addr_iv[i] >> 1);
 74 |                     f(ram.port_w_addr_iv[i] >> 1);
 75 |                 }
 76 |                 for i in 0..32 {
 77 |                     f(ram.port_w_wr_en_iv[i] >> 1);
 78 |                     f(ram.port_w_wr_data_iv[i] >> 1);
 79 |                 }
 80 |             },
 81 |             Self::StagedIOPin(idx) => f(idx),
 82 |         }
 83 |     }
 84 | }
 85 | 
 86 | /// The driver type of an AIG pin.
 87 | #[derive(Debug, Clone)]
 88 | pub enum DriverType {
 89 |     /// Driven by an and gate.
 90 |     ///
 91 |     /// The inversion bit is stored as the last bits in
 92 |     /// two input indices.
 93 |     ///
 94 |     /// Only this type has combinational fan-in.
 95 |     AndGate(usize, usize),
 96 |     /// Driven by a primary input port (with its netlistdb id).
 97 |     InputPort(usize),
 98 |     /// Driven by a clock flag (with clock port netlistdb id, and pos/negedge)
 99 |     InputClockFlag(usize, u8),
100 |     /// Driven by a DFF (with its index)
101 |     DFF(usize),
102 |     /// Driven by a 13-bit by 32-bit RAM block (with its index)
103 |     SRAM(usize),
104 |     /// Tie0: tied to zero. Only the 0-th aig pin is allowed to have this.
105 |     Tie0
106 | }
107 | 
108 | /// An AIG associated with a netlistdb.
109 | #[derive(Debug, Default)]
110 | pub struct AIG {
111 |     /// The number of AIG pins.
112 |     ///
113 |     /// This number might be smaller than num_pins in netlistdb,
114 |     /// because inverters and buffers are merged when possible.
115 |     /// It might also be larger because we may add mux circuits.
116 |     ///
117 |     /// AIG pins are numbered from 1 to num_aigpins inclusive.
118 |     /// The AIG pin id zero (0) is tied to 0.
119 |     ///
120 |     /// AIG pins are guaranteed to have topological order.
121 |     pub num_aigpins: usize,
122 |     /// The mapping from a netlistdb pin to an AIG pin.
123 |     ///
124 |     /// The inversion bit is stored as the last bit.
125 |     /// E.g., `pin2aigpin_iv[pin_id] = aigpin_id << 1 | invert`.
126 |     pub pin2aigpin_iv: Vec<usize>,
127 |     /// The clock pins map. Every clock pin has a pair of flag pins
128 |     /// showing if they are posedge/negedge.
129 |     ///
130 |     /// The flag pin can be empty which means the circuit is not
131 |     /// active with that edge.
132 |     pub clock_pin2aigpins: IndexMap<usize, (usize, usize)>,
133 |     /// The driver types of AIG pins.
134 |     pub drivers: Vec<DriverType>,
135 |     /// A cache for identical and gates.
136 |     pub and_gate_cache: IndexMap<(usize, usize), usize>,
137 |     /// Unique primary output aigpin indices
138 |     pub primary_outputs: IndexSet<usize>,
139 |     /// The D flip-flops (DFFs), indexed by cell id
140 |     pub dffs: IndexMap<usize, DFF>,
141 |     /// The SRAMs, indexed by cell id
142 |     pub srams: IndexMap<usize, RAMBlock>,
143 |     /// The fanout CSR start array.
144 |     pub fanouts_start: Vec<usize>,
145 |     /// The fanout CSR array.
146 |     pub fanouts: Vec<usize>,
147 | }
148 | 
149 | impl AIG {
150 |     fn add_aigpin(&mut self, driver: DriverType) -> usize {
151 |         self.num_aigpins += 1;
152 |         self.drivers.push(driver);
153 |         self.num_aigpins
154 |     }
155 | 
156 |     fn add_and_gate(&mut self, a: usize, b: usize) -> usize {
157 |         assert_ne!(a | 1, usize::MAX);
158 |         assert_ne!(b | 1, usize::MAX);
159 |         if a == 0 || b == 0 {
160 |             return 0
161 |         }
162 |         if a == 1 {
163 |             return b
164 |         }
165 |         if b == 1 {
166 |             return a
167 |         }
168 |         let (a, b) = if a < b { (a, b) } else { (b, a) };
169 |         if let Some(o) = self.and_gate_cache.get(&(a, b)) {
170 |             return o << 1;
171 |         }
172 |         let aigpin = self.add_aigpin(DriverType::AndGate(a, b));
173 |         self.and_gate_cache.insert((a, b), aigpin);
174 |         aigpin << 1
175 |     }
176 | 
177 |     /// given a clock pin, trace back to clock root and return its
178 |     /// enable signal (with invert bit).
179 |     ///
180 |     /// if result is 0, that means the pin is dangled.
181 |     /// if an error occurs because of a undecipherable multi-input cell,
182 |     /// we will return in error the last output pin index of that cell.
183 |     fn trace_clock_pin(
184 |         &mut self,
185 |         netlistdb: &NetlistDB,
186 |         pinid: usize, is_negedge: bool,
187 |         // should we ignore cklnqd in this tracing.
188 |         // if set to true, we will treat cklnqd as a simple buffer.
189 |         // otherwise, we assert that cklnqd/en is already built in
190 |         // our aig mapping (pin2aigpin_iv).
191 |         ignore_cklnqd: bool,
192 |     ) -> Result<usize, usize> {
193 |         if netlistdb.pindirect[pinid] == Direction::I {
194 |             let netid = netlistdb.pin2net[pinid];
195 |             if Some(netid) == netlistdb.net_zero || Some(netid) == netlistdb.net_one {
196 |                 return Ok(0)
197 |             }
198 |             let root = netlistdb.net2pin.items[
199 |                 netlistdb.net2pin.start[netid]
200 |             ];
201 |             return self.trace_clock_pin(
202 |                 netlistdb, root, is_negedge,
203 |                 ignore_cklnqd
204 |             )
205 |         }
206 |         let cellid = netlistdb.pin2cell[pinid];
207 |         if cellid == 0 {
208 |             let clkentry = self.clock_pin2aigpins.entry(pinid)
209 |                 .or_insert((usize::MAX, usize::MAX));
210 |             let clksignal = match is_negedge {
211 |                 false => clkentry.0,
212 |                 true => clkentry.1
213 |             };
214 |             if clksignal != usize::MAX {
215 |                 return Ok(clksignal << 1)
216 |             }
217 |             let aigpin = self.add_aigpin(DriverType::InputClockFlag(pinid, is_negedge as u8));
218 |             let clkentry = self.clock_pin2aigpins.get_mut(&pinid).unwrap();
219 |             let clksignal = match is_negedge {
220 |                 false => &mut clkentry.0,
221 |                 true => &mut clkentry.1
222 |             };
223 |             *clksignal = aigpin;
224 |             return Ok(aigpin << 1)
225 |         }
226 |         let mut pin_a = usize::MAX;
227 |         let mut pin_cp = usize::MAX;
228 |         let mut pin_en = usize::MAX;
229 |         let celltype = netlistdb.celltypes[cellid].as_str();
230 |         if !matches!(celltype, "INV" | "BUF" | "CKLNQD") {
231 |             clilog::error!("cell type {} supported on clock path. expecting only INV, BUF, or CKLNQD", celltype);
232 |             return Err(pinid)
233 |         }
234 |         for ipin in netlistdb.cell2pin.iter_set(cellid) {
235 |             if netlistdb.pindirect[ipin] == Direction::I {
236 |                 match netlistdb.pinnames[ipin].1.as_str() {
237 |                     "A" => pin_a = ipin,
238 |                     "CP" => pin_cp = ipin,
239 |                     "E" => pin_en = ipin,
240 |                     i @ _ => {
241 |                         clilog::error!("input pin {} unexpected for ck element {}", i, celltype);
242 |                         return Err(ipin)
243 |                     }
244 |                 }
245 |             }
246 |         }
247 |         match celltype {
248 |             "INV" => {
249 |                 assert_ne!(pin_a, usize::MAX);
250 |                 self.trace_clock_pin(
251 |                     netlistdb, pin_a, !is_negedge,
252 |                     ignore_cklnqd
253 |                 )
254 |             },
255 |             "BUF" => {
256 |                 assert_ne!(pin_a, usize::MAX);
257 |                 self.trace_clock_pin(
258 |                     netlistdb, pin_a, is_negedge,
259 |                     ignore_cklnqd
260 |                 )
261 |             },
262 |             "CKLNQD" => {
263 |                 assert_ne!(pin_cp, usize::MAX);
264 |                 assert_ne!(pin_en, usize::MAX);
265 |                 let ck_iv = self.trace_clock_pin(
266 |                     netlistdb, pin_cp, is_negedge,
267 |                     ignore_cklnqd
268 |                 )?;
269 |                 if ignore_cklnqd {
270 |                     return Ok(ck_iv)
271 |                 }
272 |                 let en_iv = self.pin2aigpin_iv[pin_en];
273 |                 assert_ne!(en_iv, usize::MAX, "clken not built");
274 |                 Ok(self.add_and_gate(ck_iv, en_iv))
275 |             },
276 |             _ => unreachable!()
277 |         }
278 |     }
279 | 
280 |     /// recursively add aig pins for netlistdb pins
281 |     ///
282 |     /// for sequential logics like DFF and RAM,
283 |     /// 1. their netlist pin inputs are not patched,
284 |     /// 2. their aig pin inputs (in dffs and srams arrays) will be
285 |     ///    patched to include mux -- but not inside this function.
286 |     /// 3. their netlist/aig outputs are directly built here,
287 |     ///    with possible patches for asynchronous DFFSR polyfill.
288 |     fn dfs_netlistdb_build_aig(
289 |         &mut self,
290 |         netlistdb: &NetlistDB,
291 |         topo_vis: &mut Vec<bool>,
292 |         topo_instack: &mut Vec<bool>,
293 |         pinid: usize
294 |     ) {
295 |         if topo_instack[pinid] {
296 |             panic!("circuit has a loop around pin {}",
297 |                    netlistdb.pinnames[pinid].dbg_fmt_pin());
298 |         }
299 |         if topo_vis[pinid] {
300 |             return
301 |         }
302 |         topo_vis[pinid] = true;
303 |         topo_instack[pinid] = true;
304 |         let netid = netlistdb.pin2net[pinid];
305 |         let cellid = netlistdb.pin2cell[pinid];
306 |         let celltype = netlistdb.celltypes[cellid].as_str();
307 |         if netlistdb.pindirect[pinid] == Direction::I {
308 |             if Some(netid) == netlistdb.net_zero {
309 |                 self.pin2aigpin_iv[pinid] = 0;
310 |             }
311 |             else if Some(netid) == netlistdb.net_one {
312 |                 self.pin2aigpin_iv[pinid] = 1;
313 |             }
314 |             else {
315 |                 let root = netlistdb.net2pin.items[
316 |                     netlistdb.net2pin.start[netid]
317 |                 ];
318 |                 self.dfs_netlistdb_build_aig(
319 |                     netlistdb, topo_vis, topo_instack,
320 |                     root
321 |                 );
322 |                 self.pin2aigpin_iv[pinid] = self.pin2aigpin_iv[root];
323 |                 if cellid == 0 {
324 |                     self.primary_outputs.insert(self.pin2aigpin_iv[pinid]);
325 |                 }
326 |             }
327 |         }
328 |         else if cellid == 0 {
329 |             let aigpin = self.add_aigpin(
330 |                 DriverType::InputPort(pinid)
331 |             );
332 |             self.pin2aigpin_iv[pinid] = aigpin << 1;
333 |         }
334 |         else if matches!(celltype, "DFF" | "DFFSR") {
335 |             let q = self.add_aigpin(DriverType::DFF(cellid));
336 |             let dff = self.dffs.entry(cellid).or_default();
337 |             dff.q = q;
338 |             let mut ap_s_iv = 1;
339 |             let mut ap_r_iv = 1;
340 |             let mut q_out = q << 1;
341 |             for pinid in netlistdb.cell2pin.iter_set(cellid) {
342 |                 if !matches!(netlistdb.pinnames[pinid].1.as_str(), "S" | "R") {
343 |                     continue
344 |                 }
345 |                 self.dfs_netlistdb_build_aig(
346 |                     netlistdb, topo_vis, topo_instack, pinid
347 |                 );
348 |                 let prev = self.pin2aigpin_iv[pinid];
349 |                 match netlistdb.pinnames[pinid].1.as_str() {
350 |                     "S" => ap_s_iv = prev,
351 |                     "R" => ap_r_iv = prev,
352 |                     _ => unreachable!()
353 |                 }
354 |             }
355 |             q_out = self.add_and_gate(q_out ^ 1, ap_s_iv) ^ 1;
356 |             q_out = self.add_and_gate(q_out, ap_r_iv);
357 |             self.pin2aigpin_iv[pinid] = q_out;
358 |         }
359 |         else if celltype == "LATCH" {
360 |             panic!("latches are intentionally UNSUPPORTED by GEM, \
361 |                     except in identified gated clocks. \n\
362 |                     you can link a FF&MUX-based LATCH module, \
363 |                     but most likely that is NOT the right solution. \n\
364 |                     check all your assignments inside always@(*) block \
365 |                     to make sure they cover all scenarios.");
366 |         }
367 |         else if celltype == "$__RAMGEM_SYNC_" {
368 |             let o = self.add_aigpin(DriverType::SRAM(cellid));
369 |             self.pin2aigpin_iv[pinid] = o << 1;
370 |             assert_eq!(netlistdb.pinnames[pinid].1.as_str(),
371 |                        "PORT_R_RD_DATA");
372 |             let sram = self.srams.entry(cellid).or_default();
373 |             sram.port_r_rd_data[netlistdb.pinnames[pinid].2.unwrap() as usize] = o;
374 |         }
375 |         else if celltype == "CKLNQD" {
376 |             let mut prev_cp = usize::MAX;
377 |             let mut prev_en = usize::MAX;
378 |             for pinid in netlistdb.cell2pin.iter_set(cellid) {
379 |                 match netlistdb.pinnames[pinid].1.as_str() {
380 |                     "CP" => prev_cp = pinid,
381 |                     "E" => prev_en = pinid,
382 |                     _ => {}
383 |                 }
384 |             }
385 |             assert_ne!(prev_cp, usize::MAX);
386 |             assert_ne!(prev_en, usize::MAX);
387 |             for prev in [prev_cp, prev_en] {
388 |                 self.dfs_netlistdb_build_aig(
389 |                     netlistdb, topo_vis, topo_instack,
390 |                     prev
391 |                 );
392 |             }
393 |             // do not define pin2aigpin_iv[pinid] which is CKLNQD/Q and unused in logic.
394 |         }
395 |         else {
396 |             let mut prev_a = usize::MAX;
397 |             let mut prev_b = usize::MAX;
398 |             for pinid in netlistdb.cell2pin.iter_set(cellid) {
399 |                 match netlistdb.pinnames[pinid].1.as_str() {
400 |                     "A" => prev_a = pinid,
401 |                     "B" => prev_b = pinid,
402 |                     _ => {}
403 |                 }
404 |             }
405 |             for prev in [prev_a, prev_b] {
406 |                 if prev != usize::MAX {
407 |                     self.dfs_netlistdb_build_aig(
408 |                         netlistdb, topo_vis, topo_instack,
409 |                         prev
410 |                     );
411 |                 }
412 |             }
413 |             match celltype {
414 |                 "AND2_00_0" | "AND2_01_0" | "AND2_10_0" | "AND2_11_0" | "AND2_11_1" => {
415 |                     assert_ne!(prev_a, usize::MAX);
416 |                     assert_ne!(prev_b, usize::MAX);
417 |                     let name = netlistdb.celltypes[cellid].as_bytes();
418 |                     let iv_a = name[5] - b'0';
419 |                     let iv_b = name[6] - b'0';
420 |                     let iv_y = name[8] - b'0';
421 |                     let apid = self.add_and_gate(
422 |                         self.pin2aigpin_iv[prev_a] ^ (iv_a as usize),
423 |                         self.pin2aigpin_iv[prev_b] ^ (iv_b as usize),
424 |                     ) ^ (iv_y as usize);
425 |                     self.pin2aigpin_iv[pinid] = apid;
426 |                 },
427 |                 "INV" => {
428 |                     assert_ne!(prev_a, usize::MAX);
429 |                     self.pin2aigpin_iv[pinid] = self.pin2aigpin_iv[prev_a] ^ 1;
430 |                 },
431 |                 "BUF" => {
432 |                     assert_ne!(prev_a, usize::MAX);
433 |                     self.pin2aigpin_iv[pinid] = self.pin2aigpin_iv[prev_a];
434 |                 },
435 |                 _ => unreachable!()
436 |             }
437 |         }
438 |         topo_instack[pinid] = false;
439 |     }
440 | 
441 |     pub fn from_netlistdb(netlistdb: &NetlistDB) -> AIG {
442 |         let mut aig = AIG {
443 |             num_aigpins: 0,
444 |             pin2aigpin_iv: vec![usize::MAX; netlistdb.num_pins],
445 |             drivers: vec![DriverType::Tie0],
446 |             ..Default::default()
447 |         };
448 | 
449 |         for cellid in 1..netlistdb.num_cells {
450 |             if !matches!(netlistdb.celltypes[cellid].as_str(),
451 |                          "DFF" | "DFFSR" | "$__RAMGEM_SYNC_") {
452 |                 continue
453 |             }
454 |             for pinid in netlistdb.cell2pin.iter_set(cellid) {
455 |                 if !matches!(netlistdb.pinnames[pinid].1.as_str(),
456 |                             "CLK" | "PORT_R_CLK" | "PORT_W_CLK") {
457 |                     continue
458 |                 }
459 |                 if let Err(pinid) = aig.trace_clock_pin(
460 |                     netlistdb, pinid, false,
461 |                     true
462 |                 ) {
463 |                     use netlistdb::GeneralHierName;
464 |                     panic!("Tracing clock pin of cell {} error: \
465 |                             there is a multi-input cell driving {} \
466 |                             that clocks this sequential element. \
467 |                             Clock gating need to be manually patched atm.",
468 |                            netlistdb.cellnames[cellid].dbg_fmt_hier(),
469 |                            netlistdb.pinnames[pinid].dbg_fmt_pin());
470 |                 }
471 |             }
472 |         }
473 |         for (&clk, &(flagr, flagf)) in &aig.clock_pin2aigpins {
474 |             clilog::info!(
475 |                 "inferred clock port {} ({})",
476 |                 netlistdb.pinnames[clk].dbg_fmt_pin(),
477 |                 match (flagr, flagf) {
478 |                     (_, usize::MAX) => "posedge",
479 |                     (usize::MAX, _) => "negedge",
480 |                     _ => "posedge & negedge"
481 |                 }
482 |             );
483 |         }
484 | 
485 |         let mut topo_vis = vec![false; netlistdb.num_pins];
486 |         let mut topo_instack = vec![false; netlistdb.num_pins];
487 | 
488 |         for pinid in 0..netlistdb.num_pins {
489 |             aig.dfs_netlistdb_build_aig(
490 |                 netlistdb, &mut topo_vis, &mut topo_instack,
491 |                 pinid
492 |             );
493 |         }
494 | 
495 |         for cellid in 0..netlistdb.num_cells {
496 |             if matches!(netlistdb.celltypes[cellid].as_str(), "DFF" | "DFFSR") {
497 |                 let mut ap_s_iv = 1;
498 |                 let mut ap_r_iv = 1;
499 |                 let mut ap_d_iv = 0;
500 |                 let mut ap_clken_iv = 0;
501 |                 for pinid in netlistdb.cell2pin.iter_set(cellid) {
502 |                     let pin_iv = aig.pin2aigpin_iv[pinid];
503 |                     match netlistdb.pinnames[pinid].1.as_str() {
504 |                         "D" => ap_d_iv = pin_iv,
505 |                         "S" => ap_s_iv = pin_iv,
506 |                         "R" => ap_r_iv = pin_iv,
507 |                         "CLK" => ap_clken_iv = aig.trace_clock_pin(
508 |                             netlistdb, pinid, false,
509 |                             false
510 |                         ).unwrap(),
511 |                         _ => {}
512 |                     }
513 |                 }
514 |                 let mut d_in = ap_d_iv;
515 | 
516 |                 d_in = aig.add_and_gate(d_in ^ 1, ap_s_iv) ^ 1;
517 |                 ap_clken_iv = aig.add_and_gate(ap_clken_iv ^ 1, ap_s_iv) ^ 1;
518 |                 d_in = aig.add_and_gate(d_in, ap_r_iv);
519 |                 ap_clken_iv = aig.add_and_gate(ap_clken_iv ^ 1, ap_r_iv) ^ 1;
520 |                 let dff = aig.dffs.entry(cellid).or_default();
521 |                 dff.en_iv = ap_clken_iv;
522 |                 dff.d_iv = d_in;
523 |                 assert_ne!(dff.q, 0);
524 |             }
525 |             else if netlistdb.celltypes[cellid].as_str() == "$__RAMGEM_SYNC_" {
526 |                 let mut sram = aig.srams.entry(cellid).or_default().clone();
527 |                 let mut write_clken_iv = 0;
528 |                 for pinid in netlistdb.cell2pin.iter_set(cellid) {
529 |                     let bit = netlistdb.pinnames[pinid].2.map(|i| i as usize);
530 |                     let pin_iv = aig.pin2aigpin_iv[pinid];
531 |                     match netlistdb.pinnames[pinid].1.as_str() {
532 |                         "PORT_R_ADDR" => {
533 |                             sram.port_r_addr_iv[bit.unwrap()] = pin_iv;
534 |                         },
535 |                         "PORT_R_CLK" => {
536 |                             sram.port_r_en_iv = aig.trace_clock_pin(
537 |                                 netlistdb, pinid, false,
538 |                                 false
539 |                             ).unwrap();
540 |                         },
541 |                         "PORT_W_ADDR" => {
542 |                             sram.port_w_addr_iv[bit.unwrap()] = pin_iv;
543 |                         }
544 |                         "PORT_W_CLK" => {
545 |                             write_clken_iv = aig.trace_clock_pin(
546 |                                 netlistdb, pinid, false,
547 |                                 false
548 |                             ).unwrap();
549 |                         },
550 |                         "PORT_W_WR_DATA" => {
551 |                             sram.port_w_wr_data_iv[bit.unwrap()] = pin_iv;
552 |                         },
553 |                         "PORT_W_WR_EN" => {
554 |                             sram.port_w_wr_en_iv[bit.unwrap()] = pin_iv;
555 |                         },
556 |                         _ => {}
557 |                     }
558 |                 }
559 |                 for i in 0..32 {
560 |                     let or_en = sram.port_w_wr_en_iv[i];
561 |                     let or_en = aig.add_and_gate(
562 |                         or_en, write_clken_iv
563 |                     );
564 |                     sram.port_w_wr_en_iv[i] = or_en;
565 |                 }
566 |                 *aig.srams.get_mut(&cellid).unwrap() = sram;
567 |             }
568 |         }
569 | 
570 |         aig.fanouts_start = vec![0; aig.num_aigpins + 2];
571 |         for (_i, driver) in aig.drivers.iter().enumerate() {
572 |             if let DriverType::AndGate(a, b) = *driver {
573 |                 if (a >> 1) != 0 {
574 |                     aig.fanouts_start[a >> 1] += 1;
575 |                 }
576 |                 if (b >> 1) != 0 {
577 |                     aig.fanouts_start[b >> 1] += 1;
578 |                 }
579 |             }
580 |         }
581 |         for i in 1..aig.num_aigpins + 2 {
582 |             aig.fanouts_start[i] += aig.fanouts_start[i - 1];
583 |         }
584 |         aig.fanouts = vec![0; aig.fanouts_start[aig.num_aigpins + 1]];
585 |         for (i, driver) in aig.drivers.iter().enumerate() {
586 |             if let DriverType::AndGate(a, b) = *driver {
587 |                 if (a >> 1) != 0 {
588 |                     let st = aig.fanouts_start[a >> 1] - 1;
589 |                     aig.fanouts_start[a >> 1] = st;
590 |                     aig.fanouts[st] = i;
591 |                 }
592 |                 if (b >> 1) != 0 {
593 |                     let st = aig.fanouts_start[b >> 1] - 1;
594 |                     aig.fanouts_start[b >> 1] = st;
595 |                     aig.fanouts[st] = i;
596 |                 }
597 |             }
598 |         }
599 | 
600 |         aig
601 |     }
602 | 
603 |     pub fn topo_traverse_generic(
604 |         &self,
605 |         endpoints: Option<&Vec<usize>>,
606 |         is_primary_input: Option<&IndexSet<usize>>,
607 |     ) -> Vec<usize> {
608 |         let mut vis = IndexSet::new();
609 |         let mut ret = Vec::new();
610 |         fn dfs_topo(aig: &AIG, vis: &mut IndexSet<usize>, ret: &mut Vec<usize>, is_primary_input: Option<&IndexSet<usize>>, u: usize) {
611 |             if vis.contains(&u) {
612 |                 return
613 |             }
614 |             vis.insert(u);
615 |             if let DriverType::AndGate(a, b) = aig.drivers[u] {
616 |                 if is_primary_input.map(|s| s.contains(&u)) != Some(true) {
617 |                     if (a >> 1) != 0 {
618 |                         dfs_topo(aig, vis, ret, is_primary_input, a >> 1);
619 |                     }
620 |                     if (b >> 1) != 0 {
621 |                         dfs_topo(aig, vis, ret, is_primary_input, b >> 1);
622 |                     }
623 |                 }
624 |             }
625 |             ret.push(u);
626 |         }
627 |         if let Some(endpoints) = endpoints {
628 |             for &endpoint in endpoints {
629 |                 dfs_topo(self, &mut vis, &mut ret, is_primary_input, endpoint);
630 |             }
631 |         }
632 |         else {
633 |             for i in 1..self.num_aigpins + 1 {
634 |                 dfs_topo(self, &mut vis, &mut ret, is_primary_input, i);
635 |             }
636 |         }
637 |         ret
638 |     }
639 | 
640 |     pub fn num_endpoint_groups(&self) -> usize {
641 |         self.primary_outputs.len() + self.dffs.len() + self.srams.len()
642 |     }
643 | 
644 |     pub fn get_endpoint_group(&self, endpt_id: usize) -> EndpointGroup {
645 |         if endpt_id < self.primary_outputs.len() {
646 |             EndpointGroup::PrimaryOutput(*self.primary_outputs.get_index(endpt_id).unwrap())
647 |         }
648 |         else if endpt_id < self.primary_outputs.len() + self.dffs.len() {
649 |             EndpointGroup::DFF(&self.dffs[endpt_id - self.primary_outputs.len()])
650 |         }
651 |         else {
652 |             EndpointGroup::RAMBlock(&self.srams[endpt_id - self.primary_outputs.len() - self.dffs.len()])
653 |         }
654 |     }
655 | }
656 | 


--------------------------------------------------------------------------------
/src/aigpdk.rs:
--------------------------------------------------------------------------------
 1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //! AIGPDK is a special artificial cell library used in GEM.
 4 | 
 5 | use netlistdb::{Direction, LeafPinProvider};
 6 | use compact_str::CompactString;
 7 | use sverilogparse::SVerilogRange;
 8 | 
 9 | /// This implements direction and width providers for
10 | /// AIG PDK cells.
11 | ///
12 | /// You can use it in netlistdb construction.
13 | pub struct AIGPDKLeafPins();
14 | 
15 | /// The addr width of an SRAM.
16 | ///
17 | /// The word width is always 32.
18 | /// If you change this, make sure to change all other occurences in this
19 | /// project as well as the definitions in PDK libraries.
20 | pub const AIGPDK_SRAM_ADDR_WIDTH: usize = 13;
21 | 
22 | pub const AIGPDK_SRAM_SIZE: usize = 1 << 13;
23 | 
24 | impl LeafPinProvider for AIGPDKLeafPins {
25 |     fn direction_of(
26 |         &self,
27 |         macro_name: &CompactString,
28 |         pin_name: &CompactString, pin_idx: Option<isize>
29 |     ) -> Direction {
30 |         match (macro_name.as_str(), pin_name.as_str(), pin_idx) {
31 |             ("INV" | "BUF", "A", None) => Direction::I,
32 |             ("INV" | "BUF", "Y", None) => Direction::O,
33 | 
34 |             ("AND2_00_0" | "AND2_01_0" | "AND2_10_0" | "AND2_11_0" |
35 |              "AND2_11_1", "A" | "B", None) => Direction::I,
36 |             ("AND2_00_0" | "AND2_01_0" | "AND2_10_0" | "AND2_11_0" |
37 |              "AND2_11_1", "Y", None) => Direction::O,
38 | 
39 |             ("DFF" | "LATCH", "CLK" | "D", None) => Direction::I,
40 |             ("DFFSR", "CLK" | "D" | "S" | "R", None) => Direction::I,
41 |             ("DFF" | "DFFSR" | "LATCH", "Q", None) => Direction::O,
42 | 
43 |             ("CKLNQD", "CP" | "E", None) => Direction::I,
44 |             ("CKLNQD", "Q", None) => Direction::O,
45 | 
46 |             ("$__RAMGEM_ASYNC_", _, _) => {
47 |                 panic!("Async RAM (lib cell {}) not supported yet in GEM.", macro_name);
48 |             },
49 | 
50 |             ("$__RAMGEM_SYNC_",
51 |              "PORT_R_CLK" | "PORT_W_CLK",
52 |              None) => Direction::I,
53 |             ("$__RAMGEM_SYNC_",
54 |              "PORT_R_ADDR" | "PORT_W_ADDR",
55 |              Some(0..=12)) => Direction::I,
56 |             ("$__RAMGEM_SYNC_",
57 |              "PORT_W_WR_EN" | "PORT_W_WR_DATA",
58 |              Some(0..=31)) => Direction::I,
59 |             ("$__RAMGEM_SYNC_",
60 |              "PORT_R_RD_DATA",
61 |              Some(0..=31)) => Direction::O,
62 | 
63 |             _ => {
64 |                 use netlistdb::{GeneralPinName, HierName};
65 |                 panic!("Cannot recognize pin type {}, please make sure the verilog netlist is synthesized in GEM's aigpdk.",
66 |                        (HierName::single(macro_name.clone()),
67 |                         pin_name, pin_idx).dbg_fmt_pin());
68 |             }
69 |         }
70 |     }
71 | 
72 |     fn width_of(
73 |         &self,
74 |         macro_name: &CompactString,
75 |         pin_name: &CompactString
76 |     ) -> Option<SVerilogRange> {
77 |         match (macro_name.as_str(), pin_name.as_str()) {
78 |             ("INV" | "BUF", "A" | "Y") => None,
79 |             ("AND2_00_0" | "AND2_01_0" | "AND2_10_0" | "AND2_11_0" |
80 |              "AND2_11_1", "A" | "B" | "Y") => None,
81 |             ("DFF" | "DFFSR" | "LATCH", "CLK" | "D" | "Q" | "S" | "R") => None,
82 |             ("CKLNQD", "CP" | "E" | "Q") => None,
83 |             ("$__RAMGEM_SYNC_",
84 |              "PORT_R_CLK" | "PORT_W_CLK") => None,
85 |             ("$__RAMGEM_SYNC_",
86 |              "PORT_R_ADDR" | "PORT_W_ADDR")
87 |                 => Some(SVerilogRange(12, 0)),
88 |             ("$__RAMGEM_SYNC_",
89 |              "PORT_W_WR_EN" | "PORT_W_WR_DATA" | "PORT_R_RD_DATA")
90 |                 => Some(SVerilogRange(31, 0)),
91 |             _ => None
92 |         }
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/src/bin/boomerang_test.rs:
--------------------------------------------------------------------------------
 1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | use std::path::PathBuf;
 4 | use gem::aigpdk::AIGPDKLeafPins;
 5 | use gem::aig::AIG;
 6 | use gem::staging::build_staged_aigs;
 7 | use gem::pe::process_partitions_from_hgr_parts_file;
 8 | use netlistdb::NetlistDB;
 9 | 
10 | #[derive(clap::Parser, Debug)]
11 | struct SimulatorArgs {
12 |     /// Gate-level verilog path synthesized in our provided library.
13 |     ///
14 |     /// If your design is still at RTL level, you should synthesize it
15 |     /// in yosys first.
16 |     netlist_verilog: PathBuf,
17 |     /// Top module type in netlist to analyze.
18 |     ///
19 |     /// If not specified, we will guess it from the hierarchy.
20 |     #[clap(long)]
21 |     top_module: Option<String>,
22 |     /// Level split thresholds.
23 |     #[clap(long, value_delimiter=',')]
24 |     level_split: Vec<usize>,
25 |     /// Input path for the partition result.
26 |     parts_dir: PathBuf,
27 |     #[clap(long, value_delimiter=',')]
28 |     parts_suffixes: Vec<usize>,
29 |     /// Output path for the serialized partitions.
30 |     parts_out: PathBuf,
31 |     /// The maximum allowance of layers for merging-induced degradations.
32 |     ///
33 |     /// By default is 0, meaning no degradation is allowed.
34 |     #[clap(long, default_value_t=0)]
35 |     max_stage_degrad: usize,
36 | }
37 | 
38 | fn main() {
39 |     clilog::init_stderr_color_debug();
40 |     clilog::set_max_print_count(clilog::Level::Warn, "NL_SV_LIT", 1);
41 |     let args = <SimulatorArgs as clap::Parser>::parse();
42 |     clilog::info!("Simulator args:\n{:#?}", args);
43 | 
44 |     let netlistdb = NetlistDB::from_sverilog_file(
45 |         &args.netlist_verilog,
46 |         args.top_module.as_deref(),
47 |         &AIGPDKLeafPins()
48 |     ).expect("cannot build netlist");
49 | 
50 |     let aig = AIG::from_netlistdb(&netlistdb);
51 | 
52 |     let stageds = build_staged_aigs(&aig, &args.level_split);
53 | 
54 |     assert_eq!(stageds.len(), args.parts_suffixes.len(), "incorrect number of parts suffixes given");
55 | 
56 |     let stages_effective_parts = stageds.iter().zip(args.parts_suffixes.iter()).map(|(&(l, r, ref staged), &suffix)| {
57 |         let filename = format!("{}.stage.{}-{}.hgr.part.{}", netlistdb.name, l, match r {
58 |             usize::MAX => "max".to_string(),
59 |             r @ _ => format!("{}", r)
60 |         }, suffix);
61 |         let effective_parts = process_partitions_from_hgr_parts_file(
62 |             &aig, staged, &args.parts_dir.join(&filename),
63 |             args.max_stage_degrad,
64 |         ).expect("some partition failed to map. please increase granularity.");
65 | 
66 |         clilog::info!("# of effective partitions in {}: {}", filename, effective_parts.len());
67 |         effective_parts
68 |     }).collect::<Vec<_>>();
69 | 
70 |     let f = std::fs::File::create(&args.parts_out).unwrap();
71 |     let mut buf = std::io::BufWriter::new(f);
72 |     serde_bare::to_writer(&mut buf, &stages_effective_parts).unwrap();
73 | }
74 | 


--------------------------------------------------------------------------------
/src/bin/cuda_dummy_test.rs:
--------------------------------------------------------------------------------
  1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //! this binary only measures performance for 10000 cycles. it does not
  4 | //! input or output actual VCD.
  5 | 
  6 | use std::path::PathBuf;
  7 | use gem::aigpdk::AIGPDKLeafPins;
  8 | use gem::aig::{DriverType, AIG};
  9 | use gem::staging::build_staged_aigs;
 10 | use gem::pe::Partition;
 11 | use gem::flatten::FlattenedScriptV1;
 12 | use netlistdb::NetlistDB;
 13 | use ulib::{Device, UVec};
 14 | 
 15 | #[derive(clap::Parser, Debug)]
 16 | struct SimulatorArgs {
 17 |     /// Gate-level verilog path synthesized in our provided library.
 18 |     ///
 19 |     /// If your design is still at RTL level, you should synthesize it
 20 |     /// in yosys first.
 21 |     netlist_verilog: PathBuf,
 22 |     /// Top module type in netlist to analyze.
 23 |     ///
 24 |     /// If not specified, we will guess it from the hierarchy.
 25 |     #[clap(long)]
 26 |     top_module: Option<String>,
 27 |     /// Level split thresholds.
 28 |     #[clap(long, value_delimiter=',')]
 29 |     level_split: Vec<usize>,
 30 |     /// Input path for the serialized partitions.
 31 |     gemparts: PathBuf,
 32 |     /// the number of CUDA blocks to map and execute with.
 33 |     ///
 34 |     /// should not exceed GPU maximum simutaneous occupancy.
 35 |     num_blocks: usize,
 36 |     /// the number of dummy cycles to execute.
 37 |     num_dummy_cycles: usize,
 38 | }
 39 | 
 40 | mod ucci {
 41 |     include!(concat!(env!("OUT_DIR"), "/uccbind/kernel_v1.rs"));
 42 | }
 43 | 
 44 | fn main() {
 45 |     clilog::init_stderr_color_debug();
 46 |     clilog::enable_timer("cuda_dummy_test");
 47 |     clilog::enable_timer("gem");
 48 |     clilog::set_max_print_count(clilog::Level::Warn, "NL_SV_LIT", 1);
 49 |     let args = <SimulatorArgs as clap::Parser>::parse();
 50 |     clilog::info!("Simulator args:\n{:#?}", args);
 51 | 
 52 |     let netlistdb = NetlistDB::from_sverilog_file(
 53 |         &args.netlist_verilog,
 54 |         args.top_module.as_deref(),
 55 |         &AIGPDKLeafPins()
 56 |     ).expect("cannot build netlist");
 57 | 
 58 |     let aig = AIG::from_netlistdb(&netlistdb);
 59 | 
 60 |     // print some statistics for listing
 61 |     let order = aig.topo_traverse_generic(None, None);
 62 |     let mut level_id = vec![0; aig.num_aigpins + 1];
 63 |     for &i in &order {
 64 |         if let DriverType::AndGate(a, b) = aig.drivers[i] {
 65 |             if a >= 2 {
 66 |                 level_id[i] = level_id[i].max(level_id[a >> 1] + 1);
 67 |             }
 68 |             if b >= 2 {
 69 |                 level_id[i] = level_id[i].max(level_id[b >> 1] + 1);
 70 |             }
 71 |         }
 72 |     }
 73 |     let max_level = level_id.iter().copied().max().unwrap();
 74 |     println!("netlist has {} pins, {} aig pins, {} and gates",
 75 |              netlistdb.num_pins, aig.num_aigpins, aig.and_gate_cache.len());
 76 |     println!("netlist logic depth: {}", max_level);
 77 | 
 78 |     let stageds = build_staged_aigs(&aig, &args.level_split);
 79 | 
 80 |     let f = std::fs::File::open(&args.gemparts).unwrap();
 81 |     let mut buf = std::io::BufReader::new(f);
 82 |     let parts_in_stages: Vec<Vec<Partition>> = serde_bare::from_reader(&mut buf).unwrap();
 83 |     clilog::info!("# of effective partitions in each stage: {:?}",
 84 |                   parts_in_stages.iter().map(|ps| ps.len()).collect::<Vec<_>>());
 85 | 
 86 |     let mut input_layout = Vec::new();
 87 |     for (i, driv) in aig.drivers.iter().enumerate() {
 88 |         if let DriverType::InputPort(_) | DriverType::InputClockFlag(_, _) = driv {
 89 |             input_layout.push(i);
 90 |         }
 91 |     }
 92 | 
 93 |     let script = FlattenedScriptV1::from(
 94 |         &aig, &stageds.iter().map(|(_, _, staged)| staged).collect::<Vec<_>>(),
 95 |         &parts_in_stages.iter().map(|ps| ps.as_slice()).collect::<Vec<_>>(),
 96 |         args.num_blocks, input_layout
 97 |     );
 98 | 
 99 |     use std::hash::{DefaultHasher, Hash, Hasher};
100 |     let mut s = DefaultHasher::new();
101 |     script.blocks_data.hash(&mut s);
102 |     println!("Script hash: {}", s.finish());
103 | 
104 |     // do simulation
105 |     clilog::info!("total number of cycles: {}", args.num_dummy_cycles);
106 |     let device = Device::CUDA(0);
107 |     let mut input_states_uvec = UVec::new_zeroed(script.reg_io_state_size as usize * (args.num_dummy_cycles + 1), device);
108 |     let mut sram_storage = UVec::new_zeroed(script.sram_storage_size as usize, device);
109 |     device.synchronize();
110 |     let timer_sim = clilog::stimer!("simulation (warm up)");
111 |     ucci::simulate_v1_noninteractive_simple_scan(
112 |         args.num_blocks,
113 |         script.num_major_stages,
114 |         &script.blocks_start, &script.blocks_data,
115 |         &mut sram_storage,
116 |         args.num_dummy_cycles,
117 |         script.reg_io_state_size as usize,
118 |         &mut input_states_uvec,
119 |         device
120 |     );
121 |     device.synchronize();
122 |     clilog::finish!(timer_sim);
123 |     let timer_sim = clilog::stimer!("simulation");
124 |     ucci::simulate_v1_noninteractive_simple_scan(
125 |         args.num_blocks,
126 |         script.num_major_stages,
127 |         &script.blocks_start, &script.blocks_data,
128 |         &mut sram_storage,
129 |         args.num_dummy_cycles,
130 |         script.reg_io_state_size as usize,
131 |         &mut input_states_uvec,
132 |         device
133 |     );
134 |     device.synchronize();
135 |     clilog::finish!(timer_sim);
136 | }
137 | 


--------------------------------------------------------------------------------
/src/bin/cut_map_interactive.rs:
--------------------------------------------------------------------------------
  1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //! This is an experimental interactive cutting-then-mapping
  4 | //! implementation.
  5 | //!
  6 | //! The key idea is to only repartition the endpoint groups that
  7 | //! are unable to be mapped.
  8 | 
  9 | use std::path::{Path, PathBuf};
 10 | use gem::repcut::RCHyperGraph;
 11 | use gem::aigpdk::AIGPDKLeafPins;
 12 | use gem::aig::AIG;
 13 | use gem::staging::build_staged_aigs;
 14 | use gem::pe::{process_partitions, Partition};
 15 | use netlistdb::NetlistDB;
 16 | use rayon::prelude::*;
 17 | 
 18 | /// Call an external hypergraph partitioner
 19 | fn run_par(hmetis_bin: &Path, hg: &RCHyperGraph, num_parts: usize) -> Vec<Vec<usize>> {
 20 |     clilog::debug!("invoking partitioner (#parts {})", num_parts);
 21 |     use std::io::{BufRead, BufReader, BufWriter, Write};
 22 |     use std::fs::File;
 23 | 
 24 |     let tmp_dir = tempdir::TempDir::new("gemtemp").unwrap();
 25 |     std::fs::create_dir_all(tmp_dir.path()).unwrap();
 26 |     let hgr_path = tmp_dir.path().join("graph.hgr");
 27 |     println!("hgr_path: {}", hgr_path.display());
 28 |     let f = File::create(&hgr_path).unwrap();
 29 |     let mut buf = BufWriter::new(f);
 30 |     write!(buf, "{}", hg).unwrap();
 31 |     buf.into_inner().unwrap().sync_all().unwrap();
 32 | 
 33 |     std::process::Command::new(hmetis_bin)
 34 |         .arg(&hgr_path)
 35 |         .arg(format!("{}", num_parts))
 36 |         .spawn()
 37 |         .expect("hmetis failed!")
 38 |         .wait().unwrap();
 39 | 
 40 |     let path_parts = tmp_dir.path()
 41 |         .join(format!("graph.hgr.part.{}", num_parts));
 42 |     let mut parts = Vec::<Vec<usize>>::new();
 43 |     let f_parts = File::open(&path_parts).unwrap();
 44 |     let f_parts = BufReader::new(f_parts);
 45 |     for (i, line) in f_parts.lines().enumerate() {
 46 |         let line = line.unwrap();
 47 |         if line.is_empty() { continue }
 48 |         let part_id = line.parse::<usize>().unwrap();
 49 |         while parts.len() <= part_id {
 50 |             parts.push(vec![]);
 51 |         }
 52 |         parts[part_id].push(i);
 53 |     }
 54 |     clilog::info!("read parts file {} with {} parts",
 55 |                   path_parts.display(), parts.len());
 56 |     parts
 57 | }
 58 | 
 59 | #[derive(clap::Parser, Debug)]
 60 | struct SimulatorArgs {
 61 |     /// Path to hmetis (or compatible partitioner) binary.
 62 |     /// We will launch it with `/path/to/binary graph.hgr NUM_PARTS` and
 63 |     /// expect a partition result with file name `graph.hgr.part.NUM_PARTS`.
 64 |     ///
 65 |     /// E.g.: `"/path/to/hmetis-2.0pre1/Linux-x86_64/hmetis2.0pre1"`
 66 |     hmetis_bin: PathBuf,
 67 |     /// Gate-level verilog path synthesized in our provided library.
 68 |     ///
 69 |     /// If your design is still at RTL level, you should synthesize it
 70 |     /// in yosys first.
 71 |     netlist_verilog: PathBuf,
 72 |     /// Top module type in netlist to analyze.
 73 |     ///
 74 |     /// If not specified, we will guess it from the hierarchy.
 75 |     #[clap(long)]
 76 |     top_module: Option<String>,
 77 |     /// Level split thresholds.
 78 |     #[clap(long, value_delimiter=',')]
 79 |     level_split: Vec<usize>,
 80 |     /// Output path for the serialized partitions.
 81 |     parts_out: PathBuf,
 82 |     /// The maximum allowance of layers for merging-induced degradations.
 83 |     ///
 84 |     /// By default is 0, meaning no degradation is allowed.
 85 |     #[clap(long, default_value_t=0)]
 86 |     max_stage_degrad: usize,
 87 | }
 88 | 
 89 | fn main() {
 90 |     clilog::init_stderr_color_debug();
 91 |     clilog::set_max_print_count(clilog::Level::Warn, "NL_SV_LIT", 1);
 92 |     let args = <SimulatorArgs as clap::Parser>::parse();
 93 |     clilog::info!("Simulator args:\n{:#?}", args);
 94 | 
 95 |     let netlistdb = NetlistDB::from_sverilog_file(
 96 |         &args.netlist_verilog,
 97 |         args.top_module.as_deref(),
 98 |         &AIGPDKLeafPins()
 99 |     ).expect("cannot build netlist");
100 | 
101 |     let aig = AIG::from_netlistdb(&netlistdb);
102 |     println!("netlist has {} pins, {} aig pins, {} and gates",
103 |              netlistdb.num_pins, aig.num_aigpins, aig.and_gate_cache.len());
104 | 
105 |     let stageds = build_staged_aigs(&aig, &args.level_split);
106 | 
107 |     let stages_effective_parts = stageds.iter().map(|&(l, r, ref staged)| {
108 |         clilog::info!("interactive partitioning stage {}-{}", l, match r {
109 |             usize::MAX => "max".to_string(),
110 |             r @ _ => format!("{}", r)
111 |         });
112 | 
113 |         let mut parts_indices_good = Vec::new();
114 |         // always made sure that staged output pins are at fronts.
115 |         let mut unrealized_endpoints = (0..staged.num_endpoint_groups()).collect::<Vec<_>>();
116 |         let mut division = 600;
117 | 
118 |         while !unrealized_endpoints.is_empty() {
119 |             division = (division / 2).max(1);
120 |             let num_parts = (unrealized_endpoints.len() + division - 1) / division;
121 |             clilog::info!("current: {} endpoints, try {} parts", unrealized_endpoints.len(), num_parts);
122 |             let staged_ur = staged.to_endpoint_subset(&unrealized_endpoints);
123 |             let hg_ur = RCHyperGraph::from_staged_aig(&aig, &staged_ur);
124 |             let mut parts_indices = run_par(&args.hmetis_bin, &hg_ur, num_parts);
125 |             for idcs in &mut parts_indices {
126 |                 for i in idcs {
127 |                     *i = unrealized_endpoints[*i];
128 |                 }
129 |             }
130 |             let parts_try = parts_indices.par_iter()
131 |                 .map(|endpts| Partition::build_one(&aig, staged, endpts))
132 |                 .collect::<Vec<_>>();
133 |             let mut new_unrealized_endpoints = Vec::new();
134 |             for (idx, part_opt) in parts_indices.into_iter().zip(parts_try.into_iter()) {
135 |                 match part_opt {
136 |                     Some(_part) => {
137 |                         parts_indices_good.push(idx);
138 |                     }
139 |                     None => {
140 |                         if idx.len() == 1 {
141 |                             panic!("A single endpoint still cannot map, you need to increase level cut granularity.");
142 |                         }
143 |                         for endpt_i in idx {
144 |                             new_unrealized_endpoints.push(endpt_i);
145 |                         }
146 |                     }
147 |                 }
148 |             }
149 |             new_unrealized_endpoints.sort_unstable();
150 |             unrealized_endpoints = new_unrealized_endpoints;
151 |         }
152 | 
153 |         clilog::info!("interactive partition completed: {} in total. merging started.",
154 |                       parts_indices_good.len());
155 | 
156 |         let effective_parts = process_partitions(
157 |             &aig, staged, parts_indices_good, args.max_stage_degrad
158 |         ).unwrap();
159 |         clilog::info!("after merging: {} parts.", effective_parts.len());
160 |         effective_parts
161 |     }).collect::<Vec<_>>();
162 | 
163 |     let f = std::fs::File::create(&args.parts_out).unwrap();
164 |     let mut buf = std::io::BufWriter::new(f);
165 |     serde_bare::to_writer(&mut buf, &stages_effective_parts).unwrap();
166 | }
167 | 


--------------------------------------------------------------------------------
/src/bin/level_test.rs:
--------------------------------------------------------------------------------
 1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | use std::path::PathBuf;
 4 | use gem::aigpdk::AIGPDKLeafPins;
 5 | use gem::aig::{AIG, DriverType};
 6 | use netlistdb::NetlistDB;
 7 | 
 8 | #[derive(clap::Parser, Debug)]
 9 | struct SimulatorArgs {
10 |     /// Gate-level verilog path synthesized in our provided library.
11 |     ///
12 |     /// If your design is still at RTL level, you should synthesize it
13 |     /// in yosys first.
14 |     netlist_verilog: PathBuf,
15 |     /// Top module type in netlist to analyze.
16 |     ///
17 |     /// If not specified, we will guess it from the hierarchy.
18 |     #[clap(long)]
19 |     top_module: Option<String>,
20 | }
21 | 
22 | fn main() {
23 |     clilog::init_stderr_color_debug();
24 |     clilog::set_max_print_count(clilog::Level::Warn, "NL_SV_LIT", 1);
25 |     let args = <SimulatorArgs as clap::Parser>::parse();
26 |     clilog::info!("Simulator args:\n{:#?}", args);
27 | 
28 |     let netlistdb = NetlistDB::from_sverilog_file(
29 |         &args.netlist_verilog,
30 |         args.top_module.as_deref(),
31 |         &AIGPDKLeafPins()
32 |     ).expect("cannot build netlist");
33 | 
34 |     let aig = AIG::from_netlistdb(&netlistdb);
35 | 
36 |     let order = aig.topo_traverse_generic(None, None);
37 |     let mut level_id = vec![0; aig.num_aigpins + 1];
38 |     for &i in &order {
39 |         if let DriverType::AndGate(a, b) = aig.drivers[i] {
40 |             if a >= 2 {
41 |                 level_id[i] = level_id[i].max(level_id[a >> 1] + 1);
42 |             }
43 |             if b >= 2 {
44 |                 level_id[i] = level_id[i].max(level_id[b >> 1] + 1);
45 |             }
46 |         }
47 |     }
48 |     let max_level = level_id.iter().copied().max().unwrap();
49 |     let mut num_nodes_in_level = vec![0; max_level + 1];
50 |     for &i in &order {
51 |         num_nodes_in_level[level_id[i]] += 1;
52 |     }
53 | 
54 |     println!("Number of levels: {}", max_level);
55 |     for (i, &num_lvlnd) in num_nodes_in_level.iter().enumerate() {
56 |         print!("[{i}]: {num_lvlnd},  ");
57 |         if i % 6 == 5 {
58 |             println!();
59 |         }
60 |     }
61 |     println!();
62 | }
63 | 


--------------------------------------------------------------------------------
/src/bin/naive_sim.rs:
--------------------------------------------------------------------------------
  1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | use std::path::PathBuf;
  4 | use std::fs::File;
  5 | use std::io::{BufReader, BufWriter, Seek, SeekFrom};
  6 | use std::hash::Hash;
  7 | use std::rc::Rc;
  8 | use std::collections::{HashMap, HashSet};
  9 | use compact_str::CompactString;
 10 | use netlistdb::{Direction, GeneralHierName, GeneralPinName, NetlistDB};
 11 | use sverilogparse::SVerilogRange;
 12 | use itertools::Itertools;
 13 | use vcd_ng::{Parser, ScopeItem, Var, Scope, FastFlow, FastFlowToken, FFValueChange, Writer, SimulationCommand};
 14 | use gem::aigpdk::AIGPDKLeafPins;
 15 | 
 16 | #[derive(clap::Parser, Debug)]
 17 | struct SimulatorArgs {
 18 |     /// Gate-level verilog path synthesized in our provided library.
 19 |     ///
 20 |     /// If your design is still at RTL level, you should synthesize it
 21 |     /// in yosys first.
 22 |     netlist_verilog: PathBuf,
 23 |     /// Top module type in netlist to analyze.
 24 |     ///
 25 |     /// If not specified, we will guess it from the hierarchy.
 26 |     #[clap(long)]
 27 |     top_module: Option<String>,
 28 |     /// VCD input signal path
 29 |     input_vcd: String,
 30 |     /// The scope path of top module in the input VCD.
 31 |     ///
 32 |     /// If not specified, we will use a flat view.
 33 |     /// (this view is often incorrect..)
 34 |     #[clap(long)]
 35 |     input_vcd_scope: Option<String>,
 36 |     /// Output VCD path (must be writable)
 37 |     output_vcd: String,
 38 |     /// The scope path of top module in the output VCD.
 39 |     ///
 40 |     /// If not specified, we will use `gem_top_module`.
 41 |     #[clap(long)]
 42 |     output_vcd_scope: Option<String>,
 43 |     /// Whether to output wire states as well (for more verbose debugging)
 44 |     #[clap(long)]
 45 |     include_wires: bool,
 46 | }
 47 | 
 48 | /// Hierarchical name representation in VCD.
 49 | #[derive(PartialEq, Eq, Clone, Debug)]
 50 | struct VCDHier {
 51 |     cur: CompactString,
 52 |     prev: Option<Rc<VCDHier>>
 53 | }
 54 | 
 55 | /// Reverse iterator of a [`VCDHier`], yielding cell names
 56 | /// from the bottom to the top module.
 57 | struct VCDHierRevIter<'i>(Option<&'i VCDHier>);
 58 | 
 59 | impl<'i> Iterator for VCDHierRevIter<'i> {
 60 |     type Item = &'i CompactString;
 61 | 
 62 |     #[inline]
 63 |     fn next(&mut self) -> Option<&'i CompactString> {
 64 |         let name = self.0?;
 65 |         if name.cur.is_empty() {
 66 |             return None
 67 |         }
 68 |         let ret = &name.cur;
 69 |         self.0 = name.prev.as_ref().map(|a| a.as_ref());
 70 |         Some(ret)
 71 |     }
 72 | }
 73 | 
 74 | impl<'i> IntoIterator for &'i VCDHier {
 75 |     type Item = &'i CompactString;
 76 |     type IntoIter = VCDHierRevIter<'i>;
 77 | 
 78 |     #[inline]
 79 |     fn into_iter(self) -> VCDHierRevIter<'i> {
 80 |         VCDHierRevIter(Some(self))
 81 |     }
 82 | }
 83 | 
 84 | impl Hash for VCDHier {
 85 |     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
 86 |         for s in self.iter() {
 87 |             s.hash(state);
 88 |         }
 89 |     }
 90 | }
 91 | 
 92 | #[allow(dead_code)]
 93 | impl VCDHier {
 94 |     #[inline]
 95 |     fn single(cur: CompactString) -> Self {
 96 |         VCDHier { cur, prev: None }
 97 |     }
 98 | 
 99 |     #[inline]
100 |     fn empty() -> Self {
101 |         VCDHier { cur: "".into(), prev: None }
102 |     }
103 | 
104 |     #[inline]
105 |     fn is_empty(&self) -> bool {
106 |         self.cur.as_str() == "" && self.prev.is_none()
107 |     }
108 | 
109 |     #[inline]
110 |     fn iter(&self) -> VCDHierRevIter {
111 |         (&self).into_iter()
112 |     }
113 | }
114 | 
115 | /// Try to match one component in a scope.
116 | /// If succeed, returns the remaining scope (can be None itself indicating
117 | /// all paths matched).
118 | /// If fails, return None.
119 | fn match_scope_path<'i>(mut scope: &'i str, cur: &str) -> Option<&'i str> {
120 |     if scope.len() == 0 { return Some("") }
121 |     if scope.starts_with('/') {
122 |         scope = &scope[1..];
123 |     }
124 |     if scope.len() == 0 { Some("") }
125 |     else if scope.starts_with(cur) {
126 |         if scope.len() == cur.len() { Some("") }
127 |         else if scope.as_bytes()[cur.len()] == b'/' {
128 |             Some(&scope[cur.len() + 1..])
129 |         }
130 |         else { None }
131 |     }
132 |     else { None }
133 | }
134 | 
135 | fn find_top_scope<'i>(
136 |     items: &'i [ScopeItem], top_scope: &'_ str
137 | ) -> Option<&'i Scope> {
138 |     for item in items {
139 |         if let ScopeItem::Scope(scope) = item {
140 |             if let Some(s1) = match_scope_path(
141 |                 top_scope, scope.identifier.as_str()
142 |             ) {
143 |                 return match s1 {
144 |                     "" => Some(scope),
145 |                     _ => find_top_scope(&scope.children[..], s1)
146 |                 };
147 |             }
148 |         }
149 |     }
150 |     None
151 | }
152 | 
153 | fn main() {
154 |     clilog::init_stderr_color_debug();
155 |     let args = <SimulatorArgs as clap::Parser>::parse();
156 |     clilog::info!("Simulator args:\n{:#?}", args);
157 | 
158 |     let netlistdb = NetlistDB::from_sverilog_file(
159 |         &args.netlist_verilog,
160 |         args.top_module.as_deref(),
161 |         &AIGPDKLeafPins()
162 |     ).expect("cannot build netlist");
163 | 
164 |     let mut posedge_monitor = HashSet::new();
165 |     for cellid in 1..netlistdb.num_cells {
166 |         if matches!(netlistdb.celltypes[cellid].as_str(),
167 |                     "DFF" | "$__RAMGEM_SYNC_") {
168 |             for pinid in netlistdb.cell2pin.iter_set(cellid) {
169 |                 if matches!(netlistdb.pinnames[pinid].1.as_str(),
170 |                             "CLK" | "PORT_R_CLK" | "PORT_W_CLK") {
171 |                     let netid = netlistdb.pin2net[pinid];
172 |                     if Some(netid) == netlistdb.net_zero || Some(netid) == netlistdb.net_one {
173 |                         continue
174 |                     }
175 |                     let root = netlistdb.net2pin.items[
176 |                         netlistdb.net2pin.start[netid]
177 |                     ];
178 |                     if netlistdb.pin2cell[root] != 0 {
179 |                         panic!("DFF {} driven by non-port pin {}: this pattern is not yet supported. please disable clock gating.",
180 |                                netlistdb.cellnames[cellid],
181 |                                netlistdb.pinnames[root].dbg_fmt_pin());
182 |                     }
183 |                     posedge_monitor.insert(root);
184 |                 }
185 |             }
186 |         }
187 |     }
188 |     clilog::info!(
189 |         "clock ports detected: {}",
190 |         posedge_monitor.iter()
191 |             .map(|&i| netlistdb.pinnames[i].dbg_fmt_pin())
192 |             .format(", "));
193 | 
194 |     let input_vcd = File::open(&args.input_vcd).unwrap();
195 |     let mut bufrd = BufReader::with_capacity(65536, input_vcd);
196 |     let mut vcd_parser = Parser::new(&mut bufrd);
197 |     let header = vcd_parser.parse_header().unwrap();
198 |     drop(vcd_parser);
199 |     let mut vcd_file = bufrd.into_inner();
200 |     vcd_file.seek(SeekFrom::Start(0)).unwrap();
201 |     let mut vcdflow = FastFlow::new(vcd_file, 65536);
202 | 
203 |     let top_scope = find_top_scope(
204 |         &header.items[..],
205 |         args.input_vcd_scope.as_deref().unwrap_or("")
206 |     ).expect("Specified top scope not found in VCD.");
207 | 
208 |     let mut vcd2inp = HashMap::new();
209 |     let mut inp_port_given = HashSet::new();
210 | 
211 |     let mut match_one_input = |var: &Var, i: Option<isize>, vcd_pos: usize| {
212 |         let key = (VCDHier::empty(), var.reference.as_str(), i);
213 |         if let Some(&id) = netlistdb.pinname2id.get(
214 |             &key as &dyn GeneralPinName
215 |         ) {
216 |             if netlistdb.pindirect[id] != Direction::O { return }
217 |             vcd2inp.insert((var.code.0, vcd_pos), id);
218 |             inp_port_given.insert(id);
219 |         }
220 |     };
221 |     for scope_item in &top_scope.children[..] {
222 |         if let ScopeItem::Var(var) = scope_item {
223 |             use vcd_ng::ReferenceIndex::*;
224 |             match var.index {
225 |                 None => match var.size {
226 |                     1 => match_one_input(var, None, 0),
227 |                     w @ _ => {
228 |                         for (pos, i) in (0..w).rev()
229 |                             .enumerate()
230 |                         {
231 |                             match_one_input(
232 |                                 var, Some(i as isize), pos)
233 |                         }
234 |                     }
235 |                 },
236 |                 Some(BitSelect(i)) => match_one_input(
237 |                     var, Some(i as isize), 0),
238 |                 Some(Range(a, b)) => {
239 |                     for (pos, i) in SVerilogRange(
240 |                         a as isize, b as isize).enumerate()
241 |                     {
242 |                         match_one_input(var, Some(i), pos);
243 |                     }
244 |                 }
245 |             }
246 |         }
247 |     }
248 |     for i in netlistdb.cell2pin.iter_set(0) {
249 |         if netlistdb.pindirect[i] != Direction::I &&
250 |             !inp_port_given.contains(&i)
251 |         {
252 |             clilog::warn!(
253 |                 GATESIM_VCDI_MISSING_PI,
254 |                 "Primary input port {:?} not present in \
255 |                  the VCD input",
256 |                 netlistdb.pinnames[i]);
257 |         }
258 |     }
259 | 
260 |     let mut circ_state = vec![0u8; netlistdb.num_pins];
261 |     let mut srams = HashMap::new();
262 |     if let Some(netid) = netlistdb.net_one {
263 |         for pinid in netlistdb.net2pin.iter_set(netid) {
264 |             circ_state[pinid] = 1u8;
265 |         }
266 |     }
267 |     let mut topo_vis = vec![false; netlistdb.num_pins];
268 |     let mut topo_instack = vec![false; netlistdb.num_pins];
269 |     let mut topo = Vec::new();
270 |     // mark all combinational circuit inputs
271 |     for i in netlistdb.cell2pin.iter_set(0) {
272 |         if netlistdb.pindirect[i] != Direction::I && !posedge_monitor.contains(&i) {
273 |             topo_vis[i] = true;
274 |         }
275 |     }
276 |     for cellid in 1..netlistdb.num_cells {
277 |         if matches!(netlistdb.celltypes[cellid].as_str(),
278 |                     "DFF" | "$__RAMGEM_SYNC_") {
279 |             for pinid in netlistdb.cell2pin.iter_set(cellid) {
280 |                 if matches!(netlistdb.pinnames[pinid].1.as_str(),
281 |                             "Q" | "PORT_R_RD_DATA") {
282 |                     topo_vis[pinid] = true;
283 |                     // do not add them to topo, but treat them separately before prop.
284 |                 }
285 |             }
286 |         }
287 |         if netlistdb.celltypes[cellid].as_str() == "$__RAMGEM_SYNC_" {
288 |             srams.insert(cellid, vec![0u32; 1 << 13]);
289 |         }
290 |     }
291 |     fn dfs_topo(netlistdb: &NetlistDB, topo_vis: &mut Vec<bool>, topo_instack: &mut Vec<bool>, topo: &mut Vec<usize>, pinid: usize) {
292 |         if topo_instack[pinid] {
293 |             panic!("circuit has loop!");
294 |         }
295 |         if topo_vis[pinid] { return }
296 |         topo_vis[pinid] = true;
297 |         topo_instack[pinid] = true;
298 |         if netlistdb.pindirect[pinid] == Direction::I {
299 |             let netid = netlistdb.pin2net[pinid];
300 |             if Some(netid) != netlistdb.net_zero && Some(netid) != netlistdb.net_one {
301 |                 let root = netlistdb.net2pin.items[
302 |                     netlistdb.net2pin.start[netid]
303 |                 ];
304 |                 dfs_topo(netlistdb, topo_vis, topo_instack, topo, root);
305 |             }
306 |         }
307 |         else {
308 |             let cellid = netlistdb.pin2cell[pinid];
309 |             for pinid in netlistdb.cell2pin.iter_set(cellid) {
310 |                 if matches!(netlistdb.pinnames[pinid].1.as_str(),
311 |                             "A" | "B") {
312 |                     dfs_topo(netlistdb, topo_vis, topo_instack, topo, pinid);
313 |                 }
314 |             }
315 |         }
316 |         topo.push(pinid);
317 |         topo_instack[pinid] = false;
318 |     }
319 |     // start from all comb. circuit outputs
320 |     for pinid in netlistdb.cell2pin.iter_set(0) {
321 |         if netlistdb.pindirect[pinid] == Direction::I {
322 |             dfs_topo(&netlistdb, &mut topo_vis, &mut topo_instack, &mut topo, pinid);
323 |         }
324 |     }
325 |     for cellid in 1..netlistdb.num_cells {
326 |         if matches!(netlistdb.celltypes[cellid].as_str(),
327 |                     "DFF" | "$__RAMGEM_SYNC_") {
328 |             for pinid in netlistdb.cell2pin.iter_set(cellid) {
329 |                 if matches!(netlistdb.pinnames[pinid].1.as_str(),
330 |                             "D" | "PORT_R_ADDR" | "PORT_W_WR_EN" | "PORT_W_ADDR" | "PORT_W_WR_DATA") {
331 |                     dfs_topo(&netlistdb, &mut topo_vis, &mut topo_instack, &mut topo, pinid);
332 |                 }
333 |             }
334 |         }
335 |     }
336 |     for &clk in &posedge_monitor {
337 |         if topo_vis[clk] {
338 |             clilog::error!("Clock {} is also used in combinational logic. This is unsupported and might lead to error.",
339 |                            netlistdb.pinnames[clk].dbg_fmt_pin());
340 |         }
341 |     }
342 |     // clilog::info!("topo size: {} / {}", topo.len(), netlistdb.num_pins);
343 | 
344 |     // for i in 0..netlistdb.num_pins {
345 |     //     if netlistdb.pinnames[i].0.cur.as_str() == "_46841_" {
346 |     //         println!("pin _01039_ ({}) id {} net {}",
347 |     //                  netlistdb.pinnames[i].dbg_fmt_pin(), i,
348 |     //                  netlistdb.pin2net[i]);
349 |     //     }
350 |     // }
351 | 
352 |     // open out
353 |     let write_buf = File::create(&args.output_vcd).unwrap();
354 |     let write_buf = BufWriter::new(write_buf);
355 |     let mut writer = Writer::new(write_buf);
356 |     if let Some((ratio, unit)) = header.timescale {
357 |         writer.timescale(ratio, unit).unwrap();
358 |     }
359 |     let output_vcd_scope = args.output_vcd_scope.as_deref().unwrap_or("gem_top_module");
360 |     let output_vcd_scope = output_vcd_scope.split('/').collect::<Vec<_>>();
361 |     for &scope in &output_vcd_scope {
362 |         writer.add_module(scope).unwrap();
363 |     }
364 |     let mut out2vcd = netlistdb.cell2pin.iter_set(0).filter_map(|i| {
365 |         if netlistdb.pindirect[i] == Direction::I {
366 |             Some((i, writer.add_wire(
367 |                 1, &format!("{}", netlistdb.pinnames[i].dbg_fmt_pin())).unwrap()))
368 |         }
369 |         else { None }
370 |     }).collect::<Vec<_>>();
371 |     if args.include_wires {
372 |         out2vcd.extend((0..netlistdb.num_nets).filter_map(|i| {
373 |             if Some(i) == netlistdb.net_zero || Some(i) == netlistdb.net_one {
374 |                 return None
375 |             }
376 |             let root = netlistdb.net2pin.items[netlistdb.net2pin.start[i]];
377 |             if netlistdb.pindirect[root] != Direction::O {
378 |                 return None
379 |             }
380 |             Some((root, writer.add_wire(
381 |                 1, &format!("{}", netlistdb.netnames[i].dbg_fmt_pin())
382 |             ).unwrap()))
383 |         }));
384 |     }
385 |     let mut last_val = vec![2; out2vcd.len()];
386 |     for _ in 0..output_vcd_scope.len() {
387 |         writer.upscope().unwrap();
388 |     }
389 |     writer.enddefinitions().unwrap();
390 |     writer.begin(SimulationCommand::Dumpvars).unwrap();
391 | 
392 |     // do simulation.
393 |     let mut vcd_time = u64::MAX;
394 |     let mut last_vcd_time_rising_edge = false;
395 |     while let Some(tok) = vcdflow.next_token().unwrap() {
396 |         match tok {
397 |             FastFlowToken::Timestamp(t) => {
398 |                 if t == vcd_time { continue }
399 |                 if last_vcd_time_rising_edge {
400 |                     clilog::debug!("simulating t={}", vcd_time);
401 |                     // latch the regs and srams.
402 |                     for cellid in 1..netlistdb.num_cells {
403 |                         if netlistdb.celltypes[cellid].as_str() == "DFF" {
404 |                             let mut pinid_d = usize::MAX;
405 |                             let mut pinid_q = usize::MAX;
406 |                             for pinid in netlistdb.cell2pin.iter_set(cellid) {
407 |                                 match netlistdb.pinnames[pinid].1.as_str() {
408 |                                     "D" => pinid_d = pinid,
409 |                                     "Q" => pinid_q = pinid,
410 |                                     _ => {}
411 |                                 }
412 |                             }
413 |                             circ_state[pinid_q] = circ_state[pinid_d];
414 |                         }
415 |                         else if netlistdb.celltypes[cellid].as_str() == "$__RAMGEM_SYNC_" {
416 |                             let sram = srams.get_mut(&cellid).unwrap();
417 |                             let mut port_r_addr = 0usize;
418 |                             let mut port_w_addr = 0usize;
419 |                             let mut port_w_wr_en = 0u32;
420 |                             let mut port_w_wr_data = 0u32;
421 |                             for pinid in netlistdb.cell2pin.iter_set(cellid) {
422 |                                 macro_rules! load_var {
423 |                                     ($($pin_name:literal => $var_name:ident),+) => {
424 |                                         match netlistdb.pinnames[pinid].1.as_str() {
425 |                                             $($pin_name => {
426 |                                                 $var_name = ($var_name as u64 | ((circ_state[pinid] as u64) << netlistdb.pinnames[pinid].2.unwrap())).try_into().unwrap();
427 |                                             }),+,
428 |                                             _ => {}
429 |                                         }
430 |                                     }
431 |                                 }
432 |                                 load_var! {
433 |                                     "PORT_R_ADDR" => port_r_addr,
434 |                                     "PORT_W_ADDR" => port_w_addr,
435 |                                     "PORT_W_WR_EN" => port_w_wr_en,
436 |                                     "PORT_W_WR_DATA" => port_w_wr_data
437 |                                 }
438 |                             }
439 |                             let port_r_rd_data = sram[port_r_addr];
440 |                             let port_w_old_data = sram[port_w_addr];
441 |                             let port_w_data = (port_w_old_data & (!port_w_wr_en)) | (port_w_wr_data & port_w_wr_en);
442 |                             sram[port_w_addr] = port_w_data;
443 |                             if netlistdb.cellnames[cellid].dbg_fmt_hier().as_str() == "cpu.instruction_unit.icache.memories[0].way_0_data_ram.mem.0.0" {
444 |                                 println!("our sram at time {vcd_time} port_r_addr {port_r_addr} port_w_addr {port_w_addr} port_w_wr_data {port_w_wr_data} -> port_r_rd_data {port_r_rd_data}");
445 |                             }
446 |                             for pinid in netlistdb.cell2pin.iter_set(cellid) {
447 |                                 macro_rules! save_var {
448 |                                     ($($pin_name:literal <= $var_name:ident),+) => {
449 |                                         match netlistdb.pinnames[pinid].1.as_str() {
450 |                                             $($pin_name => {
451 |                                                 circ_state[pinid] = ($var_name >> netlistdb.pinnames[pinid].2.unwrap() & 1) as u8;
452 |                                             }),+,
453 |                                             _ => {}
454 |                                         }
455 |                                     }
456 |                                 }
457 |                                 save_var! {
458 |                                     "PORT_R_RD_DATA" <= port_r_rd_data
459 |                                 }
460 |                             }
461 |                         }
462 |                     }
463 |                     // propagate
464 |                     for &pinid in &topo {
465 |                         // if netlistdb.pin2cell[pinid] == 0 {
466 |                         //     println!("trying to visit port {}", netlistdb.pinnames[pinid].dbg_fmt_pin());
467 |                         // }
468 |                         if netlistdb.pindirect[pinid] == Direction::I {
469 |                             let netid = netlistdb.pin2net[pinid];
470 |                             if Some(netid) != netlistdb.net_zero && Some(netid) != netlistdb.net_one {
471 |                                 let root = netlistdb.net2pin.items[
472 |                                     netlistdb.net2pin.start[netid]
473 |                                 ];
474 |                                 circ_state[pinid] = circ_state[root];
475 |                                 // if netlistdb.pin2cell[pinid] == 0 {
476 |                                 //     println!("changing output for pin {} to {}", netlistdb.pinnames[pinid].dbg_fmt_pin(), circ_state[pinid]);
477 |                                 // }
478 |                             }
479 |                         }
480 |                         else {
481 |                             let cellid = netlistdb.pin2cell[pinid];
482 |                             let mut vala = 0;
483 |                             let mut valb = 0;
484 |                             for pinid_inp in netlistdb.cell2pin.iter_set(cellid) {
485 |                                 match netlistdb.pinnames[pinid_inp].1.as_str() {
486 |                                     "A" => vala = circ_state[pinid_inp],
487 |                                     "B" => valb = circ_state[pinid_inp],
488 |                                     "Y" => {},
489 |                                     _ => unreachable!()
490 |                                 }
491 |                             }
492 |                             circ_state[pinid] = match netlistdb.celltypes[cellid].as_str() {
493 |                                 "AND2_00_0" => vala & valb,
494 |                                 "AND2_01_0" => vala & (valb ^ 1),
495 |                                 "AND2_10_0" => (vala ^ 1) & valb,
496 |                                 "AND2_11_0" => (vala | valb) ^ 1,
497 |                                 "AND2_11_1" => vala | valb,
498 |                                 "INV" => vala ^ 1,
499 |                                 "BUF" => vala,
500 |                                 _ => unreachable!()
501 |                             };
502 |                             // if netlistdb.pin2net[pinid] == 1039 {
503 |                             //     println!("d_we_o input gate: {} {} type {}", vala, valb, netlistdb.celltypes[cellid].as_str());
504 |                             // }
505 |                         }
506 |                     }
507 |                     // write vcd vars out
508 |                     writer.timestamp(vcd_time).unwrap();
509 |                     for (i, &(pinid, vid)) in out2vcd.iter().enumerate() {
510 |                         use vcd_ng::Value;
511 |                         let value_new = circ_state[pinid];
512 |                         if value_new == last_val[i] {
513 |                             continue
514 |                         }
515 |                         last_val[i] = value_new;
516 |                         writer.change_scalar(vid, match value_new {
517 |                             1 => Value::V1,
518 |                             _ => Value::V0
519 |                         }).unwrap();
520 |                     }
521 |                 }
522 |                 // reset for next timestamp
523 |                 vcd_time = t;
524 |                 last_vcd_time_rising_edge = false;
525 |                 for &clk in &posedge_monitor {
526 |                     circ_state[clk] = 0;
527 |                 }
528 |             },
529 |             FastFlowToken::Value(FFValueChange { id, bits }) => {
530 |                 for (pos, &b) in bits.iter().enumerate() {
531 |                     if let Some(&pin) = vcd2inp.get(
532 |                         &(id.0, pos)
533 |                     ) {
534 |                         if b == b'1' && posedge_monitor.contains(&pin) {
535 |                             last_vcd_time_rising_edge = true;
536 |                         }
537 |                         circ_state[pin] = match b {
538 |                             b'1' => 1, _ => 0
539 |                         };
540 |                     }
541 |                 }
542 |             }
543 |         }
544 |     }
545 | }
546 | 


--------------------------------------------------------------------------------
/src/bin/repcut_test.rs:
--------------------------------------------------------------------------------
 1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | use std::path::PathBuf;
 4 | use gem::aigpdk::AIGPDKLeafPins;
 5 | use gem::aig::AIG;
 6 | use gem::repcut::RCHyperGraph;
 7 | use gem::staging::build_staged_aigs;
 8 | use netlistdb::NetlistDB;
 9 | use std::io::Write;
10 | use std::fs;
11 | 
12 | #[derive(clap::Parser, Debug)]
13 | struct SimulatorArgs {
14 |     /// Gate-level verilog path synthesized in our provided library.
15 |     ///
16 |     /// If your design is still at RTL level, you should synthesize it
17 |     /// in yosys first.
18 |     netlist_verilog: PathBuf,
19 |     /// Top module type in netlist to analyze.
20 |     ///
21 |     /// If not specified, we will guess it from the hierarchy.
22 |     #[clap(long)]
23 |     top_module: Option<String>,
24 |     /// Level split thresholds.
25 |     #[clap(long, value_delimiter=',')]
26 |     level_split: Vec<usize>,
27 |     /// Output directory for hypergraph files.
28 |     hgr_output_dir: PathBuf,
29 | }
30 | 
31 | fn main() {
32 |     clilog::init_stderr_color_debug();
33 |     clilog::set_max_print_count(clilog::Level::Warn, "NL_SV_LIT", 1);
34 |     let args = <SimulatorArgs as clap::Parser>::parse();
35 |     clilog::info!("Simulator args:\n{:#?}", args);
36 | 
37 |     let netlistdb = NetlistDB::from_sverilog_file(
38 |         &args.netlist_verilog,
39 |         args.top_module.as_deref(),
40 |         &AIGPDKLeafPins()
41 |     ).expect("cannot build netlist");
42 | 
43 |     let aig = AIG::from_netlistdb(&netlistdb);
44 |     println!("netlist has {} pins, {} aig pins, {} and gates",
45 |              netlistdb.num_pins, aig.num_aigpins, aig.and_gate_cache.len());
46 | 
47 |     let stageds = build_staged_aigs(&aig, &args.level_split);
48 | 
49 |     if !args.hgr_output_dir.exists() {
50 |         fs::create_dir_all(&args.hgr_output_dir).unwrap();
51 |     }
52 |     for &(l, r, ref staged) in &stageds {
53 |         let hg = RCHyperGraph::from_staged_aig(&aig, staged);
54 | 
55 |         let filename = format!("{}.stage.{}-{}.hgr", netlistdb.name, l, match r {
56 |             usize::MAX => "max".to_string(),
57 |             r @ _ => format!("{}", r)
58 |         });
59 |         println!("writing {}", filename);
60 |         let path = args.hgr_output_dir.join(filename);
61 | 
62 |         let f = std::fs::File::create(&path).unwrap();
63 |         let mut buf = std::io::BufWriter::new(f);
64 |         write!(buf, "{}", hg).unwrap();
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | pub mod aigpdk;
 4 | 
 5 | pub mod aig;
 6 | 
 7 | pub mod staging;
 8 | 
 9 | pub mod repcut;
10 | 
11 | pub mod pe;
12 | 
13 | pub mod flatten;
14 | 


--------------------------------------------------------------------------------
/src/pe.rs:
--------------------------------------------------------------------------------
  1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //! Partition executor
  4 | 
  5 | use crate::aig::{DriverType, AIG, EndpointGroup};
  6 | use crate::staging::StagedAIG;
  7 | use indexmap::{IndexMap, IndexSet};
  8 | use serde::{Deserialize, Serialize};
  9 | use std::collections::HashSet;
 10 | use std::path::PathBuf;
 11 | use rayon::prelude::*;
 12 | 
 13 | /// The number of boomerang stages.
 14 | ///
 15 | /// This determines the shuffle width, i.e., kernel width.
 16 | /// `kernel width = (1 << BOOMERANG_NUM_STAGES)`.
 17 | pub const BOOMERANG_NUM_STAGES: usize = 13;
 18 | 
 19 | const BOOMERANG_MAX_WRITEOUTS: usize = 1 << (BOOMERANG_NUM_STAGES - 5);
 20 | 
 21 | /// One Boomerang stage
 22 | #[derive(Debug, Clone, Serialize, Deserialize)]
 23 | pub struct BoomerangStage {
 24 |     /// the boomerang hierarchy, 8192 -> 4096 -> ... -> 1.
 25 |     ///
 26 |     /// each element is an aigpin index (without iv).
 27 |     /// its parent indices should either be a passthrough or an
 28 |     /// and gate mapping.
 29 |     pub hier: Vec<Vec<usize>>,
 30 |     /// the 32-packed elements in the hierarchy where there should be
 31 |     /// a pass-through.
 32 |     pub write_outs: Vec<usize>,
 33 | }
 34 | 
 35 | /// One partitioned block: a basic execution unit on GPU.
 36 | ///
 37 | /// A block is mapped to a GPU block with the following resource
 38 | /// constraints:
 39 | /// 1. the number of unique inputs should not exceed 8191.
 40 | /// 2. the number of unique outputs should not exceed 8191.
 41 | ///    for srams and dffs, outputs include all enable pins and bus pins.
 42 | ///    there might be unusable holes but the effective capacity is at least
 43 | ///    4095.
 44 | /// 3. the number of intermediate pins alive at each stage should not
 45 |  ///    exceed 4095.
 46 | /// 4. the number of SRAM output groups should not exceed 64.
 47 | ///    64 = 8192 / (32 * 4).
 48 | #[derive(Debug, Clone, Serialize, Deserialize)]
 49 | pub struct Partition {
 50 |     /// the endpoints that are realized by this partition.
 51 |     pub endpoints: Vec<usize>,
 52 |     /// the boomerang stages.
 53 |     ///
 54 |     /// between stages there will automatically be shuffles.
 55 |     pub stages: Vec<BoomerangStage>,
 56 | }
 57 | 
 58 | /// build a single boomerang stage given the current inputs and
 59 | /// outputs.
 60 | fn build_one_boomerang_stage(
 61 |     aig: &AIG,
 62 |     unrealized_comb_outputs: &mut IndexSet<usize>,
 63 |     realized_inputs: &mut IndexSet<usize>,
 64 |     total_write_outs: &mut usize,
 65 |     num_reserved_writeouts: usize,
 66 | ) -> Option<BoomerangStage> {
 67 |     let mut hier = Vec::new();
 68 |     for i in 0..=BOOMERANG_NUM_STAGES {
 69 |         hier.push(vec![usize::MAX; 1 << (BOOMERANG_NUM_STAGES - i)]);
 70 |     }
 71 | 
 72 |     // first discover the (remaining) subgraph to implement.
 73 |     let order = aig.topo_traverse_generic(
 74 |         Some(
 75 |             &unrealized_comb_outputs.iter().copied().collect()
 76 |         ),
 77 |         Some(&realized_inputs)
 78 |     );
 79 |     let id2order: IndexMap<_, _> = order.iter().copied().enumerate()
 80 |         .map(|(order_i, i)| (i, order_i))
 81 |         .collect();
 82 |     let mut level = vec![0; order.len()];
 83 |     for (order_i, i) in order.iter().copied().enumerate() {
 84 |         if realized_inputs.contains(&i) { continue }
 85 |         let mut lvli: usize = 0;
 86 |         if let DriverType::AndGate(a, b) = aig.drivers[i] {
 87 |             if a >= 2 {
 88 |                 lvli = lvli.max(level[*id2order.get(&(a >> 1)).unwrap()] + 1);
 89 |             }
 90 |             if b >= 2 {
 91 |                 lvli = lvli.max(level[*id2order.get(&(b >> 1)).unwrap()] + 1);
 92 |             }
 93 |         }
 94 |         level[order_i] = lvli;
 95 |     }
 96 |     let max_level = level.iter().copied().max().unwrap();
 97 |     clilog::trace!("boomerang current max level: {}", max_level);
 98 | 
 99 |     fn place_bit(
100 |         aig: &AIG,
101 |         hier: &mut Vec<Vec<usize>>,
102 |         hier_visited_nodes_count: &mut IndexMap<usize, usize>,
103 |         level: &Vec<usize>,
104 |         id2order: &IndexMap<usize, usize>,
105 |         hi: usize, j: usize, nd: usize
106 |     ) {
107 |         hier[hi][j] = nd;
108 |         if hi == 0 { return }
109 |         *hier_visited_nodes_count.entry(nd).or_default() += 1;
110 |         let lvlnd = level[*id2order.get(&nd).unwrap()];
111 |         assert!(lvlnd <= hi);
112 |         if lvlnd != hi {
113 |             place_bit(aig, hier, hier_visited_nodes_count,
114 |                       level, id2order,
115 |                       hi - 1, j, nd);
116 |         }
117 |         else {
118 |             let (a, b) = match aig.drivers[nd] {
119 |                 DriverType::AndGate(a, b) => (a, b),
120 |                 _ => panic!()
121 |             };
122 |             let hier_hi_len = hier[hi].len();
123 |             place_bit(aig, hier, hier_visited_nodes_count,
124 |                       level, id2order,
125 |                       hi - 1, j, a >> 1);
126 |             place_bit(aig, hier, hier_visited_nodes_count,
127 |                       level, id2order,
128 |                       hi - 1, j + hier_hi_len, b >> 1);
129 |         }
130 |     }
131 | 
132 |     fn purge_bit(
133 |         aig: &AIG,
134 |         hier: &mut Vec<Vec<usize>>,
135 |         hier_visited_nodes_count: &mut IndexMap<usize, usize>,
136 |         level: &Vec<usize>,
137 |         id2order: &IndexMap<usize, usize>,
138 |         hi: usize, j: usize
139 |     ) {
140 |         if hier[hi][j] == usize::MAX { return }
141 |         let nd = hier[hi][j];
142 |         hier[hi][j] = usize::MAX;
143 |         if hi == 0 { return }
144 |         let hvc = hier_visited_nodes_count.get_mut(&nd).unwrap();
145 |         *hvc -= 1;
146 |         if *hvc == 0 {
147 |             hier_visited_nodes_count.swap_remove(&nd);
148 |         }
149 |         let hier_hi_len = hier[hi].len();
150 |         purge_bit(aig, hier, hier_visited_nodes_count,
151 |                   level, id2order,
152 |                   hi - 1, j);
153 |         purge_bit(aig, hier, hier_visited_nodes_count,
154 |                   level, id2order,
155 |                   hi - 1, j + hier_hi_len);
156 |     }
157 | 
158 |     // the nodes that are implemented in the hierarchy.
159 |     // we only count for hierarchy[1 and more], [0] is not counted.
160 |     let mut hier_visited_nodes_count: IndexMap<usize, usize> = IndexMap::new();
161 |     let mut selected_level = max_level.min(BOOMERANG_NUM_STAGES);
162 | 
163 |     /// compute the maximum number of steps needed from this node
164 |     /// to reach an endpoint node.
165 |     ///
166 |     /// during this path, except the starting point, no node should
167 |     /// already be inside the boomerang hierarchy.
168 |     fn compute_reverse_level(
169 |         order: &Vec<usize>,
170 |         id2order: &IndexMap<usize, usize>,
171 |         unrealized_comb_outputs: &IndexSet<usize>,
172 |         realized_inputs: &IndexSet<usize>,
173 |         hier_visited_nodes_count: &IndexMap<usize, usize>,
174 |         aig: &AIG
175 |     ) -> Vec<usize> {
176 |         let mut reverse_level = vec![usize::MAX; order.len()];
177 |         for &i in unrealized_comb_outputs.iter() {
178 |             reverse_level[*id2order.get(&i).unwrap()] = 0;
179 |         }
180 |         for (order_i, i) in order.iter().copied().enumerate().rev() {
181 |             if realized_inputs.contains(&i) ||
182 |                 hier_visited_nodes_count.contains_key(&i)
183 |             {
184 |                 continue
185 |             }
186 |             let rlvli = reverse_level[order_i];
187 |             if let DriverType::AndGate(a, b) = aig.drivers[i] {
188 |                 if a >= 2 {
189 |                     let a = *id2order.get(&(a >> 1)).unwrap();
190 |                     let rlvla = &mut reverse_level[a];
191 |                     if *rlvla == usize::MAX || *rlvla < rlvli + 1 {
192 |                         *rlvla = rlvli + 1;
193 |                     }
194 |                 }
195 |                 if b >= 2 {
196 |                     let b = *id2order.get(&(b >> 1)).unwrap();
197 |                     let rlvlb = &mut reverse_level[b];
198 |                     if *rlvlb == usize::MAX || *rlvlb < rlvli + 1 {
199 |                         *rlvlb = rlvli + 1;
200 |                     }
201 |                 }
202 |             }
203 |         }
204 |         reverse_level
205 |     }
206 | 
207 |     /// compute the set of nodes that must be implemented in level 1
208 |     /// in addition to the current hierarchy.
209 |     ///
210 |     /// the necessary_level1 nodes can only come from level 0 or
211 |     /// level 1.
212 |     /// a level 1 node is necessary if it is not already
213 |     /// implemented, and it still drives a downstream endpoint.
214 |     /// a level 0 node is necessary if it is not already implemented,
215 |     /// and it either (1) is needed by a level>=2 node, or (2) is
216 |     /// itself an unrealized endpoint.
217 |     fn compute_lvl1_necessary_nodes(
218 |         order: &Vec<usize>,
219 |         id2order: &IndexMap<usize, usize>,
220 |         level: &Vec<usize>,
221 |         reverse_level: &Vec<usize>,
222 |         aig: &AIG,
223 |         unrealized_comb_outputs: &IndexSet<usize>,
224 |         hier_visited_nodes_count: &IndexMap<usize, usize>,
225 |     ) -> IndexSet<usize> {
226 |         let mut lvl1_necessary_nodes = IndexSet::new();
227 |         for order_i in 0..order.len() {
228 |             if hier_visited_nodes_count.contains_key(&order[order_i]) {
229 |                 continue
230 |             }
231 |             if reverse_level[order_i] == usize::MAX { continue }
232 |             if level[order_i] == 0 {
233 |                 if unrealized_comb_outputs.contains(&order[order_i]) {
234 |                     lvl1_necessary_nodes.insert(order[order_i]);
235 |                 }
236 |                 continue
237 |             }
238 |             if level[order_i] == 1 {
239 |                 lvl1_necessary_nodes.insert(order[order_i]);
240 |             }
241 |             else {
242 |                 let (a, b) = match aig.drivers[order[order_i]] {
243 |                     DriverType::AndGate(a, b) => (a, b),
244 |                     _ => panic!()
245 |                 };
246 |                 if a >= 2 &&
247 |                     level[*id2order.get(&(a >> 1)).unwrap()] == 0 &&
248 |                     !hier_visited_nodes_count.contains_key(&(a >> 1))
249 |                 {
250 |                     lvl1_necessary_nodes.insert(a >> 1);
251 |                 }
252 |                 if b >= 2 &&
253 |                     level[*id2order.get(&(b >> 1)).unwrap()] == 0 &&
254 |                     !hier_visited_nodes_count.contains_key(&(b >> 1))
255 |                 {
256 |                     lvl1_necessary_nodes.insert(b >> 1);
257 |                 }
258 |             }
259 |         }
260 |         lvl1_necessary_nodes
261 |     }
262 | 
263 |     let mut reverse_level = compute_reverse_level(
264 |         &order, &id2order,
265 |         unrealized_comb_outputs, realized_inputs,
266 |         &hier_visited_nodes_count, aig
267 |     );
268 | 
269 |     let mut last_lvl1_necessary_nodes = IndexSet::new();
270 | 
271 |     while selected_level >= 2 {
272 |         // find a valid slot to place a high level bit
273 |         let mut slot_at_level = usize::MAX;
274 |         for i in 0..hier[selected_level].len() {
275 |             if hier[selected_level][i] == usize::MAX {
276 |                 slot_at_level = i;
277 |                 break
278 |             }
279 |         }
280 |         if slot_at_level == usize::MAX {
281 |             clilog::trace!("no space at level {}", selected_level);
282 |             selected_level -= 1;
283 |             continue
284 |         }
285 | 
286 |         // find a valuable node to put into the above slot
287 |         let mut selected_node_ord = usize::MAX;
288 |         for order_i in 0..order.len() {
289 |             if level[order_i] != selected_level { continue }
290 |             if hier_visited_nodes_count.contains_key(&order[order_i]) || reverse_level[order_i] == usize::MAX {
291 |                 continue
292 |             }
293 |             if selected_node_ord == usize::MAX ||
294 |                 reverse_level[selected_node_ord] < reverse_level[order_i]
295 |             {
296 |                 selected_node_ord = order_i;
297 |             }
298 |         }
299 |         if selected_node_ord == usize::MAX {
300 |             clilog::trace!("no node at level {}", selected_level);
301 |             selected_level -= 1;
302 |             continue
303 |         }
304 |         let selected_node = order[selected_node_ord];
305 | 
306 |         place_bit(
307 |             aig, &mut hier, &mut hier_visited_nodes_count,
308 |             &level, &id2order,
309 |             selected_level, slot_at_level, selected_node
310 |         );
311 | 
312 |         let reverse_level_upd = compute_reverse_level(
313 |             &order, &id2order,
314 |             unrealized_comb_outputs, realized_inputs,
315 |             &hier_visited_nodes_count, aig
316 |         );
317 | 
318 |         // store the nodes that need to be put on the 1-level
319 |         // (simple ands).
320 |         // they are periodically checked to ensure they have space.
321 |         let lvl1_necessary_nodes = compute_lvl1_necessary_nodes(
322 |             &order, &id2order, &level,
323 |             &reverse_level_upd, aig, &unrealized_comb_outputs,
324 |             &hier_visited_nodes_count
325 |         );
326 | 
327 |         let num_lvl1_hier_taken =
328 |             hier[1].iter().filter(|i| **i != usize::MAX).count();
329 | 
330 |         clilog::trace!(
331 |             "taken one node at level {}, used 1-level space {}, hier visited unique {}, num nodes necessary in lvl1 {}",
332 |             selected_level, num_lvl1_hier_taken,
333 |             hier_visited_nodes_count.len(), lvl1_necessary_nodes.len()
334 |         );
335 | 
336 |         if lvl1_necessary_nodes.len() +
337 |             num_lvl1_hier_taken.max(hier_visited_nodes_count.len())
338 |             >= (1 << (BOOMERANG_NUM_STAGES - 1))
339 |         {
340 |             clilog::trace!("REVERSED the plan due to overflow");
341 |             purge_bit(
342 |                 aig, &mut hier, &mut hier_visited_nodes_count,
343 |                 &level, &id2order,
344 |                 selected_level, slot_at_level
345 |             );
346 |             selected_level -= 1;
347 |             continue
348 |         }
349 | 
350 |         reverse_level = reverse_level_upd;
351 |         last_lvl1_necessary_nodes = lvl1_necessary_nodes;
352 |     }
353 | 
354 |     if last_lvl1_necessary_nodes.is_empty() {
355 |         last_lvl1_necessary_nodes = compute_lvl1_necessary_nodes(
356 |             &order, &id2order, &level,
357 |             &reverse_level, aig, &unrealized_comb_outputs,
358 |             &hier_visited_nodes_count
359 |         );
360 |     }
361 | 
362 |     // the hierarchy is now constructed except all 1-level nodes.
363 |     // it's time to place them. during this process, we heuristically collect
364 |     // endpoint nodes into consecutive space for early write-out.
365 |     //
366 |     // we first try to finalize all endpoints that have to appear in
367 |     // level 1.
368 |     // after that, we will try if we can write out all others scattered.
369 |     let mut endpoints_lvl1 = Vec::new();
370 |     let mut endpoints_untouched = Vec::new();
371 |     let mut endpoints_hier = IndexSet::new();
372 |     for &endpt in unrealized_comb_outputs.iter() {
373 |         if hier_visited_nodes_count.contains_key(&endpt) {
374 |             endpoints_hier.insert(endpt);
375 |         }
376 |         else if last_lvl1_necessary_nodes.contains(&endpt) {
377 |             endpoints_lvl1.push(endpt);
378 |         }
379 |         else {
380 |             endpoints_untouched.push(endpt);
381 |         }
382 |     }
383 | 
384 |     // collect all 32-consecutive level 1 spaces.
385 |     // (num occupied, i), will be sorted later.
386 |     let mut spaces = Vec::new();
387 |     for i in 0..hier[1].len() / 32 {
388 |         let mut num_occupied = 0u8;
389 |         for j in i * 32..(i + 1) * 32 {
390 |             if hier[1][j] != usize::MAX {
391 |                 num_occupied += 1;
392 |             }
393 |         }
394 |         if num_occupied < 10 {
395 |             spaces.push((num_occupied, i * 32))
396 |         }
397 |     }
398 |     spaces.sort();
399 |     let mut spaces_j = 0;
400 |     let mut endpt_lvl1_i = 0;
401 |     let mut realized_endpoints = IndexSet::new();
402 |     let mut write_outs = Vec::new();
403 |     // heuristically push level 1 endpoints.
404 |     while spaces_j < spaces.len() &&
405 |         (endpoints_untouched.is_empty() || // if we can try all
406 |          endpoints_lvl1.len() - endpt_lvl1_i >= (32 - spaces[spaces_j].0) as usize)
407 |     {
408 |         let i = spaces[spaces_j].1;
409 |         for j in i..i + 32 {
410 |             if endpt_lvl1_i >= endpoints_lvl1.len() { break }
411 |             if hier[1][j] == usize::MAX {
412 |                 let endpt_i = endpoints_lvl1[endpt_lvl1_i];
413 |                 place_bit(
414 |                     aig, &mut hier, &mut hier_visited_nodes_count,
415 |                     &level, &id2order,
416 |                     1, j, endpt_i
417 |                 );
418 |                 realized_endpoints.insert(endpt_i);
419 |                 endpt_lvl1_i += 1;
420 |             }
421 |             else if unrealized_comb_outputs.contains(&hier[1][j]) {
422 |                 realized_endpoints.insert(hier[1][j]);
423 |             }
424 |         }
425 |         *total_write_outs += 1;
426 |         write_outs.push((i + hier[1].len()) / 32);
427 |         spaces_j += 1;
428 |     }
429 | 
430 |     if *total_write_outs > BOOMERANG_MAX_WRITEOUTS - num_reserved_writeouts {
431 |         clilog::trace!("boomerang: write out overflowed");
432 |         return None
433 |     }
434 | 
435 |     // then place all remaining lvl1 nodes in any order.
436 |     // clilog::debug!("last_lvl1_necessary: {}, hier visited: {}, realized endpts: {}", last_lvl1_necessary_nodes.len(), hier_visited_nodes_count.len(), realized_endpoints.len());
437 |     let mut hier1_j = 0;
438 |     for &nd in &last_lvl1_necessary_nodes {
439 |         if hier_visited_nodes_count.contains_key(&nd) ||
440 |             realized_endpoints.contains(&nd)
441 |         {
442 |             continue
443 |         }
444 |         while hier[1][hier1_j] != usize::MAX {
445 |             hier1_j += 1;
446 |             if hier1_j >= hier[1].len() {
447 |                 clilog::trace!("boomerang: overflow putting lvl1");
448 |                 return None
449 |             }
450 |         }
451 |         place_bit(
452 |             aig, &mut hier, &mut hier_visited_nodes_count,
453 |             &level, &id2order,
454 |             1, hier1_j, nd
455 |         );
456 |     }
457 |     while hier[1][hier1_j] != usize::MAX {
458 |         hier1_j += 1;
459 |         if hier1_j >= hier[1].len() {
460 |             clilog::trace!("boomerang: overflow putting lvl1 (just a zero pin..)");
461 |             return None
462 |         }
463 |     }
464 | 
465 |     // check if we can make this the last stage.
466 |     if endpoints_untouched.is_empty() {
467 |         let mut add_write_outs = IndexSet::new();
468 |         for hi in 1..=BOOMERANG_NUM_STAGES {
469 |             for j in 0..hier[hi].len() {
470 |                 let nd = hier[hi][j];
471 |                 if endpoints_hier.contains(&nd) && !realized_endpoints.contains(&nd) {
472 |                     add_write_outs.insert((j + hier[hi].len()) / 32);
473 |                     if add_write_outs.len() + *total_write_outs > BOOMERANG_MAX_WRITEOUTS - num_reserved_writeouts {
474 |                         break
475 |                     }
476 |                 }
477 |             }
478 |         }
479 |         if add_write_outs.len() + *total_write_outs <= BOOMERANG_MAX_WRITEOUTS - num_reserved_writeouts {
480 |             for wo in add_write_outs {
481 |                 write_outs.push(wo);
482 |                 *total_write_outs += 1;
483 |             }
484 |             for endpt in endpoints_hier {
485 |                 realized_endpoints.insert(endpt);
486 |             }
487 |         }
488 |     }
489 | 
490 |     for (&i, _) in &hier_visited_nodes_count {
491 |         realized_inputs.insert(i);
492 |     }
493 |     for &i in &realized_endpoints {
494 |         assert!(unrealized_comb_outputs.swap_remove(&i));
495 |     }
496 | 
497 |     Some(BoomerangStage {
498 |         hier,
499 |         write_outs
500 |     })
501 | }
502 | 
503 | impl Partition {
504 |     /// build one partition given a set of endpoints to realize.
505 |     ///
506 |     /// if the resource is overflowed, None will be returned.
507 |     /// see [Partition] for resource constraints.
508 |     pub fn build_one(
509 |         aig: &AIG,
510 |         staged: &StagedAIG,
511 |         endpoints: &Vec<usize>
512 |     ) -> Option<Partition> {
513 |         let mut unrealized_comb_outputs = IndexSet::new();
514 |         let mut realized_inputs = staged.primary_inputs.as_ref()
515 |             .cloned().unwrap_or_default();
516 |         let mut num_srams = 0;
517 |         let mut comb_outputs_activations = IndexMap::<usize, IndexSet<usize>>::new();
518 |         for &endpt_i in endpoints {
519 |             let edg = staged.get_endpoint_group(aig, endpt_i);
520 |             edg.for_each_input(|i| {
521 |                 unrealized_comb_outputs.insert(i);
522 |             });
523 |             match edg {
524 |                 EndpointGroup::DFF(dff) => {
525 |                     comb_outputs_activations.entry(dff.d_iv >> 1).or_default().insert(dff.en_iv << 1 | (dff.d_iv & 1));
526 |                 },
527 |                 EndpointGroup::PrimaryOutput(pin) => {
528 |                     comb_outputs_activations.entry(pin >> 1).or_default().insert(2 | (pin & 1));
529 |                 },
530 |                 EndpointGroup::RAMBlock(_) => {
531 |                     num_srams += 1;
532 |                 },
533 |                 EndpointGroup::StagedIOPin(pin) => {
534 |                     comb_outputs_activations.entry(pin).or_default().insert(2);
535 |                 },
536 |             }
537 |         }
538 |         let num_output_dups = comb_outputs_activations.iter()
539 |             .map(|(_, ckens)| ckens.len() - 1)
540 |             .sum::<usize>();
541 |         let num_reserved_writeouts = num_srams + (num_output_dups + 31) / 32;
542 |         if num_reserved_writeouts >= BOOMERANG_MAX_WRITEOUTS ||
543 |             num_srams * 4 + num_output_dups > BOOMERANG_MAX_WRITEOUTS
544 |         {
545 |             // overflowed writeout
546 |             return None
547 |         }
548 |         let mut stages = Vec::<BoomerangStage>::new();
549 |         let mut total_write_outs = 0;
550 |         while !unrealized_comb_outputs.is_empty() {
551 |             let stage = build_one_boomerang_stage(
552 |                 aig, &mut unrealized_comb_outputs,
553 |                 &mut realized_inputs, &mut total_write_outs,
554 |                 num_reserved_writeouts
555 |             )?;
556 |             stages.push(stage);
557 |         }
558 |         Some(Partition {
559 |             endpoints: endpoints.clone(),
560 |             stages
561 |         })
562 |     }
563 | }
564 | 
565 | /// Given an initial clustering solution of endpoints, generate and map a
566 | /// refined solution.
567 | ///
568 | /// The refined solution will have smaller number of partitions
569 | /// as we aggressively merge the partitions when possible.
570 | pub fn process_partitions(
571 |     aig: &AIG,
572 |     staged: &StagedAIG,
573 |     mut parts: Vec<Vec<usize>>,
574 |     max_stage_degrad: usize,
575 | ) -> Option<Vec<Partition>> {
576 |     let cnt_nodes = parts.par_iter().map(|v| {
577 |         let mut comb_outputs = Vec::new();
578 |         for &endpt_i in v {
579 |             staged.get_endpoint_group(aig, endpt_i).for_each_input(|i| {
580 |                 comb_outputs.push(i);
581 |             });
582 |         }
583 |         let order = aig.topo_traverse_generic(
584 |             Some(&comb_outputs),
585 |             staged.primary_inputs.as_ref(),
586 |         );
587 |         order.len()
588 |     }).collect::<Vec<_>>();
589 | 
590 |     let all_original_parts = parts.par_iter().enumerate().map(|(i, v)| {
591 |         let part = Partition::build_one(aig, staged, v);
592 |         if part.is_none() {
593 |             clilog::error!("Partition {} exceeds resource constraint.", i);
594 |         }
595 |         part
596 |     }).collect::<Vec<_>>();
597 |     let all_original_parts = all_original_parts.into_iter().collect::<Option<Vec<_>>>()?;
598 |     let max_original_nstages = all_original_parts.iter()
599 |         .map(|p| p.stages.len()).max().unwrap();
600 | 
601 |     let mut effective_parts = Vec::<Partition>::new();
602 |     let max_trials = (all_original_parts.len() / 8).max(20);
603 |     for (i, mut partition_self) in all_original_parts.into_iter().enumerate() {
604 |         if parts[i].is_empty() {
605 |             continue
606 |         }
607 |         let mut merge_blacklist = HashSet::<usize>::new();
608 |         let mut cnt_node_i = cnt_nodes[i];
609 |         loop {
610 |             let mut comb_outputs = Vec::new();
611 |             for &endpt_i in &parts[i] {
612 |                 staged.get_endpoint_group(aig, endpt_i).for_each_input(|i| {
613 |                     comb_outputs.push(i);
614 |                 });
615 |             }
616 | 
617 |             let mut merge_choices = parts[i + 1..parts.len()].par_iter().enumerate().filter_map(|(j, v)| {
618 |                 if v.is_empty() { return None }
619 |                 if merge_blacklist.contains(&(i + j + 1)) {
620 |                     return None
621 |                 }
622 |                 let mut comb_outputs = comb_outputs.clone();
623 |                 for &endpt_i in v {
624 |                     staged.get_endpoint_group(aig, endpt_i).for_each_input(|i| {
625 |                         comb_outputs.push(i);
626 |                     });
627 |                 }
628 |                 let order = aig.topo_traverse_generic(
629 |                     Some(&comb_outputs),
630 |                     staged.primary_inputs.as_ref(),
631 |                 );
632 |                 Some((order.len() - cnt_nodes[i + j + 1].max(cnt_node_i),
633 |                       order.len(),
634 |                       i + j + 1))
635 |             }).collect::<Vec<_>>();
636 |             merge_choices.sort();
637 |             let mut merged = false;
638 | 
639 |             #[derive(Clone)]
640 |             struct PartsPartitions {
641 |                 parts_ij: Vec<usize>,
642 |                 partition_ij: Option<Partition>,
643 |             }
644 |             let mut merge_trials: Vec<Option<PartsPartitions>> =
645 |                 vec![None; merge_choices.len()];
646 |             let mut parallel_trial_stride = 4;
647 | 
648 |             for (merge_i, &(_cnt_diff, cnt_new, j)) in merge_choices.iter().enumerate() {
649 |                 if merge_trials[merge_i].is_none() {
650 |                     if merge_i > max_trials {
651 |                         break   // do not try too more
652 |                     }
653 |                     let rhs = merge_trials.len().min(
654 |                         merge_i + parallel_trial_stride);
655 |                     merge_trials[merge_i..rhs].par_iter_mut().enumerate().for_each(|(merge_j, trial)| {
656 |                         let j = merge_choices[merge_i + merge_j].2;
657 |                         let parts_ij = parts[i].iter().chain(parts[j].iter()).copied().collect();
658 |                         let partition_ij = Partition::build_one(aig, staged, &parts_ij);
659 |                         *trial = Some(PartsPartitions {
660 |                             parts_ij, partition_ij
661 |                         });
662 |                     });
663 |                     parallel_trial_stride *= 2;
664 |                 }
665 | 
666 |                 let PartsPartitions {
667 |                     parts_ij, partition_ij
668 |                 } = merge_trials[merge_i].take().unwrap();
669 | 
670 |                 match partition_ij {
671 |                     None => {
672 |                         merge_blacklist.insert(j);
673 |                     }
674 |                     Some(partition) if partition.stages.len() >
675 |                         max_original_nstages + max_stage_degrad =>
676 |                     {
677 |                         clilog::debug!("skipped merging {} with {} due to nstage degradation: \
678 |                                         {} > {}", i, j, partition.stages.len(),
679 |                                        max_original_nstages + max_stage_degrad);
680 |                         merge_blacklist.insert(j);
681 |                     }
682 |                     Some(partition) => {
683 |                         clilog::info!("merged partition {} with {}", i, j);
684 |                         parts[i] = parts_ij;
685 |                         parts[j] = vec![];
686 |                         partition_self = partition;
687 |                         merged = true;
688 |                         cnt_node_i = cnt_new;
689 |                         break
690 |                     },
691 |                 }
692 |             }
693 |             if !merged { break }
694 |         }
695 | 
696 |         clilog::info!("part {}: #stages {}",
697 |                       i, partition_self.stages.len());
698 |         effective_parts.push(partition_self);
699 |     }
700 |     effective_parts.sort_by_key(|p| usize::MAX - p.stages.len());
701 |     Some(effective_parts)
702 | }
703 | 
704 | /// Read a cluster solution from hgr.part.xx file.
705 | /// Then call [process_partitions].
706 | pub fn process_partitions_from_hgr_parts_file(
707 |     aig: &AIG,
708 |     staged: &StagedAIG,
709 |     hgr_parts_file: &PathBuf,
710 |     max_stage_degrad: usize,
711 | ) -> Option<Vec<Partition>> {
712 |     use std::io::{BufRead, BufReader};
713 |     use std::fs::File;
714 | 
715 |     let mut parts = Vec::<Vec<usize>>::new();
716 |     let f_parts = File::open(&hgr_parts_file).unwrap();
717 |     let f_parts = BufReader::new(f_parts);
718 |     for (i, line) in f_parts.lines().enumerate() {
719 |         let line = line.unwrap();
720 |         if line.is_empty() { continue }
721 |         let part_id = line.parse::<usize>().unwrap();
722 |         while parts.len() <= part_id {
723 |             parts.push(vec![]);
724 |         }
725 |         parts[part_id].push(i);
726 |     }
727 |     clilog::info!("read parts file {} with {} parts",
728 |                   hgr_parts_file.display(), parts.len());
729 | 
730 |     process_partitions(aig, staged, parts, max_stage_degrad)
731 | }
732 | 


--------------------------------------------------------------------------------
/src/repcut.rs:
--------------------------------------------------------------------------------
  1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | /// RepCut implementation
  4 | 
  5 | use crate::aig::{DriverType, AIG};
  6 | use crate::staging::StagedAIG;
  7 | use indexmap::{IndexMap, IndexSet};
  8 | use cachedhash::CachedHash;
  9 | use std::collections::HashMap;
 10 | use std::sync::Arc;
 11 | use std::fmt;
 12 | use rayon::prelude::*;
 13 | use rand::prelude::*;
 14 | use rand_chacha::ChaCha20Rng;
 15 | 
 16 | const REPCUT_HYPERGRAPH_EDGE_SIZE_LIMIT: usize = 1000;
 17 | const REPCUT_BITSET_BLOCK_SIZE: usize = 4096;
 18 | 
 19 | #[derive(Hash, PartialEq, Eq, Debug)]
 20 | struct EndpointSetSegment {
 21 |     bs_set: [u64; REPCUT_BITSET_BLOCK_SIZE / 64],
 22 | }
 23 | 
 24 | impl Default for EndpointSetSegment {
 25 |     fn default() -> Self {
 26 |         EndpointSetSegment {
 27 |             bs_set: [0; REPCUT_BITSET_BLOCK_SIZE / 64]
 28 |         }
 29 |     }
 30 | }
 31 | 
 32 | #[derive(Hash, PartialEq, Eq, Debug)]
 33 | struct EndpointSet {
 34 |     s: Vec<Option<Arc<CachedHash<EndpointSetSegment>>>>,
 35 | }
 36 | 
 37 | pub struct RCHyperGraph {
 38 |     clusters: IndexMap<CachedHash<EndpointSet>, usize>,
 39 |     endpoint_weights: Vec<u64>,
 40 | }
 41 | 
 42 | impl EndpointSet {
 43 |     fn popcount(&self) -> usize {
 44 |         self.s.iter().map(|o| {
 45 |             match o {
 46 |                 Some(ess) =>
 47 |                     ess.bs_set.iter().map(|u| u.count_ones())
 48 |                     .sum::<u32>() as usize,
 49 |                 None => 0
 50 |             }
 51 |         }).sum()
 52 |     }
 53 | }
 54 | 
 55 | impl RCHyperGraph {
 56 |     pub fn from_staged_aig(aig: &AIG, staged: &StagedAIG) -> RCHyperGraph {
 57 |         let timer_repcut_endpoint_process = clilog::stimer!("repcut endpoint process");
 58 |         let num_blocks = (
 59 |             staged.num_endpoint_groups() + REPCUT_BITSET_BLOCK_SIZE - 1
 60 |         ) / REPCUT_BITSET_BLOCK_SIZE;
 61 |         let mut segments_blockid_nodeid = vec![
 62 |             Vec::<Option<Arc<CachedHash<EndpointSetSegment>>>>::new();
 63 |             num_blocks
 64 |         ];
 65 |         segments_blockid_nodeid.par_iter_mut().enumerate().for_each(|(i_block, vs)| {
 66 |             *vs = vec![None; aig.num_aigpins + 1];
 67 |             let endpoint_block_st = i_block * REPCUT_BITSET_BLOCK_SIZE;
 68 |             let endpoint_block_ed = staged.num_endpoint_groups()
 69 |                 .min(endpoint_block_st + REPCUT_BITSET_BLOCK_SIZE);
 70 |             let mut endpoint_pins = Vec::new();
 71 |             for endpt_i in endpoint_block_st..endpoint_block_ed {
 72 |                 staged.get_endpoint_group(aig, endpt_i).for_each_input(|i| {
 73 |                     endpoint_pins.push(i);
 74 |                 });
 75 |             }
 76 |             let order_blk = aig.topo_traverse_generic(
 77 |                 Some(&endpoint_pins),
 78 |                 staged.primary_inputs.as_ref()
 79 |             );
 80 |             let mut unique_segs =
 81 |                 IndexSet::<Arc<CachedHash<EndpointSetSegment>>>::new();
 82 |             let mut ess_init: HashMap<usize, EndpointSetSegment> =
 83 |                 HashMap::new();
 84 |             for endpt_i in endpoint_block_st..endpoint_block_ed {
 85 |                 let idx_offset = endpt_i - endpoint_block_st;
 86 |                 staged.get_endpoint_group(aig, endpt_i).for_each_input(|i| {
 87 |                     let ess = ess_init.entry(i).or_default();
 88 |                     ess.bs_set[idx_offset / 64] |= 1 << (idx_offset % 64);
 89 |                 });
 90 |             }
 91 |             for order_i in (0..order_blk.len()).rev() {
 92 |                 let i = order_blk[order_i];
 93 |                 let mut ess =
 94 |                     ess_init.remove(&i).unwrap_or_default();
 95 |                 let fs = aig.fanouts_start[i];
 96 |                 let fe = aig.fanouts_start[i + 1];
 97 |                 for fi in fs..fe {
 98 |                     let j = aig.fanouts[fi];
 99 |                     if let Some(vj) = &mut vs[j] {
100 |                         for bs_k in 0..REPCUT_BITSET_BLOCK_SIZE / 64 {
101 |                             ess.bs_set[bs_k] |= vj.bs_set[bs_k];
102 |                         }
103 |                     }
104 |                 }
105 |                 let ess = Arc::new(
106 |                     CachedHash::new(ess)
107 |                 );
108 |                 let (idx, _) = unique_segs.insert_full(ess);
109 |                 vs[i] = Some(unique_segs.get_index(idx).unwrap().clone());
110 |             }
111 |             // println!("vs: {:?}", vs);
112 |         });
113 |         // println!("sbn: {:?}", segments_blockid_nodeid);
114 |         let mut clusters = IndexMap::<_, usize>::new();
115 |         for i in 1..aig.num_aigpins {
116 |             let es = CachedHash::new(EndpointSet {
117 |                 s: (0..num_blocks)
118 |                     .map(|k| segments_blockid_nodeid[k][i]
119 |                          .clone()).collect()
120 |             });
121 |             if es.popcount() >= 2 {
122 |                 *clusters.entry(es).or_default() += 1;
123 |             }
124 |         }
125 |         clilog::finish!(timer_repcut_endpoint_process);
126 | 
127 |         let mut endpoint_pins_all = Vec::new();
128 |         for endpt_i in 0..staged.num_endpoint_groups() {
129 |             staged.get_endpoint_group(aig, endpt_i).for_each_input(|i| {
130 |                 endpoint_pins_all.push(i);
131 |             });
132 |         }
133 |         let order_all = aig.topo_traverse_generic(
134 |             Some(&endpoint_pins_all),
135 |             staged.primary_inputs.as_ref()
136 |         );
137 |         let mut node_weights = vec![0.0f32; aig.num_aigpins + 1];
138 |         for &i in &order_all {
139 |             node_weights[i] = 1.;
140 |             if let DriverType::AndGate(a, b) = aig.drivers[i] {
141 |                 if (a >> 1) != 0 {
142 |                     node_weights[i] += node_weights[a >> 1] / ((
143 |                         aig.fanouts_start[(a >> 1) + 1] - aig.fanouts_start[a >> 1]
144 |                     ) as f32);
145 |                 }
146 |                 if (b >> 1) != 0 {
147 |                     node_weights[i] += node_weights[b >> 1] / ((
148 |                         aig.fanouts_start[(b >> 1) + 1] - aig.fanouts_start[b >> 1]
149 |                     ) as f32);
150 |                 }
151 |             }
152 |         }
153 |         let mut num_fanouts_to_endpt = vec![0usize; aig.num_aigpins + 1];
154 |         for endpt_i in 0..staged.num_endpoint_groups() {
155 |             staged.get_endpoint_group(aig, endpt_i).for_each_input(|i| {
156 |                 num_fanouts_to_endpt[i] += 1;
157 |             });
158 |         }
159 |         let endpoint_weights = (0..staged.num_endpoint_groups()).map(|endpt_i| {
160 |             let mut tot = 0.0;
161 |             staged.get_endpoint_group(aig, endpt_i).for_each_input(|i| {
162 |                 tot += node_weights[i] / (num_fanouts_to_endpt[i] as f32)
163 |             });
164 |             (tot + 0.5) as u64
165 |         }).collect();
166 | 
167 |         // println!("clusters: {:#?}, endpoint_weights: {:#?}", clusters, endpoint_weights);
168 |         RCHyperGraph {
169 |             clusters, endpoint_weights
170 |         }
171 |     }
172 | }
173 | 
174 | impl fmt::Display for RCHyperGraph {
175 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
176 |         let mut rng = ChaCha20Rng::seed_from_u64(8026727);
177 |         writeln!(f, "{} {} 11", self.clusters.len(), self.endpoint_weights.len())?;
178 |         for (s, v) in &self.clusters {
179 |             let mut edgend = Vec::<usize>::new();
180 |             let mut num_prev_nodes = 0;
181 |             edgend.reserve(REPCUT_HYPERGRAPH_EDGE_SIZE_LIMIT);
182 |             for segment_i in 0..s.s.len() {
183 |                 let bs_set = match &s.s[segment_i] {
184 |                     Some(seg) => &seg.bs_set,
185 |                     None => continue
186 |                 };
187 |                 for bs_i in 0..REPCUT_BITSET_BLOCK_SIZE / 64 {
188 |                     if bs_set[bs_i] == 0 {
189 |                         continue
190 |                     }
191 |                     for k in 0..64 {
192 |                         if (bs_set[bs_i] >> k & 1) != 0 {
193 |                             let nd = segment_i * REPCUT_BITSET_BLOCK_SIZE + bs_i * 64 + k + 1;
194 |                             if edgend.len() < REPCUT_HYPERGRAPH_EDGE_SIZE_LIMIT {
195 |                                 edgend.push(nd);
196 |                             }
197 |                             else if rng.gen_range(0..num_prev_nodes) < REPCUT_HYPERGRAPH_EDGE_SIZE_LIMIT {
198 |                                 edgend[rng.gen_range(0..REPCUT_HYPERGRAPH_EDGE_SIZE_LIMIT)] = nd;
199 |                             }
200 |                             num_prev_nodes += 1;
201 |                         }
202 |                     }
203 |                 }
204 |             }
205 |             write!(f, "{}", v)?;
206 |             for nd in edgend {
207 |                 write!(f, " {}", nd)?;
208 |             }
209 |             writeln!(f)?;
210 |         }
211 |         for w in &self.endpoint_weights {
212 |             writeln!(f, "{}", w)?;
213 |         }
214 |         Ok(())
215 |     }
216 | }
217 | 


--------------------------------------------------------------------------------
/src/staging.rs:
--------------------------------------------------------------------------------
  1 | // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //! Splitting deep circuit into major stages at global level indices.
  4 | //!
  5 | //! This is crucial in efficiently handling large and deep circuits
  6 | //! with a limited processing element width.
  7 | 
  8 | use indexmap::IndexSet;
  9 | use crate::aig::{AIG, EndpointGroup, DriverType};
 10 | 
 11 | /// A struct representing the boundaries of a staged AIG.
 12 | pub struct StagedAIG {
 13 |     /// the staged primary inputs from previous levels.
 14 |     pub primary_inputs: Option<IndexSet<usize>>,
 15 |     /// the staged primary output pins for next levels.
 16 |     ///
 17 |     /// these pins are active nodes at the level split.
 18 |     pub primary_output_pins: Vec<usize>,
 19 |     /// the endpoint indices of original AIG fulfilled by current level.
 20 |     pub endpoints: Vec<usize>,
 21 | }
 22 | 
 23 | impl StagedAIG {
 24 |     /// Get the number of endpoint groups that should be fulfilled
 25 |     /// with this staged AIG.
 26 |     ///
 27 |     /// This mimics the interface given by a raw AIG.
 28 |     pub fn num_endpoint_groups(&self) -> usize {
 29 |         self.primary_output_pins.len() + self.endpoints.len()
 30 |     }
 31 | 
 32 |     /// Get the virtual endpoint group with an index.
 33 |     ///
 34 |     /// This mimics the interface given by a raw AIG.
 35 |     pub fn get_endpoint_group<'aig>(&self, aig: &'aig AIG, endpt_id: usize) -> EndpointGroup<'aig> {
 36 |         if endpt_id < self.primary_output_pins.len() {
 37 |             EndpointGroup::StagedIOPin(self.primary_output_pins[endpt_id])
 38 |         }
 39 |         else {
 40 |             aig.get_endpoint_group(self.endpoints[endpt_id - self.primary_output_pins.len()])
 41 |         }
 42 |     }
 43 | 
 44 |     /// build a staged AIG that consists of all levels.
 45 |     pub fn from_full_aig(aig: &AIG) -> Self {
 46 |         StagedAIG {
 47 |             primary_inputs: None,
 48 |             primary_output_pins: vec![],
 49 |             endpoints: (0..aig.num_endpoint_groups()).collect()
 50 |         }
 51 |     }
 52 | 
 53 |     /// build a staged AIG by horizontal splitting given a subset
 54 |     /// of endpoints.
 55 |     ///
 56 |     /// return built StagedAIG.
 57 |     /// the endpoints are given as a slice of endpoint group indices,
 58 |     /// that must have all staged primary output groups at the front
 59 |     /// and original endpoints following. otherwise we panic.
 60 |     ///
 61 |     /// the result guarantees that the endpoint `i` corresponds to
 62 |     /// the original staged's endpoint `endpoint_subset[i]`.
 63 |     pub fn to_endpoint_subset(
 64 |         &self,
 65 |         endpoint_subset: &[usize]
 66 |     ) -> StagedAIG {
 67 |         let mut staged_sub = StagedAIG {
 68 |             primary_inputs: self.primary_inputs.clone(),
 69 |             primary_output_pins: vec![],
 70 |             endpoints: vec![],
 71 |         };
 72 |         for &endpt_i in endpoint_subset {
 73 |             if endpt_i < self.primary_output_pins.len() {
 74 |                 staged_sub.primary_output_pins.push(
 75 |                     self.primary_output_pins[endpt_i]
 76 |                 );
 77 |                 assert!(staged_sub.endpoints.is_empty(),
 78 |                         "endpoint subset must be in order!");
 79 |             }
 80 |             else {
 81 |                 staged_sub.endpoints.push(
 82 |                     self.endpoints[endpt_i - self.primary_output_pins.len()]
 83 |                 );
 84 |             }
 85 |         }
 86 |         staged_sub
 87 |     }
 88 | 
 89 |     /// build a staged AIG by vertical splitting at the given level id.
 90 |     ///
 91 |     /// return built StagedAIG.
 92 |     /// the active middle nodes at split can be obtained from the
 93 |     /// StagedAIG::primary_output_pins.
 94 |     /// if this is empty, it means all endpoints are already satisfied
 95 |     /// after this stage.
 96 |     pub fn from_split(
 97 |         aig: &AIG,
 98 |         unrealized_orig_endpoints: &IndexSet<usize>,
 99 |         primary_inputs: Option<&IndexSet<usize>>,
100 |         split_at_level: usize,
101 |     ) -> Self {
102 |         let mut unrealized_endpoint_nodes = Vec::new();
103 |         for &endpt in unrealized_orig_endpoints {
104 |             aig.get_endpoint_group(endpt).for_each_input(|i| {
105 |                 unrealized_endpoint_nodes.push(i);
106 |             });
107 |         }
108 |         assert!(!unrealized_endpoint_nodes.is_empty());
109 |         let order = aig.topo_traverse_generic(
110 |             Some(&unrealized_endpoint_nodes),
111 |             primary_inputs
112 |         );
113 |         let mut num_fanouts = vec![0; aig.num_aigpins + 1];
114 |         let mut level_id = vec![0; aig.num_aigpins + 1];
115 |         for &i in &order {
116 |             if matches!(primary_inputs, Some(pi) if pi.contains(&i)) {
117 |                 continue
118 |             }
119 |             if let DriverType::AndGate(a, b) = aig.drivers[i] {
120 |                 if a >= 2 {
121 |                     num_fanouts[a >> 1] += 1;
122 |                     level_id[i] = level_id[i].max(level_id[a >> 1] + 1);
123 |                 }
124 |                 if b >= 2 {
125 |                     num_fanouts[b >> 1] += 1;
126 |                     level_id[i] = level_id[i].max(level_id[b >> 1] + 1);
127 |                 }
128 |             }
129 |         }
130 |         let mut endpt_level_id = vec![0; aig.num_endpoint_groups()];
131 |         for &endpt in unrealized_orig_endpoints {
132 |             aig.get_endpoint_group(endpt).for_each_input(|i| {
133 |                 num_fanouts[i] += 1;
134 |                 endpt_level_id[endpt] = endpt_level_id[endpt].max(level_id[i]);
135 |             });
136 |         }
137 |         let mut nodes_at_split = IndexSet::new();
138 |         for &i in &order {
139 |             if level_id[i] > split_at_level { continue }
140 |             nodes_at_split.insert(i);
141 |             if matches!(primary_inputs, Some(pi) if pi.contains(&i)) {
142 |                 continue
143 |             }
144 |             if let DriverType::AndGate(a, b) = aig.drivers[i] {
145 |                 if a >= 2 {
146 |                     num_fanouts[a >> 1] -= 1;
147 |                     if num_fanouts[a >> 1] == 0 {
148 |                         assert!(nodes_at_split.swap_remove(&(a >> 1)));
149 |                     }
150 |                 }
151 |                 if b >= 2 {
152 |                     num_fanouts[b >> 1] -= 1;
153 |                     if num_fanouts[b >> 1] == 0 {
154 |                         assert!(nodes_at_split.swap_remove(&(b >> 1)));
155 |                     }
156 |                 }
157 |             }
158 |         }
159 |         let mut endpoints_before_split = Vec::new();
160 |         for &endpt in unrealized_orig_endpoints {
161 |             if endpt_level_id[endpt] > split_at_level { continue }
162 |             endpoints_before_split.push(endpt);
163 |             aig.get_endpoint_group(endpt).for_each_input(|i| {
164 |                 num_fanouts[i] -= 1;
165 |                 if num_fanouts[i] == 0 {
166 |                     assert!(nodes_at_split.swap_remove(&i));
167 |                 }
168 |             });
169 |         }
170 | 
171 |         StagedAIG {
172 |             primary_inputs: primary_inputs.cloned(),
173 |             primary_output_pins: nodes_at_split.iter().copied()
174 |                 .filter(|po| !matches!(primary_inputs, Some(pi) if pi.contains(po)))
175 |                 .collect(),
176 |             endpoints: endpoints_before_split
177 |         }
178 |     }
179 | }
180 | 
181 | /// Given the level split points, return a list of split stages.
182 | ///
183 | /// For example, given [10, 20], will return a list like this:
184 | /// [(0, 10, stage0_10), (10, 20, stage10_20), (20, MAX, stage20_MAX)]
185 | ///
186 | /// If the netlist ends early before all split points, the length might be
187 | /// shorter than expected.
188 | pub fn build_staged_aigs(
189 |     aig: &AIG, level_split: &[usize]
190 | ) -> Vec<(usize, usize, StagedAIG)> {
191 |     let mut ret = Vec::new();
192 |     let mut unrealized_orig_endpoints = (0..aig.num_endpoint_groups()).collect::<IndexSet<_>>();
193 |     let mut primary_inputs: Option<IndexSet<usize>> = None;
194 | 
195 |     for i in 0..level_split.len() {
196 |         let cur_split = level_split[i];
197 |         let last_split = match i {
198 |             0 => 0,
199 |             i @ _ => level_split[i - 1]
200 |         };
201 |         let staged = StagedAIG::from_split(
202 |             aig, &unrealized_orig_endpoints, primary_inputs.as_ref(),
203 |             cur_split - last_split
204 |         );
205 |         for &endpt in &staged.endpoints {
206 |             assert!(unrealized_orig_endpoints.swap_remove(&endpt));
207 |         }
208 |         let primary_inputs = primary_inputs.get_or_insert_with(|| Default::default());
209 |         for &inp in &staged.primary_output_pins {
210 |             primary_inputs.insert(inp);
211 |         }
212 |         if staged.primary_output_pins.is_empty() {
213 |             ret.push((last_split, usize::MAX, staged));
214 |             return ret
215 |         }
216 |         ret.push((last_split, cur_split, staged));
217 |     }
218 | 
219 |     let last_split = match level_split.len() {
220 |         0 => 0,
221 |         i @ _ => level_split[i - 1]
222 |     };
223 |     ret.push((last_split, usize::MAX, StagedAIG::from_split(
224 |         aig, &unrealized_orig_endpoints, primary_inputs.as_ref(),
225 |         usize::MAX
226 |     )));
227 | 
228 |     ret
229 | }
230 | 


--------------------------------------------------------------------------------
/usage.md:
--------------------------------------------------------------------------------
  1 | # Getting Started to use GEM
  2 | 
  3 | **Caveats**: currently GEM only supports non-interactive testbenches. This means the input to the circuit needs to be a static waveform (e.g., VCD). Registers and clock gates inside the circuit are allowed, but latches and other asynchronous sequential logics are currently unsupported.
  4 | 
  5 | **Dataset**: Some (namely, netlists after AIG transformation in Steps 1-2 below, and reference VCDs) input data is available [here](https://drive.google.com/drive/folders/1M42vFoVZhG4ZjyD1hqYD0Hrw8F1rwNXd?usp=drive_link) .
  6 | 
  7 | ## Step 0. Download the AIG Process Kit
  8 | Go to [aigpdk](./aigpdk) directory where you can download `aigpdk.lib`, `aigpdk.v`, and `memlib_yosys.txt`. You will need them later in the flow.
  9 | 
 10 | Before continuing, make sure your design contains only synchronous logic.
 11 | If your design has clock gates implemented in your RTL code, you need to replace them manually with instantiations to the `CKLNQD` module in `aigpdk.v`.
 12 | Also, you are advised to be familiar with where memory blocks (e.g., caches) are implemented in your design so you can check that the memory blocks are mapped correctly later.
 13 | 
 14 | ## Step 1. Memory Synthesis with Yosys
 15 | This step makes use of the open-source [Yosys](https://github.com/YosysHQ/yosys) synthesizer to recognize and map the memory blocks automatically.
 16 | 
 17 | Download and compile the latest version of Yosys. Then run yosys shell with the following synthesis script.
 18 | 
 19 | ``` tcl
 20 | # replace this with paths to your RTL code, and add `-I`, `-D`, `-sv` etc when necessary
 21 | read_verilog xx.v yy.v top.v
 22 | 
 23 | # replace TOP_MODULE with your top module name
 24 | hierarchy -check -top TOP_MODULE
 25 | 
 26 | # simplify design before mapping
 27 | proc;;
 28 | opt_expr; opt_dff; opt_clean
 29 | memory -nomap
 30 | 
 31 | # map the rams
 32 | # point -lib path to your downloaded memlib_yosys.txt
 33 | memory_libmap -lib path/to/memlib_yosys.txt -logic-cost-rom 100 -logic-cost-ram 100
 34 | ```
 35 | 
 36 | The `memory_libmap` command will output a list of RAMs it found and mapped.
 37 | 
 38 | - If you see `$__RAMGEM_SYNC_`, it means the mapping is successful.
 39 | - If you see `$__RAMGEM_ASYNC_`, it means this RAM is found to have asynchronous READ port. You need to confirm if it is the case.
 40 |   - If it is a synchronous one but accidentally recognized as asynchronous, you might need to patch the RTL code to fix it. There might be multiple reasons it cannot be recognized as synchronous. For example, [when the read and write clocks are different](https://github.com/YosysHQ/yosys/issues/4521).
 41 |   - If it is indeed asynchronous, check its size. If its size is very small and affordable to be synthesized using registers and mux trees (which is *very* expensive for large RAM banks), you can remove the `$__RAMGEM_ASYNC_` block in `memlib_yosys.txt`, re-run Yosys to force the use of registers.
 42 | - If you see `using FF mapping for memory`, it means the memory is recognized, but due to it being nonstandard (e.g., special global reset or nontrivial initialization), GEM will fall back to registers and mux trees. If the size of the memory is small, this is usually not an issue. Otherwise, you are advised to try other implementations.
 43 | 
 44 | After a successful mapping, use the following command to write out the mapped RTL as a single Verilog file.
 45 | ``` tcl
 46 | write_verilog memory_mapped.v
 47 | ```
 48 | 
 49 | Check the correctness of this step by simulating `memory_mapped.v` with your reference CPU simulator.
 50 | 
 51 | ## Step 2. Logic Synthesis
 52 | This step maps all combinational and sequential logic into a special set of standard cells we defined in `aigpdk.lib`.
 53 | The quality of synthesis is directly tied to GEM's final performance, so we suggest you use a commercial synthesis tool like DC. You can also use Yosys to complete this if you do not have access to a commercial synthesis tool.
 54 | 
 55 | Check the correctness of this step by simulating `gatelevel.gv` with your reference CPU simulator.
 56 | 
 57 | ### Use Synopsys DC
 58 | First, you need to compile `aigpdk.lib` to `aigpdk.db` using Library Compiler.
 59 | 
 60 | With that, you synthesize the `memory_mapped.v` obtained before under `aigpdk.db`.
 61 | 
 62 | Some key commands you may use on top of your existing DC flow:
 63 | 
 64 | ``` tcl
 65 | # change path/to/aigpdk.db to a correct path. same for other commands.
 66 | set_app_var link_path path/to/aigpdk.db
 67 | set_app_var target_library path/to/aigpdk.db
 68 | read_file -format db $target_library
 69 | 
 70 | # elaborate TOP_MODULE
 71 | # current_design TOP_MODULE
 72 | 
 73 | # timing settings like create_clock ... are recommended. GEM benefits from timing-driven synthesis.
 74 | 
 75 | compile_ultra -no_seq_output_inversion -no_autoungroup
 76 | optimize_netlist -area
 77 | 
 78 | write -format verilog -hierarchy -out gatelevel.gv
 79 | ```
 80 | 
 81 | ### Use Yosys: Example script
 82 | ``` tcl
 83 | # if you exited Yosys in step 2, you can read back in your memory_mapped.v yourself.
 84 | # read_verilog memory_mapped.v
 85 | # hierarchy -check -top TOP_MODULE
 86 | 
 87 | # synthesis
 88 | synth -flatten
 89 | delete t:$print
 90 | 
 91 | # change path/to/aigpdk.lib to a correct path. same for other commands.
 92 | dfflibmap -liberty path/to/aigpdk.lib
 93 | opt_clean -purge
 94 | abc -liberty /home/zzscratch1/scratch/aigpdk/aigpdk.lib
 95 | opt_clean -purge
 96 | techmap
 97 | abc -liberty /home/zzscratch1/scratch/aigpdk/aigpdk.lib
 98 | opt_clean -purge
 99 | 
100 | # write out
101 | write_verilog gatelevel.gv
102 | ```
103 | 
104 | ## Step 3. Download and Compile GEM
105 | Make sure CUDA is installed on your Linux machine.
106 | 
107 | Download and install Rust toolchain. This is as simple as a one-liner in your terminal. We recommend [https://rustup.rs](https://rustup.rs/).
108 | 
109 | Clone GEM along with its dependency.
110 | ``` sh
111 | git clone https://github.com/NVlabs/GEM.git
112 | cd GEM
113 | git submodule update --init --recursive
114 | ```
115 | 
116 | GEM comes with a `cut_map_interactive` command and a `cuda_test` command, that correspond to `compile` and `simulate` steps of a classical CPU simulator. See their help usage with the following command under `GEM`:
117 | ``` sh
118 | cargo run -r --features cuda --bin cut_map_interactive -- --help
119 | 
120 | cargo run -r --features cuda --bin cuda_test -- --help
121 | ```
122 | 
123 | ## Map the Design with GEM
124 | GEM depends on an external hypergraph partitioner binary. We recommend hmetis 2.0. You can download its binary and put it in a proper location.
125 | 
126 | Then, run the following command to start the Boolean processor mapping.
127 | 
128 | ``` sh
129 | cargo run -r --features cuda --bin cut_map_interactive -- path/to/hmetis/Linux-x86_64/hmetis2.0pre1 path/to/gatelevel.gv path/to/result.gemparts
130 | ```
131 | 
132 | The mapped result will be stored in a binary file `result.gemparts`.
133 | 
134 | If the mapping failed due to failure to partition deep circuits (which often shows as trying to partition a circuit with only 0 or 1 endpoints), try adding a `--level-split` option to force a stage split. For example `--level-split 30` or `--level-split 20,40`. If you used this, remember to add the same `--level-split` option when you simulate.
135 | 
136 | ## Simulate the Design
137 | Run the following. Replace `NUM_BLOCKS` with twice the number of physical streaming multiprocessors (SMs) of your GPU. If ports in your `input.vcd` are not in top-level, add a `--input-vcd-scope` to specify it.
138 | ``` sh
139 | cargo run -r --features cuda --bin cuda_test -- path/to/gatelevel.gv path/to/result.gemparts path/to/input.vcd path/to/output.vcd NUM_BLOCKS
140 | ```
141 | 
142 | The simulated output ports value will be stored in `output.vcd`.
143 | 
144 | **Caveat**: The actual GPU simulation runtime will also be outputted. You might see a long time before GPU enters due to reading and parsing `input.vcd`. You are recommended to develop your own pipeline to feed the input waveform into GEM CUDA kernels.
145 | 


--------------------------------------------------------------------------------