├── RAMP-slides
    ├── 10-2-design-verif-mike-butts-rtl-simulation-taxonomy.pdf
    ├── Derek_201008_RAMPWrap (Slides, 8-25-2010).pptx
    ├── FPT14_Keynote_MButts_2Dec14.pdf
    ├── JHoe_RAMPWrap_Aug10A (Slides, 8-25-2010).pdf
    ├── KVissers_Ramp_wrap_slides (Slides, 8-25-2010).pdf
    ├── Krste_RAMP-20100825-GoldWrap (Slides, 8-25-2010).pptx
    ├── Krste_RAMP-20100825-WrapWrap (Slides, 8-25-2010).pptx
    ├── MAdler_201008_RAMP (Slides, 8-25-2010).pptx
    ├── RAMP2010_MButts20Aug (Slides, 8-25-2010).pptx
    ├── RAMPretroPatterson (Slides, 8-25-2010).ppt
    ├── Thacker_RAMPing Down (Slides, 8-25-2010).pdf
    └── ramp-retro.kozyrakis (Slides, 8-25-2010).pptx
├── README.md
└── resources
    ├── GPU-accelerated-rtl-sim-with-loop-unrolling.pdf
    └── emulation
        ├── 03_Jonathan_Bachrach_--_Rethinking_Sketching_with_Chisel_and_DREAMER.pdf
        ├── 2023.ash.micro.pdf
        ├── A_Survey_of_Hardware_Accelerators_Used_in_Computer-Aided_Design.pdf
        ├── RAMP2010_MButts20Aug (Slides, 8-25-2010).pptx
        ├── RTL-emulation-uarch.pptx
        ├── RTL-power-analysis-using-GL-cell-power-models.pdf
        ├── Speeding up lookup table driven logic simulation.pdf
        ├── The_Yorktown_Simulation_Engine.pdf
        ├── US5551013-processor-design.pdf
        ├── US5551013.pdf
        ├── US6035117.pdf
        ├── US7047179.pdf
        ├── US7555423.pdf
        ├── cyclist-chisel-emulator.pdf
        ├── ibm_logic_engine.pdf
        ├── ibm_lse_2.pdf
        ├── malibu-fpga2011.pdf
        └── palladium-dynamic-power-anal.pdf


/RAMP-slides/10-2-design-verif-mike-butts-rtl-simulation-taxonomy.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/RAMP-slides/10-2-design-verif-mike-butts-rtl-simulation-taxonomy.pdf


--------------------------------------------------------------------------------
/RAMP-slides/Derek_201008_RAMPWrap (Slides, 8-25-2010).pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/RAMP-slides/Derek_201008_RAMPWrap (Slides, 8-25-2010).pptx


--------------------------------------------------------------------------------
/RAMP-slides/FPT14_Keynote_MButts_2Dec14.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/RAMP-slides/FPT14_Keynote_MButts_2Dec14.pdf


--------------------------------------------------------------------------------
/RAMP-slides/JHoe_RAMPWrap_Aug10A (Slides, 8-25-2010).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/RAMP-slides/JHoe_RAMPWrap_Aug10A (Slides, 8-25-2010).pdf


--------------------------------------------------------------------------------
/RAMP-slides/KVissers_Ramp_wrap_slides (Slides, 8-25-2010).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/RAMP-slides/KVissers_Ramp_wrap_slides (Slides, 8-25-2010).pdf


--------------------------------------------------------------------------------
/RAMP-slides/Krste_RAMP-20100825-GoldWrap (Slides, 8-25-2010).pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/RAMP-slides/Krste_RAMP-20100825-GoldWrap (Slides, 8-25-2010).pptx


--------------------------------------------------------------------------------
/RAMP-slides/Krste_RAMP-20100825-WrapWrap (Slides, 8-25-2010).pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/RAMP-slides/Krste_RAMP-20100825-WrapWrap (Slides, 8-25-2010).pptx


--------------------------------------------------------------------------------
/RAMP-slides/MAdler_201008_RAMP (Slides, 8-25-2010).pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/RAMP-slides/MAdler_201008_RAMP (Slides, 8-25-2010).pptx


--------------------------------------------------------------------------------
/RAMP-slides/RAMP2010_MButts20Aug (Slides, 8-25-2010).pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/RAMP-slides/RAMP2010_MButts20Aug (Slides, 8-25-2010).pptx


--------------------------------------------------------------------------------
/RAMP-slides/RAMPretroPatterson (Slides, 8-25-2010).ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/RAMP-slides/RAMPretroPatterson (Slides, 8-25-2010).ppt


--------------------------------------------------------------------------------
/RAMP-slides/Thacker_RAMPing Down (Slides, 8-25-2010).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/RAMP-slides/Thacker_RAMPing Down (Slides, 8-25-2010).pdf


--------------------------------------------------------------------------------
/RAMP-slides/ramp-retro.kozyrakis (Slides, 8-25-2010).pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/RAMP-slides/ramp-retro.kozyrakis (Slides, 8-25-2010).pptx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RTL Simulation Reading Group Notes
  2 | 
  3 | ## Papers / Patents
  4 | 
  5 | ### Emulation HW
  6 | 
  7 | Custom ASIC-based emulation hardware produced by industry.
  8 | 
  9 | - [Yorktown simulation engine](https://ieeexplore.ieee.org/document/1585479) (DAC 1982, IBM)
 10 | - [A survey of HW accelerators used in CAD](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5005647) (IEEE Design and Test 1984, Tom Blank (Stanford))
 11 | - [Logic simulation engines in Japan](https://ieeexplore.ieee.org/abstract/document/43078) (IEEE Design and Test 1989, NEC / Fujitsu)
 12 | - [Multiprocessor for HW emulation](https://patents.google.com/patent/US5551013A/en) (Patent 1994, IBM / Cadence)
 13 | - [Emulating multi-ported memory circuits](https://patents.google.com/patent/US5940603A/en) (Patent 1997, Quickturn / Cadence)
 14 | - [Speeding Up Look-up-Table Driven Logic Simulation](https://link.springer.com/chapter/10.1007/978-0-387-35498-9_34) (Springer 1999, Fujitsu)
 15 | - [Sahara: Massively Parallel Dedicated Hardware for Cycle-Based Logic Simulations](https://onlinelibrary.wiley.com/doi/epdf/10.1002/ecjc.20193) (Wiley 2005, Fujitsu)
 16 | - [ibm logic engine](./resources/emulation/ibm_logic_engine.pdf)
 17 | - [ibm logic engine 2](./resources/emulation/ibm_lse_2.pdf)
 18 | 
 19 | ### FPGA Overlays
 20 | 
 21 | FPGA overlay-oriented emulation hardware and techniques for word-level FPGA compilation.
 22 | 
 23 | - [Time multiplexed FPGA architecture for logic emulation](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=518231) (CICC 1995, UToronto)
 24 | - [A CAD framework for Malibu: an FPGA with time-multiplexed coarse-grained elements](https://dl.acm.org/doi/abs/10.1145/1950413.1950441) (FPGA 2011, UBC)
 25 | - [Hoplite: Building Austere Overlay NoCs for FPGAs](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7293956) (FPL 2015)
 26 | - [Overgen](https://polyarch.cs.ucla.edu/papers/micro2022-overgen.pdf) (MICRO 2022)
 27 | - [grvi-phalanx](https://fpga.org/grvi-phalanx/)
 28 |     - FPGA efficient implementation of a RISC-V processor
 29 |     - 2/3 stage
 30 |     - Each processor takes about 320 6 LUTS and the design closes timing at 300MHz
 31 | - [hoplite](https://fpga.org/hoplite/)
 32 |     - FPGA efficient implementation of a NoC
 33 |     - Seems like they support the mesh topology, would need to profile the design to decide the amount of fanout
 34 |     - Scheduling becomes much mor difficult with this constraint (which is also why this is interesting)
 35 | 
 36 | ### Academic Attempts at Emulation
 37 | 
 38 | Academic efforts to create emulation hardware either using an FPGA overlay or modeling a custom emulation ASIC.
 39 | 
 40 | - [Cyclist](https://dl.acm.org/doi/abs/10.1109/ICCAD.2017.8203892) (ICCAD 2017)
 41 |   - [flo-llvm](https://github.com/palmer-dabbelt/flo-llvm), [libflo](https://github.com/palmer-dabbelt/libflo)
 42 |   - [Chisel DREAMER emulation platform](https://wiki.eecs.berkeley.edu/dreamer/Main/20141203Notes)
 43 | - [Accelerating RTL Simulation with Hardware-Software Co-Design](https://dl.acm.org/doi/abs/10.1145/3613424.3614257) (MICRO 2023)
 44 | - [Manticore: Hardware-Accelerated RTL Simulation with Static Bulk-Synchronous Parallelism](https://dl.acm.org/doi/10.1145/3623278.3624750) (ASPLOS 2023)
 45 |   - [A 475 MHz Manycore FPGA Accelerator for RTL Simulation - Manticore implementation paper](https://dl.acm.org/doi/pdf/10.1145/3626202.3637579) (FPGA 2024)
 46 | 
 47 | ### Compiler Partitioning Strategy
 48 | 
 49 | - [Yorktown simulation SW support](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1585481) (DAC 1982, IBM)
 50 | - [Load and Communications Balancing on Multiprocessor Logic Simulation Engines](https://web.archive.org/web/20170222020308id_/http://openscholarship.wustl.edu/cgi/viewcontent.cgi?article=1814&context=cse_research) (1987)
 51 | - [Efficient circuit partitioning algorithms for parallel logic simulation](https://dl.acm.org/doi/abs/10.1145/76263.76303) (SC 1989, UIUC)
 52 | - [Performance analysis of a parallel logic simulation machine](https://www.sciencedirect.com/science/article/pii/0743731589900294?) (1989)
 53 | - [Multiple-level partitioning: an application to the very large-scale hardware simulator](https://ieeexplore.ieee.org/abstract/document/78241) (JSSC 1991)
 54 | - [Jim's graph partitioning algorithm lecture](https://people.eecs.berkeley.edu/~demmel/lecture22a_partition_demmel22.pdf)
 55 | 
 56 | ### FireSim ancestors
 57 | 
 58 | - [FPGA-Accelerated Simulation Technologies (FAST): Fast, Full-System, Cycle-Accurate Simulators](https://ieeexplore.ieee.org/abstract/document/4408260) (MICRO 2007)
 59 | - [RAMP: Research Accelerator for Multiple Processors](https://ieeexplore.ieee.org/abstract/document/4287395) (IEEE Micro 2007)
 60 | - [RAMP Blue: A Message-Passing Manycore System in FPGAs](https://ieeexplore.ieee.org/abstract/document/4380625) (FPL 2007)
 61 | - [RAMP Blue: Implementation of a Manycore 1008 Processor FPGA System](http://people.eecs.berkeley.edu/~krste/papers/Burke_RAMP_Blue_RSSI_2008.pdf) (RSSI 2008)
 62 | - [A-Ports: an efficient abstraction for cycle-accurate performance models on FPGAs](https://dl.acm.org/doi/pdf/10.1145/1344671.1344685) (FPGA 2008)
 63 | - [Quick Performance Models Quickly: Closely-Coupled Partitioned Simulation on FPGAs](https://ieeexplore.ieee.org/abstract/document/4510733) (ISPASS 2008)
 64 | - [A-Port Networks: Preserving the Timed Behavior of Synchronous Systems for Modeling on FPGAs](https://dl.acm.org/doi/abs/10.1145/1575774.1575775) (2009)
 65 | - [ProtoFlex: Towards Scalable, Full-System Multiprocessor Simulations Using FPGAs](https://dl.acm.org/doi/abs/10.1145/1534916.1534925) (2009)
 66 | - [The Future of Architectural Simulation](https://ieeexplore.ieee.org/abstract/document/5506934) (IEEE Micro 2010)
 67 | - [RAMP gold: an FPGA-based architecture simulator for multiprocessors](https://dl.acm.org/doi/abs/10.1145/1837274.1837390) (DAC 2010)
 68 | - [A case for FAME: FPGA architecture model execution](https://dl.acm.org/doi/abs/10.1145/1815961.1815999) (ISCA 2010)
 69 | - [HAsim: FPGA-based high-detail multicore simulation using time-division multiplexing](https://ieeexplore.ieee.org/abstract/document/5749747) (HPCA 2011)
 70 | - [Leveraging latency-insensitivity to ease multiple FPGA design](https://dl.acm.org/doi/abs/10.1145/2145694.2145725) (FPGA 2012)
 71 | - [A cycle-accurate, cycle-reproducible multi-FPGA system for accelerating multi-core processor simulation](https://dl.acm.org/doi/abs/10.1145/2145694.2145720) (FPGA 2012)
 72 | - [Golden Gate: Bridging The Resource-Efficiency Gap Between ASICs and FPGA Prototypes](https://ieeexplore.ieee.org/abstract/document/8942087) (ICCAD 2019)
 73 | - [FASED: FPGA-Accelerated Simulation and Evaluation of DRAM](https://dl.acm.org/doi/abs/10.1145/3289602.3293894) (FPGA 2019)
 74 | 
 75 | ### Power, gate level Simulation
 76 | 
 77 | - [Using a hardware simulation engine for custom MOS structured designs](https://ieeexplore.ieee.org/abstract/document/5390298) (IBM 1984)
 78 | - CPF_palladium
 79 | - LowPowerCPF-Simulation-Guide
 80 | 
 81 | ### Software RTL Simulation
 82 | 
 83 | - [VLSI Logic and Fault Simulation on General-Purpose Parallel Computers](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=215006) (TCAD 1993, IBM)
 84 | - [LiveSim: A Fast Hot Reload Simulator for HDLs](https://ieeexplore.ieee.org/abstract/document/9238634) (ISPASS 2020)
 85 | - [A Case for Accelerating Software RTL Simulation by Scott Beamer](https://ieeexplore.ieee.org/abstract/document/9099598) (IEEE Micro 2020)
 86 | - [ESSENT: Efficiently Exploiting Low Activity Factors to Accelerate RTL Simulation](https://ieeexplore.ieee.org/abstract/document/9218632) (DAC 2020)
 87 | - [Fast behavioural rtl simulation of 10b transistor soc designs with metro-mpi](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10137080) (DATE 2023)
 88 | - [RepCut: Superlinear Parallel RTL Simulation with Replication-Aided Partitioning](https://dl.acm.org/doi/abs/10.1145/3582016.3582034) (ASPLOS 23)
 89 | - [On Accelerating PyRTL Simulation with Essential Signal Simulation Techniques](https://ieeexplore.ieee.org/abstract/document/10218453) (ISEDA 23)
 90 | - [Khronos: Fusing Memory Access for Improved Hardware RTL Simulation](https://dl.acm.org/doi/abs/10.1145/3613424.3614301) (MICRO 23)
 91 | - [TaroRTL: Accelerating RTL Simulation using Coroutine-based Heterogeneous Task Graph Scheduling](https://jsm.ece.wisc.edu/docs/lin-europar2024.pdf) (Euro-Par 24)
 92 | 
 93 | ### HW-Accelerated (Non-FPGA) RTL Simulation
 94 | 
 95 | - [RTLFlow: From RTL to CUDA: A GPU Acceleration Flow for RTL Simulation with Batch Stimulus](https://dl.acm.org/doi/abs/10.1145/3545008.3545091) (ICPP 22)
 96 | - [Parendi: Thousand-Way Parallel RTL Simulation](https://arxiv.org/abs/2403.04714) (Arxiv Preprint 2024)
 97 | - [GPU accelerated RTL simulations with loop unrolling](http://joces.nudt.edu.cn/EN/article/downloadArticleFile.do?attachType=PDF&id=18075)
 98 |     - Can we unroll + use FAME-5 to amortize resource overhead while enabling higher decoupling btw partitions?
 99 | 
100 | ### etc
101 | 
102 | - [Rents rule](./RAMP-slides/FPT14_Keynote_MButts_2Dec14.pdf)
103 | - [Mike butts RTL verification taxonomy](./RAMP-slides/10-2-design-verif-mike-butts-rtl-simulation-taxonomy.pdf)
104 | 
105 | ---
106 | 
107 | ## Tentative schedule
108 | 
109 | ## Week 1 - uarch
110 | 
111 | - [Multiprocessor for HW emulation](https://patents.google.com/patent/US5551013A/en)
112 | 
113 | ### Summary
114 | 
115 | #### Processor design
116 | 
117 | - Instructions execute step by step (no control flow, fixed set of instructions in IMEM). Each iteration through the instruction memory corresponds to one target cycle
118 | - Two data memory (denoted as input/data stack)
119 |     - In each step, the function bit out (FBO) is stored in the data stack
120 |     - In each step, the input from the switch is stored in the input stack. Instruction encodes which other processor to accept the incoming switch bit from (a bit can be ignored or broadcasted to X other processors)
121 |     - Need to perform logic computation in a BFS manner in order to use the values created from previous steps
122 | - LUTs are configured to simulate arbitrary N-1 gates. Operands are read from the input/data stack.
123 | - Bits can be forwarded to nearby processor (N-3 ~ N+3) instead of going through the network.
124 |     - Way of saving one cycle: if the bit goes over the network, to use it, the processor has to store it in the input stack and use it in the following cycle
125 | - Instruction memory is split into two parts: left and right
126 |     - For logic emulation, left and right both encodes the operation to perform
127 |     - For memory (SRAM?) emulation, the right instruction is essentially the data array. 16 processors are grouped and a bit from each group is used to generate the address for the memory operation
128 | 
129 | #### Emulation module, board, platform
130 | 
131 | - Module
132 |     - 64 processors are grouped together as a module
133 |     - All the processors within a module are connected as a crossbar
134 | - Board
135 |     - Collection of emulation modules
136 |     - Module ports are connected in a pre-configured fashion
137 | - Platform
138 |     - Collection of boards, DRAM(?), host communication logic, and other platform control logic
139 | - Need to synchronize across every cycle across all boards. How should this global synchronization achieved? Also can we allow certain parts to slip ahead of this global synchronization barrier (I think we can, but the  benefit might not be significant due to straggler effects)
140 | 
141 | ---
142 | 
143 | - Can connect multiple processors to simulate logic where the logic depth is larger than the max steps
144 |     - The performance degradation as the target design size increases is gradual
145 | - Inter board communication has to happen in fixed latency / compiler is aware of the latency (to the compiler, the link doesn't really matter except that the scheduling might change a little bit)
146 | - Need to have a core that can run testbench code near the machine (display messages, assertions, C++ models ...)
147 | - For 4 state, just use software and inject state
148 |     - However, there are other cases where 4 state sim make sense : external IP can inject 4 state, low power simulation...
149 |     - Cadence added support for X-prop in their latest Palladium
150 |     - Problem with X-prop is that you have to use 2 bits to simulate a single bit (00 -> 0, 01 -> 1, 10 -> X, 11 -> Z) but can be very area inefficient (especially X's are a rare state compared to just 0 & 1)
151 |     - But for the problems that we are trying to deal with (functional & performance verification) 2 state simulation may be sufficient
152 | - Expanding SRAM depth is cheap because we can use custom macros
153 |     - So when you can increase the frequency of the design, you would want to increase the SRAM depth so that each processor can emulate more gates w/o performance loss
154 |     - However, if the frequency is fixed, increasing the number of steps per cycle translates to lower simulation perf
155 |     - So that seems to be the reason that the IBM people found out 128 steps
156 |     - Need to find the optimal step for FPGA & ASIC w/ modern technology nodes.
157 | - One implementation option: add the processor grid as a FireSim LI-BDN where the interface is fixed (e.g., your tile)
158 |     - Can share the FireSim bridge/IO infrastructure
159 |     - Can save FPGA resources by mapping parts of the design directly on the FPGA and only the part where you anticipate RTL changes on the emulation processors
160 |     - For Fpga overlay (300 MHz), may have to make the network more simple to save FPGA routing resources
161 |         - The compiler has to be aware of the network latency (network has to be designed to have static latency & maps well onto an FPGA in such a way it matches switch boxes well)
162 |         - The compiler has to be able to pipeline instructions to hide extra network latency
163 |     - GCD is a good place to start
164 |     - FMR will increase (perhaps similar to when running TracerV)
165 |         - Jerry's opinion is that we shouldn't try to compromise on performance
166 |         - In my opinion, this is somewhat inevitable and not too bad
167 | - Approximating how many gates we can emulate when using a FPGA overlay
168 |     - FPGA can simulate N ASIC gates
169 |     - Each emulation processor corresponds to M ASIC gates, and has max T steps (T gates)
170 |         - M has to take account of the network
171 |     - Gates that can be emulated is approximately (N / M * T)
172 |     - Need to measure T/M by implementing a dummy module and building a bitstream with it
173 | 
174 | ### Discussion/Questions
175 | 
176 | - Should the compiler always cut across register boundaries?
177 |     - If a RTL block mapped to a single processor contains sequential logic, the processor cannot use the bit in the data stack that correpsonds to the FF as it will be overwritten. So that bit must go across the network and come back and the compiler would have to insert NOPs. -> utilization vs performance tradeoff
178 |     - Alternatively, can double the on-chip memory so that each half can work like a master (producing bits) and slave (storing bits for the next cycle). This enables more partitioning flexibility in the compiler but decreases area efficiency of the processors
179 | - How many processors can fit in a single FPGA & how many processors/modules/boards would we need to simulate a reasonably sized CY SoC?
180 | - What are some problems that might show up when scaling this system up in such a way that it can support a billion gate simulation?
181 | - (Since this word seems like some magic keyword to people) Heterogeneous integration of processor designs? Can we design certain modules/blocks to have different number of operands, bitwidth, ... to optimize for area & performance?
182 | - How to do X-propagation? We can encode that by just using 2 bits instead of 1 bit but that will have a significant area overhead. However, the most recent palladium started supporting X-propagation as well. Maybe they only have certain processors that have X-modeling while most processors only support 2 state simulation? Static analysis to identify gates that will not be X's for certain.
183 | 
184 | ## Week 2 - uarch
185 | 
186 | <!-- - [Yorktown simulation engine](https://ieeexplore.ieee.org/document/1585479) -->
187 | - [Yorktown simulation engine](https://dl.acm.org/doi/pdf/10.5555/800263.809186)
188 | 
189 | ### Discussion
190 | - What is the GDM for? Why not just use LUTs like in the patent
191 | - Go over SRAM emulation
192 | 
193 | - What does it mean to propagate the clock distribution logic?
194 |     - It must be due to how their clock distribution network is designed
195 |     - Pre multi-clock domain ages
196 |     - Can simulate clock gating, however there is no performance benefit from logic skipping
197 |     - You can have logic in the clock tree -> clock in is a combinational of some data & clock
198 |     - So you can simulate a FF as transistors, gate and functiona level, as you go down the abstraction takes more cycles to simulate a single FF
199 | - 4 state simulation support
200 |     - Can model x optimixim & pessimism
201 | - Very different from cyclist
202 | - If there is slipping, it has to be fixed amount because you will need memory to perform some sort of bookkeeping
203 | - Skipping has to happen in a coarse-grained manner & the parts that can be skipped at the same time has to be pre-determined
204 |     - Also the amount of host cycles that can be skipped has to be predetermined & known by the compiler -> kind of reaches a multicore simulation logic
205 |     - However, the simulation throughput is determined by the worst case processor steps
206 | - Core functional logic
207 |     - GDM : it is for x-prop pess & opt for tuning (for 4 state simulation)
208 |         - x opt: X & 1 -> 0 or 1
209 |         - x pess: X & 1 -> X
210 |         - x symbolic: if the output can be proved, use that value (even with this, there are cases when you need x prop for registers in RR-arbiter)
211 |         - interrupt logic
212 |             - propagation of X when happens after a certain point in simulation to check if x prop breaks stuff
213 |             - can be used to generate trigger conditions
214 | - vs Quickturn
215 |     - much more effort was put to perform 4 state sim & clock gating modeling
216 |     - possibly because they had low confidence about their digital logic
217 |     - z3 has 4 state -> (emulated 4 state by using multiple bits, probably x-optimistic simulation)
218 | - How clock trees are modeled in modern palladiums
219 | 
220 | ## Week 3 - uarch
221 | 
222 | - [Logic simulation engines in Japan](https://ieeexplore.ieee.org/abstract/document/43078?casa_token=nD2xnLdyzTYAAAAA:rY_2eFFqS8Imhzso9TwMOKM2qQ6E5eQ0rZVc54LK_iRS4cVwM2CNewPATFflru2O-nGR-r7kvNg)
223 | - [Sahara: Massively parallel dedicated hardware for cycle-based logic simulations, Hanamura et. al., 2005](https://onlinelibrary.wiley.com/doi/epdf/10.1002/ecjc.20193)
224 | 
225 | ## Week 4 - compiler
226 | 
227 | - [Yorktown simulation SW support](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1585481)
228 | - [ibm logic engine](./resources/emulation/ibm_logic_engine.pdf)
229 | 
230 | - Compiler/HW complexity tradeoffs
231 |     - Unit delay model vs rank order
232 |     - How does this tradeoff space differ from FPGAs vs ASIC?
233 | - Partitioning & instruction scheduling
234 |     - When partitioning, should we try to partition across register boundaries? Or if we have a partition that is balanced & minimizes communication, would that also be a nice partition?
235 |     - Linker: can it link arbitrary boundaries or are there any conditions for these link boundaries? How can we use the permuters for incremental compilation flows?
236 | - What is a nice interface/method to load the compiled instructions into these processors?
237 |     - FESVR -> too slow?
238 |     - ???
239 | 
240 | ## Week 5 - academic
241 | 
242 | - [Cyclist](https://dl.acm.org/doi/abs/10.1109/ICCAD.2017.8203892) (ICCAD 2017)
243 | 
244 | ### Cyclist
245 | - No traction at all, sad
246 | - Related work
247 |     - Compared against EVE, YSE
248 |     - Palladium 100 million gates an hour
249 |         - compilation performance strong scaling with more cores -> most of the compilation time is in partitioning
250 |     - **Malibu (another related work)**
251 | - simulates in the RTL operator level -> datapath width vs simulator platform capacity tradeoff
252 | - Uarch
253 |     - Modified Rocket, 32 bit wide instruction
254 |     - No custom logic function, it uses ALUs to perform computation
255 |     - ISA
256 |         - log2: are they recovering the RTL semantics to find use cases for log2 (find highest bit, because in chisel, this circuit is blasted out)
257 |         - cat: more of a consequence of FIRRTL having Cat & made implementation easier
258 |         - mul: extreme -> may be area inefficient to have in every single emulation core
259 |     - 32 architectural registers
260 |         - They didn't want to spend too much time
261 |     - Explicit NOPs to resolve data hazards
262 |     - Only neighbor to neighbor routing -> a lot of cycles are spent routing data across the network
263 |     - Can broadcast outputs to all the neighbors
264 | - Debug
265 |     - Nice engineering
266 |     - Capture IO traces and replay them later
267 | - Utilization only 4%
268 | - Pay as you go
269 |     - perform annealing to come up with a better compilation output while loading & running the simulation
270 |     - high engineering effort, but not impossible
271 |     - must maintain a mapping of new compilation, done on the host
272 | - Interactive visibility
273 |     - Find signal at a particular point in time
274 |     - Take peridoic snapshots & replay
275 |     - Only 12% perf slowdown (in Palladium, it is like 2 ~ 5x)
276 | 
277 | ## Week 6 - academic
278 | 
279 | - [Manticore: Hardware-Accelerated RTL Simulation with Static Bulk-Synchronous Parallelism](https://dl.acm.org/doi/10.1145/3623278.3624750) (ASPLOS 2023)
280 | 
281 | - Recap:
282 |     - Cyclist: msg passing btw cores (mesh), low utilization
283 |     - neighbor to neighbor results in low utilization
284 | - Manticore:
285 |     - No message passing
286 |         - Bulk synchronous parallelism (separate bit shuffling phase)
287 |         - 2D torus network
288 |         - Leads to low utilization
289 |         - Enables the compiler to perform core local scheduling of instructions
290 |         - However, while performing the communication phase the compiler still has to be aware of the NoC traffic and make sure things don't collide
291 |     - Statically scheduled via compiler
292 |     - Verilator is the baseline, but not a fair comparison, but similar to verilator and repcut, it uses a bulk synchronous execution model
293 |     - Each tile is larger because of the above execution model
294 |         - State has to be duplicated & maintained within each tile
295 |     - 14 stage pipeline
296 |         - Specialized to FPGAs
297 |         - No interlocks, compiler is inserting NOPs
298 |     - Large datapath & low utilization
299 |     - Custom function unit
300 |         - Particular design
301 |     - Results
302 |         - 2x compared to Rocket / cannot extract out enough parallelism to compete with Xeons
303 |         - No utilization, NOPs...
304 | 
305 | - Taxanomy
306 |     - Event driven vs static -> where is the static dynamic boundary? accessing SRAM?
307 |     - Bulk synch vs fine-grained msg passing
308 |     - Core compute element (LUT vs ALU)... degree to how close it looks like a LUT / datapath width
309 |     - Synch vs intra cycle Timing
310 |     - 4 state simulation support
311 |     - Memory and encoding support / how are SRAMs are mapped
312 | 
313 | 
314 | 
315 | ## Week 7 - misc
316 | 
317 | - [Malibu](https://people.ece.ubc.ca/lemieux/publications/grant-fpga2011.pdf)
318 | - [Nexus](https://woset-workshop.github.io/PDFs/2022/1-Birch-paper.pdf)
319 | 
320 | ## Week 8 - Power & gate level simulation
321 | 
322 | - CPF_palladium (cadence manual)
323 | - LowPowerCPF-Simulation-Guide (cadence manual)
324 | 
325 | ## Week 9 - FPGA overlay
326 | 
327 | - [Overgen](https://polyarch.cs.ucla.edu/papers/micro2022-overgen.pdf)
328 | 
329 | ## Week 10 - FPGA based emulation
330 | 
331 | - [Time multiplexed FPGA architecture for logic emulation](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=518231)
332 | - [CAD fromwork for Malibu: an FPGA with time-multiplexed coarse-grained elements](https://dl.acm.org/doi/abs/10.1145/1950413.1950441)
333 | 
334 | ---
335 | 
336 | ## [ParSGCN: Bridging the Gap Between Emulation Partitioning and Scheduling](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10663266)
337 | 
338 | - A good partition doesn't mean the scheduling results will be good
339 | - They found out that "good partitions" usually contains certain nets in the partition cuts
340 |     - What does "certain nets" mean in this context?
341 |     - Probably is some structural characteristic of the graph
342 | - They train a GCN to obtain a probability `P(e)` where `e` represents the net and `P` represents the probablity that it will be included in the partition cut
343 | - During the partitioning process, they use the GCN to guide partitioning decisions so that the scheduling quality will be high
344 | - To limit the explosion of compute requirements, they only apply the above technique in the final partitioning step (where subpartitions are again partitioned onto emulation processors)
345 | - But what are the characteristics of the nodes that have high `P(e)` vs the ones that do not? This isn't revealed from the paper
346 | - The results look quite promising and they seemed to have used Palladium compilers as the baseline. (avg 10% less steps than the Palladium compiler for open source designs, max up to 33% less steps)
347 | - It seems like the `KaHyPar` partitioner provides pretty decent compilation results as well though
348 | 
349 | ## [Sphinx: A Hybrid Boolean Processor-FPGA Hardware Emulation System](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10323694)
350 | 
351 | - FPGA + boolean processor approach
352 | 
353 | ## [HW design and CAD for processor based logic emulation systems](https://core.ac.uk/download/pdf/127678602.pdf)
354 | 


--------------------------------------------------------------------------------
/resources/GPU-accelerated-rtl-sim-with-loop-unrolling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/GPU-accelerated-rtl-sim-with-loop-unrolling.pdf


--------------------------------------------------------------------------------
/resources/emulation/03_Jonathan_Bachrach_--_Rethinking_Sketching_with_Chisel_and_DREAMER.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/03_Jonathan_Bachrach_--_Rethinking_Sketching_with_Chisel_and_DREAMER.pdf


--------------------------------------------------------------------------------
/resources/emulation/2023.ash.micro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/2023.ash.micro.pdf


--------------------------------------------------------------------------------
/resources/emulation/A_Survey_of_Hardware_Accelerators_Used_in_Computer-Aided_Design.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/A_Survey_of_Hardware_Accelerators_Used_in_Computer-Aided_Design.pdf


--------------------------------------------------------------------------------
/resources/emulation/RAMP2010_MButts20Aug (Slides, 8-25-2010).pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/RAMP2010_MButts20Aug (Slides, 8-25-2010).pptx


--------------------------------------------------------------------------------
/resources/emulation/RTL-emulation-uarch.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/RTL-emulation-uarch.pptx


--------------------------------------------------------------------------------
/resources/emulation/RTL-power-analysis-using-GL-cell-power-models.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/RTL-power-analysis-using-GL-cell-power-models.pdf


--------------------------------------------------------------------------------
/resources/emulation/Speeding up lookup table driven logic simulation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/Speeding up lookup table driven logic simulation.pdf


--------------------------------------------------------------------------------
/resources/emulation/The_Yorktown_Simulation_Engine.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/The_Yorktown_Simulation_Engine.pdf


--------------------------------------------------------------------------------
/resources/emulation/US5551013-processor-design.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/US5551013-processor-design.pdf


--------------------------------------------------------------------------------
/resources/emulation/US5551013.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/US5551013.pdf


--------------------------------------------------------------------------------
/resources/emulation/US6035117.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/US6035117.pdf


--------------------------------------------------------------------------------
/resources/emulation/US7047179.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/US7047179.pdf


--------------------------------------------------------------------------------
/resources/emulation/US7555423.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/US7555423.pdf


--------------------------------------------------------------------------------
/resources/emulation/cyclist-chisel-emulator.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/cyclist-chisel-emulator.pdf


--------------------------------------------------------------------------------
/resources/emulation/ibm_logic_engine.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/ibm_logic_engine.pdf


--------------------------------------------------------------------------------
/resources/emulation/ibm_lse_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/ibm_lse_2.pdf


--------------------------------------------------------------------------------
/resources/emulation/malibu-fpga2011.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/malibu-fpga2011.pdf


--------------------------------------------------------------------------------
/resources/emulation/palladium-dynamic-power-anal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/euphoric-hardware/rtl-simulation-reading/be4baf8dd63a7b8ade143b0705b8859b412d4259/resources/emulation/palladium-dynamic-power-anal.pdf


--------------------------------------------------------------------------------