├── .env.toolchain
├── .gitignore
├── README.mkdn
├── boards
    ├── __init__.py
    └── icoboard.py
├── dhrystone
    ├── .gitignore
    ├── Makefile
    ├── README.mkdn
    ├── dhry.h
    ├── dhry_1.c
    ├── dhry_2.c
    ├── sections.lds
    ├── start.S
    └── stdlib.c
├── doc
    ├── chonk.mkdn
    └── instruction-cycle.svg
├── hapenny
    ├── __init__.py
    ├── bus.py
    ├── chonk
    │   ├── __init__.py
    │   ├── cpu.py
    │   ├── ewbox.py
    │   ├── fdbox.py
    │   ├── gpio32.py
    │   ├── mem32.py
    │   ├── regfile32.py
    │   ├── sbox.py
    │   └── serial32.py
    ├── cpu.py
    ├── decoder.py
    ├── ewbox.py
    ├── extsram.py
    ├── fdbox.py
    ├── gpio.py
    ├── mem.py
    ├── regfile16.py
    ├── rvfi.py
    ├── sbox.py
    └── serial.py
├── icestick-chonk.py
├── icestick-smallest.py
├── icesticktest.py
├── icoboard-large.py
├── icolarge-bootloader.bin
├── montool
    ├── .gitignore
    ├── Cargo.lock
    ├── Cargo.toml
    ├── README.mkdn
    └── src
    │   └── main.rs
├── notes
    ├── 20231001.mkdn
    ├── 20231002.mkdn
    ├── 20231003.mkdn
    ├── 20231004.mkdn
    ├── 20231005.mkdn
    └── 20231006.mkdn
├── pdm.lock
├── pyproject.toml
├── sim-chonk.py
├── sim-cpu.py
├── smallest-toggle.bin
├── tiny-bootloader.bin
├── tinyboot-upduino-chonk.bin
├── tinyboot
    ├── .cargo
    │   └── config
    ├── Cargo.lock
    ├── Cargo.toml
    ├── README.mkdn
    ├── build.rs
    ├── link.x
    ├── rust-toolchain.toml
    └── src
    │   └── main.rs
├── upduino-bootloader.bin
├── upduino-chonk.py
└── upduino-large.py


/.env.toolchain:
--------------------------------------------------------------------------------
1 | AMARANTH_USE_YOSYS=builtin
2 | YOSYS=yowasp-yosys
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.gtkw
3 | *.vcd
4 | .pdm-python
5 | build/
6 | sim-cpu.v
7 | 


--------------------------------------------------------------------------------
/README.mkdn:
--------------------------------------------------------------------------------
  1 | # `hapenny`: a half-width RISC-V
  2 | 
  3 | `hapenny` is a 32-bit RISC-V CPU implementation that operates internally on
  4 | 16-bit chunks. This means it takes longer to do things, but uses less space.
  5 | 
  6 | This approach was inspired by the MC68000 (1979), which also implemented a
  7 | 32-bit instruction set using a 16-bit datapath. (`hapenny` uses about half as
  8 | many cycles per instruction as the MC68000, after optimization.)
  9 | 
 10 | `hapenny` was written to evaluate the Amaranth HDL.
 11 | 
 12 | (The current `hapenny` was formerly version 2; once it became mature enough I
 13 | removed version 1.)
 14 | 
 15 | ## Bullet points
 16 | 
 17 | - Over 12M inst/sec on iCE40 HX1K, while occupying under 800 LCs, or less than
 18 |   63% of the chip. (Throughput compares favorably to some 32-bit implementations
 19 |   occupying twice the area.)
 20 | - Native 16-bit bus allows for simpler peripherals and external RAMs. (Can run
 21 |   out of external 16-bit SRAM with no penalty.)
 22 | - Parameterized with knobs for trading off size vs capability.
 23 | - Implements the RV32I unprivileged instruction set (currently missing FENCE and
 24 |   SYSTEM).
 25 | - Optional interrupt support in the older core. (yet to come in the revised one)
 26 | - Written in Python using Amaranth.
 27 | 
 28 | ## But why
 29 | 
 30 | There are a bazillion open-source RISC-V CPU implementations out there, which is
 31 | what happens when you release a well-designed and free-to-implement instruction
 32 | set spec -- nerds like me will crank out implementations.
 33 | 
 34 | I wrote `hapenny` as an experiment to see if I could target the space between
 35 | the PicoRV32 core and the SERV core, in terms of size and performance. I
 36 | specifically wanted to produce a CPU with decent performance that could fit into
 37 | an iCE40 HX1K part (like on the Icestick evaluation board) with enough space
 38 | left over for useful logic. PicoRV32 doesn't quite fit on that chip; SERV fits
 39 | but takes 32-64 cycles per instruction.
 40 | 
 41 | | Property                       | PicoRV32-small | `hapenny` | SERV |
 42 | | ------------------------------ | -------------- | --------- | ---- |
 43 | | Datapath width (bits)          | 32             | **16**    | 1 |
 44 | | External data bus width        | 32             | **16**    | 32 |
 45 | | Average cycles per instruction | 5.429          | **5.525** | 40-ish |
 46 | | Minimal size on iCE40 (LCs)    | 1500-ish       | **796**   | 200-ish |
 47 | | Typical MHz on iCE40           | 40s?           | **72+**   | 40s? |
 48 | 
 49 | (Cycles/instruction is measured on Dhrystone. Minimal size is the output
 50 | produced by the `icestick-smallest.py` script. I would appreciate help getting
 51 | apples-to-apples comparison numbers!)
 52 | 
 53 | So, basically,
 54 | 
 55 | - `hapenny` is significantly smaller than a similarly-configured PicoRV32 core
 56 |   for only 1.7% less performance per clock. (Of course, PicoRV32 is a far more
 57 |   general and well-tested processor, and in practice you'd configure it with
 58 |   performance-enhancing features like a dual-port register file and faster
 59 |   shifts.)
 60 | 
 61 | - `hapenny` is much faster than SERV, but also about 4x larger. (SERV is also
 62 |   better tested than `hapenny`.)
 63 | 
 64 | `hapenny` is easy to interface to 16-bit peripherals and external memory with no
 65 | (additional) performance loss. This can result in smaller overall designs and
 66 | simpler boards. For instance, `hapenny` can run at full rate out of the 16-bit
 67 | SRAM on the Icoboard.
 68 | 
 69 | Independent from the datapath width, I also did some fairly aggressive manual
 70 | register retiming in the decoder and datapath, which means `hapenny` can often
 71 | close timing at higher Fmax than other simple RV32 cores. (I miss automatic
 72 | retiming from ASIC toolchains.)
 73 | 
 74 | 
 75 | ## Details
 76 | 
 77 | `hapenny` executes (most of) the RV32I instruction set in 16-bit pieces. It uses
 78 | 16-bit memory, a 16-bit (single-ported) register file, and a 16-bit ALU. To
 79 | perform 32-bit operations, it uses the same techniques a programmer might use in
 80 | software on a 16-bit computer, e.g. "chaining" operations using preserved
 81 | carry/zero bits.
 82 | 
 83 | All memory interfaces in `hapenny` are synchronous, including the register file,
 84 | which is another reason why operations take more cycles. The RV32I register file
 85 | is comparatively large (at 1024 bits), and using a synchronous register file
 86 | ensures that it can be mapped into an FPGA block RAM if desired.
 87 | 
 88 | Here's what the CPU does during the timing of a typical instruction like `ADD`.
 89 | I've color/brightness-coded three different executions that are in flight during
 90 | this diagram.
 91 | 
 92 | ![A timing diagram showing a typical instruction cycle.](doc/instruction-cycle.svg)
 93 | 
 94 | - The "FD-Box" is responsible for fetch and decode, and is always working on the
 95 |   _next_ instruction. It requires three cycles to fetch both halfwords of an
 96 |   instruction, and then uses the `DECODE` cycle to do initial instruction
 97 |   decoding and start the read of rs1's low half. (It spends one cycle out of
 98 |   four essentially idle to make the state machines line up conveniently.)
 99 | - The "EW-Box" is responsible for execute and writeback. It goes through at
100 |   least four states in every instruction:
101 |     - `R2L` starts the load of the low half of rs2 from the register file.
102 |     - `OPL` operates on the low halves of rs1 and rs2 (or rs1 and an immediate),
103 |       and also starts the load of the high half of rs1.
104 |     - `R2H` and `OPH` do the same thing for the high half.
105 | 
106 | Most instructions take four cycles, as shown in that diagram. Some take more if
107 | they need to do additional things (by adding states), or if they change control
108 | flow such that the FD-Box's speculative fetch was wrong. The CPU test bench
109 | (`sim-cpu.py`) measures the cycle timing for every instruction; here's where
110 | things currently stand:
111 | 
112 | | Instruction  | Cycles | Notes |
113 | | ------------ | ------ | ----- |
114 | | AUIPC        | 4      | |
115 | | LUI          | 4      | |
116 | | JAL          | 8      | Includes four-cycle re-fetch penalty |
117 | | JALR         | 8      | Includes four-cycle re-fetch penalty |
118 | | Branch       | 5/10   | Not Taken / Taken |
119 | | Load         | 6      | |
120 | | SW           | 5      | |
121 | | SB/SH        | 4      | |
122 | | SLT(I)(U)    | 6      | |
123 | | Shift        | 6 + N  | N is number of bits shifted |
124 | | Other ALU op | 4      | |
125 | 
126 | On the instruction mix in Dhrystone, this yields an average of 5.525
127 | cycles/instruction.
128 | 
129 | ## Interfaces
130 | 
131 | `hapenny` uses a very simple bus interface with up to 32-bit addressing. In
132 | practice, applications will wire up fewer than 32 address lines, which saves
133 | space.
134 | 
135 | | Signal     | Driver | Width    | Description |
136 | | ---------- | ------ | -------- | ----------- |
137 | | `addr`     | CPU    | up to 31 | addresses a halfword, i.e. LSB missing |
138 | | `data_out` | CPU    | 16       | carries data for a write |
139 | | `lanes`    | CPU    | 2        | signals a write of either or both byte in a halfword; zero means a load |
140 | | `valid`    | CPU    | 1        | when high, indicates that the signals above are valid and starts a bus transaction. |
141 | | `response` | device |16       | on the cycle after a load, carries back data from the addressed device. |
142 | 
143 | The PC can be shrunk separately from the address bus if you know that all
144 | program memory appears in e.g. the bottom half of the address space. This
145 | further saves space.
146 | 
147 | The bus interface does not support wait states, to reduce complexity. This makes
148 | it difficult to interface to things like XIP SPI Flash or SDRAM. `hapenny` is
149 | really intended for applications that don't rely on such things.
150 | 
151 | `hapenny` exposes a fairly flexible debug interface capable of inspecting
152 | processor state and reading and writing the register file. These feautres are
153 | only available when the processor is halted, which can be achieved by holding
154 | `halt_request` high until the processor confirms (at the next instruction
155 | boundary) by asserting `halted`. Release `halt_request` to resume.
156 | 
157 | Finally, `hapenny` has an RVFI (RISC-V Formal Interface) trace port for
158 | generating a trace of instruction effects, though I haven't wired up the actual
159 | test suite.
160 | 
161 | ## Interrupt options
162 | 
163 | Currently, `hapenny` does not support interrupts, but I'm planning on changing
164 | this. (An earlier version did, support was removed when I rearchitected the core
165 | for v2.)
166 | 
167 | ## Drawbacks
168 | 
169 | - Written by someone who pretends to be an electrical engineer as a way to
170 |   procrastinate finishing his slides for a talk.
171 | 
172 | - Used for exactly one thing so far, so not exactly battle-hardened.
173 | 
174 | - Less general than more mature implementations like PicoRV32 -- e.g. no support
175 |   for wait states, hardware multiply, coprocessors, or (currently) interrupts.
176 | 
177 | - 16-bit external data bus means that, currently, 32-bit reads/writes are not
178 |   atomic -- a problem when interfacing with peripherals with 32-bit
179 |   memory-mapped registers. (Peripherals with 16-bit memory-mapped registers work
180 |   fine, however.)
181 | 
182 | - Not exactly well factored/commented.
183 | 
184 | - Written in Python, so chances are pretty good the code won't keep working
185 |   across OS updates / minor runtime versions.
186 | 
187 | ## What's with the name
188 | 
189 | `hapenny` is implemented using about half the logic of other cheap RV32 cores.
190 | 
191 | The half-penny, or "ha'penny," is a historical English coin worth (as the name
192 | implies) half a penny. So if the other cheap cores cost a penny, this is a
193 | ha'penny.
194 | 


--------------------------------------------------------------------------------
/boards/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/boards/__init__.py


--------------------------------------------------------------------------------
/boards/icoboard.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from pathlib import Path
 4 | 
 5 | from amaranth.build import *
 6 | from amaranth.vendor import *
 7 | from amaranth_boards.resources import *
 8 | 
 9 | 
10 | __all__ = ["IcoboardPlatform"]
11 | 
12 | 
13 | class IcoboardPlatform(LatticeICE40Platform):
14 |     device      = "iCE40HX8K"
15 |     package     = "CT256"
16 |     default_clk = "clk100"
17 |     resources   = [
18 |         Resource("clk100", 0, Pins("R9", dir="i"),
19 |                  Clock(100e6), Attrs(GLOBAL=True, IO_STANDARD="SB_LVCMOS")),
20 | 
21 |         *LEDResources(pins="C8 F7 K9", attrs=Attrs(IO_STANDARD="SB_LVCMOS")),
22 | 
23 |         *ButtonResources(pins="K11 P13", attrs=Attrs(IO_STANDARD="SB_LVCMOS")),
24 | 
25 |         SRAMResource(0,
26 |             cs_n="M7", oe_n="L5", we_n="T7",
27 |             a="N2 K5 J5 M5 P4 N5 P5 P7 M6 P6 T8 T1 P2 R1 N3 P1 M11 P10 P8",
28 |             d="T2 R3 T3 R4 R5 T5 R6 T6 N4 M4 L6 M3 L4 L3 K4 K3",
29 |             dm_n="J4 J3",
30 |             attrs=Attrs(IO_STANDARD="SB_LVCMOS"),
31 |         ),
32 | 
33 |         *SPIFlashResources(0,
34 |             cs_n="R12", clk="R11", copi="P12", cipo="P11",
35 |             attrs=Attrs(IO_STANDARD="SB_LVCMOS")
36 |         ),
37 |     ]
38 |     connectors  = [
39 |         Connector("pmod", 1, "D8 B9 B10 B11 - - B8 A9 A10 A11 - -"),
40 |         Connector("pmod", 2, "A5 A2 C3 B4 - - B7 B6 B3 B5 - -"),
41 |         Connector("pmod", 3, "L9 G5 L7 N6 - - N9 P9 M8 N7 - -"),
42 |         Connector("pmod", 4, "T15 T14 T11 R10 - - R14 T13 T10 T9 - -"),
43 |     ]
44 | 
45 |     def toolchain_program(self, products, name):
46 |         icoprog = os.environ.get("ICOPROG", "icoprog")
47 |         with products.extract("{}.bin".format(name)) as bitstream_filename:
48 |             bitstream = Path(bitstream_filename).read_bytes()
49 |             subprocess.run([icoprog, "-p"], input=bitstream, check=True)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     from amaranth_boards.test.blinky import *
54 |     IcoboardPlatform().build(Blinky(), do_program=True)
55 | 


--------------------------------------------------------------------------------
/dhrystone/.gitignore:
--------------------------------------------------------------------------------
1 | dhry.bin
2 | dhry.elf
3 | dhry.map
4 | *.d
5 | *.o
6 | 


--------------------------------------------------------------------------------
/dhrystone/Makefile:
--------------------------------------------------------------------------------
 1 | UARTADDR ?= 0x18000
 2 | RUNS ?= 1000
 3 | STACK ?= 0x8000
 4 | OBJS = dhry_1.o dhry_2.o stdlib.o start.o
 5 | CFLAGS = -MD -O3 -mabi=ilp32 -march=rv32i -DTIME -DRISCV -g3
 6 | TOOLCHAIN_PREFIX = riscv64-unknown-elf-
 7 | 
 8 | CFLAGS += -DUSE_MYSTDLIB -ffreestanding -nostdlib -DUARTADDR=$(UARTADDR) -DSTACK=$(STACK) -DRUNS=$(RUNS)
 9 | 
10 | dhry.bin: dhry.elf
11 | 	$(TOOLCHAIN_PREFIX)objcopy -Obinary $^ $@
12 | 
13 | dhry.elf: $(OBJS) sections.lds
14 | 	$(TOOLCHAIN_PREFIX)gcc $(CFLAGS) -Wl,-Bstatic,-T,sections.lds,-Map,dhry.map,--strip-debug -o $@ $(OBJS) -lgcc
15 | 	chmod -x $@
16 | 
17 | %.o: %.c
18 | 	$(TOOLCHAIN_PREFIX)gcc -c $(CFLAGS) $<
19 | 
20 | %.o: %.S
21 | 	$(TOOLCHAIN_PREFIX)gcc -c $(CFLAGS) $<
22 | 
23 | dhry_1.o dhry_2.o: CFLAGS += -Wno-implicit-int -Wno-implicit-function-declaration
24 | 
25 | clean:
26 | 	rm -rf *.o *.d dhry.elf dhry.map dhry.bin dhry.hex
27 | 
28 | .PHONY: test clean
29 | 
30 | -include *.d
31 | 
32 | 


--------------------------------------------------------------------------------
/dhrystone/README.mkdn:
--------------------------------------------------------------------------------
 1 | # Hacked up Dhrystone
 2 | 
 3 | This is the classical Dhrystone benchmark, fitted with code to support a hapenny
 4 | SoC. This code and the Makefile are derived from the PicoRV32 Dhrystone test
 5 | bench, but further modified and simplified.
 6 | 
 7 | The Dhrystone sources appear to be in the public domain. I've borrowed those
 8 | bits and left the non-public-domain bits from PicoRV32 behind, as far as I know.
 9 | 
10 | By default, this builds an image compatible with the `upduino-large` example
11 | SoC. That's the only example currently in the repo that has enough RAM to run
12 | Dhrystone (you need at least 18 kiB).
13 | 
14 | ## Current results
15 | 
16 | For the `upduino-large` SoC example using the newer (`box`) CPU revision, with
17 | the integer overflows fixed in the C code for printing cycle counts (boo C!), we
18 | get:
19 | 
20 | ```
21 | Number_Of_Runs: 10000
22 | User_Time: 21440119 cycles, 3880025 insn
23 | Cycles_Per_Instruction: 5.525
24 | Dhrystones_Per_Second_Per_MHz: 466
25 | DMIPS_Per_MHz: 0.265
26 | ```
27 | 
28 | 
29 | ## Building and running it
30 | 
31 | First, make sure you have an SoC running `tinyboot` and a working serial cable.
32 | Your SoC should respond to the `ping` subcommand in `montool`.
33 | 
34 | Build Dhrystone by running `make`. This will produce (among other things) a file
35 | called `dhry.bin`.
36 | 
37 | Go into the `montool` directory and run
38 | 
39 | ```
40 | cargo run -q YOURPORT write 0 dhry.bin
41 | cargo run -q YOURPORT call 0 --then-echo
42 | ```
43 | 
44 | ...where `YOURPORT` should be the name of the serial port on your system (e.g.
45 | `/dev/ttyUSB0` or `COM1:`) and `0` is the address to load. If that address isn't
46 | right for your SoC, see the next section.
47 | 
48 | The `call` subcommand will activate Dhrystone and print its output to your
49 | terminal. Once it says `DONE`, it will appear to hang; just abort the command at
50 | this point. If you'd like to run it more than once, just `call` again.
51 | 
52 | 
53 | ## Adapting to your SoC
54 | 
55 | The Makefile's behavior can be customized by passing two variables:
56 | 
57 | - `UARTADDR` is the address of the UART (default: 0x18000).
58 | - `STACK` is the initial stack pointer (default: 0x8000).
59 | 
60 | If your program RAM is not at address 0, you'll need to modify the linker
61 | script.
62 | 
63 | 


--------------------------------------------------------------------------------
/dhrystone/dhry_1.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  ****************************************************************************
  3 |  *
  4 |  *                   "DHRYSTONE" Benchmark Program
  5 |  *                   -----------------------------
  6 |  *
  7 |  *  Version:    C, Version 2.1
  8 |  *
  9 |  *  File:       dhry_1.c (part 2 of 3)
 10 |  *
 11 |  *  Date:       May 25, 1988
 12 |  *
 13 |  *  Author:     Reinhold P. Weicker
 14 |  *
 15 |  ****************************************************************************
 16 |  */
 17 | 
 18 | #include "dhry.h"
 19 | 
 20 | #ifdef USE_MYSTDLIB
 21 | extern char     *malloc ();
 22 | #else
 23 | #  include <stdlib.h>
 24 | #  include <string.h>
 25 | #endif
 26 | 
 27 | /* Global Variables: */
 28 | 
 29 | Rec_Pointer     Ptr_Glob,
 30 |                 Next_Ptr_Glob;
 31 | int             Int_Glob;
 32 | Boolean         Bool_Glob;
 33 | char            Ch_1_Glob,
 34 |                 Ch_2_Glob;
 35 | int             Arr_1_Glob [50];
 36 | int             Arr_2_Glob [50] [50];
 37 | 
 38 | Enumeration     Func_1 ();
 39 |   /* forward declaration necessary since Enumeration may not simply be int */
 40 | 
 41 | #ifndef REG
 42 |         Boolean Reg = false;
 43 | #define REG
 44 |         /* REG becomes defined as empty */
 45 |         /* i.e. no register variables   */
 46 | #else
 47 |         Boolean Reg = true;
 48 | #endif
 49 | 
 50 | /* variables for time measurement: */
 51 | 
 52 | #ifdef IGN_TIMES
 53 | struct tms      time_info;
 54 | extern  int     times ();
 55 |                 /* see library function "times" */
 56 | #define Too_Small_Time 120
 57 |                 /* Measurements should last at least about 2 seconds */
 58 | #endif
 59 | #ifdef TIME
 60 | extern long     time();
 61 | #ifdef RISCV
 62 | extern long     insn();
 63 | #endif
 64 |                 /* see library function "time"  */
 65 | #define Too_Small_Time 2
 66 |                 /* Measurements should last at least 2 seconds */
 67 | #endif
 68 | 
 69 | long            Begin_Time,
 70 |                 End_Time,
 71 |                 User_Time;
 72 | #ifdef RISCV
 73 | long            Begin_Insn,
 74 |                 End_Insn,
 75 |                 User_Insn;
 76 | #endif
 77 | float           Microseconds,
 78 |                 Dhrystones_Per_Second;
 79 | 
 80 | /* end of variables for time measurement */
 81 | 
 82 | 
 83 | main ()
 84 | /*****/
 85 | 
 86 |   /* main program, corresponds to procedures        */
 87 |   /* Main and Proc_0 in the Ada version             */
 88 | {
 89 |         One_Fifty       Int_1_Loc;
 90 |   REG   One_Fifty       Int_2_Loc;
 91 |         One_Fifty       Int_3_Loc;
 92 |   REG   char            Ch_Index;
 93 |         Enumeration     Enum_Loc;
 94 |         Str_30          Str_1_Loc;
 95 |         Str_30          Str_2_Loc;
 96 |   REG   int             Run_Index;
 97 |   REG   int             Number_Of_Runs;
 98 | 
 99 |   /* Initializations */
100 | 
101 |   Next_Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type));
102 |   Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type));
103 | 
104 |   Ptr_Glob->Ptr_Comp                    = Next_Ptr_Glob;
105 |   Ptr_Glob->Discr                       = Ident_1;
106 |   Ptr_Glob->variant.var_1.Enum_Comp     = Ident_3;
107 |   Ptr_Glob->variant.var_1.Int_Comp      = 40;
108 |   strcpy (Ptr_Glob->variant.var_1.Str_Comp,
109 |           "DHRYSTONE PROGRAM, SOME STRING");
110 |   strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
111 | 
112 |   Arr_2_Glob [8][7] = 10;
113 |         /* Was missing in published program. Without this statement,    */
114 |         /* Arr_2_Glob [8][7] would have an undefined value.             */
115 |         /* Warning: With 16-Bit processors and Number_Of_Runs > 32000,  */
116 |         /* overflow may occur for this array element.                   */
117 | 
118 |   printf ("\n");
119 |   printf ("Dhrystone Benchmark, Version 2.1 (Language: C)\n");
120 |   printf ("\n");
121 |   if (Reg)
122 |   {
123 |     printf ("Program compiled with 'register' attribute\n");
124 |     printf ("\n");
125 |   }
126 |   else
127 |   {
128 |     printf ("Program compiled without 'register' attribute\n");
129 |     printf ("\n");
130 |   }
131 |   printf ("Please give the number of runs through the benchmark: ");
132 |   {
133 |     // int n;
134 |     // scanf ("%d", &n);
135 |     Number_Of_Runs = RUNS;
136 |   }
137 |   printf ("\n");
138 | 
139 |   printf ("Execution starts, %d runs through Dhrystone\n", Number_Of_Runs);
140 | 
141 |   /***************/
142 |   /* Start timer */
143 |   /***************/
144 | 
145 | #ifdef IGN_TIMES
146 |   times (&time_info);
147 |   Begin_Time = (long) time_info.tms_utime;
148 | #endif
149 | #ifdef TIME
150 |   Begin_Time = time ( (long *) 0);
151 | #ifdef RISCV
152 |   Begin_Insn = insn ( (long *) 0);
153 | #endif
154 | #endif
155 | 
156 |   for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index)
157 |   {
158 | 
159 |     Proc_5();
160 |     Proc_4();
161 |       /* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */
162 |     Int_1_Loc = 2;
163 |     Int_2_Loc = 3;
164 |     strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
165 |     Enum_Loc = Ident_2;
166 |     Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc);
167 |       /* Bool_Glob == 1 */
168 |     while (Int_1_Loc < Int_2_Loc)  /* loop body executed once */
169 |     {
170 |       Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc;
171 |         /* Int_3_Loc == 7 */
172 |       Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc);
173 |         /* Int_3_Loc == 7 */
174 |       Int_1_Loc += 1;
175 |     } /* while */
176 |       /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
177 |     Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc);
178 |       /* Int_Glob == 5 */
179 |     Proc_1 (Ptr_Glob);
180 |     for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index)
181 |                              /* loop body executed twice */
182 |     {
183 |       if (Enum_Loc == Func_1 (Ch_Index, 'C'))
184 |           /* then, not executed */
185 |         {
186 |         Proc_6 (Ident_1, &Enum_Loc);
187 |         strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING");
188 |         Int_2_Loc = Run_Index;
189 |         Int_Glob = Run_Index;
190 |         }
191 |     }
192 |       /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
193 |     Int_2_Loc = Int_2_Loc * Int_1_Loc;
194 |     Int_1_Loc = Int_2_Loc / Int_3_Loc;
195 |     Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc;
196 |       /* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */
197 |     Proc_2 (&Int_1_Loc);
198 |       /* Int_1_Loc == 5 */
199 | 
200 |   } /* loop "for Run_Index" */
201 | 
202 |   /**************/
203 |   /* Stop timer */
204 |   /**************/
205 | 
206 | #ifdef IGN_TIMES
207 |   times (&time_info);
208 |   End_Time = (long) time_info.tms_utime;
209 | #endif
210 | #ifdef TIME
211 |   End_Time = time ( (long *) 0);
212 | #ifdef RISCV
213 |   End_Insn = insn ( (long *) 0);
214 | #endif
215 | #endif
216 | 
217 |   printf ("Execution ends\n");
218 |   printf ("\n");
219 |   printf ("Final values of the variables used in the benchmark:\n");
220 |   printf ("\n");
221 |   printf ("Int_Glob:            %d\n", Int_Glob);
222 |   printf ("        should be:   %d\n", 5);
223 |   printf ("Bool_Glob:           %d\n", Bool_Glob);
224 |   printf ("        should be:   %d\n", 1);
225 |   printf ("Ch_1_Glob:           %c\n", Ch_1_Glob);
226 |   printf ("        should be:   %c\n", 'A');
227 |   printf ("Ch_2_Glob:           %c\n", Ch_2_Glob);
228 |   printf ("        should be:   %c\n", 'B');
229 |   printf ("Arr_1_Glob[8]:       %d\n", Arr_1_Glob[8]);
230 |   printf ("        should be:   %d\n", 7);
231 |   printf ("Arr_2_Glob[8][7]:    %d\n", Arr_2_Glob[8][7]);
232 |   printf ("        should be:   Number_Of_Runs + 10\n");
233 |   printf ("Ptr_Glob->\n");
234 |   printf ("  Ptr_Comp:          %d\n", (int) Ptr_Glob->Ptr_Comp);
235 |   printf ("        should be:   (implementation-dependent)\n");
236 |   printf ("  Discr:             %d\n", Ptr_Glob->Discr);
237 |   printf ("        should be:   %d\n", 0);
238 |   printf ("  Enum_Comp:         %d\n", Ptr_Glob->variant.var_1.Enum_Comp);
239 |   printf ("        should be:   %d\n", 2);
240 |   printf ("  Int_Comp:          %d\n", Ptr_Glob->variant.var_1.Int_Comp);
241 |   printf ("        should be:   %d\n", 17);
242 |   printf ("  Str_Comp:          %s\n", Ptr_Glob->variant.var_1.Str_Comp);
243 |   printf ("        should be:   DHRYSTONE PROGRAM, SOME STRING\n");
244 |   printf ("Next_Ptr_Glob->\n");
245 |   printf ("  Ptr_Comp:          %d\n", (int) Next_Ptr_Glob->Ptr_Comp);
246 |   printf ("        should be:   (implementation-dependent), same as above\n");
247 |   printf ("  Discr:             %d\n", Next_Ptr_Glob->Discr);
248 |   printf ("        should be:   %d\n", 0);
249 |   printf ("  Enum_Comp:         %d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
250 |   printf ("        should be:   %d\n", 1);
251 |   printf ("  Int_Comp:          %d\n", Next_Ptr_Glob->variant.var_1.Int_Comp);
252 |   printf ("        should be:   %d\n", 18);
253 |   printf ("  Str_Comp:          %s\n",
254 |                                 Next_Ptr_Glob->variant.var_1.Str_Comp);
255 |   printf ("        should be:   DHRYSTONE PROGRAM, SOME STRING\n");
256 |   printf ("Int_1_Loc:           %d\n", Int_1_Loc);
257 |   printf ("        should be:   %d\n", 5);
258 |   printf ("Int_2_Loc:           %d\n", Int_2_Loc);
259 |   printf ("        should be:   %d\n", 13);
260 |   printf ("Int_3_Loc:           %d\n", Int_3_Loc);
261 |   printf ("        should be:   %d\n", 7);
262 |   printf ("Enum_Loc:            %d\n", Enum_Loc);
263 |   printf ("        should be:   %d\n", 1);
264 |   printf ("Str_1_Loc:           %s\n", Str_1_Loc);
265 |   printf ("        should be:   DHRYSTONE PROGRAM, 1'ST STRING\n");
266 |   printf ("Str_2_Loc:           %s\n", Str_2_Loc);
267 |   printf ("        should be:   DHRYSTONE PROGRAM, 2'ND STRING\n");
268 |   printf ("\n");
269 | 
270 |   User_Time = End_Time - Begin_Time;
271 | 
272 | #ifdef RISCV
273 |   User_Insn = End_Insn - Begin_Insn;
274 | 
275 |   printf("Number_Of_Runs: %d\n", Number_Of_Runs);
276 |   printf("User_Time: %d cycles, %d insn\n", User_Time, User_Insn);
277 | 
278 |   long long Cycles_Per_Instruction_x1000 = (1000 * (long long)User_Time) / User_Insn;
279 |   printf("Cycles_Per_Instruction: %d.%d%d%d\n", (int) (Cycles_Per_Instruction_x1000 / 1000),
280 | 		(int) (Cycles_Per_Instruction_x1000 / 100) % 10,
281 | 		(int) (Cycles_Per_Instruction_x1000 / 10) % 10,
282 | 		(int) (Cycles_Per_Instruction_x1000 / 1) % 10);
283 | 
284 |   int Dhrystones_Per_Second_Per_MHz = ((long long) Number_Of_Runs * 1000000) / User_Time;
285 |   printf("Dhrystones_Per_Second_Per_MHz: %d\n", (int) Dhrystones_Per_Second_Per_MHz);
286 | 
287 |   int DMIPS_Per_MHz_x1000 = (1000 * (long long) Dhrystones_Per_Second_Per_MHz) / 1757;
288 |   printf("DMIPS_Per_MHz: %d.%d%d%d\n", (int) (DMIPS_Per_MHz_x1000 / 1000),
289 | 		(int) (DMIPS_Per_MHz_x1000 / 100) % 10,
290 | 		(int) (DMIPS_Per_MHz_x1000 / 10) % 10,
291 | 		(int) (DMIPS_Per_MHz_x1000 / 1) % 10);
292 | #else
293 |   if (User_Time < Too_Small_Time)
294 |   {
295 |     printf ("Measured time too small to obtain meaningful results\n");
296 |     printf ("Please increase number of runs\n");
297 |     printf ("\n");
298 |   }
299 |   else
300 |   {
301 | #ifdef TIME
302 |     Microseconds = (float) User_Time * Mic_secs_Per_Second
303 |                         / (float) Number_Of_Runs;
304 |     Dhrystones_Per_Second = (float) Number_Of_Runs / (float) User_Time;
305 | #else
306 |     Microseconds = (float) User_Time * Mic_secs_Per_Second
307 |                         / ((float) HZ * ((float) Number_Of_Runs));
308 |     Dhrystones_Per_Second = ((float) HZ * (float) Number_Of_Runs)
309 |                         / (float) User_Time;
310 | #endif
311 |     printf ("Microseconds for one run through Dhrystone: ");
312 |     printf ("%6.1f \n", Microseconds);
313 |     printf ("Dhrystones per Second:                      ");
314 |     printf ("%6.1f \n", Dhrystones_Per_Second);
315 |     printf ("\n");
316 |   }
317 | #endif
318 | 
319 | }
320 | 
321 | 
322 | Proc_1 (Ptr_Val_Par)
323 | /******************/
324 | 
325 | REG Rec_Pointer Ptr_Val_Par;
326 |     /* executed once */
327 | {
328 |   REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp;
329 |                                         /* == Ptr_Glob_Next */
330 |   /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp,    */
331 |   /* corresponds to "rename" in Ada, "with" in Pascal           */
332 | 
333 |   structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob);
334 |   Ptr_Val_Par->variant.var_1.Int_Comp = 5;
335 |   Next_Record->variant.var_1.Int_Comp
336 |         = Ptr_Val_Par->variant.var_1.Int_Comp;
337 |   Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp;
338 |   Proc_3 (&Next_Record->Ptr_Comp);
339 |     /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp
340 |                         == Ptr_Glob->Ptr_Comp */
341 |   if (Next_Record->Discr == Ident_1)
342 |     /* then, executed */
343 |   {
344 |     Next_Record->variant.var_1.Int_Comp = 6;
345 |     Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp,
346 |            &Next_Record->variant.var_1.Enum_Comp);
347 |     Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp;
348 |     Proc_7 (Next_Record->variant.var_1.Int_Comp, 10,
349 |            &Next_Record->variant.var_1.Int_Comp);
350 |   }
351 |   else /* not executed */
352 |     structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp);
353 | } /* Proc_1 */
354 | 
355 | 
356 | Proc_2 (Int_Par_Ref)
357 | /******************/
358 |     /* executed once */
359 |     /* *Int_Par_Ref == 1, becomes 4 */
360 | 
361 | One_Fifty   *Int_Par_Ref;
362 | {
363 |   One_Fifty  Int_Loc;
364 |   Enumeration   Enum_Loc;
365 | 
366 |   Int_Loc = *Int_Par_Ref + 10;
367 |   do /* executed once */
368 |     if (Ch_1_Glob == 'A')
369 |       /* then, executed */
370 |     {
371 |       Int_Loc -= 1;
372 |       *Int_Par_Ref = Int_Loc - Int_Glob;
373 |       Enum_Loc = Ident_1;
374 |     } /* if */
375 |   while (Enum_Loc != Ident_1); /* true */
376 | } /* Proc_2 */
377 | 
378 | 
379 | Proc_3 (Ptr_Ref_Par)
380 | /******************/
381 |     /* executed once */
382 |     /* Ptr_Ref_Par becomes Ptr_Glob */
383 | 
384 | Rec_Pointer *Ptr_Ref_Par;
385 | 
386 | {
387 |   if (Ptr_Glob != Null)
388 |     /* then, executed */
389 |     *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp;
390 |   Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp);
391 | } /* Proc_3 */
392 | 
393 | 
394 | Proc_4 () /* without parameters */
395 | /*******/
396 |     /* executed once */
397 | {
398 |   Boolean Bool_Loc;
399 | 
400 |   Bool_Loc = Ch_1_Glob == 'A';
401 |   Bool_Glob = Bool_Loc | Bool_Glob;
402 |   Ch_2_Glob = 'B';
403 | } /* Proc_4 */
404 | 
405 | 
406 | Proc_5 () /* without parameters */
407 | /*******/
408 |     /* executed once */
409 | {
410 |   Ch_1_Glob = 'A';
411 |   Bool_Glob = false;
412 | } /* Proc_5 */
413 | 
414 | 
415 |         /* Procedure for the assignment of structures,          */
416 |         /* if the C compiler doesn't support this feature       */
417 | #ifdef  NOSTRUCTASSIGN
418 | memcpy (d, s, l)
419 | register char   *d;
420 | register char   *s;
421 | register int    l;
422 | {
423 |         while (l--) *d++ = *s++;
424 | }
425 | #endif
426 | 
427 | 
428 | 


--------------------------------------------------------------------------------
/dhrystone/dhry_2.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  ****************************************************************************
  3 |  *
  4 |  *                   "DHRYSTONE" Benchmark Program
  5 |  *                   -----------------------------
  6 |  *
  7 |  *  Version:    C, Version 2.1
  8 |  *
  9 |  *  File:       dhry_2.c (part 3 of 3)
 10 |  *
 11 |  *  Date:       May 25, 1988
 12 |  *
 13 |  *  Author:     Reinhold P. Weicker
 14 |  *
 15 |  ****************************************************************************
 16 |  */
 17 | 
 18 | #include "dhry.h"
 19 | 
 20 | #ifndef REG
 21 | #define REG
 22 |         /* REG becomes defined as empty */
 23 |         /* i.e. no register variables   */
 24 | #endif
 25 | 
 26 | extern  int     Int_Glob;
 27 | extern  char    Ch_1_Glob;
 28 | 
 29 | 
 30 | Proc_6 (Enum_Val_Par, Enum_Ref_Par)
 31 | /*********************************/
 32 |     /* executed once */
 33 |     /* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */
 34 | 
 35 | Enumeration  Enum_Val_Par;
 36 | Enumeration *Enum_Ref_Par;
 37 | {
 38 |   *Enum_Ref_Par = Enum_Val_Par;
 39 |   if (! Func_3 (Enum_Val_Par))
 40 |     /* then, not executed */
 41 |     *Enum_Ref_Par = Ident_4;
 42 |   switch (Enum_Val_Par)
 43 |   {
 44 |     case Ident_1:
 45 |       *Enum_Ref_Par = Ident_1;
 46 |       break;
 47 |     case Ident_2:
 48 |       if (Int_Glob > 100)
 49 |         /* then */
 50 |       *Enum_Ref_Par = Ident_1;
 51 |       else *Enum_Ref_Par = Ident_4;
 52 |       break;
 53 |     case Ident_3: /* executed */
 54 |       *Enum_Ref_Par = Ident_2;
 55 |       break;
 56 |     case Ident_4: break;
 57 |     case Ident_5:
 58 |       *Enum_Ref_Par = Ident_3;
 59 |       break;
 60 |   } /* switch */
 61 | } /* Proc_6 */
 62 | 
 63 | 
 64 | Proc_7 (Int_1_Par_Val, Int_2_Par_Val, Int_Par_Ref)
 65 | /**********************************************/
 66 |     /* executed three times                                      */
 67 |     /* first call:      Int_1_Par_Val == 2, Int_2_Par_Val == 3,  */
 68 |     /*                  Int_Par_Ref becomes 7                    */
 69 |     /* second call:     Int_1_Par_Val == 10, Int_2_Par_Val == 5, */
 70 |     /*                  Int_Par_Ref becomes 17                   */
 71 |     /* third call:      Int_1_Par_Val == 6, Int_2_Par_Val == 10, */
 72 |     /*                  Int_Par_Ref becomes 18                   */
 73 | One_Fifty       Int_1_Par_Val;
 74 | One_Fifty       Int_2_Par_Val;
 75 | One_Fifty      *Int_Par_Ref;
 76 | {
 77 |   One_Fifty Int_Loc;
 78 | 
 79 |   Int_Loc = Int_1_Par_Val + 2;
 80 |   *Int_Par_Ref = Int_2_Par_Val + Int_Loc;
 81 | } /* Proc_7 */
 82 | 
 83 | 
 84 | Proc_8 (Arr_1_Par_Ref, Arr_2_Par_Ref, Int_1_Par_Val, Int_2_Par_Val)
 85 | /*********************************************************************/
 86 |     /* executed once      */
 87 |     /* Int_Par_Val_1 == 3 */
 88 |     /* Int_Par_Val_2 == 7 */
 89 | Arr_1_Dim       Arr_1_Par_Ref;
 90 | Arr_2_Dim       Arr_2_Par_Ref;
 91 | int             Int_1_Par_Val;
 92 | int             Int_2_Par_Val;
 93 | {
 94 |   REG One_Fifty Int_Index;
 95 |   REG One_Fifty Int_Loc;
 96 | 
 97 |   Int_Loc = Int_1_Par_Val + 5;
 98 |   Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val;
 99 |   Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc];
100 |   Arr_1_Par_Ref [Int_Loc+30] = Int_Loc;
101 |   for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index)
102 |     Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc;
103 |   Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1;
104 |   Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc];
105 |   Int_Glob = 5;
106 | } /* Proc_8 */
107 | 
108 | 
109 | Enumeration Func_1 (Ch_1_Par_Val, Ch_2_Par_Val)
110 | /*************************************************/
111 |     /* executed three times                                         */
112 |     /* first call:      Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R'    */
113 |     /* second call:     Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C'    */
114 |     /* third call:      Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C'    */
115 | 
116 | Capital_Letter   Ch_1_Par_Val;
117 | Capital_Letter   Ch_2_Par_Val;
118 | {
119 |   Capital_Letter        Ch_1_Loc;
120 |   Capital_Letter        Ch_2_Loc;
121 | 
122 |   Ch_1_Loc = Ch_1_Par_Val;
123 |   Ch_2_Loc = Ch_1_Loc;
124 |   if (Ch_2_Loc != Ch_2_Par_Val)
125 |     /* then, executed */
126 |     return (Ident_1);
127 |   else  /* not executed */
128 |   {
129 |     Ch_1_Glob = Ch_1_Loc;
130 |     return (Ident_2);
131 |    }
132 | } /* Func_1 */
133 | 
134 | 
135 | Boolean Func_2 (Str_1_Par_Ref, Str_2_Par_Ref)
136 | /*************************************************/
137 |     /* executed once */
138 |     /* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */
139 |     /* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */
140 | 
141 | Str_30  Str_1_Par_Ref;
142 | Str_30  Str_2_Par_Ref;
143 | {
144 |   REG One_Thirty        Int_Loc;
145 |       Capital_Letter    Ch_Loc;
146 | 
147 |   Int_Loc = 2;
148 |   while (Int_Loc <= 2) /* loop body executed once */
149 |     if (Func_1 (Str_1_Par_Ref[Int_Loc],
150 |                 Str_2_Par_Ref[Int_Loc+1]) == Ident_1)
151 |       /* then, executed */
152 |     {
153 |       Ch_Loc = 'A';
154 |       Int_Loc += 1;
155 |     } /* if, while */
156 |   if (Ch_Loc >= 'W' && Ch_Loc < 'Z')
157 |     /* then, not executed */
158 |     Int_Loc = 7;
159 |   if (Ch_Loc == 'R')
160 |     /* then, not executed */
161 |     return (true);
162 |   else /* executed */
163 |   {
164 |     if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0)
165 |       /* then, not executed */
166 |     {
167 |       Int_Loc += 7;
168 |       Int_Glob = Int_Loc;
169 |       return (true);
170 |     }
171 |     else /* executed */
172 |       return (false);
173 |   } /* if Ch_Loc */
174 | } /* Func_2 */
175 | 
176 | 
177 | Boolean Func_3 (Enum_Par_Val)
178 | /***************************/
179 |     /* executed once        */
180 |     /* Enum_Par_Val == Ident_3 */
181 | Enumeration Enum_Par_Val;
182 | {
183 |   Enumeration Enum_Loc;
184 | 
185 |   Enum_Loc = Enum_Par_Val;
186 |   if (Enum_Loc == Ident_3)
187 |     /* then, executed */
188 |     return (true);
189 |   else /* not executed */
190 |     return (false);
191 | } /* Func_3 */
192 | 
193 | 


--------------------------------------------------------------------------------
/dhrystone/sections.lds:
--------------------------------------------------------------------------------
 1 | /*
 2 | This is free and unencumbered software released into the public domain.
 3 | 
 4 | Anyone is free to copy, modify, publish, use, compile, sell, or
 5 | distribute this software, either in source code form or as a compiled
 6 | binary, for any purpose, commercial or non-commercial, and by any
 7 | means.
 8 | */
 9 | 
10 | SECTIONS {
11 | 	.memory : {
12 | 		start*(.text);
13 | 		*(.text);
14 | 		*(*);
15 | 		end = .;
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/dhrystone/start.S:
--------------------------------------------------------------------------------
 1 | 	.section .text
 2 | 	.global start
 3 | 	.global main
 4 | 
 5 | start:
 6 | 	/* print "START\r\n" */
 7 | 	li a0, UARTADDR
 8 | 	li a1, 'S'
 9 | 	li a2, 'T'
10 | 	li a3, 'A'
11 | 	li a4, 'R'
12 | 	li a5, '\r'
13 | 	li a6, '\n'
14 | 	sh a1, 2(a0)
15 | 1:      lh t0, 2(a0)
16 |         bnez t0, 1b
17 | 
18 | 	sh a2, 2(a0)
19 | 1:      lh t0, 2(a0)
20 |         bnez t0, 1b
21 | 
22 | 	sh a3, 2(a0)
23 | 1:      lh t0, 2(a0)
24 |         bnez t0, 1b
25 | 
26 | 	sh a4, 2(a0)
27 | 1:      lh t0, 2(a0)
28 |         bnez t0, 1b
29 | 
30 | 	sh a2, 2(a0)
31 | 1:      lh t0, 2(a0)
32 |         bnez t0, 1b
33 | 
34 | 	sh a5, 2(a0)
35 | 1:      lh t0, 2(a0)
36 |         bnez t0, 1b
37 | 
38 | 	sh a6, 2(a0)
39 | 
40 | 	/* set stack pointer */
41 | 	li sp, STACK
42 | 
43 |         # store return address
44 |         addi sp, sp, -4
45 |         sw ra, (sp)
46 | 
47 | 	/* jump to main C code */
48 | 	jal ra,main
49 | 
50 | 	/* print "DONE\r\n" */
51 | 	li a0,UARTADDR
52 | 	addi a1,zero,'D'
53 | 	addi a2,zero,'O'
54 | 	addi a3,zero,'N'
55 | 	addi a4,zero,'E'
56 | 	addi a5,zero,'\r'
57 | 	addi a6,zero,'\n'
58 | 
59 | 1:      lh t0, 2(a0)
60 |         bnez t0, 1b
61 | 
62 | 	sh a1, 2(a0)
63 | 1:      lh t0, 2(a0)
64 |         bnez t0, 1b
65 | 
66 | 	sh a2, 2(a0)
67 | 1:      lh t0, 2(a0)
68 |         bnez t0, 1b
69 | 
70 | 	sh a3, 2(a0)
71 | 1:      lh t0, 2(a0)
72 |         bnez t0, 1b
73 | 
74 | 	sh a4, 2(a0)
75 | 1:      lh t0, 2(a0)
76 |         bnez t0, 1b
77 | 
78 | 	sh a5, 2(a0)
79 | 1:      lh t0, 2(a0)
80 |         bnez t0, 1b
81 | 
82 | 	sh a6, 2(a0)
83 | 
84 |         # return to monitor
85 |         lw ra, (sp)
86 |         ret
87 | 
88 | 


--------------------------------------------------------------------------------
/dhrystone/stdlib.c:
--------------------------------------------------------------------------------
  1 | // This is free and unencumbered software released into the public domain.
  2 | //
  3 | // Anyone is free to copy, modify, publish, use, compile, sell, or
  4 | // distribute this software, either in source code form or as a compiled
  5 | // binary, for any purpose, commercial or non-commercial, and by any
  6 | // means.
  7 | 
  8 | #include <stdarg.h>
  9 | #include <stdint.h>
 10 | 
 11 | extern long time();
 12 | extern long insn();
 13 | 
 14 | #ifdef USE_MYSTDLIB
 15 | extern char *malloc();
 16 | extern int printf(const char *format, ...);
 17 | 
 18 | extern void *memcpy(void *dest, const void *src, long n);
 19 | extern char *strcpy(char *dest, const char *src);
 20 | extern int strcmp(const char *s1, const char *s2);
 21 | 
 22 | char heap_memory[1024];
 23 | int heap_memory_used = 0;
 24 | #endif
 25 | 
 26 | long time()
 27 | {
 28 | 	int cycles;
 29 | 	asm volatile ("rdcycle %0" : "=r"(cycles));
 30 |         //printf("[time() -> %d]\n", cycles);
 31 | 	return cycles;
 32 | }
 33 | 
 34 | long insn()
 35 | {
 36 | 	int insns;
 37 | 	asm volatile ("rdinstret %0" : "=r"(insns));
 38 | 	//printf("[insn() -> %d]\n", insns);
 39 | 	return insns;
 40 | }
 41 | 
 42 | #ifdef USE_MYSTDLIB
 43 | char *malloc(int size)
 44 | {
 45 | 	char *p = heap_memory + heap_memory_used;
 46 | 	// printf("[malloc(%d) -> %d (%d..%d)]", size, (int)p, heap_memory_used, heap_memory_used + size);
 47 | 	heap_memory_used += size;
 48 | 	if (heap_memory_used > 1024)
 49 | 		asm volatile ("ebreak");
 50 | 	return p;
 51 | }
 52 | 
 53 | static void printf_c(int c)
 54 | {
 55 |     while (*((short volatile *)(UARTADDR + 2)) != 0) {}
 56 | 
 57 |     if (c == '\n') {
 58 |         // Add carriage returns for serial terminal compatibility.
 59 | 	*((volatile short*)(UARTADDR + 2)) = '\r';
 60 |         while (*((short volatile *)(UARTADDR + 2)) != 0) {}
 61 |     }
 62 | 
 63 |     *((volatile short*)(UARTADDR + 2)) = c;
 64 | }
 65 | 
 66 | static void printf_s(char *p)
 67 | {
 68 | 	while (*p) {
 69 |             printf_c(*(p++));
 70 |         }
 71 | }
 72 | 
 73 | static void printf_d(int val)
 74 | {
 75 | 	char buffer[32];
 76 | 	char *p = buffer;
 77 | 	if (val < 0) {
 78 | 		printf_c('-');
 79 | 		val = -val;
 80 | 	}
 81 | 	while (val || p == buffer) {
 82 | 		*(p++) = '0' + val % 10;
 83 | 		val = val / 10;
 84 | 	}
 85 | 	while (p != buffer)
 86 | 		printf_c(*(--p));
 87 | }
 88 | 
 89 | int printf(const char *format, ...)
 90 | {
 91 | 	int i;
 92 | 	va_list ap;
 93 | 
 94 | 	va_start(ap, format);
 95 | 
 96 | 	for (i = 0; format[i]; i++)
 97 | 		if (format[i] == '%') {
 98 | 			while (format[++i]) {
 99 | 				if (format[i] == 'c') {
100 | 					printf_c(va_arg(ap,int));
101 | 					break;
102 | 				}
103 | 				if (format[i] == 's') {
104 | 					printf_s(va_arg(ap,char*));
105 | 					break;
106 | 				}
107 | 				if (format[i] == 'd') {
108 | 					printf_d(va_arg(ap,int));
109 | 					break;
110 | 				}
111 | 			}
112 | 		} else
113 | 			printf_c(format[i]);
114 | 
115 | 	va_end(ap);
116 | }
117 | 
118 | void *memcpy(void *aa, const void *bb, long n)
119 | {
120 | 	// printf("**MEMCPY**\n");
121 | 	char *a = aa;
122 | 	const char *b = bb;
123 | 	while (n--) *(a++) = *(b++);
124 | 	return aa;
125 | }
126 | 
127 | char *strcpy(char* dst, const char* src)
128 | {
129 | 	char *r = dst;
130 | 
131 | 	while ((((uint32_t)dst | (uint32_t)src) & 3) != 0)
132 | 	{
133 | 		char c = *(src++);
134 | 		*(dst++) = c;
135 | 		if (!c) return r;
136 | 	}
137 | 
138 | 	while (1)
139 | 	{
140 | 		uint32_t v = *(uint32_t*)src;
141 | 
142 | 		if (__builtin_expect((((v) - 0x01010101UL) & ~(v) & 0x80808080UL), 0))
143 | 		{
144 | 			dst[0] = v & 0xff;
145 | 			if ((v & 0xff) == 0)
146 | 				return r;
147 | 			v = v >> 8;
148 | 
149 | 			dst[1] = v & 0xff;
150 | 			if ((v & 0xff) == 0)
151 | 				return r;
152 | 			v = v >> 8;
153 | 
154 | 			dst[2] = v & 0xff;
155 | 			if ((v & 0xff) == 0)
156 | 				return r;
157 | 			v = v >> 8;
158 | 
159 | 			dst[3] = v & 0xff;
160 | 			return r;
161 | 		}
162 | 
163 | 		*(uint32_t*)dst = v;
164 | 		src += 4;
165 | 		dst += 4;
166 | 	}
167 | }
168 | 
169 | int strcmp(const char *s1, const char *s2)
170 | {
171 | 	while ((((uint32_t)s1 | (uint32_t)s2) & 3) != 0)
172 | 	{
173 | 		char c1 = *(s1++);
174 | 		char c2 = *(s2++);
175 | 
176 | 		if (c1 != c2)
177 | 			return c1 < c2 ? -1 : +1;
178 | 		else if (!c1)
179 | 			return 0;
180 | 	}
181 | 
182 | 	while (1)
183 | 	{
184 | 		uint32_t v1 = *(uint32_t*)s1;
185 | 		uint32_t v2 = *(uint32_t*)s2;
186 | 
187 | 		if (__builtin_expect(v1 != v2, 0))
188 | 		{
189 | 			char c1, c2;
190 | 
191 | 			c1 = v1 & 0xff, c2 = v2 & 0xff;
192 | 			if (c1 != c2) return c1 < c2 ? -1 : +1;
193 | 			if (!c1) return 0;
194 | 			v1 = v1 >> 8, v2 = v2 >> 8;
195 | 
196 | 			c1 = v1 & 0xff, c2 = v2 & 0xff;
197 | 			if (c1 != c2) return c1 < c2 ? -1 : +1;
198 | 			if (!c1) return 0;
199 | 			v1 = v1 >> 8, v2 = v2 >> 8;
200 | 
201 | 			c1 = v1 & 0xff, c2 = v2 & 0xff;
202 | 			if (c1 != c2) return c1 < c2 ? -1 : +1;
203 | 			if (!c1) return 0;
204 | 			v1 = v1 >> 8, v2 = v2 >> 8;
205 | 
206 | 			c1 = v1 & 0xff, c2 = v2 & 0xff;
207 | 			if (c1 != c2) return c1 < c2 ? -1 : +1;
208 | 			return 0;
209 | 		}
210 | 
211 | 		if (__builtin_expect((((v1) - 0x01010101UL) & ~(v1) & 0x80808080UL), 0))
212 | 			return 0;
213 | 
214 | 		s1 += 4;
215 | 		s2 += 4;
216 | 	}
217 | }
218 | #endif
219 | 
220 | 


--------------------------------------------------------------------------------
/doc/chonk.mkdn:
--------------------------------------------------------------------------------
  1 | # Cost-Benefit of Half-Width Datapath
  2 | 
  3 | Part of my goal with `hapenny` was to try and determine how the speed and size
  4 | of an RV32 CPU changes when it uses a half-width datapath. Comparisons against
  5 | other RV32 CPUs are a good start, but it's hard to do apples-to-apples
  6 | comparisons, because the different CPUs have different goals, different
  7 | microarchitectures, and different bus interfaces.
  8 | 
  9 | To more accurately compare the datapath widths, I've modified the `hapenny` v2
 10 | microarchitecture to produce a similarly-designed CPU with a 32-bit datapath,
 11 | and I've given it the obvious name.
 12 | 
 13 | ## `chonk`: Oh Lawd, He Comin'
 14 | 
 15 | `chonk` is a copy-paste-edit of the `hapenny` v2 core. The diffs between the
 16 | cores are fairly compact, and much logic is shared:
 17 | 
 18 | - The main datapath, including the adder, is now 32 bits wide.
 19 | - The register file can still only read one instruction per cycle, though the
 20 |   reads and writes are now 32 bits wide as well.
 21 | - The data bus is now 32 bits wide.
 22 | - Most decoding logic is shared and the state machine is similar, but with fewer
 23 |   states.
 24 | 
 25 | You can load `chonk` onto an Icestick eval board using the `icestick-chonk.py`
 26 | script.
 27 | 
 28 | ## Ways this comparison isn't great
 29 | 
 30 | `chonk` uses a 32-bit data bus, which means 32-bit-wide RAM and peripherals.
 31 | This makes peripherals slightly more expensive, and doubles the resource usage
 32 | of the smallest possible RAM. While `hapenny` can happily run at full speed out
 33 | of a single 16-bit block RAM, `chonk` needs at least two.
 34 | 
 35 | Of course, on FPGAs with wider 32-36 bit block RAMs, this is fine.
 36 | 
 37 | `chonk` uses a register file with doubled bandwidth: still only one read port,
 38 | but only one read is required to get the full contents of any 32-bit register.
 39 | This is largely responsible for its lower clocks-per-instruction.
 40 | 
 41 | ## Effects of widening the datapath
 42 | 
 43 | | Parameter              | `hapenny` 2 | `chonk` | Change |
 44 | | ---------------------- | ----------- | ------- | ------ |
 45 | | LCs on iCE40           | 796         | 971     | +22%   |
 46 | | Fmax (MHz)             | 72          | 62      | -14%   |
 47 | | Cycles/instruction     | 5.525       | 2.925   | -47%   |
 48 | | Instructions/second    | 13.032      | 21.197  | +63%   |
 49 | 
 50 | (Comparison of the output of `icestick-smallest` vs `icestick-chonk`.
 51 | Cycles/instruction numbers are from Dhrystone and will vary depending on
 52 | instruction mix.)
 53 | 
 54 | Observations:
 55 | 
 56 | - `chonk` tends to have a lower Fmax than `hapenny` because of longer carry
 57 |   chains in additions and comparisons. `hapenny` is far more amenable to having
 58 |   its critical path rearranged.
 59 | 
 60 | - `chonk` is only about 22% larger, rather than twice as large, because much of
 61 |   the control logic is unchanged, and a lot of datapath control logic removed
 62 |   compared to the 16-bit version.
 63 | 
 64 | - Even with the lower Fmax, `chonk` gets significantly higher performance in
 65 |   terms of RV32 instructions executed per second.
 66 | 
 67 | Instruction timing:
 68 | 
 69 | | Instruction   | `hapenny` 2 | `chonk` | Change  |
 70 | | ------------- | ----------- | ------- | ------- |
 71 | | AUIPC         | 4           | 2       | -50%    |
 72 | | LUI           | 4           | 2       | -50%    |
 73 | | JAL           | 8           | 4       | -50%    |
 74 | | JALR          | 8           | 4       | -50%    |
 75 | | Branch        | 5/10        | 3/5     | -40/50% |
 76 | | Load          | 6           | 3       | -50%    |
 77 | | SW            | 5           | 2       | -60%    |
 78 | | SB/SH         | 4           | 2       | -50%    |
 79 | | SLT(I)(U)     | 6           | 3       | -50%    |
 80 | | Shift         | 6 + N       | 3 + N   | -8-50%  |
 81 | | Other ALU op  | 4           | 2       | -50%    |
 82 | | division test | 956         | 519     | -46%    |
 83 | 
 84 | As you can see from this table, most instructions on `chonk` take half the
 85 | cycles as `hapenny`, because both cores are fundamentally restricted by register
 86 | file bandwidth. There are some instructions that don't show that degree of
 87 | improvement, which is why the average instructions per clock on Dhrystone isn't
 88 | exactly 2x:
 89 | 
 90 | - Not-taken branches are only 40% faster.
 91 | - Shifts still take one cycle per bit moved, on either core, so the 50%
 92 |   advantage when shifting by zero bits drops to an 8% advantage at 31.
 93 | 
 94 | On both Dhrystone and the division test case from the testbenches (which is an
 95 | extract of libgcc and a hot path in Dhrystone), we see about a 46% reduction in
 96 | cycles required to execute a given workload.
 97 | 
 98 | 
 99 | ## Conclusions
100 | 
101 | In FPGAs, where adders are relatively inexpensive and flops plentiful, cutting
102 | the datapath of an RV32 implementation in half doesn't save quite as much area
103 | as you might expect -- about 17% (972 LCs down to 813). It causes RV32
104 | instructions to execute at roughly half the speed, since two steps are required
105 | for any 32-bit operation. (Excluding shifts -- these cores implement shifts
106 | naively.)
107 | 
108 | This leaves us with three main benefits to the approach:
109 | 
110 | 1. A fully-realized SoC built out of 16-bit memories and peripherals will tend
111 |    to use less of an FPGA -- in other words, the area advantage grows with
112 |    system complexity.
113 | 
114 | 2. The 16-bit version can often close timing at higher frequencies, due in large
115 |    part to the shorter carry chains. (The 16-bit design is basically equivalent
116 |    to a 32-bit design with a register in the middle of the adder -- only smaller
117 |    and more complex.)
118 | 
119 | 3. The ability to use 16-bit memory without further performance penalty has its
120 |    own advantages, such as the ability to run out of 16-bit external SRAM. On
121 |    FPGAs with 16-bit (or 18-bit) block RAMs, a 16-bit implementation can use
122 |    fewer of them, leaving others available for other things.
123 | 


--------------------------------------------------------------------------------
/hapenny/__init__.py:
--------------------------------------------------------------------------------
  1 | from amaranth import *
  2 | from amaranth.lib import data, enum, wiring
  3 | from amaranth.lib.enum import Enum
  4 | from amaranth.lib.wiring import In, Out
  5 | from amaranth.lib.data import Struct
  6 | 
  7 | from functools import reduce
  8 | 
  9 | class StreamSig(wiring.Signature):
 10 |     def __init__(self, payload_shape):
 11 |         super().__init__({
 12 |             'payload': Out(payload_shape),
 13 |             'valid': Out(1),
 14 |             'ready': In(1),
 15 |         })
 16 | 
 17 | class AlwaysReady(wiring.Signature):
 18 |     def __init__(self, payload_shape):
 19 |         super().__init__({
 20 |             'payload': Out(payload_shape),
 21 |             'valid': Out(1),
 22 |         })
 23 | 
 24 | # Builds a mux but out of AND and OR, which often generates cheaper logic on
 25 | # 4LUT devices.
 26 | def mux(select, one, zero):
 27 |     if isinstance(one, Enum):
 28 |         one = one.value
 29 |     if isinstance(one, int):
 30 |         one = Const(one)
 31 |     if isinstance(one, Struct):
 32 |         one = Value.cast(one)
 33 |     if isinstance(zero, Enum):
 34 |         zero = zero.value
 35 |     if isinstance(zero, int):
 36 |         zero = Const(zero)
 37 |     if isinstance(zero, Struct):
 38 |         zero = Value.cast(zero)
 39 |     n = max(one.shape().width, zero.shape().width)
 40 |     select = select.any() # force to 1 bit
 41 |     return (
 42 |         (select.replicate(n) & one) | (~select.replicate(n) & zero)
 43 |     )
 44 | 
 45 | # Builds an output net that chooses between options based on a onehot control
 46 | # signal.
 47 | #
 48 | # onehot_sig should be a signal of N bits, and options should be a dict with at
 49 | # most N entries. Each key in the dict is a bit number in onehot_sig, or a
 50 | # tuple of bit numbers, and the corresponding value will be produced as output
 51 | # when the indicated bit(s) are set in the state.
 52 | #
 53 | # If a default is provided, it will be used if none of the explicit conditions
 54 | # in the options map fires. By default, the default is zero.
 55 | #
 56 | # This assumes all bits in the onehot_sig are mutually exclusive, and combines
 57 | # each path using a bitwise OR instead of muxes, which is often cheaper on 4LUT
 58 | # devices. However, this means if the onehot invariant is violated, you'll get
 59 | # nonsense output. If that concerns you, see oneof instead.
 60 | def onehot_choice(onehot_sig, options, default = None):
 61 |     assert len(options) > 0
 62 |     output = []
 63 |     matches = []
 64 |     for (choice, result) in options.items():
 65 |         if isinstance(choice, Enum):
 66 |             choice = choice.value
 67 |         if isinstance(choice, list) or isinstance(choice, tuple):
 68 |             pass
 69 |         else:
 70 |             # Force choice to be a sequence
 71 |             choice = [choice]
 72 |         if isinstance(result, Enum):
 73 |             result = result.value
 74 |         if isinstance(result, int):
 75 |             result = Const(result)
 76 | 
 77 |         condition = reduce(lambda a, b: a | b, map(lambda s: onehot_sig[s],
 78 |                                                    choice))
 79 |         matches.append(condition)
 80 | 
 81 |         case = condition.replicate(result.shape().width) & result
 82 | 
 83 |         output.append(case)
 84 | 
 85 |     if default is not None:
 86 |         if isinstance(default, Enum):
 87 |             default = default.value
 88 |         if isinstance(default, int):
 89 |             default = Const(default)
 90 |         no_match = ~reduce(lambda a, b: a | b, matches)
 91 |         output.append(no_match.replicate(default.shape().width) & default)
 92 | 
 93 |     return reduce(lambda a, b: a | b, output)
 94 | 
 95 | # Builds a chained mux that selects between a set of options, which must be
 96 | # mutually exclusive.
 97 | #
 98 | # 'options' is a list of pairs. The first element in each pair is evaluated as a
 99 | # boolean condition. If 1, the second element is OR'd into the result.
100 | #
101 | # This means if more than one condition is true simultaneously, the result will
102 | # bitwise OR the results together. It is up to you to ensure that all
103 | # conditions are mutually exclusive.
104 | #
105 | # If a default is provided, it will be used when no other conditions match.
106 | # Otherwise, the default is zero.
107 | #
108 | # If you've got a onehot control signal instead of a bunch of separate condition
109 | # strobes, see onehot_choice.
110 | def oneof(options, default = None):
111 |     assert len(options) > 0
112 |     output = []
113 |     matches = []
114 |     for (condition, result) in options:
115 |         if isinstance(condition, int):
116 |             condition = Const(condition)
117 |         if isinstance(result, Enum):
118 |             result = result.value
119 |         if isinstance(result, int):
120 |             result = Const(result)
121 |         
122 |         matches.append(condition.any())
123 | 
124 |         case = condition.any().replicate(result.shape().width) & result
125 | 
126 |         output.append(case)
127 | 
128 |     if default is not None:
129 |         if isinstance(default, Enum):
130 |             default = default.value
131 |         if isinstance(default, int):
132 |             default = Const(default)
133 |         no_match = ~reduce(lambda a, b: a|b, matches)
134 |         output.append(no_match.replicate(default.shape().width) & default)
135 | 
136 |     return reduce(lambda a, b: a|b, output)
137 | 
138 | def hihalf(signal):
139 |     return signal[16:]
140 | 
141 | def lohalf(signal):
142 |     return signal[:16]
143 | 
144 | # Selects between the halfwords of (32-bit) signal: if hi is 1, chooses the
145 | # high half, otherwise the low half.
146 | def choosehalf(hi, signal):
147 |     return mux(hi, hihalf(signal), lohalf(signal))
148 | 
149 | # Combines a list of signals using binary function 'fun', organizing them into
150 | # a balanced binary tree instead of a linked list like reduce/foldl would.
151 | def treeduce(fun, items):
152 |     if len(items) == 1:
153 |         return items[0]
154 | 
155 |     partition = len(items) // 2
156 |     left = items[:partition]
157 |     right = items[partition:]
158 |     return fun(treeduce(fun, left), treeduce(fun, right))
159 | 
160 | 


--------------------------------------------------------------------------------
/hapenny/bus.py:
--------------------------------------------------------------------------------
  1 | from functools import reduce
  2 | 
  3 | from amaranth import *
  4 | from amaranth.lib.wiring import *
  5 | from amaranth.lib.enum import *
  6 | from amaranth.lib.coding import Encoder, Decoder
  7 | 
  8 | from hapenny import StreamSig, AlwaysReady, treeduce
  9 | 
 10 | class BusCmd(Signature):
 11 |     def __init__(self, *, addr, data):
 12 |         if isinstance(data, int):
 13 |             lanes = (data + 7) // 8
 14 |         else:
 15 |             lanes = (data.width + 7) // 8
 16 |         super().__init__({
 17 |             'addr': Out(addr),
 18 |             'lanes': Out(lanes),
 19 |             'data': Out(data)
 20 |         })
 21 | 
 22 | class BusPort(Signature):
 23 |     def __init__(self, *, addr, data):
 24 |         super().__init__({
 25 |             'cmd': Out(AlwaysReady(BusCmd(addr=addr, data=data))),
 26 |             'resp': In(data),
 27 |         })
 28 | 
 29 | def partial_decode(m, bus, width):
 30 |     assert width >= bus.cmd.payload.addr.shape().width, \
 31 |             "can't use partial_decode to make a bus narrower"
 32 |     port = BusPort(addr = width, data = bus.cmd.payload.data.shape()).flip().create()
 33 |     m.d.comb += [
 34 |         bus.cmd.payload.addr.eq(port.cmd.payload.addr),
 35 |         bus.cmd.payload.data.eq(port.cmd.payload.data),
 36 |         bus.cmd.payload.lanes.eq(port.cmd.payload.lanes),
 37 |         bus.cmd.valid.eq(port.cmd.valid),
 38 | 
 39 |         port.resp.eq(bus.resp),
 40 |     ]
 41 |     return port
 42 | 
 43 | def narrow_addr(m, bus, width):
 44 |     assert width <= bus.cmd.payload.addr.shape().width, \
 45 |             "can't use narrow_addr to make a bus wider"
 46 |     port = BusPort(addr = width, data = bus.cmd.payload.data.shape()).flip().create()
 47 |     m.d.comb += [
 48 |         bus.cmd.payload.addr.eq(port.cmd.payload.addr),
 49 |         bus.cmd.payload.data.eq(port.cmd.payload.data),
 50 |         bus.cmd.payload.lanes.eq(port.cmd.payload.lanes),
 51 |         bus.cmd.valid.eq(port.cmd.valid),
 52 | 
 53 |         port.resp.eq(bus.resp),
 54 |     ]
 55 |     return port
 56 | 
 57 | class SimpleFabric(Elaboratable):
 58 |     def __init__(self, devices):
 59 |         assert len(devices) > 0
 60 |         data_bits = max(p.cmd.payload.data.shape().width for p in devices)
 61 |         addr_bits = max(p.cmd.payload.addr.shape().width for p in devices)
 62 |         sig = BusPort(addr = addr_bits, data = data_bits).flip()
 63 |         print(f"fabric configured for {addr_bits} addr bits, {data_bits} data bits")
 64 |         for i, d in enumerate(devices):
 65 |             assert sig.is_compliant(d), \
 66 |                     f"device #{i} does not have {addr_bits} addr bits: {d.cmd.payload.addr.shape()}"
 67 |         self.devices = devices
 68 |         self.extra_bits = (len(devices) - 1).bit_length()
 69 |         self.addr_bits = addr_bits
 70 |         self.data_bits = data_bits
 71 | 
 72 |         self.bus = BusPort(addr = addr_bits + self.extra_bits, data =
 73 |                                  data_bits).flip().create()
 74 | 
 75 |     def elaborate(self, platform):
 76 |         m = Module()
 77 | 
 78 |         # index of the currently selected device.
 79 |         devid = Signal(self.extra_bits)
 80 |         m.d.comb += devid.eq(self.bus.cmd.payload.addr[self.addr_bits:])
 81 | 
 82 |         # index of the last selected device (registered).
 83 |         last_id = Signal(self.extra_bits)
 84 |         # Since the setting of the response mux is ignored if the CPU isn't
 85 |         # expecting data back, we can just capture the address lines on every
 86 |         # cycle whether it's valid or not.
 87 |         m.d.sync += last_id.eq(devid)
 88 | 
 89 |         for (i, d) in enumerate(self.devices):
 90 |             # Fan out the incoming address, data, and lanes to every device.
 91 |             m.d.comb += [
 92 |                 d.cmd.payload.addr.eq(self.bus.cmd.payload.addr),
 93 |                 d.cmd.payload.data.eq(self.bus.cmd.payload.data),
 94 |                 d.cmd.payload.lanes.eq(self.bus.cmd.payload.lanes),
 95 |             ]
 96 |             # Only propagate cmd valid to the specific addressed device.
 97 |             dv = Signal(1, name = f"valid_{i}")
 98 |             m.d.comb += [
 99 |                 dv.eq(self.bus.cmd.valid & (devid == i)),
100 |                 d.cmd.valid.eq(dv),
101 |             ]
102 | 
103 |         # Fan the response data in based on who we're listening to.
104 |         response_data = []
105 |         for (i, d) in enumerate(self.devices):
106 |             data = d.resp & (last_id == i).replicate(self.data_bits)
107 |             response_data.append(data)
108 | 
109 |         m.d.comb += self.bus.resp.eq(treeduce(lambda a, b: a | b, response_data))
110 | 
111 |         return m
112 | 


--------------------------------------------------------------------------------
/hapenny/chonk/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/hapenny/chonk/__init__.py


--------------------------------------------------------------------------------
/hapenny/chonk/cpu.py:
--------------------------------------------------------------------------------
  1 | # A baseline implementation of an RV32 processor for comparison,
  2 | # sharing microarchitectural details with hapenny.
  3 | 
  4 | from amaranth import *
  5 | from amaranth.lib.wiring import *
  6 | from amaranth.lib.enum import *
  7 | import amaranth.lib.coding
  8 | 
  9 | from hapenny import StreamSig, AlwaysReady, mux, oneof, onehot_choice
 10 | from hapenny.decoder import ImmediateDecoder, Decoder, DecodeSignals
 11 | from hapenny.chonk.regfile32 import RegFile, RegWrite
 12 | from hapenny.bus import BusPort, BusCmd
 13 | from hapenny.chonk.sbox import SBox, STATE_COUNT
 14 | from hapenny.chonk.fdbox import FDBox
 15 | from hapenny.chonk.ewbox import EWBox
 16 | from hapenny.rvfi import Rvfi, Mode, Ixl
 17 | 
 18 | # Note: all debug port signals are directional from the perspective of the DEBUG
 19 | # PROBE, not the CPU.
 20 | DebugPort = Signature({
 21 |     # Register read port. The CPU asserts READY on this port when it is halted
 22 |     # and the register file is available for inspection. Debug probes should
 23 |     # place a register number on the payload signals and assert VALID; the
 24 |     # response will come on reg_value on the next cycle.
 25 |     'reg_read': Out(StreamSig(5)),
 26 |     # Value that was read from the reg_read port above.
 27 |     'reg_value': In(32),
 28 |     # Register write command. Works roughly like reg_read, e.g. only READY when
 29 |     # the CPU is halted.
 30 |     'reg_write': Out(StreamSig(RegWrite(5))),
 31 |     # PC output from CPU. This is always valid. If the CPU's PC is narrower
 32 |     # than 32 bits (the prog_addr_width parameter) then its value is
 33 |     # zero-extended on this port.
 34 |     'pc': In(32),
 35 |     # PC override signal. Becomes READY when the CPU is halted; assert a new
 36 |     # value with VALID here to change the next instruction that will be
 37 |     # fetched. If the PC is narrower than 32 bits (the prog_addr_width
 38 |     # parameter) then the higher bits in this path are ignored.
 39 |     'pc_write': Out(StreamSig(32)),
 40 |     # State output from CPU. This is a one-hot encoding of the CPU's internal
 41 |     # execution state, mostly intended for testbenches.
 42 |     'state': In(STATE_COUNT),
 43 | })
 44 | 
 45 | class Cpu(Component):
 46 |     """A basic RV32I core.
 47 | 
 48 |     Parameters
 49 |     ----------
 50 |     addr_width (int): number of low-order bits that are significant in memory
 51 |         addresses. The default is 32; if this is reduced, memory and I/O
 52 |         devices will appear to repeat at higher addresses because the top bits
 53 |         won't be decoded. Note that this parameter is in terms of byte
 54 |         addresses (the numbers RV32I software deals with); the actual bus port
 55 |         has addr_width-2 address lines because it addresses words.
 56 |     prog_addr_width (int): number of low-order bits that are significant in
 57 |         instruction addresses. This determines the width of the PC register(s)
 58 |         and fetch path. If program storage is in the lower section of the
 59 |         address range, and I/O devices higher, you can set this parameter to
 60 |         smaller than addr_width to save some area. If not explicitly
 61 |         overridden, this is the same as addr_width.  addr_width.
 62 | 
 63 |     Attributes
 64 |     ----------
 65 |     bus (both): connection to the bus, 32 bit data path and `addr_width - 2`
 66 |         address bits.
 67 |     debug (both): debug port for testing or development.
 68 |     halt_request (in): when asserted (1), requests that the CPU stop at the
 69 |         next instruction boundary. Release (0) to resume.
 70 |     halted (out): raised when the CPU has halted.
 71 |     rvfi (out): RISC-V Formal Interface trace port.
 72 |     """
 73 |     halt_request: In(1)
 74 |     halted: Out(1)
 75 | 
 76 |     debug: In(DebugPort)
 77 |     rvfi: Out(AlwaysReady(Rvfi()))
 78 | 
 79 |     def __init__(self, *,
 80 |                  reset_vector = 0,
 81 |                  addr_width = 32,
 82 |                  counters = False,
 83 |                  prog_addr_width = None):
 84 |         super().__init__()
 85 | 
 86 |         # Capture and derive parameter values
 87 |         self.addr_width = addr_width
 88 |         self.prog_addr_width = prog_addr_width or addr_width
 89 | 
 90 |         # Create our parameterized ports and modules
 91 |         self.bus = BusPort(addr = addr_width - 2, data = 32).create()
 92 | 
 93 |         self.s = SBox()
 94 |         self.rf = RegFile()
 95 |         self.fd = FDBox(
 96 |             prog_addr_width = self.prog_addr_width,
 97 |         )
 98 |         self.ew = EWBox(
 99 |             reset_vector = reset_vector,
100 |             addr_width = addr_width,
101 |             prog_addr_width = self.prog_addr_width,
102 |             counters = counters,
103 |         )
104 | 
105 |     def elaborate(self, platform):
106 |         m = Module()
107 | 
108 |         # Make the elaborator aware of all our submodules, and wire them up.
109 |         m.submodules.regfile = rf = self.rf
110 |         m.submodules.s = s = self.s
111 |         m.submodules.fd = fd = self.fd
112 |         m.submodules.ew = ew = self.ew
113 | 
114 |         m.d.comb += [
115 |             fd.onehot_state.eq(s.onehot_state),
116 |             fd.pc.eq(ew.pc_next),
117 |             fd.from_the_top.eq(ew.from_the_top),
118 | 
119 |             ew.onehot_state.eq(s.onehot_state),
120 |             ew.inst_next.eq(fd.inst_next),
121 |             ew.debug_pc_write.valid.eq(self.debug.pc_write.valid),
122 |             # Drop the bottom two bits of any incoming PC before feeding to EW.
123 |             ew.debug_pc_write.payload.eq(self.debug.pc_write.payload[2:]),
124 | 
125 |             s.from_the_top.eq(ew.from_the_top),
126 |             s.halt_request.eq(self.halt_request),
127 |             s.not_a_bubble.eq(ew.full),
128 |             s.hold.eq(ew.hold),
129 | 
130 |             self.halted.eq(s.halted),
131 | 
132 |             self.debug.reg_value.eq(rf.read_resp),
133 |             self.debug.state.eq(s.onehot_state),
134 |             # Internal PCs never have bits 0/1, but the debug port deals in
135 |             # 32-bit addresses, so add LSBs when exposing the PC:
136 |             self.debug.pc.eq(Cat(0, 0, ew.pc)),
137 |             self.debug.pc_write.ready.eq(ew.debug_pc_write.ready),
138 |         ]
139 | 
140 |         # Combine the register file write ports from EW (primary) and the debug
141 |         # interface (secondary). We use an actual mux here instead of OR-ing to
142 |         # keep the debug port from disrupting execution.
143 |         m.d.comb += [
144 |             rf.write_cmd.valid.eq(
145 |                 mux(
146 |                     s.halted,
147 |                     self.debug.reg_write.valid,
148 |                     ew.rf_write_cmd.valid,
149 |                 ),
150 |             ),
151 |             rf.write_cmd.payload.reg.eq(
152 |                 mux(
153 |                     s.halted,
154 |                     self.debug.reg_write.payload.reg,
155 |                     ew.rf_write_cmd.payload.reg,
156 |                 ),
157 |             ),
158 |             rf.write_cmd.payload.value.eq(
159 |                 mux(
160 |                     s.halted,
161 |                     self.debug.reg_write.payload.value,
162 |                     ew.rf_write_cmd.payload.value,
163 |                 ),
164 |             ),
165 |             self.debug.reg_write.ready.eq(s.halted),
166 |         ]
167 | 
168 |         # Combine the register file read ports from EW, FD, and debug. We OR
169 |         # the EW/FD ports together because those modules are well behaved, but
170 |         # explicitly gate signals from the debug port to only work when we're
171 |         # halted.
172 |         m.d.comb += [
173 |             rf.read_cmd.valid.eq(
174 |                 fd.rf_cmd.valid | ew.rf_read_cmd.valid
175 |                 | (self.debug.reg_read.valid & s.halted)
176 |             ),
177 |             rf.read_cmd.payload.eq(
178 |                 fd.rf_cmd.payload | ew.rf_read_cmd.payload
179 |                 | oneof([(s.halted, self.debug.reg_read.payload)])
180 |             ),
181 |             ew.rf_resp.eq(rf.read_resp),
182 |             self.debug.reg_read.ready.eq(s.halted),
183 |         ]
184 |         # Combine the bus access ports. The debug port can't drive our bus, so
185 |         # this is simpler.
186 |         m.d.comb += [
187 |             self.bus.cmd.valid.eq(
188 |                 fd.bus.cmd.valid | ew.bus.cmd.valid
189 |             ),
190 |             # Note that this will implicitly zero-extend the FD address if it's
191 |             # shorter than the full bus (because prog_addr_width is dialed
192 |             # back).
193 |             self.bus.cmd.payload.addr.eq(
194 |                 fd.bus.cmd.payload.addr | ew.bus.cmd.payload.addr
195 |             ),
196 |             self.bus.cmd.payload.data.eq(
197 |                 fd.bus.cmd.payload.data | ew.bus.cmd.payload.data
198 |             ),
199 |             self.bus.cmd.payload.lanes.eq(
200 |                 fd.bus.cmd.payload.lanes | ew.bus.cmd.payload.lanes
201 |             ),
202 | 
203 |             fd.bus.resp.eq(self.bus.resp),
204 |             ew.bus.resp.eq(self.bus.resp),
205 |         ]
206 | 
207 |         # Trace port
208 |         m.submodules.rvfi_adapter = rvfi = RvfiPort()
209 |         m.d.comb += [
210 |             rvfi.state.eq(s.onehot_state),
211 |             rvfi.full.eq(ew.full),
212 |             rvfi.end_of_instruction.eq(ew.from_the_top),
213 |             rvfi.pc.eq(Cat(0, 0, ew.pc)),
214 |             rvfi.pc_next.eq(Cat(0, 0, ew.pc_next)),
215 |             rvfi.insn.eq(ew.debug_inst),
216 |             rvfi.rf_read_resp_snoop.eq(rf.read_resp),
217 | 
218 |             rvfi.rf_read_snoop.valid.eq(rf.read_cmd.valid),
219 |             rvfi.rf_read_snoop.payload.eq(rf.read_cmd.payload),
220 | 
221 |             rvfi.rf_write_snoop.valid.eq(rf.write_cmd.valid),
222 |             rvfi.rf_write_snoop.payload.reg.eq(rf.write_cmd.payload.reg),
223 |             rvfi.rf_write_snoop.payload.value.eq(rf.write_cmd.payload.value),
224 | 
225 |             rvfi.bus_snoop.valid.eq(self.bus.cmd.valid),
226 |             rvfi.bus_snoop.payload.addr.eq(self.bus.cmd.payload.addr),
227 |             rvfi.bus_snoop.payload.data.eq(self.bus.cmd.payload.data),
228 |             rvfi.bus_snoop.payload.lanes.eq(self.bus.cmd.payload.lanes),
229 |             rvfi.bus_resp_snoop.eq(self.bus.resp),
230 |         ]
231 |         connect(m, rvfi.rvfi_out, flipped(self.rvfi))
232 | 
233 |         return m
234 | 
235 | class RvfiPort(Component):
236 |     state: In(STATE_COUNT)
237 |     full: In(1)
238 |     end_of_instruction: In(1)
239 |     pc: In(32)
240 |     pc_next: In(32)
241 |     insn: In(32)
242 | 
243 |     rf_read_snoop: In(AlwaysReady(5))
244 |     rf_read_resp_snoop: In(32)
245 | 
246 |     rf_write_snoop: In(AlwaysReady(RegWrite(5)))
247 | 
248 |     bus_snoop: In(AlwaysReady(BusCmd(addr = 30, data = 32)))
249 |     bus_resp_snoop: In(32)
250 | 
251 |     rvfi_out: Out(AlwaysReady(Rvfi()))
252 | 
253 |     def elaborate(self, platform):
254 |         m = Module()
255 | 
256 |         m.d.comb += [
257 |             self.rvfi_out.payload.ixl.eq(Ixl._32),
258 |             self.rvfi_out.payload.mode.eq(Mode.M),
259 |         ]
260 | 
261 |         load_expected = Signal(1)
262 |         after_end = Signal()
263 |         rs1_addr_d = Signal(5)
264 | 
265 |         m.d.sync += after_end.eq(self.end_of_instruction)
266 | 
267 |         with m.If(self.end_of_instruction):
268 |             m.d.sync += [
269 |                 self.rvfi_out.valid.eq(self.full),
270 |                 self.rvfi_out.payload.order.eq(self.rvfi_out.payload.order + 1),
271 |                 self.rvfi_out.payload.pc_wdata.eq(self.pc_next),
272 | 
273 |                 rs1_addr_d.eq(self.rf_read_snoop.payload),
274 |             ]
275 |         with m.Else():
276 |             m.d.sync += self.rvfi_out.valid.eq(0)
277 | 
278 |         with m.If(after_end):
279 |             m.d.sync += [
280 |                 # Clear the things that accumulate
281 |                 self.rvfi_out.payload.halt.eq(0),
282 |                 self.rvfi_out.payload.mem_wmask.eq(0),
283 |                 self.rvfi_out.payload.mem_wdata.eq(0),
284 |                 self.rvfi_out.payload.mem_rmask.eq(0),
285 |                 self.rvfi_out.payload.mem_rdata.eq(0),
286 |                 self.rvfi_out.payload.rd_addr.eq(0),
287 |                 self.rvfi_out.payload.rd_wdata.eq(0),
288 | 
289 |                 self.rvfi_out.payload.rs1_addr.eq(rs1_addr_d),
290 |             ]
291 | 
292 |         with m.If(self.full):
293 |             with m.If(self.state[0]):
294 |                 m.d.sync += [
295 |                     self.rvfi_out.payload.rs1_rdata.eq(self.rf_read_resp_snoop),
296 |                     self.rvfi_out.payload.rs2_addr.eq(self.rf_read_snoop.payload),
297 | 
298 |                     self.rvfi_out.payload.pc_rdata.eq(self.pc),
299 | 
300 |                     self.rvfi_out.payload.insn.eq(self.insn),
301 |                 ]
302 | 
303 |             with m.If(self.state[1]):
304 |                 m.d.sync += [
305 |                     self.rvfi_out.payload.rs2_rdata.eq(self.rf_read_resp_snoop),
306 |                 ]
307 | 
308 |             with m.If(self.rf_write_snoop.valid):
309 |                 m.d.sync += [
310 |                     self.rvfi_out.payload.rd_wdata.eq(self.rf_write_snoop.payload.value),
311 |                     self.rvfi_out.payload.rd_addr.eq(self.rf_write_snoop.payload.reg),
312 |                 ]
313 | 
314 |             with m.If(load_expected):
315 |                 m.d.sync += load_expected.eq(0)
316 |                 m.d.sync += self.rvfi_out.payload.mem_rdata.eq(
317 |                     self.bus_resp_snoop
318 |                 )
319 | 
320 |             # Ignore bus activity in state 0 as RVFI doesn't consider fetch
321 |             # traffic.
322 |             with m.If(self.bus_snoop.valid & ~self.state[0]):
323 |                 m.d.sync += [
324 |                     # Present addresses word-aligned
325 |                     self.rvfi_out.payload.mem_addr.eq(Cat(0, 0, self.bus_snoop.payload.addr)),
326 |                     # Set masks.
327 |                     self.rvfi_out.payload.mem_wmask.eq(self.bus_snoop.payload.lanes),
328 |                     self.rvfi_out.payload.mem_rmask.eq((~self.bus_snoop.payload.lanes.any()).replicate(4)),
329 |                 ]
330 |                 with m.If(self.bus_snoop.payload.lanes[0]):
331 |                     m.d.sync += self.rvfi_out.payload.mem_wdata[:8].eq(
332 |                         self.bus_snoop.payload.data[:8]
333 |                     )
334 |                 with m.If(self.bus_snoop.payload.lanes[1]):
335 |                     m.d.sync += self.rvfi_out.payload.mem_wdata[8:16].eq(
336 |                         self.bus_snoop.payload.data[8:16]
337 |                     )
338 |                 with m.If(self.bus_snoop.payload.lanes[2]):
339 |                     m.d.sync += self.rvfi_out.payload.mem_wdata[16:24].eq(
340 |                         self.bus_snoop.payload.data[16:24]
341 |                     )
342 |                 with m.If(self.bus_snoop.payload.lanes[3]):
343 |                     m.d.sync += self.rvfi_out.payload.mem_wdata[24:].eq(
344 |                         self.bus_snoop.payload.data[24:]
345 |                     )
346 |                 with m.If(self.bus_snoop.payload.lanes == 0):
347 |                     m.d.sync += load_expected.eq(1)
348 | 
349 |         return m
350 | 


--------------------------------------------------------------------------------
/hapenny/chonk/fdbox.py:
--------------------------------------------------------------------------------
  1 | # The FD-Box, responsible for fetch and decode during execution.
  2 | 
  3 | from amaranth import *
  4 | from amaranth.lib.wiring import *
  5 | from amaranth.lib.enum import *
  6 | from amaranth.lib.coding import Encoder, Decoder
  7 | 
  8 | from hapenny import StreamSig, AlwaysReady, onehot_choice, mux, oneof
  9 | from hapenny.chonk.sbox import STATE_COUNT
 10 | from hapenny.bus import BusPort
 11 | 
 12 | class FDBox(Component):
 13 |     """The FD-Box fetches and decodes instructions.
 14 | 
 15 |     Based on a PC (provided by the EW-box) the FD-box generates bus
 16 |     transactions to collect both halfwords of an instruction, and then provides
 17 |     it on an output signal to the EW-box.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     prog_addr_width (integer): number of bits in a program address, 32 by default
 22 |         but can be shrunk to save logic.
 23 | 
 24 |     Attributes
 25 |     ----------
 26 |     onehot_state (input): state input from the S-Box
 27 |     pc (input): program counter from EW-box.
 28 |     rf_cmd (output): read command to the register file, intended to be OR'd.
 29 |     inst_next (output): instruction word for EW to use next time we restart
 30 |         from the top.
 31 |     bus (port): our connection to the memory fabric.
 32 |     from_the_top (input): signal from EW indicating that this is the final
 33 |         cycle of the instruction. We use this to gate register reads.
 34 |     """
 35 |     onehot_state: In(STATE_COUNT)
 36 |     rf_cmd: Out(AlwaysReady(5))
 37 |     inst_next: Out(32)
 38 |     from_the_top: In(1)
 39 | 
 40 |     def __init__(self, *,
 41 |                  prog_addr_width = 32,
 42 |                  ):
 43 |         super().__init__()
 44 | 
 45 |         # Create a bus port of sufficient width to fetch instructions only.
 46 |         # (Width is -2 because we're addressing words.)
 47 |         self.bus = BusPort(addr = prog_addr_width - 2, data = 32).create()
 48 | 
 49 |         # The PC width is -2 because it's addressing words.
 50 |         self.pc = Signal(prog_addr_width - 2)
 51 | 
 52 |         self.inst = Signal(32)
 53 | 
 54 |     def elaborate(self, platform):
 55 |         m = Module()
 56 | 
 57 |         # State 0: we start the fetch.
 58 |         # State 1: we receive the instruction word and begin a register read.
 59 |         # State 2+: we don't do anything.
 60 | 
 61 |         m.d.comb += [
 62 |             # We issue bus transactions in state 0 only.
 63 |             self.bus.cmd.valid.eq(self.onehot_state[0]),
 64 |             # In that state we put the PC on the bus.
 65 |             self.bus.cmd.payload.addr.eq(onehot_choice(self.onehot_state, {
 66 |                 0: self.pc,
 67 |             })),
 68 | 
 69 |             # We access the register file only in the last cycle.
 70 |             self.rf_cmd.valid.eq(self.from_the_top),
 71 |             # If the last cycle is state 1, our fetch is still completing, so
 72 |             # we need to forward the bus response to the register file. If it
 73 |             # isn't state 1, we can serve out of our inst register.
 74 |             # (It's important to send zeros in other states instead of
 75 |             # hardwiring this so that we can OR.)
 76 |             self.rf_cmd.payload.eq(oneof([
 77 |                 (self.from_the_top & self.onehot_state[1], self.bus.resp[15:20]),
 78 |                 (self.from_the_top & ~self.onehot_state[1], self.inst[15:20]),
 79 |             ])),
 80 | 
 81 |             # Forward the instruction through so it's valid in states 1+. In
 82 |             # other states, serve up the contents of our registers. EW's not
 83 |             # supposed to look at this in state 0.
 84 |             self.inst_next.eq(mux(
 85 |                 self.onehot_state[1],
 86 |                 self.bus.resp,
 87 |                 self.inst,
 88 |             )),
 89 |         ]
 90 | 
 91 |         m.d.sync += [
 92 |             # Latch the bottom half of the instruction at the end of state 1.
 93 |             self.inst.eq(mux(
 94 |                 self.onehot_state[1],
 95 |                 self.bus.resp,
 96 |                 self.inst,
 97 |             )),
 98 |         ]
 99 | 
100 |         return m
101 | 


--------------------------------------------------------------------------------
/hapenny/chonk/gpio32.py:
--------------------------------------------------------------------------------
 1 | from amaranth import *
 2 | from amaranth.lib.wiring import *
 3 | from amaranth.lib.enum import *
 4 | from amaranth.lib.coding import Encoder, Decoder
 5 | 
 6 | from hapenny.bus import BusPort
 7 | 
 8 | class OutputPort32(Component):
 9 |     bus: In(BusPort(addr = 0, data = 32))
10 | 
11 |     def __init__(self, pins):
12 |         super().__init__()
13 |         self.pins = Signal(pins)
14 | 
15 |     def elaborate(self, platform):
16 |         m = Module()
17 | 
18 |         with m.If(self.bus.cmd.valid & self.bus.cmd.payload.lanes[0]):
19 |             m.d.sync += self.pins[:8].eq(self.bus.cmd.payload.data[:8])
20 |         with m.If(self.bus.cmd.valid & self.bus.cmd.payload.lanes[1]):
21 |             m.d.sync += self.pins[8:16].eq(self.bus.cmd.payload.data[8:16])
22 |         with m.If(self.bus.cmd.valid & self.bus.cmd.payload.lanes[2]):
23 |             m.d.sync += self.pins[16:24].eq(self.bus.cmd.payload.data[16:24])
24 |         with m.If(self.bus.cmd.valid & self.bus.cmd.payload.lanes[3]):
25 |             m.d.sync += self.pins[24:].eq(self.bus.cmd.payload.data[24:])
26 | 
27 |         return m
28 | 


--------------------------------------------------------------------------------
/hapenny/chonk/mem32.py:
--------------------------------------------------------------------------------
  1 | # Reusable memory with our bus interface.
  2 | 
  3 | from amaranth import *
  4 | from amaranth.lib.wiring import *
  5 | from amaranth.lib.enum import *
  6 | from amaranth.lib.coding import Encoder, Decoder
  7 | 
  8 | from hapenny import StreamSig, AlwaysReady, mux
  9 | from hapenny.bus import BusPort
 10 | 
 11 | import hapenny.mem # for stitching together 16-bit primitives
 12 | 
 13 | class BasicMemory(Elaboratable):
 14 |     """A dead-simple 32-bit-wide memory with the Hapenny bus interface.
 15 | 
 16 |     This uses an Amaranth generic memory internally, which relies on inference
 17 |     in the synthesis tools to map to a specific type of resource such as block
 18 |     RAM. In practice it won't map to uninitialized RAM (like the iCE40UP5K's
 19 |     SPRAM) because Amaranth insists on generating it with an initializer; for
 20 |     that you'll need another module.
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     depth (integer): number of 32-bit words in the memory. If omitted,
 25 |         contents must be provided, and depth is inferred from len(contents).
 26 |     contents (list of integer): initialization contents of the memory. If
 27 |         omitted, depth must be provided, and the RAM is implicitly zeroed.
 28 |     read_only (boolean): if overridden to True, the memory will not respond to
 29 |         write strobes. This is useful for using an initialized block RAM as a
 30 |         program ROM.
 31 | 
 32 |     Attributes
 33 |     ----------
 34 |     bus: a BusPort with the minimum number of addr bits required to address
 35 |         'depth' words, and a 32-bit data path.
 36 |     """
 37 | 
 38 |     def __init__(self, *,
 39 |                  depth = None,
 40 |                  contents = [],
 41 |                  read_only = False):
 42 |         super().__init__()
 43 | 
 44 |         if depth is None:
 45 |             assert len(contents) > 0, "either depth or contents must be provided"
 46 |             depth = len(contents)
 47 | 
 48 |         addr_bits = (depth - 1).bit_length()
 49 | 
 50 |         self.bus = BusPort(addr = addr_bits, data = 32).flip().create()
 51 | 
 52 |         self.m = Memory(
 53 |             width = 32,
 54 |             depth = depth,
 55 |             name = "basicram",
 56 |             init = contents,
 57 |         )
 58 | 
 59 |         self.read_only = False
 60 | 
 61 |     def elaborate(self, platform):
 62 |         m = Module()
 63 | 
 64 |         m.submodules.m = self.m
 65 | 
 66 |         rp = self.m.read_port(transparent = False)
 67 | 
 68 | 
 69 |         m.d.comb += [
 70 |             rp.addr.eq(self.bus.cmd.payload.addr),
 71 |             rp.en.eq(self.bus.cmd.valid & (self.bus.cmd.payload.lanes == 0)),
 72 |             self.bus.resp.eq(rp.data),
 73 |         ]
 74 | 
 75 |         if not self.read_only:
 76 |             wp = self.m.write_port(granularity = 8)
 77 |             m.d.comb += [
 78 |                 wp.addr.eq(self.bus.cmd.payload.addr),
 79 |                 wp.data.eq(self.bus.cmd.payload.data),
 80 |             ]
 81 |             for i, lane in enumerate(self.bus.cmd.payload.lanes):
 82 |                 m.d.comb += wp.en[i].eq(self.bus.cmd.valid & lane)
 83 | 
 84 |         return m
 85 | 
 86 | class SpramMemory(Component):
 87 |     """A pair of 256 kiB / 32 kiB SPRAMs on the UP5K, joined to make a 32-bit
 88 |     wide memory.
 89 | 
 90 |     This module exists because getting Amaranth to generate a memory that Yosys
 91 |     is willing to map to SPRAM is currently hard.
 92 | 
 93 |     SPRAMs are uninitialized at reset and can retain content across both design
 94 |     and device resets. As a result, this module doesn't support a read_only
 95 |     mode, because its contents would be indeterminate (yet not random enough to
 96 |     be interesting).
 97 | 
 98 |     Attributes
 99 |     ----------
100 |     bus: bus interface with 14 address bits.
101 |     """
102 |     bus: In(BusPort(addr = 14, data = 32))
103 | 
104 |     def elaborate(self, platform):
105 |         m = Module()
106 | 
107 |         m.submodules.lo = lo = hapenny.mem.SpramMemory()
108 |         m.submodules.hi = hi = hapenny.mem.SpramMemory()
109 | 
110 |         m.d.comb += [
111 |             lo.bus.cmd.valid.eq(self.bus.cmd.valid),
112 |             lo.bus.cmd.payload.addr.eq(self.bus.cmd.payload.addr),
113 |             lo.bus.cmd.payload.data.eq(self.bus.cmd.payload.data[:16]),
114 |             lo.bus.cmd.payload.lanes.eq(self.bus.cmd.payload.lanes[:2]),
115 | 
116 |             hi.bus.cmd.valid.eq(self.bus.cmd.valid),
117 |             hi.bus.cmd.payload.addr.eq(self.bus.cmd.payload.addr),
118 |             hi.bus.cmd.payload.data.eq(self.bus.cmd.payload.data[16:]),
119 |             hi.bus.cmd.payload.lanes.eq(self.bus.cmd.payload.lanes[2:]),
120 | 
121 |             self.bus.resp[:16].eq(lo.bus.resp),
122 |             self.bus.resp[16:].eq(hi.bus.resp),
123 |         ]
124 | 
125 |         return m
126 | 
127 | 


--------------------------------------------------------------------------------
/hapenny/chonk/regfile32.py:
--------------------------------------------------------------------------------
 1 | # 32-bit x 32 register file for a full-width RV32 implementation.
 2 | 
 3 | from amaranth import *
 4 | from amaranth.lib.wiring import *
 5 | from amaranth.lib.enum import *
 6 | 
 7 | from hapenny import StreamSig, AlwaysReady
 8 | 
 9 | def RegWrite(addrbits = 5):
10 |     return Signature({
11 |         'reg': Out(addrbits),
12 |         'value': Out(32),
13 |     })
14 | 
15 | class RegFile(Component):
16 |     read_resp: Out(32)
17 | 
18 |     def __init__(self, *, 
19 |                  banks = 1):
20 |         super().__init__()
21 | 
22 |         self.banks = banks
23 | 
24 |         # 5 bits for x0..x31, then bank bits
25 |         select_bits = 5 + (banks - 1).bit_length()
26 | 
27 |         self.read_cmd = AlwaysReady(select_bits).flip().create()
28 |         self.write_cmd = AlwaysReady(RegWrite(select_bits)).flip().create()
29 | 
30 |     def elaborate(self, platform):
31 |         m = Module()
32 | 
33 |         nregs = 32 * self.banks
34 |         contents = [0xDEAD_0000 | n | (b << 8) for n in range(32) for b in range(self.banks)]
35 |         contents[0] = 0
36 | 
37 |         m.submodules.mem = mem = Memory(
38 |             width = 32,
39 |             depth = nregs,
40 |             name = "regfile",
41 |             #init = contents,
42 |         )
43 | 
44 |         # The 32-bit core can read a register at the same time that it's
45 |         # writing it, so we have to make this transparent to bypass.
46 |         rp = mem.read_port(transparent = True)
47 |         wp = mem.write_port()
48 | 
49 |         m.d.comb += [
50 |             rp.addr.eq(self.read_cmd.payload),
51 |             rp.en.eq(self.read_cmd.valid),
52 | 
53 |             self.read_resp.eq(rp.data),
54 | 
55 |             wp.addr.eq(self.write_cmd.payload.reg),
56 |             wp.data.eq(self.write_cmd.payload.value),
57 |             # Block writes to both halves of x0 in all banks.
58 |             wp.en.eq((self.write_cmd.payload.reg != 0) & self.write_cmd.valid),
59 |         ]
60 | 
61 |         return m
62 | 


--------------------------------------------------------------------------------
/hapenny/chonk/sbox.py:
--------------------------------------------------------------------------------
 1 | # The S-Box, responsible for state sequencing of other boxes.
 2 | 
 3 | from amaranth import *
 4 | from amaranth.lib.wiring import *
 5 | from amaranth.lib.enum import *
 6 | from amaranth.lib.coding import Encoder, Decoder
 7 | 
 8 | from hapenny import StreamSig, AlwaysReady
 9 | from hapenny.bus import BusPort
10 | 
11 | # Maximum number of (unique) states needed by any instruction, plus one
12 | # additional for halt. (Note that repeated states when e.g. shifting do not
13 | # count as unique states.)
14 | STATE_COUNT = 3 + 1
15 | 
16 | class SBox(Component):
17 |     """The S-Box sequences the other components.
18 | 
19 |     The S-Box implements a state counter that counts up through the maximum
20 |     number of unique states required by any instruction. The count can be reset,
21 |     signaling the end of one instruction and the beginning of the next, by
22 |     asserting the from_the_top input.
23 | 
24 |     The state counter, and output, are both one-hot.
25 | 
26 |     Attributes
27 |     ----------
28 |     from_the_top (input): restarts the count for the next instruction.
29 |     hold (input): input from EW-box to keep doing this same state. Only safe for
30 |         use after state 3 to avoid weird side effects.
31 |     halt_request (input): when high, redirects the next from_the_top assertion
32 |         to go to the halted state instead.
33 |     not_a_bubble (input): indicates that the CPU is doing useful work and not
34 |         just fetching. Used to gate transitions to halt state to ensure forward
35 |         progress during single-stepping.
36 |     onehot_state (output): one bit per possible state.
37 |     halted(output): a handy synonym for the last onehot_state bit.
38 |     """
39 |     from_the_top: In(1)
40 |     hold: In(1)
41 |     halt_request: In(1)
42 |     not_a_bubble: In(1)
43 | 
44 |     onehot_state: Out(STATE_COUNT)
45 |     halted: Out(1)
46 | 
47 |     def __init__(self):
48 |         super().__init__()
49 | 
50 |         self.onehot_state.reset = 1
51 | 
52 |     def elaborate(self, platform):
53 |         m = Module()
54 | 
55 |         # This module is doing a lot of things by hand, because as far as I can
56 |         # tell, Amaranth doesn't really know anything about one-hot encoding.
57 |         # Like, there's no way to indicate that the bits are exclusive. So in an
58 |         # attempt to get this managed like a one-hot FSM rather than a
59 |         # STATE_COUNT-wide base-2 FSM, I'm rolling circuits by hand.
60 | 
61 |         # Inexpensive way to detect that we're leaving a halt request without
62 |         # requiring more registers:
63 |         end_of_halt = Signal(1)
64 |         m.d.comb += end_of_halt.eq(
65 |             self.onehot_state[STATE_COUNT - 1] & ~self.halt_request
66 |         )
67 | 
68 |         # Generate one-hot counter transition circuit. In each state we clear
69 |         # one bit and set another to advance. This can be overridden if we get
70 |         # the signal to start again from the top.
71 |         for state_num in range(STATE_COUNT):
72 |             with m.If(self.from_the_top | end_of_halt):
73 |                 with m.If(self.halt_request & self.not_a_bubble):
74 |                     # Each bit must clear itself except for the highest.
75 |                     m.d.sync += self.onehot_state[state_num].eq(
76 |                         state_num == STATE_COUNT - 1
77 |                     )
78 |                 with m.Else():
79 |                     # Each bit must clear itself except for the lowest.
80 |                     m.d.sync += self.onehot_state[state_num].eq(state_num == 0)
81 |             with m.Elif(self.onehot_state[state_num] & ~self.hold):
82 |                 # The final state is sticky, so, don't implement wraparound
83 |                 # logic to advance out of it. We only leave that state if we
84 |                 # receive from_the_top.
85 |                 if state_num < STATE_COUNT - 1:
86 |                     m.d.sync += [
87 |                         self.onehot_state[state_num].eq(0),
88 |                         self.onehot_state[state_num + 1].eq(1),
89 |                     ]
90 | 
91 |         m.d.comb += self.halted.eq(self.onehot_state[STATE_COUNT - 1])
92 |         return m
93 | 


--------------------------------------------------------------------------------
/hapenny/chonk/serial32.py:
--------------------------------------------------------------------------------
  1 | from amaranth import *
  2 | from amaranth.lib.wiring import *
  3 | from amaranth.lib.enum import *
  4 | from amaranth.lib.coding import Encoder, Decoder
  5 | 
  6 | from hapenny import StreamSig, AlwaysReady, mux, oneof
  7 | from hapenny.bus import BusPort
  8 | 
  9 | class ReceiveCore(Component):
 10 |     rx: In(1)
 11 |     sample_clock: In(1)
 12 |     rdr: Out(8)
 13 |     empty: Out(1)
 14 |     read_strobe: In(1)
 15 | 
 16 |     def __init__(self, oversample = 16):
 17 |         super().__init__()
 18 | 
 19 |         self.oversample = oversample
 20 | 
 21 |     def elaborate(self, platform):
 22 |         m = Module()
 23 | 
 24 |         state = Signal(range(4))
 25 |         bits_left = Signal(range(8))
 26 |         timer = Signal(range(self.oversample))
 27 |         have_data = Signal(1)
 28 | 
 29 |         m.d.comb += [
 30 |             self.empty.eq(~have_data),
 31 |         ]
 32 | 
 33 |         m.d.sync += timer.eq(oneof([
 34 |             # Set to delay half a bit period from initial negative edge.
 35 |             (self.sample_clock & (state == 0), (self.oversample // 2) - 1),
 36 |             # Count down in all other states until we reach 0.
 37 |             (self.sample_clock & (state != 0) & (timer != 0), timer - 1),
 38 |             # Once we reach 0, reset to a full bit time.
 39 |             (self.sample_clock & (state != 0) & (timer == 0), self.oversample - 1),
 40 |         ], default = timer))
 41 | 
 42 |         m.d.sync += state.eq(oneof([
 43 |             # Leave state 0 if we see the falling edge.
 44 |             (self.sample_clock & (state == 0), ~self.rx),
 45 |             # If it's still low at the midpoint of the start bit, proceed.
 46 |             # Otherwise, treat it as a glitch and reset.
 47 |             (self.sample_clock & (state == 1) & (timer == 0), mux(~self.rx, 2, 0)),
 48 |             # Automatically advance when we've done all the bits in state 2.
 49 |             (self.sample_clock & (state == 2) & (timer == 0), mux(bits_left == 0, 3, 2)),
 50 |             # Automatically advance at the end of the stop bit.
 51 |             (self.sample_clock & (state == 3) & (timer == 0), 0),
 52 |         ], default = state))
 53 | 
 54 |         m.d.sync += bits_left.eq(oneof([
 55 |             # Configure for 7 bits after the first one.
 56 |             (self.sample_clock & (timer == 0), mux(state == 1, 7, bits_left - 1)),
 57 |         ], default = bits_left))
 58 | 
 59 |         m.d.sync += self.rdr.eq(oneof([
 60 |             (self.sample_clock & (state == 2) & (timer == 0), Cat(self.rdr[1:], self.rx)),
 61 |         ], default = self.rdr))
 62 | 
 63 |         m.d.sync += have_data.eq(oneof([
 64 |             # The way this is expressed, newly arriving data will override the
 65 |             # read strobe -- the two cases will OR if they occur
 66 |             # simultaneously, and the 0 loses.
 67 |             (self.sample_clock & (state == 3) & (timer == 0), self.rx),
 68 |             (self.read_strobe, 0),
 69 |         ], default = have_data))
 70 | 
 71 |         return m
 72 | 
 73 | 
 74 | class TransmitCore(Component):
 75 |     tx: Out(1)
 76 |     sample_clock: In(1)
 77 |     thr_write: In(AlwaysReady(8))
 78 |     busy: Out(1)
 79 | 
 80 |     def __init__(self, oversample = 16):
 81 |         super().__init__()
 82 | 
 83 |         self.oversample = oversample
 84 | 
 85 |     def elaborate(self, platform):
 86 |         m = Module()
 87 | 
 88 |         # We use this as a shift register containing: start bit, 8 data bits, 2
 89 |         # stop bits. Its LSB is our output state, so it's important that it
 90 |         # reset to 1; the other bits can reset to whatever value.
 91 |         thr = Signal(1 + 8, reset = 1)
 92 | 
 93 |         tx_bits_left = Signal(range(1 + 8 + 2))
 94 |         tx_timer = Signal(range(self.oversample))
 95 | 
 96 |         with m.If(self.sample_clock):
 97 |             with m.If(tx_bits_left != 0):
 98 |                 with m.If(tx_timer == 0):
 99 |                     m.d.sync += [
100 |                         thr.eq(Cat(thr[1:], 1)),
101 |                         tx_timer.eq(self.oversample - 1),
102 |                         tx_bits_left.eq(tx_bits_left - 1),
103 |                     ]
104 |                 with m.Else():
105 |                     m.d.sync += tx_timer.eq(tx_timer - 1)
106 | 
107 |         # Transmit output
108 |         m.d.comb += self.tx.eq(thr[0])
109 | 
110 |         # Control register interface.
111 |         m.d.comb += self.busy.eq(tx_bits_left != 0)
112 | 
113 |         with m.If(self.thr_write.valid):
114 |             m.d.sync += [
115 |                 # Load THR with the start bit.
116 |                 thr.eq(Cat(0, self.thr_write.payload)),
117 |                 tx_bits_left.eq(1 + 8 + 2),
118 |                 tx_timer.eq(self.oversample - 1),
119 |             ]
120 | 
121 |         return m
122 | 
123 | 
124 | class OversampleClock(Component):
125 |     out: Out(1)
126 | 
127 |     def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None):
128 |         super().__init__()
129 |         self.baud_rate = baud_rate
130 |         self.oversample = oversample
131 |         self.clock_freq = clock_freq
132 | 
133 |     def elaborate(self, platform):
134 |         m = Module()
135 | 
136 |         # We divide the system clock to our baud rate * oversample and use that
137 |         # clock for sampling. This is a compromise between low cost transmit
138 |         # (where we could divide the clock all the way down to the baud rate
139 |         # without issue) and accurate receive (where higher sampling rates are
140 |         # better but cost more flops).
141 |         clock_freq = self.clock_freq or platform.default_clk_frequency
142 |         our_freq = self.baud_rate * self.oversample
143 |         divisor = int(round(clock_freq / our_freq))
144 |         print(f"UART configured for {self.baud_rate} from input clock {clock_freq}, divisor = {divisor}")
145 |         actual_freq = clock_freq / self.oversample / divisor
146 |         freq_error = abs(actual_freq - self.baud_rate) / self.baud_rate
147 |         print(f"Actual baud rate will be: {actual_freq} (error: {freq_error * 100:.3}%)")
148 |         assert freq_error< 0.01, "Error: cannot achieve requested UART frequency"
149 | 
150 |         sample_clock = Signal(1)
151 |         sample_counter = Signal(range(divisor))
152 |         # Generate a pulse on every sample period for one (fast) clock cycle.
153 |         m.d.comb += self.out.eq(sample_counter == 0)
154 | 
155 |         m.d.sync += sample_counter.eq(mux(self.out, divisor - 1, sample_counter - 1))
156 | 
157 |         return m
158 | 
159 | class TransmitOnlyUart(Component):
160 |     """The world's crappiest UART!
161 | 
162 |     The low byte of any write goes into the transmit holding register and will
163 |     be sent out promptly.
164 | 
165 |     Reads return a status register where bit 0 indicates BUSY.
166 |     """
167 |     bus: In(BusPort(addr = 0, data = 32))
168 |     tx: Out(1)
169 | 
170 |     def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None):
171 |         super().__init__()
172 | 
173 |         self.baud_rate = baud_rate
174 |         self.oversample = oversample
175 |         self.clock_freq = clock_freq
176 | 
177 |     def elaborate(self, platform):
178 |         m = Module()
179 |         m.submodules.clkdiv = clkdiv = OversampleClock(
180 |             baud_rate = self.baud_rate,
181 |             oversample = self.oversample,
182 |             clock_freq = self.clock_freq,
183 |         )
184 | 
185 |         m.submodules.txr = txr = TransmitCore(oversample = self.oversample)
186 |         m.d.comb += [
187 |             txr.sample_clock.eq(clkdiv.out),
188 |             self.tx.eq(txr.tx),
189 |             self.bus.resp.eq(txr.busy),
190 | 
191 |             txr.thr_write.payload.eq(self.bus.payload.data[:8]),
192 |         ]
193 | 
194 |         with m.If(self.bus.cmd.valid & self.bus.cmd.payload.lanes[0]):
195 |             m.d.comb += txr.thr_write.valid.eq(1)
196 | 
197 |         return m
198 | 
199 | class ReceiveOnlyUart(Component):
200 |     """The world's other crappiest UART!
201 | 
202 |     This can receive a single frame and hold it in registers.
203 | 
204 |     On any read, this will return the frame in the low 8 bits, plus bit 15 set
205 |     if there's actual data. This is intended to be used with LH to easily get
206 |     the "data full" flag into the MSB where it can be tested with bltz.
207 | 
208 |     And, read sensitive, why not.
209 |     """
210 |     bus: In(BusPort(addr = 0, data = 32))
211 |     rx: In(1)
212 | 
213 |     def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None):
214 |         super().__init__()
215 | 
216 |         self.baud_rate = baud_rate
217 |         self.clock_freq = clock_freq
218 | 
219 |     def elaborate(self, platform):
220 |         m = Module()
221 | 
222 |         m.submodules.clkdiv = clkdiv = OversampleClock(
223 |             baud_rate = self.baud_rate,
224 |             oversample = self.oversample,
225 |             clock_freq = self.clock_freq,
226 |         )
227 | 
228 |         m.submodules.rxr = rxr = ReceiveCore(oversample = self.oversample)
229 |         m.d.comb += [
230 |             rxr.rx.eq(self.rx),
231 |             rxr.sample_clock.eq(clkdiv.out),
232 |             rxr.read_strobe.eq(self.bus.cmd.valid & ~self.bus.cmd.payload.lanes.any()),
233 |         ]
234 | 
235 |         m.d.sync += [
236 |             self.bus.resp[:8].eq(rxr.rdr),
237 |             self.bus.resp[-1].eq(rxr.empty),
238 |         ]
239 | 
240 |         return m
241 | 
242 | class BidiUart(Component):
243 |     """A slightly less crappy UART.
244 | 
245 |     This combines the transmit and receive logic using a shared clock divider,
246 |     to save some space if you need both directions.
247 | 
248 |     Register Layout
249 |     ---------------
250 |     0x0000   RDR - data in low 8 bits, empty flag in bit 15, read-sensitive
251 |     0x0004   THR - reads as 0 if TX is idle, writes send low 8 bits
252 |     """
253 |     bus: In(BusPort(addr = 1, data = 32))
254 |     tx: In(1)
255 |     rx: In(1)
256 | 
257 |     def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None):
258 |         super().__init__()
259 | 
260 |         self.baud_rate = baud_rate
261 |         self.oversample = oversample
262 |         self.clock_freq = clock_freq
263 | 
264 |     def elaborate(self, platform):
265 |         m = Module()
266 | 
267 |         # Clock divider for sampling
268 |         m.submodules.clkdiv = clkdiv = OversampleClock(
269 |             baud_rate = self.baud_rate,
270 |             oversample = self.oversample,
271 |             clock_freq = self.clock_freq,
272 |         )
273 | 
274 |         # Receive state machine.
275 |         m.submodules.rxr = rxr = ReceiveCore(oversample = self.oversample)
276 |         m.d.comb += [
277 |             rxr.rx.eq(self.rx),
278 |             rxr.sample_clock.eq(clkdiv.out),
279 |         ]
280 | 
281 |         # Transmit machine.
282 | 
283 |         m.submodules.txr = txr = TransmitCore(oversample = self.oversample)
284 |         m.d.comb += [
285 |             txr.sample_clock.eq(clkdiv.out),
286 |             self.tx.eq(txr.tx),
287 |         ]
288 | 
289 |         # Bus read port. We register this so that state doesn't change by the
290 |         # time the output is read. This is particularly a problem for the
291 |         # read-sensitive RDR register.
292 |         m.d.sync += [
293 |             self.bus.resp[:8].eq(mux(
294 |                 self.bus.cmd.payload.addr[0],
295 |                 txr.busy,
296 |                 rxr.rdr,
297 |             )),
298 |             self.bus.resp[-1].eq(
299 |                 ~self.bus.cmd.payload.addr[0] & rxr.empty
300 |             ),
301 |         ]
302 | 
303 |         # Read-sense logic for receive side.
304 |         m.d.comb += rxr.read_strobe.eq(
305 |             self.bus.cmd.valid
306 |             & ~self.bus.cmd.payload.lanes.any()
307 |             & ~self.bus.cmd.payload.addr[0]
308 |         )
309 | 
310 |         # Write logic for TX side.
311 |         m.d.comb += txr.thr_write.payload.eq(self.bus.cmd.payload.data[:8])
312 | 
313 |         m.d.comb += txr.thr_write.valid.eq(
314 |             self.bus.cmd.valid
315 |             & self.bus.cmd.payload.lanes[0]
316 |             & self.bus.cmd.payload.addr[0]
317 |         )
318 | 
319 |         return m
320 | 
321 | 


--------------------------------------------------------------------------------
/hapenny/decoder.py:
--------------------------------------------------------------------------------
  1 | # Combinational decode logic.
  2 | 
  3 | from amaranth import *
  4 | from amaranth.lib.wiring import *
  5 | from amaranth.lib.enum import *
  6 | from amaranth.lib.data import *
  7 | import amaranth.lib.coding
  8 | 
  9 | class Opcode(Enum):
 10 |     LUI = 0b01101
 11 |     AUIPC = 0b00101
 12 |     JAL = 0b11011
 13 |     JALR = 0b11001
 14 |     Bxx = 0b11000
 15 |     Lxx = 0b00000
 16 |     Sxx = 0b01000
 17 |     ALUIMM = 0b00100
 18 |     ALUREG = 0b01100
 19 |     SYSTEM = 0b11100
 20 |     CUSTOM0 = 0b00001
 21 | 
 22 | class DecodeSignals(Struct):
 23 |     inst: unsigned(32)
 24 | 
 25 |     opcode: unsigned(5)
 26 |     funct3: unsigned(3)
 27 |     rs1: unsigned(5)
 28 |     rs2: unsigned(5)
 29 |     rd: unsigned(5)
 30 | 
 31 |     is_auipc: unsigned(1)
 32 |     is_lui: unsigned(1)
 33 |     is_jal: unsigned(1)
 34 |     is_jalr: unsigned(1)
 35 |     is_b: unsigned(1)
 36 |     is_load: unsigned(1)
 37 |     is_store: unsigned(1)
 38 |     is_alu: unsigned(1)
 39 |     is_alu_rr: unsigned(1)
 40 |     is_alu_ri: unsigned(1)
 41 |     is_system: unsigned(1)
 42 |     is_custom0: unsigned(1)
 43 | 
 44 |     # derived signals to make it easier to move the functions before a register.
 45 |     is_auipc_or_lui: unsigned(1)
 46 |     is_auipc_or_jal: unsigned(1)
 47 |     is_auipc_or_lui_or_jal: unsigned(1)
 48 |     is_jal_or_jalr: unsigned(1)
 49 |     is_load_or_jalr: unsigned(1)
 50 |     is_csr: unsigned(1)
 51 |     writes_rd_normally: unsigned(1)
 52 |     is_imm_i: unsigned(1)
 53 |     is_neg_imm_i: unsigned(1)
 54 |     is_any_imm_i: unsigned(1)
 55 |     is_neg_reg_to_adder: unsigned(1)
 56 |     is_reg_to_adder: unsigned(1)
 57 |     is_any_reg_to_adder: unsigned(1)
 58 |     is_shift: unsigned(1)
 59 |     is_slt: unsigned(1)
 60 |     is_sw: unsigned(1)
 61 |     is_adder_rhs_complemented: unsigned(1)
 62 |     writes_adder_to_reg: unsigned(1)
 63 | 
 64 |     # one-hot decode of funct3
 65 |     funct3_is: unsigned(8)
 66 | 
 67 | class Decoder(Component):
 68 |     """The Decoder is a circuit that breaks an instruction into the various
 69 |     control signals. It's used by the larger components.
 70 | 
 71 |     Attributes
 72 |     ----------
 73 |     inst (input): instruction word.
 74 |     out (output): group of decode signals, see DecodeSignals struct.
 75 |     """
 76 |     inst: In(32)
 77 | 
 78 |     out: Out(DecodeSignals)
 79 | 
 80 |     def __init__(self, 
 81 |                  ):
 82 |         super().__init__()
 83 | 
 84 |     def elaborate(self, platform):
 85 |         m = Module()
 86 | 
 87 |         m.submodules.funct3_decode = f3d = amaranth.lib.coding.Decoder(8)
 88 | 
 89 |         m.d.comb += f3d.i.eq(self.inst[12:15])
 90 | 
 91 |         opcode = Signal(5)
 92 |         m.d.comb += opcode.eq(self.inst[2:7])
 93 | 
 94 |         m.d.comb += [
 95 |             self.out.inst.eq(self.inst),
 96 |             self.out.opcode.eq(opcode),
 97 |             self.out.funct3.eq(self.inst[12:15]),
 98 |             self.out.rs1.eq(self.inst[15:20]),
 99 |             self.out.rs2.eq(self.inst[20:25]),
100 |             self.out.rd.eq(self.inst[7:12]),
101 |             self.out.is_auipc.eq(opcode == Opcode.AUIPC),
102 |             self.out.is_lui.eq(opcode == Opcode.LUI),
103 |             self.out.is_jal.eq(opcode == Opcode.JAL),
104 |             self.out.is_jalr.eq(opcode == Opcode.JALR),
105 |             self.out.is_b.eq(opcode == Opcode.Bxx),
106 |             self.out.is_load.eq(opcode == Opcode.Lxx),
107 |             self.out.is_store.eq(opcode == Opcode.Sxx),
108 |             self.out.is_alu_rr.eq(opcode == Opcode.ALUREG),
109 |             self.out.is_alu_ri.eq(opcode == Opcode.ALUIMM),
110 |             self.out.is_system.eq(opcode == Opcode.SYSTEM),
111 |             self.out.is_custom0.eq(opcode == Opcode.CUSTOM0),
112 |             self.out.funct3_is.eq(f3d.o),
113 |         ]
114 | 
115 |         # derived signals
116 |         m.d.comb += [
117 |             self.out.is_alu.eq(
118 |                 self.out.is_alu_rr | self.out.is_alu_ri
119 |             ),
120 |             self.out.is_auipc_or_lui.eq(
121 |                 self.out.is_auipc | self.out.is_lui
122 |             ),
123 |             self.out.is_auipc_or_jal.eq(
124 |                 self.out.is_auipc | self.out.is_jal
125 |             ),
126 |             self.out.is_auipc_or_lui_or_jal.eq(
127 |                 self.out.is_auipc | self.out.is_lui | self.out.is_jal
128 |             ),
129 |             self.out.is_jal_or_jalr.eq(
130 |                 self.out.is_jal | self.out.is_jalr
131 |             ),
132 |             self.out.is_load_or_jalr.eq(
133 |                 self.out.is_load | self.out.is_jalr
134 |             ),
135 |             self.out.is_csr.eq((opcode == Opcode.SYSTEM) & ~self.out.funct3_is[0b000]),
136 | 
137 |             self.out.writes_rd_normally.eq(
138 |                 self.out.is_jal
139 |                 | self.out.is_jalr
140 |                 | self.out.is_lui
141 |                 | self.out.is_auipc
142 |                 | self.out.is_alu
143 |             ),
144 |             self.out.is_imm_i.eq(
145 |                 (opcode == Opcode.Lxx) | (opcode == Opcode.JALR)
146 |                 | ((opcode == Opcode.ALUIMM) & ~(self.out.funct3_is[0b010] |
147 |                                                  self.out.funct3_is[0b011]))
148 |                 | (opcode == Opcode.CUSTOM0)
149 |             ),
150 |             self.out.is_neg_imm_i.eq(
151 |                 (opcode == Opcode.ALUIMM) & (self.out.funct3_is[0b010] |
152 |                                              self.out.funct3_is[0b011])
153 |             ),
154 |             self.out.is_any_imm_i.eq(
155 |                 (opcode == Opcode.Lxx) | (opcode == Opcode.JALR)
156 |                 | (opcode == Opcode.ALUIMM)
157 |                 | (opcode == Opcode.CUSTOM0)
158 |             ),
159 |             self.out.is_neg_reg_to_adder.eq(
160 |                 (opcode == Opcode.Bxx)
161 |                 | ((opcode == Opcode.ALUREG) & (self.out.funct3_is[0b010] |
162 |                                                 self.out.funct3_is[0b011]))
163 |             ),
164 |             self.out.is_reg_to_adder.eq(
165 |                 ((opcode == Opcode.ALUREG) & ~(self.out.funct3_is[0b010] |
166 |                                                self.out.funct3_is[0b011]))
167 |             ),
168 |             self.out.is_any_reg_to_adder.eq(
169 |                 (opcode == Opcode.Bxx)
170 |                 | (opcode == Opcode.ALUREG)
171 |             ),
172 |             self.out.is_shift.eq(
173 |                 self.out.is_alu & (self.out.funct3_is[0b001] |
174 |                                    self.out.funct3_is[0b101])
175 |             ),
176 |             self.out.is_slt.eq(
177 |                 self.out.is_alu & (self.out.funct3_is[0b010] |
178 |                                    self.out.funct3_is[0b011])
179 |             ),
180 |             self.out.is_sw.eq(
181 |                 self.out.is_store & self.out.funct3_is[0b010]
182 |             ),
183 |             self.out.is_adder_rhs_complemented.eq(
184 |                 self.out.is_neg_reg_to_adder
185 |                 | self.out.is_neg_imm_i
186 |                 | (self.out.is_reg_to_adder & self.out.inst[30])
187 |             ),
188 |             self.out.writes_adder_to_reg.eq(
189 |                 self.out.is_auipc_or_lui | (self.out.is_alu &
190 |                                             self.out.funct3_is[0b000])
191 |             ),
192 |         ]
193 | 
194 |         return m
195 | 
196 | class ImmediateDecoder(Component):
197 |     """The ImmediateDecoder decodes an instruction word into its various
198 |     immediate formats. It's used by the larger components.
199 | 
200 |     Attributes
201 |     ----------
202 |     inst (input): instruction word.
203 |     imm_i (output): I-format immediate.
204 |     imm_s (output): S-format immediate.
205 |     imm_b (output): B-format immediate.
206 |     imm_u (output): U-format immediate.
207 |     imm_j (output): J-format immediate.
208 |     """
209 |     inst: In(32)
210 | 
211 |     i: Out(32)
212 |     s: Out(32)
213 |     b: Out(32)
214 |     u: Out(32)
215 |     j: Out(32)
216 | 
217 |     def elaborate(self, platform):
218 |         m = Module()
219 | 
220 |         m.d.comb += [
221 |             self.i.eq(Cat(self.inst[20:31], self.inst[31].replicate(21))),
222 |             self.s.eq(Cat(self.inst[7:12], self.inst[25:31],
223 |                      self.inst[31].replicate(21))),
224 |             self.b.eq(Cat(0, self.inst[8:12], self.inst[25:31], self.inst[7],
225 |                  self.inst[31].replicate(20))),
226 |             self.u.eq(self.inst & 0xFFFFF000),
227 |             self.j.eq(Cat(0, self.inst[21:31], self.inst[20],
228 |                          self.inst[12:20], self.inst[31].replicate(12))),
229 |         ]
230 | 
231 |         return m
232 | 
233 | 


--------------------------------------------------------------------------------
/hapenny/extsram.py:
--------------------------------------------------------------------------------
  1 | from amaranth import *
  2 | from amaranth.lib.wiring import *
  3 | from amaranth.lib.enum import *
  4 | from amaranth.lib.coding import Encoder, Decoder
  5 | 
  6 | from hapenny import StreamSig, AlwaysReady
  7 | from hapenny.bus import BusPort
  8 | 
  9 | class ExternalSRAM(Component):
 10 |     """An interface to 16-bit-wide external asynchronous SRAM.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     address_bits (integer): number of implemented address bits at the physical
 15 |         interface -- so, not including address bit 0 since the memory is 16
 16 |         bits wide.
 17 | 
 18 |     Attributes
 19 |     ----------
 20 |     sram_oe (out): output enable to SRAM, active high. Enables the SRAM's
 21 |         output drivers during a read cycle.
 22 |     sram_we (out): write enable to SRAM, active high.
 23 |     sram_lanes (out): byte select lines to SRAM, a 1 during a write means the
 24 |         corresponding byte is written, a 0 leaves it untouched.
 25 |     addr_to_sram (out): address, width determined by the 'address_bits'
 26 |         parameter.
 27 |     data_to_sram (out): 16-bit data path to SRAM. Unidirectional because FPGAs
 28 |         like that, becomes bidirectional at the I/O pin.
 29 |     data_from_sram (in): 16-bit data path from SRAM.
 30 | 
 31 |     bus (port): connection to the SoC bus.
 32 |     """
 33 |     clock_90: In(1)
 34 | 
 35 |     sram_oe: Out(1)
 36 |     sram_we: Out(1)
 37 |     sram_lanes: Out(2)
 38 |     data_to_sram: In(16)
 39 |     data_from_sram: Out(16)
 40 | 
 41 |     def __init__(self, *, address_bits):
 42 |         super().__init__()
 43 |         self.bus = BusPort(addr = address_bits, data = 16).flip().create()
 44 | 
 45 |         self.addr_to_sram = Signal(address_bits)
 46 | 
 47 |         self.address_bits = address_bits
 48 | 
 49 |     def elaborate(self, platform):
 50 |         m = Module()
 51 | 
 52 |         # Register the bus inputs when we're selected, so that we can maintain
 53 |         # stable outputs. Note that we register "lanes" separately from the
 54 |         # "write" signal because the SRAM requires lanes to be asserted to
 55 |         # read!
 56 |         r_addr = Signal(self.address_bits)
 57 |         r_data_to_sram = Signal(16)
 58 |         r_lanes = Signal(2)
 59 |         r_write = Signal()
 60 |         r_read = Signal()
 61 | 
 62 |         # Automatically clear any write request on the cycle after it occurs, so
 63 |         # that we don't sit there repeatedly writing from this interface while
 64 |         # the CPU is off doing other things.
 65 |         with m.If(r_write):
 66 |             m.d.sync += [
 67 |                 r_write.eq(0),
 68 |                 r_read.eq(0),
 69 |             ]
 70 |        
 71 |         # Copy any bus transaction into the registers. This will override the
 72 |         # clearing above.
 73 |         with m.If(self.bus.cmd.valid):
 74 |             m.d.sync += [
 75 |                 r_addr.eq(self.bus.cmd.payload.addr),
 76 |                 r_data_to_sram.eq(self.bus.cmd.payload.data),
 77 |                 r_write.eq(self.bus.cmd.payload.lanes.any()),
 78 |                 r_read.eq(~self.bus.cmd.payload.lanes.any()),
 79 |                 # Our bus doesn't use lane signals on read. The external bus
 80 |                 # does. Convert.
 81 |                 r_lanes.eq(self.bus.cmd.payload.lanes
 82 |                     | (~self.bus.cmd.payload.lanes.any()).replicate(2)),
 83 |             ]
 84 | 
 85 |         # Present transactions from our registers on the bus output.
 86 |         m.d.comb += [
 87 |             self.addr_to_sram.eq(r_addr),
 88 |             self.data_to_sram.eq(r_data_to_sram),
 89 |             self.sram_lanes.eq(r_lanes),
 90 | 
 91 |             # Assert the (active high) output enable line whenever we're not
 92 |             # writing. Conversely, deassert it on write cycles. The phase
 93 |             # offset of our write-enable output gives the drivers time to turn
 94 |             # off.
 95 |             self.sram_oe.eq(r_read),
 96 |             # Combine our write enable (active high) with the incoming
 97 |             # phase-shifted clock to generate a write pulse in the center of
 98 |             # each write cycle.
 99 |             self.sram_we.eq(r_write & self.clock_90),
100 |         ]
101 |         # Responses come back combinationally, in what is likely to be the slow
102 |         # path.
103 |         m.d.comb += self.bus.resp.eq(self.data_from_sram)
104 | 
105 |         return m
106 | 


--------------------------------------------------------------------------------
/hapenny/fdbox.py:
--------------------------------------------------------------------------------
  1 | # The FD-Box, responsible for fetch and decode during execution.
  2 | 
  3 | from amaranth import *
  4 | from amaranth.lib.wiring import *
  5 | from amaranth.lib.enum import *
  6 | from amaranth.lib.coding import Encoder, Decoder
  7 | 
  8 | from hapenny import StreamSig, AlwaysReady, onehot_choice, mux, oneof
  9 | from hapenny.sbox import STATE_COUNT
 10 | from hapenny.bus import BusPort
 11 | 
 12 | class FDBox(Component):
 13 |     """The FD-Box fetches and decodes instructions.
 14 | 
 15 |     Based on a PC (provided by the EW-box) the FD-box generates bus
 16 |     transactions to collect both halfwords of an instruction, and then provides
 17 |     it on an output signal to the EW-box.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     prog_addr_width (integer): number of bits in a program address, 32 by default
 22 |         but can be shrunk to save logic.
 23 | 
 24 |     Attributes
 25 |     ----------
 26 |     onehot_state (input): state input from the S-Box
 27 |     pc (input): program counter from EW-box; includes a 'valid' signal that
 28 |         determines whether a fetch happens, to avoid wild fetches of nonsense
 29 |         addresses.
 30 |     rf_cmd (output): read command to the register file, intended to be OR'd.
 31 |     inst_next (output): instruction word for EW to use next time we restart
 32 |         from the top.
 33 |     bus (port): our connection to the memory fabric.
 34 |     from_the_top (input): signal from EW indicating that this is the final
 35 |         cycle of the instruction. We use this to gate register reads.
 36 |     """
 37 |     onehot_state: In(STATE_COUNT)
 38 |     rf_cmd: Out(AlwaysReady(6))
 39 |     inst_next: Out(32)
 40 |     from_the_top: In(1)
 41 | 
 42 |     def __init__(self, *,
 43 |                  prog_addr_width = 32,
 44 |                  ):
 45 |         super().__init__()
 46 | 
 47 |         # Create a bus port of sufficient width to fetch instructions only.
 48 |         # (Width is -1 because we're addressing halfwords.)
 49 |         self.bus = BusPort(addr = prog_addr_width - 1, data = 16).create()
 50 | 
 51 |         # The PC width is -2 because it's addressing words.
 52 |         self.pc = AlwaysReady(prog_addr_width - 2).flip().create()
 53 | 
 54 |         self.inst = Signal(32)
 55 | 
 56 |     def elaborate(self, platform):
 57 |         m = Module()
 58 | 
 59 |         # State 0: we don't really do anything.
 60 |         # State 1: we start the low half fetch.
 61 |         # State 2: we receive the low half of the instruction word and issue
 62 |         # the high half fetch.
 63 |         # State 3: we receive the high half fetch and begin a register read.
 64 |         # State 4+: we don't do anything.
 65 | 
 66 |         m.d.comb += [
 67 |             # We issue bus transactions in states 1 and 2 only.
 68 |             self.bus.cmd.valid.eq(
 69 |                 self.pc.valid & (self.onehot_state[1] | self.onehot_state[2])
 70 |             ),
 71 |             # In those states we select the bottom and top halves of the
 72 |             # instruction, respectively.
 73 |             self.bus.cmd.payload.addr.eq(onehot_choice(self.onehot_state, {
 74 |                 1: Cat(0, self.pc.payload),
 75 |                 2: Cat(1, self.pc.payload),
 76 |             })),
 77 | 
 78 |             # We access the register file only in the last cycle.
 79 |             self.rf_cmd.valid.eq(self.from_the_top),
 80 |             # If the last cycle is state 3, our fetch is still completing, so
 81 |             # we need to forward the bus response to the register file. If it
 82 |             # isn't state 3, we can serve out of our inst register.
 83 |             # (It's important to send zeros in other states instead of
 84 |             # hardwiring this so that we can OR.)
 85 |             self.rf_cmd.payload.eq(oneof([
 86 |                 (self.from_the_top & self.onehot_state[3],
 87 |                  Cat(self.inst[15], self.bus.resp[0:4], 0)),
 88 |                 (self.from_the_top & ~self.onehot_state[3],
 89 |                  Cat(self.inst[15:20], 0)),
 90 |             ])),
 91 | 
 92 |             # Forward the instruction through so it's valid in states 3+. In
 93 |             # state 3 specifically, forward the top half from the bus. In other
 94 |             # states, serve up the contents of our registers. EW's not supposed
 95 |             # to look at this in states 0-2.
 96 |             self.inst_next[:16].eq(self.inst[:16]),
 97 |             self.inst_next[16:].eq(mux(
 98 |                 self.onehot_state[3],
 99 |                 self.bus.resp,
100 |                 self.inst[16:],
101 |             )),
102 |         ]
103 | 
104 |         m.d.sync += [
105 |             # Latch the bottom half of the instruction at the end of state 2.
106 |             self.inst[:16].eq(mux(
107 |                 self.onehot_state[2],
108 |                 self.bus.resp,
109 |                 self.inst[:16],
110 |             )),
111 |             # Latch the top half at the end of state 3.
112 |             self.inst[16:].eq(mux(
113 |                 self.onehot_state[3],
114 |                 self.bus.resp,
115 |                 self.inst[16:],
116 |             )),
117 |         ]
118 | 
119 |         return m
120 | 


--------------------------------------------------------------------------------
/hapenny/gpio.py:
--------------------------------------------------------------------------------
  1 | from amaranth import *
  2 | from amaranth.lib.wiring import *
  3 | from amaranth.lib.enum import *
  4 | from amaranth.lib.coding import Encoder, Decoder
  5 | 
  6 | from hapenny import StreamSig, AlwaysReady, mux, oneof
  7 | from hapenny.bus import BusPort
  8 | 
  9 | class MinimalOutputPort(Component):
 10 |     """An absolutely dead-simple output port. Pipes any data written through to
 11 |     pins. Does not support reading back the state of the pins, or any fancy
 12 |     manipulation.
 13 | 
 14 |     Use this when space is at a premium; otherwise, see OutputPort. Also,
 15 |     measure the actual area requirement of the port -- in many cases, OutputPort
 16 |     is the same cost for more functionality.
 17 | 
 18 |     Memory Map
 19 |     ----------
 20 |     +00: pins (byte write supported)
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     pins (integer): number of pins to implement, 0-16.
 25 | 
 26 |     Attributes
 27 |     ----------
 28 |     bus (port): connection to the fabric.
 29 |     pins (signal array): output to pins.
 30 |     """
 31 |     bus: In(BusPort(addr = 0, data = 16))
 32 | 
 33 |     def __init__(self, pins):
 34 |         super().__init__()
 35 |         self.pins = Signal(pins)
 36 | 
 37 |     def elaborate(self, platform):
 38 |         m = Module()
 39 | 
 40 |         m.d.sync += self.pins[:8].eq(mux(
 41 |             self.bus.cmd.valid & self.bus.cmd.payload.lanes[0],
 42 |             self.bus.cmd.payload.data[:8],
 43 |             self.pins[:8],
 44 |         ))
 45 |         m.d.sync += self.pins[8:].eq(mux(
 46 |             self.bus.cmd.valid & self.bus.cmd.payload.lanes[1],
 47 |             self.bus.cmd.payload.data[8:],
 48 |             self.pins[8:],
 49 |         ))
 50 | 
 51 |         return m
 52 | 
 53 | class OutputPort(Component):
 54 |     """A block of general-purpose outputs that can be changed simultaneously.
 55 | 
 56 |     Memory Map
 57 |     ----------
 58 |     +0  sets pins when written
 59 |     +2  ORs value with current pin state
 60 |     +4  ANDs the complement of the value written with the current pin state.
 61 |     +6  XORs value with the current pin state
 62 | 
 63 |     All registers support byte writes to affect only half the pins.
 64 | 
 65 |     Parameters
 66 |     ----------
 67 |     pins (integer): number of pins to implement (1-16)
 68 |     read_back (boolean): when True (default), the state of the pins can be read
 69 |         back. When False, reads always return zero. Turning off read-back can
 70 |         save some space.
 71 | 
 72 |     Attributes
 73 |     ----------
 74 |     bus (port): connection to bus fabric
 75 |     pins (signal array): the output pins
 76 |     """
 77 |     bus: In(BusPort(addr = 2, data = 16))
 78 | 
 79 |     def __init__(self, pins, read_back = True):
 80 |         super().__init__()
 81 |         self.pins = Signal(pins)
 82 |         self.read_back = read_back
 83 | 
 84 |     def elaborate(self, platform):
 85 |         m = Module()
 86 | 
 87 |         a = self.bus.cmd.payload.addr
 88 |         d = self.bus.cmd.payload.data
 89 | 
 90 |         m.d.sync += self.pins[:8].eq(mux(
 91 |             self.bus.cmd.valid & self.bus.cmd.payload.lanes[0],
 92 |             oneof([
 93 |                 (a == 1, self.pins[:8] | d[:8]),
 94 |                 (a == 2, self.pins[:8] & ~d[:8]),
 95 |                 (a == 3, self.pins[:8] ^ d[:8]),
 96 |              ], default = d[:8]),
 97 |             self.pins,
 98 |         ))
 99 |         m.d.sync += self.pins[8:].eq(mux(
100 |             self.bus.cmd.valid & self.bus.cmd.payload.lanes[1],
101 |             oneof([
102 |                 (a == 1, self.pins[8:] | d[8:]),
103 |                 (a == 2, self.pins[8:] & ~d[8:]),
104 |                 (a == 3, self.pins[8:] ^ d[8:]),
105 |             ], default = d[8:]),
106 |             self.pins,
107 |         ))
108 | 
109 |         if self.read_back:
110 |             # We can service reads trivially by just permanently connecting our
111 |             # register to the bus.
112 |             m.d.comb += self.bus.resp.eq(self.pins)
113 | 
114 |         return m
115 | 
116 | class InputPort(Component):
117 |     """A simple input port peripheral. Can read the state of pins.
118 | 
119 |     Memory Map
120 |     ----------
121 |     +00: pins (read only, writes ignored)
122 | 
123 |     Parameters
124 |     ----------
125 |     pins (integer): number of pins to implement, 0-16.
126 | 
127 |     Attributes
128 |     ----------
129 |     bus (port): connection to the fabric.
130 |     pins (signal array): input from pins.
131 |     """
132 |     bus: In(BusPort(addr = 0, data = 16))
133 | 
134 |     def __init__(self, pins):
135 |         super().__init__()
136 |         self.pins = Signal(pins)
137 | 
138 |     def elaborate(self, platform):
139 |         m = Module()
140 | 
141 |         # Am I the simplest peripheral? I think so!
142 | 
143 |         # Register inputs to cut the path from pins, and also to avoid leaking
144 |         # metastability.
145 |         pins_r = Signal(self.pins.shape().width)
146 |         m.d.sync += pins_r.eq(self.pins)
147 | 
148 |         # Always output the state from the last cycle onto the bus. This has the
149 |         # nice side effect of returning the state of the pins when the read was
150 |         # _issued_ rather than when it completed.
151 |         m.d.comb += self.bus.resp.eq(pins_r)
152 | 
153 |         return m
154 | 


--------------------------------------------------------------------------------
/hapenny/mem.py:
--------------------------------------------------------------------------------
  1 | # Reusable memory with our bus interface.
  2 | 
  3 | from amaranth import *
  4 | from amaranth.lib.wiring import *
  5 | from amaranth.lib.enum import *
  6 | from amaranth.lib.coding import Encoder, Decoder
  7 | 
  8 | from hapenny import StreamSig, AlwaysReady, mux
  9 | from hapenny.bus import BusPort
 10 | 
 11 | class BasicMemory(Elaboratable):
 12 |     """A dead-simple 16-bit-wide memory with the Hapenny bus interface.
 13 | 
 14 |     This uses an Amaranth generic memory internally, which relies on inference
 15 |     in the synthesis tools to map to a specific type of resource such as block
 16 |     RAM. In practice it won't map to uninitialized RAM (like the iCE40UP5K's
 17 |     SPRAM) because Amaranth insists on generating it with an initializer; for
 18 |     that you'll need another module.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     depth (integer): number of 16-bit halfwords in the memory. If omitted,
 23 |         contents must be provided, and depth is inferred from len(contents).
 24 |     contents (list of integer): initialization contents of the memory. If
 25 |         omitted, depth must be provided, and the RAM is implicitly zeroed.
 26 |     read_only (boolean): if overridden to True, the memory will not respond to
 27 |         write strobes. This is useful for using an initialized block RAM as a
 28 |         program ROM.
 29 | 
 30 |     Attributes
 31 |     ----------
 32 |     bus: a BusPort with the minimum number of addr bits required to address
 33 |         'depth' words, and a 16-bit data path.
 34 |     """
 35 | 
 36 |     def __init__(self, *,
 37 |                  depth = None,
 38 |                  contents = [],
 39 |                  read_only = False):
 40 |         super().__init__()
 41 | 
 42 |         if depth is None:
 43 |             assert len(contents) > 0, "either depth or contents must be provided"
 44 |             depth = len(contents)
 45 | 
 46 |         addr_bits = (depth - 1).bit_length()
 47 | 
 48 |         self.bus = BusPort(addr = addr_bits, data = 16).flip().create()
 49 | 
 50 |         self.m = Memory(
 51 |             width = 16,
 52 |             depth = depth,
 53 |             name = "basicram",
 54 |             init = contents,
 55 |         )
 56 | 
 57 |         self.read_only = False
 58 | 
 59 |     def elaborate(self, platform):
 60 |         m = Module()
 61 | 
 62 |         m.submodules.m = self.m
 63 | 
 64 |         rp = self.m.read_port(transparent = False)
 65 | 
 66 | 
 67 |         m.d.comb += [
 68 |             rp.addr.eq(self.bus.cmd.payload.addr),
 69 |             rp.en.eq(self.bus.cmd.valid & (self.bus.cmd.payload.lanes == 0)),
 70 |             self.bus.resp.eq(rp.data),
 71 |         ]
 72 | 
 73 |         if not self.read_only:
 74 |             wp = self.m.write_port(granularity = 8)
 75 |             m.d.comb += [
 76 |                 wp.addr.eq(self.bus.cmd.payload.addr),
 77 |                 wp.data.eq(self.bus.cmd.payload.data),
 78 |                 wp.en[0].eq(self.bus.cmd.valid & self.bus.cmd.payload.lanes[0]),
 79 |                 wp.en[1].eq(self.bus.cmd.valid & self.bus.cmd.payload.lanes[1]),
 80 |             ]
 81 | 
 82 |         return m
 83 | 
 84 | class SpramMemory(Component):
 85 |     """A single 256 kiB / 32 kiB SPRAM on the UP5K.
 86 | 
 87 |     This module exists because getting Amaranth to generate a memory that Yosys
 88 |     is willing to map to SPRAM is currently hard.
 89 | 
 90 |     SPRAMs are uninitialized at reset and can retain content across both design
 91 |     and device resets. As a result, this module doesn't support a read_only
 92 |     mode, because its contents would be indeterminate (yet not random enough to
 93 |     be interesting).
 94 | 
 95 |     Attributes
 96 |     ----------
 97 |     bus: bus interface with 14 address bits.
 98 |     """
 99 |     bus: In(BusPort(addr = 14, data = 16))
100 | 
101 |     def elaborate(self, platform):
102 |         m = Module()
103 | 
104 |         m.submodules.spram = Instance(
105 |             "SB_SPRAM256KA",
106 |             i_CLOCK = ClockSignal("sync"),
107 |             i_ADDRESS = self.bus.cmd.payload.addr,
108 |             i_DATAIN = self.bus.cmd.payload.data,
109 |             # Weirdly, write enables are at the nibble level.
110 |             i_MASKWREN = Cat(
111 |                 self.bus.cmd.payload.lanes[0],
112 |                 self.bus.cmd.payload.lanes[0],
113 |                 self.bus.cmd.payload.lanes[1],
114 |                 self.bus.cmd.payload.lanes[1],
115 |             ),
116 |             i_WREN = self.bus.cmd.payload.lanes != 0,
117 |             i_CHIPSELECT = self.bus.cmd.valid,
118 |             i_POWEROFF = 1, # active fucking low
119 |             i_STANDBY = 0,
120 |             i_SLEEP = 0,
121 |             o_DATAOUT = self.bus.resp,
122 |         )
123 | 
124 |         return m
125 | 


--------------------------------------------------------------------------------
/hapenny/regfile16.py:
--------------------------------------------------------------------------------
 1 | # 16-bit x 64 register file for narrow datapath RV32 implementation.
 2 | 
 3 | from amaranth import *
 4 | from amaranth.lib.wiring import *
 5 | from amaranth.lib.enum import *
 6 | 
 7 | from hapenny import StreamSig, AlwaysReady
 8 | 
 9 | def RegWrite(addrbits = 5):
10 |     return Signature({
11 |         'reg': Out(addrbits),
12 |         'value': Out(16),
13 |     })
14 | 
15 | class RegFile16(Component):
16 |     read_resp: Out(16)
17 | 
18 |     def __init__(self, *, 
19 |                  banks = 1):
20 |         super().__init__()
21 | 
22 |         self.banks = banks
23 | 
24 |         # 5 bits for x0..x31, 1 bit for top vs bottom half, then bank bits
25 |         select_bits = 5 + 1 + (banks - 1).bit_length()
26 | 
27 |         self.read_cmd = AlwaysReady(select_bits).flip().create()
28 |         self.write_cmd = AlwaysReady(RegWrite(select_bits)).flip().create()
29 | 
30 |     def elaborate(self, platform):
31 |         m = Module()
32 | 
33 |         nregs = 32 * self.banks
34 | 
35 |         m.submodules.mem = mem = Memory(
36 |             width = 16,
37 |             depth = 2 * nregs,
38 |             name = "regfile",
39 |             attrs = {
40 |                 'ram_style': 'block',
41 |             },
42 |         )
43 | 
44 |         rp = mem.read_port(transparent = False)
45 |         wp = mem.write_port()
46 | 
47 |         m.d.comb += [
48 |             rp.addr.eq(self.read_cmd.payload),
49 |             rp.en.eq(self.read_cmd.valid),
50 | 
51 |             self.read_resp.eq(rp.data),
52 | 
53 |             wp.addr.eq(self.write_cmd.payload.reg),
54 |             wp.data.eq(self.write_cmd.payload.value),
55 |             # Block writes to both halves of x0 in all banks.
56 |             wp.en.eq((self.write_cmd.payload.reg[:5] != 0) & self.write_cmd.valid),
57 |         ]
58 | 
59 |         return m
60 | 


--------------------------------------------------------------------------------
/hapenny/rvfi.py:
--------------------------------------------------------------------------------
 1 | from functools import reduce
 2 | 
 3 | from amaranth import *
 4 | from amaranth.lib.wiring import *
 5 | from amaranth.lib.enum import *
 6 | from amaranth.lib.coding import Encoder, Decoder
 7 | 
 8 | from hapenny import StreamSig, AlwaysReady
 9 | 
10 | class Mode(Enum, shape = unsigned(2)):
11 |     U = 0
12 |     S = 1
13 |     # 2 is reserved
14 |     M = 3
15 | 
16 | class Ixl(Enum, shape = unsigned(2)):
17 |     _32 = 1
18 |     _64 = 2
19 | 
20 | def Rvfi(ilen = 32, xlen = 32):
21 |     return Signature({
22 |         # instruction index, unique per instruction retired, no gaps
23 |         'order': Out(64),
24 |         'insn': Out(ilen),
25 |         'trap': Out(1),
26 |         'halt': Out(1),
27 |         'intr': Out(1),
28 |         'mode': Out(Mode),
29 |         'ixl': Out(Ixl, reset = Ixl._32),
30 | 
31 |         'rs1_addr': Out(5),
32 |         'rs2_addr': Out(5),
33 |         'rs1_rdata': Out(xlen),
34 |         'rs2_rdata': Out(xlen),
35 | 
36 |         'rd_addr': Out(5),
37 |         'rd_wdata': Out(xlen),
38 | 
39 |         'pc_rdata': Out(xlen),
40 |         'pc_wdata': Out(xlen),
41 | 
42 |         'mem_addr': Out(xlen),
43 |         'mem_rmask': Out(xlen // 8),
44 |         'mem_wmask': Out(xlen // 8),
45 |         'mem_rdata': Out(xlen),
46 |         'mem_wdata': Out(xlen),
47 |     })
48 | 


--------------------------------------------------------------------------------
/hapenny/sbox.py:
--------------------------------------------------------------------------------
 1 | # The S-Box, responsible for state sequencing of other boxes.
 2 | 
 3 | from amaranth import *
 4 | from amaranth.lib.wiring import *
 5 | from amaranth.lib.enum import *
 6 | from amaranth.lib.coding import Encoder, Decoder
 7 | 
 8 | from hapenny import StreamSig, AlwaysReady
 9 | from hapenny.bus import BusPort
10 | 
11 | # Maximum number of (unique) states needed by any instruction, plus one
12 | # additional for halt. (Note that repeated states when e.g. shifting do not
13 | # count as unique states.)
14 | STATE_COUNT = 6 + 1
15 | 
16 | class SBox(Component):
17 |     """The S-Box sequences the other components.
18 | 
19 |     The S-Box implements a state counter that counts up through the maximum
20 |     number of unique states required by any instruction. The count can be reset,
21 |     signaling the end of one instruction and the beginning of the next, by
22 |     asserting the from_the_top input.
23 | 
24 |     The state counter, and output, are both one-hot.
25 | 
26 |     Attributes
27 |     ----------
28 |     from_the_top (input): restarts the count for the next instruction.
29 |     hold (input): input from EW-box to keep doing this same state. Only safe for
30 |         use after state 3 to avoid weird side effects.
31 |     halt_request (input): when high, redirects the next from_the_top assertion
32 |         to go to the halted state instead.
33 |     not_a_bubble (input): indicates that the CPU is doing useful work and not
34 |         just fetching. Used to gate transitions to halt state to ensure forward
35 |         progress during single-stepping.
36 |     onehot_state (output): one bit per possible state.
37 |     halted(output): a handy synonym for the last onehot_state bit.
38 |     """
39 |     from_the_top: In(1)
40 |     hold: In(1)
41 |     halt_request: In(1)
42 |     not_a_bubble: In(1)
43 | 
44 |     onehot_state: Out(STATE_COUNT)
45 |     halted: Out(1)
46 | 
47 |     def __init__(self):
48 |         super().__init__()
49 | 
50 |         self.onehot_state.reset = 1
51 | 
52 |     def elaborate(self, platform):
53 |         m = Module()
54 | 
55 |         # This module is doing a lot of things by hand, because as far as I can
56 |         # tell, Amaranth doesn't really know anything about one-hot encoding.
57 |         # Like, there's no way to indicate that the bits are exclusive. So in an
58 |         # attempt to get this managed like a one-hot FSM rather than a
59 |         # STATE_COUNT-wide base-2 FSM, I'm rolling circuits by hand.
60 | 
61 |         # Inexpensive way to detect that we're leaving a halt request without
62 |         # requiring more registers:
63 |         end_of_halt = Signal(1)
64 |         m.d.comb += end_of_halt.eq(
65 |             self.onehot_state[STATE_COUNT - 1] & ~self.halt_request
66 |         )
67 | 
68 |         # Generate one-hot counter transition circuit. In each state we clear
69 |         # one bit and set another to advance. This can be overridden if we get
70 |         # the signal to start again from the top.
71 |         for state_num in range(STATE_COUNT):
72 |             with m.If(self.from_the_top | end_of_halt):
73 |                 with m.If(self.halt_request & self.not_a_bubble):
74 |                     # Each bit must clear itself except for the highest.
75 |                     m.d.sync += self.onehot_state[state_num].eq(
76 |                         state_num == STATE_COUNT - 1
77 |                     )
78 |                 with m.Else():
79 |                     # Each bit must clear itself except for the lowest.
80 |                     m.d.sync += self.onehot_state[state_num].eq(state_num == 0)
81 |             with m.Elif(self.onehot_state[state_num] & ~self.hold):
82 |                 # The final state is sticky, so, don't implement wraparound
83 |                 # logic to advance out of it. We only leave that state if we
84 |                 # receive from_the_top.
85 |                 if state_num < STATE_COUNT - 1:
86 |                     m.d.sync += [
87 |                         self.onehot_state[state_num].eq(0),
88 |                         self.onehot_state[state_num + 1].eq(1),
89 |                     ]
90 | 
91 |         m.d.comb += self.halted.eq(self.onehot_state[STATE_COUNT - 1])
92 |         return m
93 | 


--------------------------------------------------------------------------------
/hapenny/serial.py:
--------------------------------------------------------------------------------
  1 | from amaranth import *
  2 | from amaranth.lib.wiring import *
  3 | from amaranth.lib.enum import *
  4 | from amaranth.lib.coding import Encoder, Decoder
  5 | 
  6 | from hapenny import StreamSig, AlwaysReady, mux, oneof
  7 | from hapenny.bus import BusPort
  8 | 
  9 | class ReceiveCore(Component):
 10 |     rx: In(1)
 11 |     sample_clock: In(1)
 12 |     rdr: Out(8)
 13 |     empty: Out(1)
 14 |     read_strobe: In(1)
 15 | 
 16 |     def __init__(self, oversample = 16):
 17 |         super().__init__()
 18 | 
 19 |         self.oversample = oversample
 20 | 
 21 |     def elaborate(self, platform):
 22 |         m = Module()
 23 | 
 24 |         state = Signal(range(4))
 25 |         bits_left = Signal(range(8))
 26 |         timer = Signal(range(self.oversample))
 27 |         have_data = Signal(1)
 28 | 
 29 |         m.d.comb += [
 30 |             self.empty.eq(~have_data),
 31 |         ]
 32 | 
 33 |         m.d.sync += timer.eq(oneof([
 34 |             # Set to delay half a bit period from initial negative edge.
 35 |             (self.sample_clock & (state == 0), (self.oversample // 2) - 1),
 36 |             # Count down in all other states until we reach 0.
 37 |             (self.sample_clock & (state != 0) & (timer != 0), timer - 1),
 38 |             # Once we reach 0, reset to a full bit time.
 39 |             (self.sample_clock & (state != 0) & (timer == 0), self.oversample - 1),
 40 |         ], default = timer))
 41 | 
 42 |         m.d.sync += state.eq(oneof([
 43 |             # Leave state 0 if we see the falling edge.
 44 |             (self.sample_clock & (state == 0), ~self.rx),
 45 |             # If it's still low at the midpoint of the start bit, proceed.
 46 |             # Otherwise, treat it as a glitch and reset.
 47 |             (self.sample_clock & (state == 1) & (timer == 0), mux(~self.rx, 2, 0)),
 48 |             # Automatically advance when we've done all the bits in state 2.
 49 |             (self.sample_clock & (state == 2) & (timer == 0), mux(bits_left == 0, 3, 2)),
 50 |             # Automatically advance at the end of the stop bit.
 51 |             (self.sample_clock & (state == 3) & (timer == 0), 0),
 52 |         ], default = state))
 53 | 
 54 |         m.d.sync += bits_left.eq(oneof([
 55 |             # Configure for 7 bits after the first one.
 56 |             (self.sample_clock & (timer == 0), mux(state == 1, 7, bits_left - 1)),
 57 |         ], default = bits_left))
 58 | 
 59 |         m.d.sync += self.rdr.eq(oneof([
 60 |             (self.sample_clock & (state == 2) & (timer == 0), Cat(self.rdr[1:], self.rx)),
 61 |         ], default = self.rdr))
 62 | 
 63 |         m.d.sync += have_data.eq(oneof([
 64 |             # The way this is expressed, newly arriving data will override the
 65 |             # read strobe -- the two cases will OR if they occur
 66 |             # simultaneously, and the 0 loses.
 67 |             (self.sample_clock & (state == 3) & (timer == 0), self.rx),
 68 |             (self.read_strobe, 0),
 69 |         ], default = have_data))
 70 | 
 71 |         return m
 72 | 
 73 | 
 74 | class TransmitCore(Component):
 75 |     tx: Out(1)
 76 |     sample_clock: In(1)
 77 |     thr_write: In(AlwaysReady(8))
 78 |     busy: Out(1)
 79 | 
 80 |     def __init__(self, oversample = 16):
 81 |         super().__init__()
 82 | 
 83 |         self.oversample = oversample
 84 | 
 85 |     def elaborate(self, platform):
 86 |         m = Module()
 87 | 
 88 |         # We use this as a shift register containing: start bit, 8 data bits, 2
 89 |         # stop bits. Its LSB is our output state, so it's important that it
 90 |         # reset to 1; the other bits can reset to whatever value.
 91 |         thr = Signal(1 + 8, reset = 1)
 92 | 
 93 |         tx_bits_left = Signal(range(1 + 8 + 2))
 94 |         tx_timer = Signal(range(self.oversample))
 95 | 
 96 |         with m.If(self.sample_clock):
 97 |             with m.If(tx_bits_left != 0):
 98 |                 with m.If(tx_timer == 0):
 99 |                     m.d.sync += [
100 |                         thr.eq(Cat(thr[1:], 1)),
101 |                         tx_timer.eq(self.oversample - 1),
102 |                         tx_bits_left.eq(tx_bits_left - 1),
103 |                     ]
104 |                 with m.Else():
105 |                     m.d.sync += tx_timer.eq(tx_timer - 1)
106 | 
107 |         # Transmit output
108 |         m.d.comb += self.tx.eq(thr[0])
109 | 
110 |         # Control register interface.
111 |         m.d.comb += self.busy.eq(tx_bits_left != 0)
112 | 
113 |         with m.If(self.thr_write.valid):
114 |             m.d.sync += [
115 |                 # Load THR with the start bit.
116 |                 thr.eq(Cat(0, self.thr_write.payload)),
117 |                 tx_bits_left.eq(1 + 8 + 2),
118 |                 tx_timer.eq(self.oversample - 1),
119 |             ]
120 | 
121 |         return m
122 | 
123 | 
124 | class OversampleClock(Component):
125 |     out: Out(1)
126 | 
127 |     def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None):
128 |         super().__init__()
129 |         self.baud_rate = baud_rate
130 |         self.oversample = oversample
131 |         self.clock_freq = clock_freq
132 | 
133 |     def elaborate(self, platform):
134 |         m = Module()
135 | 
136 |         # We divide the system clock to our baud rate * oversample and use that
137 |         # clock for sampling. This is a compromise between low cost transmit
138 |         # (where we could divide the clock all the way down to the baud rate
139 |         # without issue) and accurate receive (where higher sampling rates are
140 |         # better but cost more flops).
141 |         clock_freq = self.clock_freq or platform.default_clk_frequency
142 |         our_freq = self.baud_rate * self.oversample
143 |         divisor = int(round(clock_freq / our_freq))
144 |         print(f"UART configured for {self.baud_rate} from input clock {clock_freq}, divisor = {divisor}")
145 |         actual_freq = clock_freq / self.oversample / divisor
146 |         freq_error = abs(actual_freq - self.baud_rate) / self.baud_rate
147 |         print(f"Actual baud rate will be: {actual_freq} (error: {freq_error * 100:.3}%)")
148 |         assert freq_error< 0.01, "Error: cannot achieve requested UART frequency"
149 | 
150 |         sample_clock = Signal(1)
151 |         sample_counter = Signal(range(divisor))
152 |         # Generate a pulse on every sample period for one (fast) clock cycle.
153 |         m.d.comb += self.out.eq(sample_counter == 0)
154 | 
155 |         m.d.sync += sample_counter.eq(mux(self.out, divisor - 1, sample_counter - 1))
156 | 
157 |         return m
158 | 
159 | class TransmitOnlyUart(Component):
160 |     """The world's crappiest UART!
161 | 
162 |     The low byte of any write goes into the transmit holding register and will
163 |     be sent out promptly.
164 | 
165 |     Reads return a status register where bit 0 indicates BUSY.
166 |     """
167 |     bus: In(BusPort(addr = 0, data = 16))
168 |     tx: Out(1)
169 | 
170 |     def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None):
171 |         super().__init__()
172 | 
173 |         self.baud_rate = baud_rate
174 |         self.oversample = oversample
175 |         self.clock_freq = clock_freq
176 | 
177 |     def elaborate(self, platform):
178 |         m = Module()
179 |         m.submodules.clkdiv = clkdiv = OversampleClock(
180 |             baud_rate = self.baud_rate,
181 |             oversample = self.oversample,
182 |             clock_freq = self.clock_freq,
183 |         )
184 | 
185 |         m.submodules.txr = txr = TransmitCore(oversample = self.oversample)
186 |         m.d.comb += [
187 |             txr.sample_clock.eq(clkdiv.out),
188 |             self.tx.eq(txr.tx),
189 |             self.bus.resp.eq(txr.busy),
190 | 
191 |             txr.thr_write.payload.eq(self.bus.payload.data[:8]),
192 |         ]
193 | 
194 |         with m.If(self.bus.cmd.valid & self.bus.cmd.payload.lanes[0]):
195 |             m.d.comb += txr.thr_write.valid.eq(1)
196 | 
197 |         return m
198 | 
199 | class ReceiveOnlyUart(Component):
200 |     """The world's other crappiest UART!
201 | 
202 |     This can receive a single frame and hold it in registers.
203 | 
204 |     On any read, this will return the frame in the low 8 bits, plus bit 15 set
205 |     if there's actual data. This is intended to be used with LH to easily get
206 |     the "data full" flag into the MSB where it can be tested with bltz.
207 | 
208 |     And, read sensitive, why not.
209 |     """
210 |     bus: In(BusPort(addr = 0, data = 16))
211 |     rx: In(1)
212 | 
213 |     def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None):
214 |         super().__init__()
215 | 
216 |         self.baud_rate = baud_rate
217 |         self.clock_freq = clock_freq
218 | 
219 |     def elaborate(self, platform):
220 |         m = Module()
221 | 
222 |         m.submodules.clkdiv = clkdiv = OversampleClock(
223 |             baud_rate = self.baud_rate,
224 |             oversample = self.oversample,
225 |             clock_freq = self.clock_freq,
226 |         )
227 | 
228 |         m.submodules.rxr = rxr = ReceiveCore(oversample = self.oversample)
229 |         m.d.comb += [
230 |             rxr.rx.eq(self.rx),
231 |             rxr.sample_clock.eq(clkdiv.out),
232 |             rxr.read_strobe.eq(self.bus.cmd.valid & ~self.bus.cmd.payload.lanes.any()),
233 |         ]
234 | 
235 |         m.d.sync += [
236 |             self.bus.resp[:8].eq(rxr.rdr),
237 |             self.bus.resp[15].eq(rxr.empty),
238 |         ]
239 | 
240 |         return m
241 | 
242 | class BidiUart(Component):
243 |     """A slightly less crappy UART.
244 | 
245 |     This combines the transmit and receive logic using a shared clock divider,
246 |     to save some space if you need both directions.
247 | 
248 |     Register Layout
249 |     ---------------
250 |     0x0000   RDR - data in low 8 bits, empty flag in bit 15, read-sensitive
251 |     0x0002   THR - reads as 0 if TX is idle, writes send low 8 bits
252 |     """
253 |     bus: In(BusPort(addr = 1, data = 16))
254 |     tx: In(1)
255 |     rx: In(1)
256 | 
257 |     def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None):
258 |         super().__init__()
259 | 
260 |         self.baud_rate = baud_rate
261 |         self.oversample = oversample
262 |         self.clock_freq = clock_freq
263 | 
264 |     def elaborate(self, platform):
265 |         m = Module()
266 | 
267 |         # Clock divider for sampling
268 |         m.submodules.clkdiv = clkdiv = OversampleClock(
269 |             baud_rate = self.baud_rate,
270 |             oversample = self.oversample,
271 |             clock_freq = self.clock_freq,
272 |         )
273 | 
274 |         # Receive state machine.
275 |         m.submodules.rxr = rxr = ReceiveCore(oversample = self.oversample)
276 |         m.d.comb += [
277 |             rxr.rx.eq(self.rx),
278 |             rxr.sample_clock.eq(clkdiv.out),
279 |         ]
280 | 
281 |         # Transmit machine.
282 | 
283 |         m.submodules.txr = txr = TransmitCore(oversample = self.oversample)
284 |         m.d.comb += [
285 |             txr.sample_clock.eq(clkdiv.out),
286 |             self.tx.eq(txr.tx),
287 |         ]
288 | 
289 |         # Bus read port. We register this so that state doesn't change by the
290 |         # time the output is read. This is particularly a problem for the
291 |         # read-sensitive RDR register.
292 |         m.d.sync += [
293 |             self.bus.resp[:8].eq(mux(
294 |                 self.bus.cmd.payload.addr[0],
295 |                 txr.busy,
296 |                 rxr.rdr,
297 |             )),
298 |             self.bus.resp[15].eq(
299 |                 ~self.bus.cmd.payload.addr[0] & rxr.empty
300 |             ),
301 |         ]
302 | 
303 |         # Read-sense logic for receive side.
304 |         m.d.comb += rxr.read_strobe.eq(
305 |             self.bus.cmd.valid
306 |             & ~self.bus.cmd.payload.lanes.any()
307 |             & ~self.bus.cmd.payload.addr[0]
308 |         )
309 | 
310 |         # Write logic for TX side.
311 |         m.d.comb += txr.thr_write.payload.eq(self.bus.cmd.payload.data[:8])
312 | 
313 |         m.d.comb += txr.thr_write.valid.eq(
314 |             self.bus.cmd.valid
315 |             & self.bus.cmd.payload.lanes[0]
316 |             & self.bus.cmd.payload.addr[0]
317 |         )
318 | 
319 |         return m
320 | 
321 | 


--------------------------------------------------------------------------------
/icestick-chonk.py:
--------------------------------------------------------------------------------
  1 | # This is a demo for the "chonk" core, which is the hapenny v2
  2 | # microarchitecture modified to have a 32-bit datapath and lower cycle count.
  3 | # This provides a useful apples-to-apples comparison with the hapenny cores,
  4 | # since it's using similar microarchitectural techniques and running the same
  5 | # code.
  6 | 
  7 | import itertools
  8 | import argparse
  9 | import struct
 10 | from pathlib import Path
 11 | 
 12 | from amaranth import *
 13 | from amaranth.lib.wiring import *
 14 | from amaranth.build import ResourceError, Resource, Pins, Attrs
 15 | from amaranth_boards.test.blinky import Blinky
 16 | from amaranth_boards.icestick import ICEStickPlatform
 17 | import amaranth.lib.cdc
 18 | 
 19 | from hapenny import StreamSig
 20 | import hapenny.chonk.cpu
 21 | from hapenny.bus import BusPort, SimpleFabric, partial_decode
 22 | from hapenny.chonk.gpio32 import OutputPort32
 23 | from hapenny.chonk.mem32 import BasicMemory
 24 | 
 25 | bootloader = Path("smallest-toggle.bin").read_bytes()
 26 | boot_image = struct.unpack("<" + "I" * (len(bootloader) // 4), bootloader)
 27 | for i, word in enumerate(boot_image):
 28 |     print(f"{i*4:08x}  {word:08x}")
 29 | 
 30 | # the blinky program does not use RAM at all, so we can fit it in a single
 31 | # block RAM. We'll use half of one, wastefully, to preserve the memory map.
 32 | RAM_WORDS = 128 * 1
 33 | RAM_ADDR_BITS = (RAM_WORDS - 1).bit_length()
 34 | 
 35 | # Add an extra bit to the implemented bus so we can also address I/O.
 36 | BUS_ADDR_BITS = RAM_ADDR_BITS + 1
 37 | print(f"BUS_ADDR_BITS = {BUS_ADDR_BITS}")
 38 | 
 39 | class Test(Elaboratable):
 40 |     def elaborate(self, platform):
 41 |         m = Module()
 42 | 
 43 |         # Gotta do some clock gymnastics here. We want the PLL on so that we can
 44 |         # run faster than the Icestick's 12 MHz crystal can go. However, setting
 45 |         # up our own sync domain _silently disables_ the Amaranth
 46 |         # ICE40Platform's reset delay, which is necssary to work around an
 47 |         # undocumented erratum in the iCE40 BRAM that has been chasing me for at
 48 |         # least six years.
 49 |         #
 50 |         # So, we're going to reconstruct it manually.
 51 |         clk12 = platform.request("clk12", dir = "-")
 52 | 
 53 |         # 15us delay, 12 MHz clock: 180 cycles
 54 |         por_delay = int(15e-6 * 12e6)
 55 |         m.domains += ClockDomain("por", reset_less=True, local=True)
 56 |         por_timer = Signal(range(por_delay))
 57 |         por_ready = Signal()
 58 |         m.d.comb += ClockSignal("por").eq(clk12.io)
 59 |         with m.If(por_timer == por_delay):
 60 |             m.d.por += por_ready.eq(1)
 61 |         with m.Else():
 62 |             m.d.por += por_timer.eq(por_timer + 1)
 63 | 
 64 |         cd_sync = ClockDomain("sync")
 65 |         m.domains += cd_sync
 66 |         m.d.comb += ResetSignal("sync").eq(~por_ready)
 67 | 
 68 |         F = 55e6 # Hz
 69 |         pll_r, pll_f, pll_q, filter_range = 0, 72, 4, 1
 70 | 
 71 |         platform.add_clock_constraint(cd_sync.clk, F)
 72 |         print(f"Configuring SoC for {F/1000000:.03} MHz")
 73 | 
 74 |         # PLL settings below must generate F from 12MHz; use icepll to adjust.
 75 |         m.submodules.pll = Instance(
 76 |             "SB_PLL40_CORE",
 77 |             p_FEEDBACK_PATH = "SIMPLE",
 78 |             p_DIVR = pll_r,
 79 |             p_DIVF = pll_f,
 80 |             p_DIVQ = pll_q,
 81 |             p_FILTER_RANGE = filter_range,
 82 | 
 83 |             i_REFERENCECLK = clk12.io,
 84 |             i_RESETB = 1,
 85 |             o_PLLOUTGLOBAL = cd_sync.clk,
 86 |         )
 87 | 
 88 |         # Ok, back to the design.
 89 |         m.submodules.cpu = cpu = hapenny.chonk.cpu.Cpu(
 90 |             # +2 to adjust from bus word addressing to CPU byte
 91 |             # addressing.
 92 |             addr_width = BUS_ADDR_BITS + 2,
 93 |             # Program addresses only need to be able to address program
 94 |             # memory, so configure the PC and fetch port to be narrower.
 95 |             # (+2 because, again, our RAM is word addressed but this parameter
 96 |             # is in bytes.)
 97 |             prog_addr_width = RAM_ADDR_BITS + 2,
 98 |         )
 99 |         m.submodules.mem = mem = BasicMemory(depth = RAM_WORDS,
100 |                                              contents = boot_image)
101 |         # Make the simplest output port possible.
102 |         m.submodules.outport = outport = OutputPort32(1)
103 |         m.submodules.fabric = fabric = SimpleFabric([
104 |             mem.bus,
105 |             partial_decode(m, outport.bus, RAM_ADDR_BITS),
106 |         ])
107 | 
108 |         connect(m, cpu.bus, fabric.bus)
109 | 
110 |         for i in range(1):
111 |             led = platform.request("led", i)
112 |             m.d.comb += led.o.eq(outport.pins[i])
113 | 
114 |         return m
115 | 
116 | parser = argparse.ArgumentParser(
117 |     prog = "icestick-smallestbig",
118 |     description = "Script for synthesizing smallest image for HX1K",
119 | )
120 | args = parser.parse_args()
121 | 
122 | p = ICEStickPlatform()
123 | p.build(Test(), do_program = True)
124 | 


--------------------------------------------------------------------------------
/icestick-smallest.py:
--------------------------------------------------------------------------------
  1 | # This is the smallest happeny SoC model for the Icestick, to show off its size
  2 | # when configured with only a basic assembly program and single peripheral.
  3 | #
  4 | # This is not a spectacularly _useful_ configuration, and is smaller than the
  5 | # "small" configurations most other small RV32I SoCs include, so the numbers
  6 | # don't necessarily compare directly. It's essentially a 32-bit version of a
  7 | # tinyAVR.
  8 | #
  9 | # Mostly, I use this to keep an eye on the minimum size with a circuit that
 10 | # isn't so simple that it optimizes away in synthesis.
 11 | 
 12 | import itertools
 13 | from functools import reduce
 14 | import argparse
 15 | import struct
 16 | from pathlib import Path
 17 | 
 18 | from amaranth import *
 19 | from amaranth.lib.wiring import *
 20 | from amaranth.build import ResourceError, Resource, Pins, Attrs
 21 | from amaranth_boards.test.blinky import Blinky
 22 | from amaranth_boards.icestick import ICEStickPlatform
 23 | import amaranth.lib.cdc
 24 | 
 25 | from hapenny import StreamSig
 26 | import hapenny.cpu
 27 | from hapenny.bus import BusPort, SimpleFabric, partial_decode
 28 | from hapenny.gpio import OutputPort
 29 | from hapenny.mem import BasicMemory
 30 | 
 31 | bootloader = Path("smallest-toggle.bin").read_bytes()
 32 | boot_image = struct.unpack("<" + "H" * (len(bootloader) // 2), bootloader)
 33 | 
 34 | image_or = reduce(lambda a, b: a|b, boot_image)
 35 | image_and = reduce(lambda a, b: a&b, boot_image)
 36 | 
 37 | problems = set(b for b in range(16)
 38 |                  if image_or & (1 << b) == 0
 39 |                  or image_and & (1 << b) != 0)
 40 | 
 41 | if problems:
 42 |     print("WARNING: contents of boot ROM may cause logic to be optimized out.")
 43 |     print("Size estimates generated from such an image would be misleading.")
 44 |     print(f"The following bit positions are constant: {problems}")
 45 | 
 46 | # the blinky program does not use RAM at all, so we can fit it in a single block RAM.
 47 | RAM_WORDS = 256 * 1
 48 | RAM_ADDR_BITS = (RAM_WORDS - 1).bit_length()
 49 | 
 50 | # Add an extra bit to the implemented bus so we can also address I/O.
 51 | BUS_ADDR_BITS = RAM_ADDR_BITS + 1
 52 | 
 53 | class Test(Elaboratable):
 54 |     def elaborate(self, platform):
 55 |         m = Module()
 56 | 
 57 |         # Gotta do some clock gymnastics here. We want the PLL on so that we can
 58 |         # run faster than the Icestick's 12 MHz crystal can go. However, setting
 59 |         # up our own sync domain _silently disables_ the Amaranth
 60 |         # ICE40Platform's reset delay, which is necssary to work around an
 61 |         # undocumented erratum in the iCE40 BRAM that has been chasing me for at
 62 |         # least six years.
 63 |         #
 64 |         # So, we're going to reconstruct it manually.
 65 |         clk12 = platform.request("clk12", dir = "-")
 66 | 
 67 |         # 15us delay, 12 MHz clock: 180 cycles
 68 |         por_delay = int(20e-6 * 12e6)
 69 |         m.domains += ClockDomain("por", reset_less=True, local=True)
 70 |         por_timer = Signal(range(por_delay))
 71 |         por_ready = Signal()
 72 |         m.d.comb += ClockSignal("por").eq(clk12.io)
 73 |         with m.If(por_timer == por_delay):
 74 |             m.d.por += por_ready.eq(1)
 75 |         with m.Else():
 76 |             m.d.por += por_timer.eq(por_timer + 1)
 77 | 
 78 |         cd_sync = ClockDomain("sync")
 79 |         m.domains += cd_sync
 80 |         m.d.comb += ResetSignal("sync").eq(~por_ready)
 81 | 
 82 |         F = 70.5e6 # Hz
 83 |         pll_f, pll_q = 46, 3
 84 | 
 85 |         platform.add_clock_constraint(cd_sync.clk, F)
 86 |         print(f"Configuring SoC for {F/1000000:.03} MHz")
 87 | 
 88 |         # PLL settings below must generate F from 12MHz; use icepll to adjust.
 89 |         m.submodules.pll = Instance(
 90 |             "SB_PLL40_CORE",
 91 |             p_FEEDBACK_PATH = "SIMPLE",
 92 |             p_DIVR = 0,
 93 |             p_DIVF = pll_f,
 94 |             p_DIVQ = pll_q,
 95 |             p_FILTER_RANGE = 1,
 96 | 
 97 |             i_REFERENCECLK = clk12.io,
 98 |             i_RESETB = 1,
 99 |             o_PLLOUTGLOBAL = cd_sync.clk,
100 |         )
101 | 
102 |         # Ok, back to the design.
103 |         m.submodules.cpu = cpu = hapenny.cpu.Cpu(
104 |             # +1 to adjust from bus halfword addressing to CPU byte
105 |             # addressing.
106 |             addr_width = BUS_ADDR_BITS + 1,
107 |             # Program addresses only need to be able to address program
108 |             # memory, so configure the PC and fetch port to be narrower.
109 |             # (+1 because, again, our RAM is halfword addressed but this
110 |             # parameter is in bytes.)
111 |             prog_addr_width = RAM_ADDR_BITS + 1,
112 |         )
113 |         m.submodules.mem = mem = BasicMemory(depth = RAM_WORDS,
114 |                                              contents = boot_image)
115 |         # Make the simplest output port possible.
116 |         m.submodules.outport = outport = OutputPort(1, read_back = False)
117 |         m.submodules.fabric = fabric = SimpleFabric([
118 |             mem.bus,
119 |             partial_decode(m, outport.bus, RAM_ADDR_BITS),
120 |         ])
121 | 
122 |         connect(m, cpu.bus, fabric.bus)
123 | 
124 |         for i in range(1):
125 |             led = platform.request("led", i)
126 |             m.d.comb += led.o.eq(outport.pins[i])
127 | 
128 |         return m
129 | 
130 | parser = argparse.ArgumentParser(
131 |     prog = "icestick-smallest",
132 |     description = "Script for synthesizing smallest image for HX1K",
133 | )
134 | args = parser.parse_args()
135 | 
136 | p = ICEStickPlatform()
137 | p.build(Test(), do_program = True)
138 | 


--------------------------------------------------------------------------------
/icesticktest.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import argparse
  3 | import struct
  4 | from pathlib import Path
  5 | 
  6 | from amaranth import *
  7 | from amaranth.lib.wiring import *
  8 | from amaranth.build import ResourceError, Resource, Pins, Attrs
  9 | from amaranth_boards.test.blinky import Blinky
 10 | from amaranth_boards.icestick import ICEStickPlatform
 11 | import amaranth.lib.cdc
 12 | 
 13 | from hapenny import StreamSig
 14 | from hapenny.cpu import Cpu
 15 | from hapenny.bus import BusPort, SimpleFabric, partial_decode
 16 | from hapenny.serial import BidiUart
 17 | from hapenny.mem import BasicMemory
 18 | 
 19 | bootloader = Path("tiny-bootloader.bin").read_bytes()
 20 | boot_image = struct.unpack("<" + "h" * (len(bootloader) // 2), bootloader)
 21 | 
 22 | # tiny-bootloader is written in a high-level language and needs to have a stack,
 23 | # so the minimum here is two 256-halfword (512-byte) RAMs.
 24 | RAM_WORDS = 256 * 2
 25 | RAM_ADDR_BITS = (RAM_WORDS - 1).bit_length()
 26 | 
 27 | BUS_ADDR_BITS = RAM_ADDR_BITS + 1
 28 | 
 29 | class Test(Elaboratable):
 30 |     def __init__(self):
 31 |         super().__init__()
 32 | 
 33 |     def elaborate(self, platform):
 34 |         m = Module()
 35 | 
 36 |         # Gotta do some clock gymnastics here. We want the PLL on so that we can
 37 |         # run faster than the Icestick's 12 MHz crystal can go. However, setting
 38 |         # up our own sync domain _silently disables_ the Amaranth
 39 |         # ICE40Platform's reset delay, which is necssary to work around an
 40 |         # undocumented erratum in the iCE40 BRAM that has been chasing me for at
 41 |         # least six years.
 42 |         #
 43 |         # So, we're going to reconstruct it manually.
 44 |         clk12 = platform.request("clk12", dir = "-")
 45 | 
 46 |         # 15us delay, 12 MHz clock: 180 cycles
 47 |         por_delay = int(15e-6 * 12e6)
 48 |         m.domains += ClockDomain("por", reset_less=True, local=True)
 49 |         por_timer = Signal(range(por_delay))
 50 |         por_ready = Signal()
 51 |         m.d.comb += ClockSignal("por").eq(clk12.io)
 52 |         with m.If(por_timer == por_delay):
 53 |             m.d.por += por_ready.eq(1)
 54 |         with m.Else():
 55 |             m.d.por += por_timer.eq(por_timer + 1)
 56 | 
 57 |         cd_sync = ClockDomain("sync")
 58 |         m.domains += cd_sync
 59 |         m.d.comb += ResetSignal("sync").eq(~por_ready)
 60 | 
 61 |         F = 66e6 # Hz
 62 |         platform.add_clock_constraint(cd_sync.clk, F)
 63 |         print(f"Configuring SoC for {F/1000000:.03} MHz")
 64 | 
 65 |         # PLL settings below must generate F from 12MHz; use icepll to adjust.
 66 |         m.submodules.pll = Instance(
 67 |             "SB_PLL40_CORE",
 68 |             p_FEEDBACK_PATH = "SIMPLE",
 69 |             p_DIVR = 0,
 70 |             p_DIVF = 87,
 71 |             p_DIVQ = 4,
 72 |             p_FILTER_RANGE = 1,
 73 | 
 74 |             i_REFERENCECLK = clk12.io,
 75 |             i_RESETB = 1,
 76 |             o_PLLOUTGLOBAL = cd_sync.clk,
 77 |         )
 78 | 
 79 |         # Ok, back to the design.
 80 |         m.submodules.cpu = cpu = Cpu(
 81 |             # +1 to adjust from bus halfword addressing to CPU byte addressing.
 82 |             addr_width = BUS_ADDR_BITS + 1,
 83 |             # Program addresses only need to be able to address program memory,
 84 |             # so configure the PC and fetch port to be narrower. (+1 because,
 85 |             # again, our RAM is halfword addressed but this parameter is in
 86 |             # bytes.)
 87 |             prog_addr_width = RAM_ADDR_BITS + 1,
 88 |         )
 89 |         m.submodules.mem = mem = BasicMemory(depth = RAM_WORDS,
 90 |                                              contents = boot_image)
 91 |         # Set the UART for 8x oversample instead of the default 16, to save some
 92 |         # logic.
 93 |         m.submodules.uart = uart = BidiUart(baud_rate = 115_200,
 94 |                                             oversample = 8,
 95 |                                             clock_freq = F)
 96 |         m.submodules.fabric = fabric = SimpleFabric([
 97 |             mem.bus,
 98 |             partial_decode(m, uart.bus, RAM_ADDR_BITS),
 99 |         ])
100 | 
101 |         connect(m, cpu.bus, fabric.bus)
102 | 
103 |         uartpins = platform.request("uart", 0)
104 |         rx_post_sync = Signal()
105 |         m.submodules.rxsync = amaranth.lib.cdc.FFSynchronizer(
106 |             i = uartpins.rx.i,
107 |             o = rx_post_sync,
108 |             o_domain = "sync",
109 |             reset = 1,
110 |             stages = 2,
111 |         )
112 |         m.d.comb += [
113 |             uartpins.tx.o.eq(uart.tx),
114 |             uart.rx.eq(rx_post_sync),
115 |         ]
116 | 
117 |         return m
118 | 
119 | p = ICEStickPlatform()
120 | p.build(Test(), do_program = True)
121 | 


--------------------------------------------------------------------------------
/icoboard-large.py:
--------------------------------------------------------------------------------
  1 | # Icoboard example using the external SRAM.
  2 | 
  3 | import itertools
  4 | import argparse
  5 | import struct
  6 | from pathlib import Path
  7 | 
  8 | from amaranth import *
  9 | from amaranth.lib.wiring import *
 10 | from amaranth.build import ResourceError, Resource, Pins, Attrs
 11 | from boards.icoboard import IcoboardPlatform
 12 | 
 13 | from hapenny import StreamSig
 14 | from hapenny.cpu import Cpu
 15 | from hapenny.bus import BusPort, SimpleFabric, partial_decode
 16 | from hapenny.gpio import OutputPort, InputPort
 17 | from hapenny.serial import BidiUart
 18 | from hapenny.mem import BasicMemory, SpramMemory
 19 | from hapenny.extsram import ExternalSRAM
 20 | 
 21 | BOOT_ROM_WORDS = 256
 22 | BOOT_ROM_ADDR_BITS = (BOOT_ROM_WORDS - 1).bit_length()
 23 | 
 24 | bootloader = Path("icolarge-bootloader.bin").read_bytes()
 25 | boot_image = struct.unpack("<" + "H" * (len(bootloader) // 2), bootloader)
 26 | 
 27 | assert len(boot_image) <= BOOT_ROM_WORDS, \
 28 |         f"bootloader is {len(boot(image))} words long, too big for boot ROM"
 29 | 
 30 | class Test(Elaboratable):
 31 |     def elaborate(self, platform):
 32 |         m = Module()
 33 | 
 34 |         # The Icoboard's input clock is 100MHz, which seems ... optimistic.
 35 |         # Let's drop it to something we can use.
 36 | 
 37 |         # Gotta do some clock gymnastics here. We want the PLL on so that we can
 38 |         # run faster than the Icestick's 12 MHz crystal can go. However, setting
 39 |         # up our own sync domain _silently disables_ the Amaranth
 40 |         # ICE40Platform's reset delay, which is necssary to work around an
 41 |         # undocumented erratum in the iCE40 BRAM that has been chasing me for at
 42 |         # least six years.
 43 |         #
 44 |         # So, we're going to reconstruct it manually.
 45 |         clk100 = platform.request("clk100", dir = "-")
 46 | 
 47 |         # 15us delay, 100 MHz clock: 1500 cycles
 48 |         por_delay = int(15e-6 * 100e6)
 49 |         m.domains += ClockDomain("por", reset_less=True, local=True)
 50 |         por_timer = Signal(range(por_delay))
 51 |         por_ready = Signal()
 52 |         m.d.comb += ClockSignal("por").eq(clk100.io)
 53 |         with m.If(por_timer == por_delay):
 54 |             m.d.por += por_ready.eq(1)
 55 |         with m.Else():
 56 |             m.d.por += por_timer.eq(por_timer + 1)
 57 | 
 58 |         cd_sync = ClockDomain("sync")
 59 |         m.domains += cd_sync
 60 |         m.d.comb += ResetSignal("sync").eq(~por_ready)
 61 | 
 62 |         #F = 25.781e6 # Hz
 63 |         #pll_r, pll_f, pll_q, filter_range = 3, 0, 5, 2
 64 |         F = 50e6 # Hz
 65 |         pll_r, pll_f, pll_q, filter_range = 1, 0, 4, 4
 66 | 
 67 | 
 68 |         platform.add_clock_constraint(cd_sync.clk, F)
 69 |         print(f"Configuring SoC for {F/1000000:.05} MHz")
 70 | 
 71 |         clk_90 = Signal()
 72 | 
 73 |         # PLL settings below must generate F from 12MHz; use icepll to adjust.
 74 |         m.submodules.pll = Instance(
 75 |             "SB_PLL40_2F_CORE",
 76 |             p_FEEDBACK_PATH = "PHASE_AND_DELAY",
 77 |             p_PLLOUT_SELECT_PORTA = "SHIFTREG_0deg",
 78 |             p_PLLOUT_SELECT_PORTB = "SHIFTREG_90deg",
 79 |             p_SHIFTREG_DIV_MODE = 0,
 80 |             p_DIVR = pll_r,
 81 |             p_DIVF = pll_f,
 82 |             p_DIVQ = pll_q,
 83 |             p_FILTER_RANGE = filter_range,
 84 | 
 85 |             i_REFERENCECLK = clk100.io,
 86 |             i_RESETB = 1,
 87 |             o_PLLOUTGLOBALA = cd_sync.clk,
 88 |             o_PLLOUTCOREB = clk_90,
 89 |         )
 90 | 
 91 |         # Memory map should be:
 92 |         # 0000_0000     external SRAM (1 MiB)
 93 |         # 0010_0000     boot ROM
 94 |         # 0010_1000     UART
 95 |         # 0010_2000     output port
 96 |         # 0010_3000     input port
 97 | 
 98 |         m.submodules.cpu = cpu = Cpu(
 99 |             reset_vector = 0x10_0000,  # boot ROM
100 |             # We need 21-bit addressing to reach both all of the 1 MiB SRAM and
101 |             # our boot ROM. This also gives us about a MiB of peripheral space,
102 |             # which is great, so we set both program and load/store address
103 |             # widths to 21.
104 |             addr_width = 21,
105 |             # We'll turn on the counters because we've got the space, and this
106 |             # makes the output from Dhrystone a lot more interesting.
107 |             counters = True,
108 |         )
109 | 
110 |         # Create our RAMs.
111 |         m.submodules.extsram = extsram = ExternalSRAM(address_bits = 19)
112 |         m.submodules.bootmem = bootmem = BasicMemory(depth = BOOT_ROM_WORDS,
113 |                                                      contents = boot_image)
114 |         m.submodules.mem = mem = BasicMemory(depth = BOOT_ROM_WORDS)
115 | 
116 |         # Create a subfabric for the top mebibyte of the addressible space. This
117 |         # will include both our I/O devices and our boot ROM. We'll give each
118 |         # thing a 4096 byte (2048-halfword) region.
119 |         m.submodules.outport = outport = OutputPort(3)
120 |         m.submodules.inport = inport = InputPort(2)
121 |         m.submodules.uart = uart = BidiUart(baud_rate = 115200,
122 |                                             clock_freq = F)
123 |         m.submodules.iofabric = iofabric = SimpleFabric([
124 |             partial_decode(m, bootmem.bus, 11),     # 0x____0000
125 |             partial_decode(m, uart.bus, 11),        # 0x____1000
126 |             partial_decode(m, outport.bus, 11),     # 0x____2000
127 |             partial_decode(m, inport.bus, 11) ,     # 0x____3000
128 |         ])
129 | 
130 |         # Create the top-level fabric to unite memory and I/O.
131 |         m.submodules.fabric = fabric = SimpleFabric([
132 |             extsram.bus,                            # 0x0000_0000
133 |             partial_decode(m, iofabric.bus, 19),    # 0x0010_0000
134 |         ])
135 | 
136 |         connect(m, cpu.bus, fabric.bus)
137 | 
138 |         # Add some things describing how I've got PMODs connected.
139 |         # USB-serial is connected on top row of PMOD 1
140 |         platform.add_resources([
141 |             Resource("tx", 0, Pins("2", dir="o", conn=("pmod", 1))),
142 |             Resource("rx", 0, Pins("3", dir="i", conn=("pmod", 1))),
143 |         ])
144 | 
145 |         # UART wiring
146 |         tx = platform.request("tx", 0)
147 |         rx = platform.request("rx", 0)
148 |         m.d.comb += [
149 |             tx.o[0].eq(uart.tx),
150 |             uart.rx.eq(rx.i[0]),
151 |         ]
152 | 
153 |         # LED wiring
154 |         for i in range(3):
155 |             led = platform.request("led", i)
156 |             m.d.comb += led.o.eq(outport.pins[i])
157 | 
158 |         # Input port wiring
159 |         for i in range(2):
160 |             button = platform.request("button", i)
161 |             m.d.comb += inport.pins[i].eq(button.i)
162 | 
163 |         # SRAM wiring. NOTE: Amaranth models all the SRAM control signals as
164 |         # active-high and inverts at the pin. This means all of our signals
165 |         # here are active-high. This is potentially confusing, hence this
166 |         # comment.
167 |         sram = platform.request("sram", 0)
168 |         m.d.comb += [
169 |             # Our SRAM interface requires a 90-degree-shifted version of the
170 |             # clock.
171 |             extsram.clock_90.eq(clk_90),
172 | 
173 |             sram.cs.o.eq(1), # amaranth inverts this
174 |             sram.oe.o.eq(extsram.sram_oe),
175 |             sram.we.o.eq(extsram.sram_we),
176 |             sram.dm.o.eq(extsram.sram_lanes),
177 | 
178 |             sram.a.o.eq(extsram.addr_to_sram),
179 | 
180 |             sram.d.o.eq(extsram.data_to_sram),
181 |             sram.d.oe.eq(extsram.sram_we),
182 |             extsram.data_from_sram.eq(sram.d.i),
183 |         ]
184 | 
185 |         return m
186 | 
187 | p = IcoboardPlatform()
188 | p.build(Test(), do_program = True)
189 | 


--------------------------------------------------------------------------------
/icolarge-bootloader.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/icolarge-bootloader.bin


--------------------------------------------------------------------------------
/montool/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | 


--------------------------------------------------------------------------------
/montool/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hapenny-montool"
 3 | version = "1.0.0"
 4 | edition = "2021"
 5 | license = "MPL-2"
 6 | 
 7 | [dependencies]
 8 | anyhow = "1.0.71"
 9 | clap = { version = "4.3.4", features = ["derive", "wrap_help"] }
10 | indicatif = "0.17.5"
11 | parse_int = "0.6.0"
12 | serialport = "4.2.1"
13 | 


--------------------------------------------------------------------------------
/montool/README.mkdn:
--------------------------------------------------------------------------------
1 | # hapenny montool
2 | 
3 | This is a very basic command line tool for interacting with the tinyboot serial
4 | monitor.
5 | 
6 | For instructions:
7 | 
8 | `cargo run -- --help`
9 | 


--------------------------------------------------------------------------------
/montool/src/main.rs:
--------------------------------------------------------------------------------
  1 | use std::{time::Duration, path::PathBuf, io::ErrorKind, io::Write};
  2 | 
  3 | use anyhow::{Context, Result, bail};
  4 | use indicatif::ProgressBar;
  5 | use serialport::SerialPort;
  6 | use clap::Parser;
  7 | 
  8 | /// A tool for interacting with the hapenny tinyboot serial monitor.
  9 | #[derive(Debug, Parser)]
 10 | #[clap(version)]
 11 | struct BootTool {
 12 |     /// Path to serial port on your machine, e.g. /dev/ttyUSB0 or COM1:
 13 |     port: String,
 14 |     /// Baud rate of serial port.
 15 |     #[clap(long, short, global = true, default_value_t = 115_200)]
 16 |     baud_rate: u32,
 17 | 
 18 |     #[clap(subcommand)]
 19 |     cmd: SubCmd,
 20 | }
 21 | 
 22 | #[derive(Debug, Parser)]
 23 | enum SubCmd {
 24 |     /// Perform a basic check to see if tinyboot appears to be running.
 25 |     Ping,
 26 |     /// Load a single 32-bit word from an address in the target.
 27 |     Peek {
 28 |         /// Address to read.
 29 |         #[clap(value_parser = parse_int::parse::<u32>)]
 30 |         address: u32,
 31 |     },
 32 |     /// Write a single 32-bit word into the taget.
 33 |     Poke {
 34 |         /// Address to write.
 35 |         #[clap(value_parser = parse_int::parse::<u32>)]
 36 |         address: u32,
 37 |         /// Value to write.
 38 |         #[clap(value_parser = parse_int::parse::<u32>)]
 39 |         value: u32,
 40 |     },
 41 |     /// Write the contents of a file into the target. Useful for loading a
 42 |     /// program from a .bin file.
 43 |     Write {
 44 |         /// Address to begin writing.
 45 |         #[clap(value_parser = parse_int::parse::<u32>)]
 46 |         address: u32,
 47 |         /// File containing bytes to write; will be padded out to a multiple of
 48 |         /// 4.
 49 |         image_file: PathBuf,
 50 |     },
 51 |     /// Call into an address in the target.
 52 |     Call {
 53 |         /// Address to call.
 54 |         #[clap(value_parser = parse_int::parse::<u32>)]
 55 |         address: u32,
 56 |         /// If provided, the tool will immediately begin echoing back data
 57 |         /// received on the serial report until you kill it. This is useful for
 58 |         /// loading and running programs that are chatty, such as Dhrystone.
 59 |         #[clap(long)]
 60 |         then_echo: bool,
 61 |     },
 62 | }
 63 | 
 64 | fn main() -> Result<()> {
 65 |     let args = BootTool::parse();
 66 | 
 67 |     let mut port = serialport::new(&args.port, args.baud_rate)
 68 |         .timeout(Duration::from_millis(500))
 69 |         .open()
 70 |         .with_context(|| format!("opening serial port {}", args.port))?;
 71 | 
 72 |     drain(&mut port)?;
 73 | 
 74 |     match args.cmd {
 75 |         SubCmd::Ping => {
 76 |             do_cmd(&mut port, &[5])
 77 |                 .context("pinging")?;
 78 |         }
 79 |         SubCmd::Peek { address } => {
 80 |             // load addr register
 81 |             let mut cmd = [3, 0, 0, 0, 0];
 82 |             cmd[1..].copy_from_slice(&address.to_le_bytes());
 83 |             do_cmd(&mut port, &cmd)
 84 |                 .context("loading A")?;
 85 |             // load count register
 86 |             let cmd = [4, 1, 0, 0, 0];
 87 |             do_cmd(&mut port, &cmd)
 88 |                 .context("loading C")?;
 89 |             // read out the data
 90 |             let cmd = [2];
 91 |             do_cmd(&mut port, &cmd)
 92 |                 .context("sending GET")?;
 93 |             let mut data = [0; 4];
 94 |             port.read_exact(&mut data)
 95 |                 .context("waiting for data")?;
 96 |             println!("{:#x}", u32::from_le_bytes(data));
 97 |         }
 98 |         SubCmd::Poke { address, value } => {
 99 |             // load addr register
100 |             let mut cmd = [3, 0, 0, 0, 0];
101 |             cmd[1..].copy_from_slice(&address.to_le_bytes());
102 |             do_cmd(&mut port, &cmd)
103 |                 .context("loading A")?;
104 |             // load count register
105 |             let cmd = [4, 1, 0, 0, 0];
106 |             do_cmd(&mut port, &cmd)
107 |                 .context("loading C")?;
108 |             // deposit the data.
109 |             let mut cmd = [1, 0, 0, 0, 0];
110 |             cmd[1..].copy_from_slice(&value.to_le_bytes());
111 |             do_cmd(&mut port, &cmd)
112 |                 .context("sending PUT")?;
113 |         }
114 |         SubCmd::Write { address, image_file } => {
115 |             let mut image = std::fs::read(&image_file)?;
116 |             while image.len() % 4 != 0 {
117 |                 image.push(0);
118 |             }
119 |             // load addr register
120 |             let mut cmd = [3, 0, 0, 0, 0];
121 |             cmd[1..].copy_from_slice(&address.to_le_bytes());
122 |             do_cmd(&mut port, &cmd)
123 |                 .context("loading A")?;
124 |             let bar = ProgressBar::new(image.len() as u64);
125 |             for chunk in image.chunks(256) {
126 |                 // load count register
127 |                 let word_count = u32::try_from(chunk.len() / 4)?;
128 |                 let mut cmd = [4, 0, 0, 0, 0];
129 |                 cmd[1..].copy_from_slice(&word_count.to_le_bytes());
130 |                 do_cmd(&mut port, &cmd)
131 |                     .context("loading C")?;
132 |                 let mut packet = vec![1];
133 |                 packet.extend_from_slice(chunk);
134 |                 // deposit the data.
135 |                 do_cmd(&mut port, &packet)
136 |                     .context("sending PUT")?;
137 |                 bar.inc(chunk.len() as u64);
138 |             }
139 |             bar.finish();
140 |         }
141 |         SubCmd::Call { address, then_echo } => {
142 |             // load addr register
143 |             let mut cmd = [3, 0, 0, 0, 0];
144 |             cmd[1..].copy_from_slice(&address.to_le_bytes());
145 |             do_cmd(&mut port, &cmd)
146 |                 .context("loading A")?;
147 |             // go!
148 |             do_cmd(&mut port, &[0])
149 |                 .context("sending CALL")?;
150 | 
151 |             if then_echo {
152 |                 let stdout = std::io::stdout();
153 |                 let mut stdout = stdout.lock();
154 |                 loop {
155 |                     let mut b = [0];
156 |                     match port.read_exact(&mut b) {
157 |                         Ok(()) => {
158 |                             write!(stdout, "{}", b[0] as char)?;
159 |                             stdout.flush()?;
160 |                         },
161 |                         Err(e) if e.kind() == ErrorKind::TimedOut => {
162 |                             // meh
163 |                         }
164 |                         other => other?,
165 |                     }
166 |                 }
167 |             }
168 |         }
169 |     }
170 | 
171 |     Ok(())
172 | }
173 | 
174 | fn do_cmd(port: &mut Box<dyn SerialPort>, cmd: &[u8]) -> Result<()> {
175 |     port.write_all(&cmd).context("writing command")?;
176 |     let mut response = [0; 1];
177 |     port.read_exact(&mut response).context("collecting response byte")?;
178 |     match response[0] {
179 |         0xAA => Ok(()),
180 |         0xFF => {
181 |             bail!("Received NACK");
182 |         }
183 |         x => {
184 |             bail!("Received unexpected response: {x:#x}");
185 |         }
186 |     }
187 | }
188 | 
189 | fn drain(port: &mut Box<dyn SerialPort>) -> Result<()> {
190 |     let saved_timeout = port.timeout();
191 | 
192 |     port.set_timeout(Duration::from_millis(1))
193 |         .context("reducing timeout for drain")?;
194 | 
195 |     let mut buffer = [0; 32];
196 |     let mut cruft = 0_usize;
197 |     loop {
198 |         match port.read(&mut buffer) {
199 |             Ok(n) => cruft += n,
200 |             Err(e) if e.kind() == ErrorKind::TimedOut => {
201 |                 break;
202 |             }
203 |             Err(e) => return Err(e)
204 |                 .context("attempting to drain buffer"),
205 |         }
206 |     }
207 |     port.set_timeout(saved_timeout)
208 |         .context("restoring timeout after drain")?;
209 | 
210 |     if cruft > 0 {
211 |         println!("note: {cruft} bytes of cruft drained from serial port");
212 |     }
213 | 
214 |     Ok(())
215 | }
216 | 
217 | 


--------------------------------------------------------------------------------
/notes/20231001.mkdn:
--------------------------------------------------------------------------------
 1 | # Small RV32 core using Amaranth
 2 | 
 3 | Looking into doing a tiny, probably-not-pipelined RV32 core for use on small
 4 | iCE40s. Likely approach:
 5 | 
 6 | - Use 2x BRAMs to provide a 1R1W register file.
 7 | - Since that necessitates most instructions taking >1 cycle, use as many cycles
 8 |   as seems appropriate.
 9 | 
10 | This is pretty similar to my Dinky5 core, which, for the record, used the
11 | following states:
12 | 
13 | ```
14 |     JustFetchState, // Reset, or second cycle of store.
15 |     Reg2State,      // Reading instruction, selecting rs2.
16 |     Reg1State,      // Latching x2, selecting rs1.
17 |     ExecuteState,   // Executing first instruction cycle.
18 |     LoadState,      // Second cycle for loads.
19 |     ShiftState,     // Processing a serial shift operation.
20 |     HaltState       // Something has gone wrong.
21 | ```
22 | 
23 | Pretty reasonable, I think. 3 cycles for most instructions since the final cycle
24 | issues a fetch. 4 cycles for stores. A great many cycles for shifts.
25 | 
26 | Lessee how things shape up in Amaranth.
27 | 
28 | ---
29 | 
30 | Going pretty well I think
31 | 
32 | Currently fetching rs1 followed by rs2. Dinky did it in the other order. I think
33 | there's some value to reversing it. While load and store both use rs1 as the
34 | base address, it's a question of when it becomes available.
35 | 
36 | Currently, load is able to skip a cycle compared to store because it only needs
37 | rs1. Store needs to wait for both registers before it can issue a bus
38 | transaction.
39 | 
40 | ---
41 | 
42 | Interesting observation.
43 | 
44 | On a 4LUT part, putting 2muxes on the inputs of an adder has equivalent resource
45 | cost to generating two adders and muxing between them. Assuming the muxes don't
46 | need to be switched separately.
47 | 
48 | Muxing adders has the advantage of loosening the timing constraint on the mux
49 | control signal. For whatever that's worth.
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/notes/20231002.mkdn:
--------------------------------------------------------------------------------
 1 | Got most of RV32 implemented at this point.
 2 | 
 3 | Missing: FENCE and SYSTEM (and so the CSRs and whatnot)
 4 | 
 5 | Current design's synthesizing a lot larger than I'd like. Adding the byte and
 6 | halfword loads and stores enlarged things _significantly._ Removing
 7 | byte/halfword _store_ support saves ... not a lot. 4 LUTs. Removing
 8 | byte/halfword and extending _loads_ saves 105 LUTs.
 9 | 
10 | SLT and friends are usually the critical path, which has happened to me before.
11 | Making them as simple as possible is basically what I've got.
12 | 
13 | I think I might need to take a step back and try to simplify. For instance, I've
14 | currently got two ALUs, effectively -- one for register-immediate and one for
15 | register-register. They're never used on the same cycle.
16 | 
17 | ---
18 | 
19 | Okay, starting to simplify by more closely mirroring my work on Dinky5 2+ years
20 | ago. I've figured something out.
21 | 
22 | Partially implemented core costs:
23 | - 666 LCs with 32-bit PC, but
24 | - 396 LCs with 8-bit PC
25 | 
26 | So, that's a lot of the size I'm seeing. Dinky5 used a shrunken PC.
27 | 
28 | Why's PC so expensive?
29 | 
30 | - Well there's PC+4, PC+IMMU, PC+IMMJ, PC+IMMB...
31 | - And the relevant muxes for selecting among them
32 | 
33 | 
34 | 
35 | ```
36 | 1100011   Bxx         rs2
37 | 0110011   ALU r-r     rs2
38 | 0010011   ALU r-i     I-format immediate
39 | ```
40 | 


--------------------------------------------------------------------------------
/notes/20231003.mkdn:
--------------------------------------------------------------------------------
  1 | Okay, got a complete port of Dinky done, and ... it's considerably larger.
  2 | 
  3 | Like, 50% larger.
  4 | 
  5 | I have admittedly added some features to the design, so, let me parameterize
  6 | those so I can evaluate with and without them.
  7 | 
  8 | # Alignment checks
  9 | 
 10 | Present in Dinky5, but, easy enough to make conditional.
 11 | 
 12 | With: 925 LC / 47.88 MHz
 13 | Without: 912 LC / 44.75 MHz
 14 | 
 15 | # Wait states / bus ready lines
 16 | 
 17 | DinkyBus doesn't allow wait states. The icestick demo doesn't make much use of
 18 | them.
 19 | 
 20 | Removed: 907 LC / 44.29 MHz
 21 | 
 22 | # PC implemented width
 23 | 
 24 | Dinky5 demo on Icestick used an 8-bit PC. PC width doesn't automatically shrink
 25 | to fit memory bus implementation width, because PC is program visible in
 26 | situations like JAL.
 27 | 
 28 | Parameterizing it down to 8 bits: 830 LC / 47.54 MHz
 29 | 
 30 | Turning alignment checks back on: 846 LC
 31 | 
 32 | # Halting debug
 33 | 
 34 | While halting debug isn't wired up in the icestick demo, maybe I should disable
 35 | it conclusively. This is a matter of
 36 | 
 37 | 1. Converting all checks of the `halt_request` pin into constant 0s
 38 | 2. Removing the entire HALT case from the switch.
 39 | 
 40 | 838 LC / 43.78 MHz. So, some halt logic _was_ getting built in. But not very
 41 | much.
 42 | 
 43 | # Fixing PC on fetch
 44 | 
 45 | Turns out, any overlapped fetch was using the wrong PC value, failing to lop off
 46 | its bottom two bits. Bet this was producing some extra muxes!
 47 | 
 48 | 836 LC - so, uh, yeah, technically, but not a lot.
 49 | 
 50 | 
 51 | # Asserting `mem_in.ready` on loads
 52 | 
 53 | whoops
 54 | 
 55 | No impact on the icestick demo since the memory there ignores it.
 56 | 
 57 | 
 58 | # Instruction mix
 59 | 
 60 | Motherfucker.
 61 | 
 62 | The synthesis tools are being clever about the contents of RAM. Changing the
 63 | instruction mix affects synthesized size.
 64 | 
 65 | Using Dinky5's test program (under the assumption that this shit was happening
 66 | before) I get 764 LCs.
 67 | 
 68 | By filling RAM with random bits, we're up to 955 LCs with the parameterization
 69 | above.
 70 | 
 71 | Without it: 1217
 72 | 
 73 | Well, that makes all of these results really hard to compare then.
 74 | 
 75 | 
 76 | # Retesting with random data in memory
 77 | 
 78 | - Disabling alignment checks: -22 LUTs
 79 | - Disabling ready lines from memory: -6 LUTs (note: memory wasn't relying on
 80 |   them)
 81 | - Shrinking PC: -252 LUTs
 82 | - Disabling halting debug: -7 LUTs (note: interface was not connected)
 83 | 
 84 | Moving PC displacement immediate selection into decode step and sharing the PC
 85 | adder across various control flow instructions: made shit bigger. But also
 86 | faster.
 87 | 
 88 | ---
 89 | 
 90 | Ah, shit, the icestick eval demo wasn't wiring up writes to lanes 2 and 3 of
 91 | RAM. That likely explains some things. Doing so costs about 52 LUTs. :(
 92 | 
 93 | However, that also seems to stop the synthesis tools from being excessively
 94 | smart about RAM contents -- I now get the same results for a small program as I
 95 | do for random bits. Still gonna eval with random bits tho.
 96 | 
 97 | 
 98 | Noticed that the shift amount always comes from `comp_rhs`. Tried using the
 99 | bottom 5 bits of `comp_rhs` as the shift amount during shifting, decrementing
100 | it. This makes things bigger. If I need a decrementer anyway, registering the
101 | output is very cheap.
102 | 
103 | 
104 | ---
105 | 
106 | Okay, so. With some incremental improvements, I've got the CPU in the following
107 | configuration:
108 | 
109 | - 256-word memory (so 1024 byte, or 2 BRAMs).
110 | - PC width constrained to 10 (i.e. all of the 256-word memory space)
111 | - alignment checks, illegal instruction checks, and bus wait states ENABLED.
112 | - Full 32-register set.
113 | 
114 | ... fitting into 965 LCs, or 75% of an hx1k. (Turning off alignment checks: 934
115 | LCs.) Timing suggests a maximum clock rate of 46 MHz.
116 | 
117 | Mostly I've been pulling logic that was conditional on state values / opcode
118 | _up,_ into the top level, so that its LUTs lose their conditional inputs. This
119 | has helped with both size and speed, though the speed's been tremendously
120 | variable.
121 | 
122 | I think this is probably a more honest representation of the size of the core,
123 | whereas I think Dinky may have "optimized" itself.
124 | 
125 | PicoRV32 doesn't post iCE40 synthesis results, but people are quoting ~1500 LUTs
126 | and saying it won't fit on an hx1k on Reddit. So, my core may be smaller, though
127 | of course picoRV32 implements some features I haven't, like interrupts.
128 | 
129 | ---
130 | 
131 | Turns out some of the halt logic was being synthesized.
132 | 
133 | - Applying a basic fix to that gets us down to 958 LCs (-7)
134 | - Disabling halting debug on icestick (it's not wired up!) gets us to... 968?
135 |   +10? Really?
136 | 
137 | ---
138 | 
139 | Random thoughts on how to be smaller
140 | 
141 | - Could probably get away with a 16-bit datapath.
142 | - 
143 | 


--------------------------------------------------------------------------------
/notes/20231004.mkdn:
--------------------------------------------------------------------------------
  1 | Starting to play with some more invasive techniques for reducing size.
  2 | 
  3 | Saved 20 LUTs by converting basically all use of funct3 to one-hot. I think this
  4 | is mostly working by converting a bunch of muxes to ORs. Gonna see if I can
  5 | break down the savings.
  6 | 
  7 | Converting load alignment detect logic: no immediate savings
  8 | 
  9 | Also converting load result assembly logic: -8 LUTs
 10 | 
 11 | Converting store logic: -3 LUTs
 12 | 
 13 | Converting branch condition logic: -14 LUTs (!)
 14 | 
 15 | Converting ALU result logic: -2 LUTs
 16 | 
 17 | Switching PC immediate adder to use a muxed operand based on partially decoded
 18 | opcode bits: +58 LUTs! fuuuuuck that
 19 | 
 20 | Switching load and store EA computation and alignment checks to use common
 21 | logic: -6 LUTs
 22 | 
 23 | Centralizing load shifter to not be opcode sensitive: +3 LUTs
 24 | 
 25 | ---
 26 | 
 27 | Merely switching state encoding to one-hot: +15 LUTs
 28 | 
 29 | Honestly fewer than I expected
 30 | 
 31 | ---
 32 | 
 33 | Wrote a new RV32I implementation (not including ALU ops, atm) using entirely
 34 | structural one-hot primitives -- not an if or switch to be found -- and it's
 35 | already at 821 LCs. 281 FFs.
 36 | 
 37 | So, I think the "naive RV32" approach will tend to inherently approach 900-950
 38 | LCs on iCE40.
 39 | 
 40 | Wondering about alternative approaches I could use to synthesize something
 41 | lighter without going all the way to bitserial. SERV's numbers are a good
 42 | illustration of the cost of control logic: at 32+ more cycles per instruction,
 43 | it's only about 1/3 the size.
 44 | 
 45 | I keep thinking about a 16-bit datapath implementation, a la 68000. The iCE40's
 46 | internal RAMs are 16-bit, so
 47 | 
 48 | - Use one for a register file with separate entries for the high and low half of
 49 |   each general purpose register (so, 64/256 entries consumed)
 50 | - Use one as a 16-bit-wide RAM.
 51 | - Operate in halfwords.
 52 | 
 53 | Things like addition/subtraction/comparison would need carry flags to link one
 54 | operation to the next. Shifts seem more interesting; it might be worth having an
 55 | actual 32-bit shift register in an otherwise 16-bit implementation.
 56 | 
 57 | Sketched execution of some instructions:
 58 | 
 59 | ```
 60 | AUIPC x1, 0xAAAAA000
 61 | 
 62 | 1. Load low half of instruction from RAM.
 63 |     - Note: have at this point: opcode, rd, funct3, low bit of rs1, some
 64 |       immediate bits
 65 | 2. Load high half of instruction from RAM.
 66 |     - Now we've got rs1 and rs2
 67 | 3. Add low half of PC to low half of U-type immediate. Set carry flag
 68 |    appropriately. Write result to low half of rd.
 69 | 4. Add upper half of PC to upper half of U-type immediate and carry flag. Write
 70 |    result to upper half of rd.
 71 |    - In this cycle we could also begin instruction fetch at PC+4 and set it up
 72 |      to latch into PC at end of cycle
 73 | 
 74 | ADD x1, x2, x3
 75 | 
 76 | 1. Low fetch
 77 | 2. High fetch
 78 |     - Can start addressing registers here
 79 | 3. Low half of a register becomes available, start fetching low half of the
 80 |    other
 81 | 4. Low half of second register becomes available, start fetching high half of
 82 |    the first. Set up the ALU to add the low halves and set carry. Write to low
 83 |    half of destination.
 84 | 5. High half of first register becomes available, start fetching high half of
 85 |    other
 86 | 6. High half of both registers available, set up the ALU to add the high halves
 87 |    using saved carry. Write to high half of destination.
 88 | ```
 89 | 
 90 | So if I could arrange things into a general pattern that doesn't require fully
 91 | realized microcode, that'd rock. Candidate states:
 92 | 
 93 | - Low Fetch
 94 | - High Fetch and Register 2 Low Load
 95 | - Register 1 Low Load
 96 | - Low Operate, Register 2 High Load
 97 | - Register 1 High Load
 98 | - High Operate (normally overlapped with Low Fetch)
 99 | 
100 | For loads, one possible sequence is
101 | 
102 | - Register 1 Low Load
103 | - Add low to immediate, register 1 high load. Latch result as low bits of EA.
104 | - Add high to immediate. Use result as high bits of EA. Issue load for first
105 |   halfword.
106 | - Write result into rd low once it comes back. If the load is a word load, issue
107 |   load for second halfword.
108 | - For word loads, write result into rd high, otherwise set or reset it according
109 |   to sign extension and contents of low halfword.
110 | 
111 | Word stores might resemble
112 | 
113 | - Register 1 Low Load
114 | - Add low to immediate, register 1 high load. Latch result as low bits of EA.
115 | - Add high to immediate. Use result as high bits of EA. Register 2 low load.
116 | - Issue store for low halfword. Register 2 high load. EA increment.
117 | - Issue store for high halfword.
118 | 
119 | ...while byte and halfword stores could skip the final cycle, if desired.
120 | 
121 | The need to have the entire EA handy before making the low halfword store is a
122 | strong argument for loading rs1 before rs2, unlike in my 32-bit implementations
123 | that wind up being able to save time by doing it in the other order.
124 | 
125 | If I can limit the EA to < 32 bits, the argument gets even stronger. In the
126 | extreme case, a core with a 16-bit physical address space could avoid dealing
127 | with the top half of rs1 in loads/stores entirely.
128 | 
129 | 
130 | Operating on RV32 immediates in 16-bit units means, effectively, twice as many
131 | possible immediate inputs into an adder. Assuming the same adder serves both top
132 | and bottom halfwords. Because if it doesn't ... what exactly am I doing
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/notes/20231005.mkdn:
--------------------------------------------------------------------------------
  1 | Ok. Loads.
  2 | 
  3 | Currently, by the time we get to OPERATE-HI, we have
  4 | - The LSBs of the EA in `mar_lo`.
  5 | - The MSBs of the EA on the adder output.
  6 | 
  7 | We can put that on the bus and goto LOAD.
  8 | 
  9 | If we're doing a halfword load,
 10 | 
 11 | - Write the loaded value into the low half of the destination register.
 12 | - Record the top bit somewhere (signed) or a zero (unsigned)
 13 | - Fill the top half of the register with that value and goto fetch.
 14 | 
 15 | If we're doing a byte load,
 16 | - Write either the low or high byte (depending on the bottom bit of EA) into the
 17 |   destination register's low half, clearing the top byte.
 18 | - Record the top bit somewhere (signed) or a zero (unsigned)
 19 | - Fill the top half of the register with that value and goto fetch.
 20 | 
 21 | If we're doing a word load, things are more interesting. We need to issue a
 22 | second memory transaction.
 23 | 
 24 | The new EA will be two higher than the last one. However, if we require loads to
 25 | be aligned, then all we have to do is bitwise-OR 2 into the EA. No adder
 26 | required, no carry chain involved.
 27 | 
 28 | So, we can go ahead and issue the load from `mar_lo` and the adder output, and
 29 | then transition to a load high state that stores the result.
 30 | 
 31 | ---
 32 | 
 33 | Muxes
 34 | 
 35 | ```
 36 | adder_rhs
 37 |     imm_u[15:0]
 38 |     imm_u[31:16]
 39 |     imm_j[15:0]
 40 |     imm_j[31:16]
 41 |     imm_i[15:0]
 42 |     imm_i[31:16]
 43 |     imm_s[15:0]
 44 |     imm_s[31:16]
 45 |     rf.read_resp
 46 |     rf.read_resp ^ 16{inst[30]} (add/sub instruction only?)
 47 |     ~rf.read_resp (compares)
 48 |     imm_b[15:0]
 49 |     imm_b[31:16]
 50 | 
 51 | accum next
 52 |     pc[15:0]
 53 |     rf.read_resp
 54 |     pc[31:16]
 55 | accum[14:0], shift_lo[15]   (shift left)
 56 |     shift_fill, accum[15:1]     (shift right)
 57 |     16{load_result[15]} (for signed loads)
 58 |     zero (for unsigned loads)
 59 |     unchanged
 60 | 
 61 | adder_carry_in
 62 |     0   might be able to eliminate these constants by
 63 |     1    flipping saved_carry
 64 |     saved_carry
 65 |     inst[30]
 66 | 
 67 | saved carry next
 68 |     0
 69 |     adder carry out
 70 |     unchanged (used to carry contents between operate phases)
 71 | 
 72 | rf write payload
 73 |     zeroes by default but probably not really used
 74 |     immu[15:0]
 75 |     adder_result
 76 |     (pc+4)[15:0]
 77 |     (pc+4)[15:0]
 78 |     (pc+4)[31:16]
 79 |     (pc+4)[31:16]
 80 |     accum ^ reg
 81 |     accum | reg
 82 |     accum & reg
 83 |     accum ^ imm_i[15:0]
 84 |     accum | imm_i[15:0]
 85 |     accum & imm_i[15:0]
 86 |     accum^imm_i[31:16]
 87 |     accum|imm_i[31:16]
 88 |     accum&imm_i[31:16]
 89 |     immu[31:16]
 90 |     accum
 91 |     load_result
 92 |     shift_lo
 93 |     zeroes (halted, not real)
 94 | 
 95 | mem out payload addr
 96 |     zeroes (fake)
 97 |     pc[31:1]
 98 |     pc[31:1] + 1
 99 |     { adder_result, mar_lo[15:1] }
100 |     { adder_result, mar_lo[15:1] } | 1
101 | ```
102 | 
103 | states
104 | 
105 | ```
106 | lo              hi
107 | reset
108 | fetch_lo        fetch_hi
109 | inst_reg1_lo
110 |                 fetch_hi_wait
111 | reg2_lo         reg2_hi
112 | operate_lo      operate_hi
113 | halted
114 | branch_lo       branch_hi
115 | load            load_hi
116 |                 load_hi_wait
117 |                 fill_msbs
118 |                 store_hi
119 | finish_shift
120 | ```
121 | 
122 | ---
123 | 
124 | I feel like I could get this smaller by doing some more fixed-function stuff and
125 | possibly using more states/cycles.
126 | 
127 | Also, decompose the state into, say, a state variable and a "active halfword"
128 | bit that determines whether we're operating high or low.
129 | 
130 | ```
131 | ALU x1, x2, x3 (not shift)
132 | 
133 | fetch low halfword of instruction
134 | clear zero and carry flags
135 | 
136 | low halfword of instruction available
137 | fetch high halfword of instruction
138 | latch low halfword into bottom half of inst
139 | 
140 | high halfword of instruction available
141 | latch high halfword into top half of inst
142 | set up read of rs1.lo
143 | 
144 | READ_RS2/LO
145 | rs1.lo available
146 | set up read of rs2.lo
147 | latch rs1.lo into accum
148 | 
149 | OPERATE/LO
150 | rs2.lo available
151 | compute ALU result of (accum, rs2.lo)
152 | set up write of ALU result into rd.lo
153 | latch carry and zero output
154 | set up read of rs1.hi
155 | flip active to high
156 | 
157 | READ_RS2/HI
158 | rs1.hi available
159 | set up read of rs2.hi
160 | latch rs1.hi into accum
161 | 
162 | OPERATE/HI
163 | rs2.hi available
164 | compute ALU result of (accum, rs2.hi)
165 | set up write of ALU result into rd.hi
166 | latch carry and zero output
167 | flip active to low (toggle it?)
168 | 
169 | increment PC
170 | ```
171 | 


--------------------------------------------------------------------------------
/notes/20231006.mkdn:
--------------------------------------------------------------------------------
 1 | Alright, sprinting last night and this morning I have a revised state model
 2 | working. This explicitly reuses logic between low and high halfwords where
 3 | feasible, separating state into a (state, hi-halfword) pair. Otherwise it
 4 | applies no aggressive optimization -- no one-hot state, no explicitly parallel
 5 | logic.
 6 | 
 7 | Currently: 786 LCs (61%)
 8 | 47 MHz
 9 | 
10 | CP: regfile -> adder RHS -> adder -> accumulator?
11 | 
12 | Current instruction timings:
13 | 
14 | ```
15 | LUI     6
16 | AUIPC   6
17 | JAL     6
18 | JAL     6
19 | JALR    7
20 | Bxx     9 if taken
21 |         7 if not
22 | Lxx     9
23 | SW      8
24 | SB/SH   7
25 | 
26 | ALU     7
27 | 
28 | shift   9 + amount
29 | ```
30 | 
31 | So if we call 7 the average, we get 6.714 MIPS.
32 | 
33 | In many cases I can probably knock a cycle off instructions, because I'm not
34 | currently doing overlapped fetch. But, that's not my current priority. My
35 | current priority is size.
36 | 
37 | Currently SLT/SLTU are not implemented, need to fix that. They're a little
38 | tricky because the LSB of the result depends on the MSBs of the input. Might
39 | need an auxiliary state for them. But, those being missing will be throwing off
40 | both my timing and area reports.
41 | 
42 | ---
43 | 
44 | Okay. Got SLT/SLTU implemented, they wanted another state, works now.
45 | 
46 | We're bigger: 831 LCs. (this is all still with `addr_width = 8`.)
47 | 
48 | ---
49 | 
50 | Down to 825 with some simplifications in SLT.
51 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | dynamic = ["version"]
 3 | 
 4 | name = "hapenny"
 5 | 
 6 | requires-python = "~=3.8"
 7 | dependencies = [
 8 |     "amaranth[builtin-yosys]@git+https://github.com/amaranth-lang/amaranth",
 9 |     "amaranth-boards@git+https://github.com/amaranth-lang/amaranth-boards",
10 |     "yowasp-yosys",
11 | ]
12 | 
13 | [project.optional-dependencies]
14 | debug = ["jtagtap"]
15 | 
16 | [build-system]
17 | requires = ["pdm-backend"]
18 | build-backend = "pdm.backend"
19 | 
20 | [tool.pdm.scripts]
21 | _.env_file = ".env.toolchain"
22 | test.composite = ["test-code"]
23 | test-code.cmd = "python -m unittest discover -t . -s tests -v"
24 | 


--------------------------------------------------------------------------------
/smallest-toggle.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/smallest-toggle.bin


--------------------------------------------------------------------------------
/tiny-bootloader.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/tiny-bootloader.bin


--------------------------------------------------------------------------------
/tinyboot-upduino-chonk.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/tinyboot-upduino-chonk.bin


--------------------------------------------------------------------------------
/tinyboot/.cargo/config:
--------------------------------------------------------------------------------
 1 | [build]
 2 | target = "riscv32i-unknown-none-elf"
 3 | 
 4 | [target.rv32-hapenny]
 5 | rustflags = [
 6 |     "-C", "link-arg=-Tlink.x",
 7 | ]
 8 | 
 9 | [target.riscv32i-unknown-none-elf]
10 | rustflags = [
11 |     "-C", "relocation-model=pie",
12 |     "-C", "link-arg=-Tlink.x",
13 | ]
14 | 
15 | # For size comparison
16 | [target.thumbv6m-none-eabi]
17 | rustflags = [
18 |     "-C", "link-arg=-Tlink.x",
19 | ]
20 | [target.thumbv7em-none-eabi]
21 | rustflags = [
22 |     "-C", "link-arg=-Tlink.x",
23 | ]
24 | 


--------------------------------------------------------------------------------
/tinyboot/Cargo.lock:
--------------------------------------------------------------------------------
 1 | # This file is automatically @generated by Cargo.
 2 | # It is not intended for manual editing.
 3 | version = 3
 4 | 
 5 | [[package]]
 6 | name = "autocfg"
 7 | version = "1.1.0"
 8 | source = "registry+https://github.com/rust-lang/crates.io-index"
 9 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
10 | 
11 | [[package]]
12 | name = "cfg-if"
13 | version = "1.0.0"
14 | source = "registry+https://github.com/rust-lang/crates.io-index"
15 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
16 | 
17 | [[package]]
18 | name = "hapenny-tinyboot"
19 | version = "0.1.0"
20 | dependencies = [
21 |  "cfg-if",
22 |  "parse_int",
23 | ]
24 | 
25 | [[package]]
26 | name = "num-traits"
27 | version = "0.2.17"
28 | source = "registry+https://github.com/rust-lang/crates.io-index"
29 | checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
30 | dependencies = [
31 |  "autocfg",
32 | ]
33 | 
34 | [[package]]
35 | name = "parse_int"
36 | version = "0.6.0"
37 | source = "registry+https://github.com/rust-lang/crates.io-index"
38 | checksum = "2d695b79916a2c08bcff7be7647ab60d1402885265005a6658ffe6d763553c5a"
39 | dependencies = [
40 |  "num-traits",
41 | ]
42 | 


--------------------------------------------------------------------------------
/tinyboot/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hapenny-tinyboot"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | cfg-if = "1.0.0"
 8 | 
 9 | [profile.release]
10 | opt-level = "s"
11 | debug = true
12 | 
13 | [[bin]]
14 | name = "tinyboot"
15 | path = "src/main.rs"
16 | test = false
17 | bench = false
18 | 
19 | [build-dependencies]
20 | parse_int = "0.6.0"
21 | 


--------------------------------------------------------------------------------
/tinyboot/README.mkdn:
--------------------------------------------------------------------------------
  1 | # hapenny tinyboot
  2 | 
  3 | This is a minimal boot ROM and monitor written in Rust. It interacts with a host
  4 | over a serial port and provides operations for reading, writing, and calling
  5 | memory.
  6 | 
  7 | This is directly inspired by Frank Sergeant's "3-Instruction Forth;" like that
  8 | system, this is not Forth.
  9 | 
 10 | ## Building
 11 | 
 12 | `cargo build --release`
 13 | 
 14 | This will generate an ELF file. Extracting a binary file suitable for handing to
 15 | the SoC generator:
 16 | 
 17 | ```
 18 | riscv32-elf-objcopy -Obinary \
 19 |     target/riscv32i-unknown-none-elf/release/tinyboot \
 20 |     path-to-your-output-file.bin
 21 | ```
 22 | 
 23 | In practice, you probably want to override the UART address. You can do that
 24 | like this:
 25 | 
 26 | ```
 27 | TINYBOOT_UART_ADDR=0x01_0000 cargo build --release
 28 | ```
 29 | 
 30 | ## Serial protocol
 31 | 
 32 | All the default examples bring up the UART at 115,200 baud, though you can
 33 | change this if you like -- it's in the HDL, not the Rust code.
 34 | 
 35 | The protocol is a very simple command-response scheme implementing five
 36 | commands. It's a binary protocol; examples below will be shown in hex, but
 37 | typing hex digits into the serial port won't do what you want. See the `montool`
 38 | in this same repo for a portable command line tool.
 39 | 
 40 | ### Call (0x00)
 41 | 
 42 | Send: `00`
 43 | Response: `AA`
 44 | 
 45 | Calls the address in the A register. Loads the tinyboot setup routine's address
 46 | into `ra` during the call, so if the code you call returns, it'll hop right back
 47 | into tinyboot.
 48 | 
 49 | ### Write (0x01)
 50 | 
 51 | Send: `01 nn nn nn nn ...`
 52 | Response: `AA`
 53 | 
 54 | Writes words to memory starting at the address in the A register and continuing
 55 | for the count in the C register. Decrements the C register by 1 per word, and
 56 | increments the A register by 4 per word.
 57 | 
 58 | Words should be sent after the command byte in little-endian format.
 59 | 
 60 | The ACK byte will arrive after all words have been transmitted.
 61 | 
 62 | ### Read (0x02)
 63 | 
 64 | Send: `02`
 65 | Response: `AA nn nn nn nn ...`
 66 | 
 67 | Reads words from memory starting at the address in the A register and continuing
 68 | for the count in the C register. Decrements the C register by one per word, and
 69 | increments the A register by 4 per word.
 70 | 
 71 | The ACK response is sent first, followed by the requested number of words in
 72 | little-endian format.
 73 | 
 74 | ### Load A (0x03)
 75 | 
 76 | Send: `03 ww xx yy zz`
 77 | Response: `AA`
 78 | 
 79 | Loads a new value into the A register. The value must be sent after the command
 80 | byte in little-endian format (so in this example, the value loaded is
 81 | `0xzzyyxxww`).
 82 | 
 83 | ### Load C (0x04)
 84 | 
 85 | Send: `04 ww xx yy zz`
 86 | Response: `AA`
 87 | 
 88 | Loads a new value into the C register. The value must be sent after the command
 89 | byte in little-endian format (so in this example, the value loaded is
 90 | `0xzzyyxxww`).
 91 | 
 92 | Note that the count held in the C register is always measured in _words,_ not
 93 | bytes.
 94 | 
 95 | ### Ping (0x05)
 96 | 
 97 | Send: `05`
 98 | Response: `AA`
 99 | 
100 | Basic verification that the bootloader is responding.
101 | 
102 | ## Configuring for your board
103 | 
104 | The environment variable `TINYBOOT_UART_ADDR` determines the location of the
105 | UART in the address space, which is currently the only configurable part of the
106 | bootloader.
107 | 
108 | The binary itself is position-independent so the location of your boot ROM
109 | doesn't matter...except for the following.
110 | 
111 | **Note:** The linker script currently assumes that there is useful stack memory
112 | located _immediately below_ the location where tinyboot is loaded.
113 | 


--------------------------------------------------------------------------------
/tinyboot/build.rs:
--------------------------------------------------------------------------------
 1 | use std::{env::VarError, path::PathBuf};
 2 | use std::io::Write;
 3 | 
 4 | fn main() {
 5 |     println!("cargo:rerun-if-changed=link.x");
 6 |     println!("cargo:rerun-if-env-changed=TINYBOOT_UART_ADDR");
 7 | 
 8 |     let addr_input = match std::env::var("TINYBOOT_UART_ADDR") {
 9 |         // Ugh why is this not an Option
10 |         Err(VarError::NotPresent) => None,
11 |         Ok(result) => Some(result),
12 |         e => panic!("{:?}", e),
13 |     };
14 | 
15 |     let addr = match addr_input {
16 |         None => {
17 |             println!("cargo:warning=note: UART address not provided, defaulting to 0x200");
18 |             0x200
19 |         }
20 |         Some(text) => {
21 |             parse_int::parse::<u32>(&text).unwrap()
22 |         }
23 |     };
24 | 
25 |     let mut out = PathBuf::from(std::env::var_os("OUT_DIR").unwrap());
26 |     out.push("peripherals.rs");
27 | 
28 |     let mut f = std::fs::File::create(&out).unwrap();
29 |     writeln!(f, "pub const UART_ADDR: u32 = 0x{addr:x};").unwrap();
30 | }
31 | 


--------------------------------------------------------------------------------
/tinyboot/link.x:
--------------------------------------------------------------------------------
 1 | MEMORY {
 2 |     PROGMEM (rwx): ORIGIN = 0x8000, LENGTH = 512
 3 |     RAM (rw): ORIGIN = 0x0000, LENGTH = 32K
 4 | }
 5 | 
 6 | EXTERN(__start);
 7 | ENTRY(__start);
 8 | 
 9 | SECTIONS {
10 |     PROVIDE(__stack_start = ORIGIN(RAM) + LENGTH(RAM));
11 | 
12 |     PROVIDE(__stext = ORIGIN(PROGMEM));
13 | 
14 |     .text __stext : {
15 |         *(.start);
16 | 
17 |         *(.text .text.*);
18 | 
19 |         . = ALIGN(4);
20 |         __etext = .;
21 |     } > PROGMEM
22 | 
23 |     .rodata : ALIGN(4) {
24 |         . = ALIGN(4);
25 |         __srodata = .;
26 |         *(.rodata .rodata.*);
27 |         . = ALIGN(4);
28 |         __erodata = .;
29 |     } > PROGMEM
30 | 
31 |     /DISCARD/ : {
32 |         /* throw away RAM sections to get a link error if they're used. */
33 |         *(.bss);
34 |         *(.bss.*);
35 |         *(.data);
36 |         *(.data.*);
37 |         *(COMMON);
38 |         *(.ARM.exidx);
39 |         *(.ARM.exidx.*);
40 |         *(.ARM.extab.*);
41 |         *(.got);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/tinyboot/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel = "1.73"
3 | targets = [ "riscv32i-unknown-none-elf"]
4 | profile = "minimal"
5 | components = [ "rustfmt", "rust-analyzer" ]
6 | 


--------------------------------------------------------------------------------
/tinyboot/src/main.rs:
--------------------------------------------------------------------------------
  1 | #![no_main]
  2 | #![no_std]
  3 | 
  4 | // World's cheapest RISC-V "runtime" - only works because we don't use non-stack
  5 | // RAM (as ensured by our linker script)
  6 | core::arch::global_asm! {
  7 |     "
  8 |     .pushsection .start,\"ax\",%progbits
  9 |     .globl __start
 10 |     __start:
 11 |         # initialize stack pointer
 12 | 1:      auipc sp, %pcrel_hi(__stack_start)
 13 |         addi sp, sp, %pcrel_lo(1b)
 14 |         # No need to fill in a return address, main won't return
 15 |         j main
 16 | 
 17 |     .popsection
 18 |     "
 19 | }
 20 | 
 21 | #[no_mangle]
 22 | pub extern "C" fn main() -> ! {
 23 |     let mut a: *mut u32 = core::ptr::null_mut();
 24 |     let mut c: u32 = 0;
 25 |     loop {
 26 |         match getc() {
 27 |             0 => unsafe {  // Call
 28 |                 ack();
 29 |                 core::arch::asm!(
 30 |                     "
 31 |                     # restart monitor if program returns.
 32 |                  1: auipc ra, %pcrel_hi(__start)
 33 |                     addi ra, ra, %pcrel_lo(1b)
 34 | 
 35 |                     jr a0               # activate routine
 36 |                     ",
 37 |                     in("a0") a,
 38 |                     options(noreturn),
 39 |                 );
 40 |             }
 41 |             1 => {  // Write
 42 |                 while c > 0 {
 43 |                     c -= 1;
 44 |                     let word = get32();
 45 |                     unsafe {
 46 |                         a.write_volatile(word);
 47 |                         a = a.add(1);
 48 |                     }
 49 |                 }
 50 |                 ack();
 51 |             }
 52 |             2 => {  // Read
 53 |                 ack();
 54 |                 while c > 0 {
 55 |                     c -= 1;
 56 |                     let word = unsafe { a.read_volatile() };
 57 |                     unsafe { a = a.add(1); }
 58 |                     put32(word);
 59 |                 }
 60 |             }
 61 |             3 => {  // Load A
 62 |                 a = get32() as _;
 63 |                 ack();
 64 |             }
 65 |             4 => {  // Load C
 66 |                 c = get32();
 67 |                 ack();
 68 |             }
 69 |             5 => { // Just ping
 70 |                 ack();
 71 |             }
 72 |             _ => {
 73 |                 putb(0xFF);
 74 |             }
 75 |         }
 76 |     }
 77 | }
 78 | 
 79 | #[inline(never)]
 80 | fn ack() {
 81 |     putb(0xAA);
 82 |     flush();
 83 | }
 84 | 
 85 | #[inline(never)]
 86 | fn get32() -> u32 {
 87 |     let mut word = u32::from(getc());
 88 |     word |= u32::from(getc()) << 8;
 89 |     word |= u32::from(getc()) << 16;
 90 |     word |= u32::from(getc()) << 24;
 91 |     word
 92 | }
 93 | 
 94 | //#[inline(never)]
 95 | fn put32(word: u32) {
 96 |     for b in word.to_le_bytes() {
 97 |         putb(b);
 98 |     }
 99 | }
100 | 
101 | const UARTRX: *mut i16 = generated::UART_ADDR as _;
102 | const UARTTX: *mut u16 = (generated::UART_ADDR + 2) as _;
103 | 
104 | fn txbusy() -> bool {
105 |     unsafe {
106 |         UARTTX.read_volatile() != 0
107 |     }
108 | }
109 | 
110 | fn flush() {
111 |     while txbusy() {
112 |         // spin
113 |     }
114 | }
115 | 
116 | fn putb(b: u8) {
117 |     flush();
118 |     unsafe {
119 |         UARTTX.write_volatile(u16::from(b));
120 |     }
121 | }
122 | 
123 | fn getc() -> u8 {
124 |     loop {
125 |         let status = unsafe { UARTRX.read_volatile() };
126 |         if status >= 0 {
127 |             return status as u8;
128 |         }
129 |     }
130 | }
131 | 
132 | extern "C" {
133 |     // This function is deliberately not implemented to cause a link error if we
134 |     // include a panic.
135 |     fn panic_handler_should_be_optimized_out() -> !;
136 | }
137 | 
138 | #[panic_handler]
139 | fn panic(_info: &core::panic::PanicInfo<'_>) -> ! {
140 |     unsafe {
141 |         panic_handler_should_be_optimized_out()
142 |     }
143 | }
144 | 
145 | mod generated {
146 |     include!(concat!(env!("OUT_DIR"), "/peripherals.rs"));
147 | }
148 | 


--------------------------------------------------------------------------------
/upduino-bootloader.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/upduino-bootloader.bin


--------------------------------------------------------------------------------
/upduino-chonk.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import argparse
 3 | import struct
 4 | from pathlib import Path
 5 | 
 6 | from amaranth import *
 7 | from amaranth.lib.wiring import *
 8 | from amaranth.build import ResourceError, Resource, Pins, Attrs
 9 | from amaranth_boards.upduino_v3 import UpduinoV3Platform
10 | 
11 | from hapenny import StreamSig
12 | import hapenny.chonk.cpu
13 | from hapenny.bus import BusPort, SimpleFabric, partial_decode, narrow_addr
14 | from hapenny.chonk.gpio32 import OutputPort32
15 | from hapenny.chonk.serial32 import BidiUart
16 | from hapenny.chonk.mem32 import BasicMemory, SpramMemory
17 | 
18 | RAM_WORDS = 256 * 1
19 | RAM_ADDR_BITS = (RAM_WORDS - 1).bit_length()
20 | 
21 | print(f"boot memory will use {RAM_ADDR_BITS}-bit addressing")
22 | 
23 | bootloader = Path("tinyboot-upduino-chonk.bin").read_bytes()
24 | boot_image = struct.unpack("<" + "I" * (len(bootloader) // 4), bootloader)
25 | 
26 | class Test(Elaboratable):
27 |     def elaborate(self, platform):
28 |         m = Module()
29 | 
30 |         m.submodules.cpu = cpu = hapenny.chonk.cpu.Cpu(
31 |             reset_vector = 0x1_0000,
32 |             # 2 for the width of the fabric's port select
33 |             # 14 for the width of the devices attached to the ports
34 |             # +2 because the fabric is word-addressed but this is in bytes
35 |             addr_width = 2 + 14 + 2,
36 |             # Program addresses only need to be able to address RAM, not
37 |             # I/O, so configure the PC and fetch port to be narrower. (+2
38 |             # because, again, our RAM is halfword addressed but this
39 |             # parameter is in bytes.)
40 |             prog_addr_width = 1 + 14 + 2,
41 |             counters = True,
42 |         )
43 |         m.submodules.bootmem = bootmem = BasicMemory(depth = RAM_WORDS,
44 |                                                      contents = boot_image)
45 |         m.submodules.bulkmem0 = bulkmem0 = SpramMemory()
46 |         m.submodules.port = port = OutputPort32(1)
47 |         m.submodules.uart = uart = BidiUart(baud_rate = 115200, oversample=8)
48 |         m.submodules.fabric = fabric = SimpleFabric([
49 |             # Put all the potentially executable RAM in the bottom portion of
50 |             # the address space, to allow PC and fetch circuitry to be slightly
51 |             # narrower. This helps with timing.
52 |             bulkmem0.bus,                       # 0x0000_0000
53 |             partial_decode(m, bootmem.bus, 14), # 0x0001_0000
54 |             partial_decode(m, port.bus, 14),    # 0x0002_0000
55 |             partial_decode(m, uart.bus, 14),    # 0x0003_0000
56 |         ])
57 | 
58 |         connect(m, cpu.bus, fabric.bus)
59 |         platform.add_resources([
60 |             Resource("tx", 0, Pins("7", dir="o", conn=("j", 0))),
61 |             Resource("rx", 0, Pins("8", dir="i", conn=("j", 0))),
62 |         ])
63 | 
64 |         tx = platform.request("tx", 0)
65 |         rx = platform.request("rx", 0)
66 | 
67 |         rgb_led = platform.request("rgb_led", 0)
68 |         m.d.comb += [
69 |             rgb_led.r.o.eq(cpu.halted),
70 |             rgb_led.g.o.eq(port.pins[0]),
71 |             tx.o[0].eq(uart.tx),
72 |             uart.rx.eq(rx.i[0]),
73 |         ]
74 | 
75 |         return m
76 | 
77 | parser = argparse.ArgumentParser(
78 |     prog = "upduino-chonk",
79 |     description = "Script for synthesizing a larger UPduino SoC using chonk",
80 | )
81 | args = parser.parse_args()
82 | 
83 | 
84 | p = UpduinoV3Platform()
85 | p.hfosc_div = 2 # divide 48MHz by 2**1 = 24 MHz
86 | p.build(Test(), do_program = True)
87 | 


--------------------------------------------------------------------------------
/upduino-large.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import argparse
 3 | import struct
 4 | from pathlib import Path
 5 | 
 6 | from amaranth import *
 7 | from amaranth.lib.wiring import *
 8 | from amaranth.build import ResourceError, Resource, Pins, Attrs
 9 | from amaranth_boards.upduino_v3 import UpduinoV3Platform
10 | 
11 | from hapenny import StreamSig
12 | import hapenny.cpu
13 | from hapenny.bus import BusPort, SimpleFabric, partial_decode
14 | from hapenny.gpio import OutputPort
15 | from hapenny.serial import BidiUart
16 | from hapenny.mem import BasicMemory, SpramMemory
17 | 
18 | RAM_WORDS = 256 * 1
19 | RAM_ADDR_BITS = (RAM_WORDS - 1).bit_length()
20 | 
21 | print(f"boot memory will use {RAM_ADDR_BITS}-bit addressing")
22 | 
23 | bootloader = Path("upduino-bootloader.bin").read_bytes()
24 | boot_image = struct.unpack("<" + "h" * (len(bootloader) // 2), bootloader)
25 | 
26 | class Test(Elaboratable):
27 |     def elaborate(self, platform):
28 |         m = Module()
29 | 
30 |         m.submodules.cpu = cpu = hapenny.cpu.Cpu(
31 |             reset_vector = 0x0_8000,
32 |             # +1 to adjust from bus halfword addressing to CPU byte
33 |             # addressing.
34 |             addr_width = 2 + 14 + 1,
35 |             # Program addresses only need to be able to address RAM, not
36 |             # I/O, so configure the PC and fetch port to be narrower. (+1
37 |             # because, again, our RAM is halfword addressed but this
38 |             # parameter is in bytes.)
39 |             prog_addr_width = 1 + 14 + 1,
40 |             counters = True,
41 |         )
42 |         m.submodules.bootmem = bootmem = BasicMemory(depth = RAM_WORDS,
43 |                                                      contents = boot_image)
44 |         m.submodules.bulkmem0 = bulkmem0 = SpramMemory()
45 |         m.submodules.port = port = OutputPort(1)
46 |         m.submodules.uart = uart = BidiUart(baud_rate = 115200)
47 |         m.submodules.fabric = fabric = SimpleFabric([
48 |             # Put all the potentially executable RAM in the bottom portion of
49 |             # the address space, to allow PC and fetch circuitry to be slightly
50 |             # narrower. This helps with timing.
51 |             bulkmem0.bus,                       # 0x0000_0000
52 |             partial_decode(m, bootmem.bus, 14), # 0x0000_8000
53 |             partial_decode(m, port.bus, 14),    # 0x0001_0000
54 |             partial_decode(m, uart.bus, 14),    # 0x0001_8000
55 |         ])
56 | 
57 |         connect(m, cpu.bus, fabric.bus)
58 |         platform.add_resources([
59 |             Resource("tx", 0, Pins("7", dir="o", conn=("j", 0))),
60 |             Resource("rx", 0, Pins("8", dir="i", conn=("j", 0))),
61 |         ])
62 |         def get_all_resources(name):
63 |             resources = []
64 |             for number in itertools.count():
65 |                 try:
66 |                     resources.append(platform.request(name, number))
67 |                 except ResourceError:
68 |                     break
69 |             return resources
70 | 
71 |         tx = platform.request("tx", 0)
72 |         rx = platform.request("rx", 0)
73 | 
74 |         rgb_led = platform.request("rgb_led", 0)
75 |         m.d.comb += [
76 |             rgb_led.r.o.eq(cpu.halted),
77 |             rgb_led.g.o.eq(port.pins[0]),
78 |             tx.o[0].eq(uart.tx),
79 |             uart.rx.eq(rx.i[0]),
80 |         ]
81 | 
82 |         return m
83 | 
84 | parser = argparse.ArgumentParser(
85 |     prog = "upduino-large",
86 |     description = "Script for synthesizing a larger UPduino SoC",
87 | )
88 | args = parser.parse_args()
89 | 
90 | 
91 | p = UpduinoV3Platform()
92 | p.hfosc_div = 1 # divide 48MHz by 2**1 = 24 MHz
93 | p.build(Test(), do_program = True)
94 | 


--------------------------------------------------------------------------------