├── .env.toolchain ├── .gitignore ├── README.mkdn ├── boards ├── __init__.py └── icoboard.py ├── dhrystone ├── .gitignore ├── Makefile ├── README.mkdn ├── dhry.h ├── dhry_1.c ├── dhry_2.c ├── sections.lds ├── start.S └── stdlib.c ├── doc ├── chonk.mkdn └── instruction-cycle.svg ├── hapenny ├── __init__.py ├── bus.py ├── chonk │ ├── __init__.py │ ├── cpu.py │ ├── ewbox.py │ ├── fdbox.py │ ├── gpio32.py │ ├── mem32.py │ ├── regfile32.py │ ├── sbox.py │ └── serial32.py ├── cpu.py ├── decoder.py ├── ewbox.py ├── extsram.py ├── fdbox.py ├── gpio.py ├── mem.py ├── regfile16.py ├── rvfi.py ├── sbox.py └── serial.py ├── icestick-chonk.py ├── icestick-smallest.py ├── icesticktest.py ├── icoboard-large.py ├── icolarge-bootloader.bin ├── montool ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── README.mkdn └── src │ └── main.rs ├── notes ├── 20231001.mkdn ├── 20231002.mkdn ├── 20231003.mkdn ├── 20231004.mkdn ├── 20231005.mkdn └── 20231006.mkdn ├── pdm.lock ├── pyproject.toml ├── sim-chonk.py ├── sim-cpu.py ├── smallest-toggle.bin ├── tiny-bootloader.bin ├── tinyboot-upduino-chonk.bin ├── tinyboot ├── .cargo │ └── config ├── Cargo.lock ├── Cargo.toml ├── README.mkdn ├── build.rs ├── link.x ├── rust-toolchain.toml └── src │ └── main.rs ├── upduino-bootloader.bin ├── upduino-chonk.py └── upduino-large.py /.env.toolchain: -------------------------------------------------------------------------------- 1 | AMARANTH_USE_YOSYS=builtin 2 | YOSYS=yowasp-yosys 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.gtkw 3 | *.vcd 4 | .pdm-python 5 | build/ 6 | sim-cpu.v 7 | -------------------------------------------------------------------------------- /README.mkdn: -------------------------------------------------------------------------------- 1 | # `hapenny`: a half-width RISC-V 2 | 3 | `hapenny` is a 32-bit RISC-V CPU implementation that operates internally on 4 | 16-bit chunks. This means it takes longer to do things, but uses less space. 5 | 6 | This approach was inspired by the MC68000 (1979), which also implemented a 7 | 32-bit instruction set using a 16-bit datapath. (`hapenny` uses about half as 8 | many cycles per instruction as the MC68000, after optimization.) 9 | 10 | `hapenny` was written to evaluate the Amaranth HDL. 11 | 12 | (The current `hapenny` was formerly version 2; once it became mature enough I 13 | removed version 1.) 14 | 15 | ## Bullet points 16 | 17 | - Over 12M inst/sec on iCE40 HX1K, while occupying under 800 LCs, or less than 18 | 63% of the chip. (Throughput compares favorably to some 32-bit implementations 19 | occupying twice the area.) 20 | - Native 16-bit bus allows for simpler peripherals and external RAMs. (Can run 21 | out of external 16-bit SRAM with no penalty.) 22 | - Parameterized with knobs for trading off size vs capability. 23 | - Implements the RV32I unprivileged instruction set (currently missing FENCE and 24 | SYSTEM). 25 | - Optional interrupt support in the older core. (yet to come in the revised one) 26 | - Written in Python using Amaranth. 27 | 28 | ## But why 29 | 30 | There are a bazillion open-source RISC-V CPU implementations out there, which is 31 | what happens when you release a well-designed and free-to-implement instruction 32 | set spec -- nerds like me will crank out implementations. 33 | 34 | I wrote `hapenny` as an experiment to see if I could target the space between 35 | the PicoRV32 core and the SERV core, in terms of size and performance. I 36 | specifically wanted to produce a CPU with decent performance that could fit into 37 | an iCE40 HX1K part (like on the Icestick evaluation board) with enough space 38 | left over for useful logic. PicoRV32 doesn't quite fit on that chip; SERV fits 39 | but takes 32-64 cycles per instruction. 40 | 41 | | Property | PicoRV32-small | `hapenny` | SERV | 42 | | ------------------------------ | -------------- | --------- | ---- | 43 | | Datapath width (bits) | 32 | **16** | 1 | 44 | | External data bus width | 32 | **16** | 32 | 45 | | Average cycles per instruction | 5.429 | **5.525** | 40-ish | 46 | | Minimal size on iCE40 (LCs) | 1500-ish | **796** | 200-ish | 47 | | Typical MHz on iCE40 | 40s? | **72+** | 40s? | 48 | 49 | (Cycles/instruction is measured on Dhrystone. Minimal size is the output 50 | produced by the `icestick-smallest.py` script. I would appreciate help getting 51 | apples-to-apples comparison numbers!) 52 | 53 | So, basically, 54 | 55 | - `hapenny` is significantly smaller than a similarly-configured PicoRV32 core 56 | for only 1.7% less performance per clock. (Of course, PicoRV32 is a far more 57 | general and well-tested processor, and in practice you'd configure it with 58 | performance-enhancing features like a dual-port register file and faster 59 | shifts.) 60 | 61 | - `hapenny` is much faster than SERV, but also about 4x larger. (SERV is also 62 | better tested than `hapenny`.) 63 | 64 | `hapenny` is easy to interface to 16-bit peripherals and external memory with no 65 | (additional) performance loss. This can result in smaller overall designs and 66 | simpler boards. For instance, `hapenny` can run at full rate out of the 16-bit 67 | SRAM on the Icoboard. 68 | 69 | Independent from the datapath width, I also did some fairly aggressive manual 70 | register retiming in the decoder and datapath, which means `hapenny` can often 71 | close timing at higher Fmax than other simple RV32 cores. (I miss automatic 72 | retiming from ASIC toolchains.) 73 | 74 | 75 | ## Details 76 | 77 | `hapenny` executes (most of) the RV32I instruction set in 16-bit pieces. It uses 78 | 16-bit memory, a 16-bit (single-ported) register file, and a 16-bit ALU. To 79 | perform 32-bit operations, it uses the same techniques a programmer might use in 80 | software on a 16-bit computer, e.g. "chaining" operations using preserved 81 | carry/zero bits. 82 | 83 | All memory interfaces in `hapenny` are synchronous, including the register file, 84 | which is another reason why operations take more cycles. The RV32I register file 85 | is comparatively large (at 1024 bits), and using a synchronous register file 86 | ensures that it can be mapped into an FPGA block RAM if desired. 87 | 88 | Here's what the CPU does during the timing of a typical instruction like `ADD`. 89 | I've color/brightness-coded three different executions that are in flight during 90 | this diagram. 91 | 92 | ![A timing diagram showing a typical instruction cycle.](doc/instruction-cycle.svg) 93 | 94 | - The "FD-Box" is responsible for fetch and decode, and is always working on the 95 | _next_ instruction. It requires three cycles to fetch both halfwords of an 96 | instruction, and then uses the `DECODE` cycle to do initial instruction 97 | decoding and start the read of rs1's low half. (It spends one cycle out of 98 | four essentially idle to make the state machines line up conveniently.) 99 | - The "EW-Box" is responsible for execute and writeback. It goes through at 100 | least four states in every instruction: 101 | - `R2L` starts the load of the low half of rs2 from the register file. 102 | - `OPL` operates on the low halves of rs1 and rs2 (or rs1 and an immediate), 103 | and also starts the load of the high half of rs1. 104 | - `R2H` and `OPH` do the same thing for the high half. 105 | 106 | Most instructions take four cycles, as shown in that diagram. Some take more if 107 | they need to do additional things (by adding states), or if they change control 108 | flow such that the FD-Box's speculative fetch was wrong. The CPU test bench 109 | (`sim-cpu.py`) measures the cycle timing for every instruction; here's where 110 | things currently stand: 111 | 112 | | Instruction | Cycles | Notes | 113 | | ------------ | ------ | ----- | 114 | | AUIPC | 4 | | 115 | | LUI | 4 | | 116 | | JAL | 8 | Includes four-cycle re-fetch penalty | 117 | | JALR | 8 | Includes four-cycle re-fetch penalty | 118 | | Branch | 5/10 | Not Taken / Taken | 119 | | Load | 6 | | 120 | | SW | 5 | | 121 | | SB/SH | 4 | | 122 | | SLT(I)(U) | 6 | | 123 | | Shift | 6 + N | N is number of bits shifted | 124 | | Other ALU op | 4 | | 125 | 126 | On the instruction mix in Dhrystone, this yields an average of 5.525 127 | cycles/instruction. 128 | 129 | ## Interfaces 130 | 131 | `hapenny` uses a very simple bus interface with up to 32-bit addressing. In 132 | practice, applications will wire up fewer than 32 address lines, which saves 133 | space. 134 | 135 | | Signal | Driver | Width | Description | 136 | | ---------- | ------ | -------- | ----------- | 137 | | `addr` | CPU | up to 31 | addresses a halfword, i.e. LSB missing | 138 | | `data_out` | CPU | 16 | carries data for a write | 139 | | `lanes` | CPU | 2 | signals a write of either or both byte in a halfword; zero means a load | 140 | | `valid` | CPU | 1 | when high, indicates that the signals above are valid and starts a bus transaction. | 141 | | `response` | device |16 | on the cycle after a load, carries back data from the addressed device. | 142 | 143 | The PC can be shrunk separately from the address bus if you know that all 144 | program memory appears in e.g. the bottom half of the address space. This 145 | further saves space. 146 | 147 | The bus interface does not support wait states, to reduce complexity. This makes 148 | it difficult to interface to things like XIP SPI Flash or SDRAM. `hapenny` is 149 | really intended for applications that don't rely on such things. 150 | 151 | `hapenny` exposes a fairly flexible debug interface capable of inspecting 152 | processor state and reading and writing the register file. These feautres are 153 | only available when the processor is halted, which can be achieved by holding 154 | `halt_request` high until the processor confirms (at the next instruction 155 | boundary) by asserting `halted`. Release `halt_request` to resume. 156 | 157 | Finally, `hapenny` has an RVFI (RISC-V Formal Interface) trace port for 158 | generating a trace of instruction effects, though I haven't wired up the actual 159 | test suite. 160 | 161 | ## Interrupt options 162 | 163 | Currently, `hapenny` does not support interrupts, but I'm planning on changing 164 | this. (An earlier version did, support was removed when I rearchitected the core 165 | for v2.) 166 | 167 | ## Drawbacks 168 | 169 | - Written by someone who pretends to be an electrical engineer as a way to 170 | procrastinate finishing his slides for a talk. 171 | 172 | - Used for exactly one thing so far, so not exactly battle-hardened. 173 | 174 | - Less general than more mature implementations like PicoRV32 -- e.g. no support 175 | for wait states, hardware multiply, coprocessors, or (currently) interrupts. 176 | 177 | - 16-bit external data bus means that, currently, 32-bit reads/writes are not 178 | atomic -- a problem when interfacing with peripherals with 32-bit 179 | memory-mapped registers. (Peripherals with 16-bit memory-mapped registers work 180 | fine, however.) 181 | 182 | - Not exactly well factored/commented. 183 | 184 | - Written in Python, so chances are pretty good the code won't keep working 185 | across OS updates / minor runtime versions. 186 | 187 | ## What's with the name 188 | 189 | `hapenny` is implemented using about half the logic of other cheap RV32 cores. 190 | 191 | The half-penny, or "ha'penny," is a historical English coin worth (as the name 192 | implies) half a penny. So if the other cheap cores cost a penny, this is a 193 | ha'penny. 194 | -------------------------------------------------------------------------------- /boards/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/boards/__init__.py -------------------------------------------------------------------------------- /boards/icoboard.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from pathlib import Path 4 | 5 | from amaranth.build import * 6 | from amaranth.vendor import * 7 | from amaranth_boards.resources import * 8 | 9 | 10 | __all__ = ["IcoboardPlatform"] 11 | 12 | 13 | class IcoboardPlatform(LatticeICE40Platform): 14 | device = "iCE40HX8K" 15 | package = "CT256" 16 | default_clk = "clk100" 17 | resources = [ 18 | Resource("clk100", 0, Pins("R9", dir="i"), 19 | Clock(100e6), Attrs(GLOBAL=True, IO_STANDARD="SB_LVCMOS")), 20 | 21 | *LEDResources(pins="C8 F7 K9", attrs=Attrs(IO_STANDARD="SB_LVCMOS")), 22 | 23 | *ButtonResources(pins="K11 P13", attrs=Attrs(IO_STANDARD="SB_LVCMOS")), 24 | 25 | SRAMResource(0, 26 | cs_n="M7", oe_n="L5", we_n="T7", 27 | a="N2 K5 J5 M5 P4 N5 P5 P7 M6 P6 T8 T1 P2 R1 N3 P1 M11 P10 P8", 28 | d="T2 R3 T3 R4 R5 T5 R6 T6 N4 M4 L6 M3 L4 L3 K4 K3", 29 | dm_n="J4 J3", 30 | attrs=Attrs(IO_STANDARD="SB_LVCMOS"), 31 | ), 32 | 33 | *SPIFlashResources(0, 34 | cs_n="R12", clk="R11", copi="P12", cipo="P11", 35 | attrs=Attrs(IO_STANDARD="SB_LVCMOS") 36 | ), 37 | ] 38 | connectors = [ 39 | Connector("pmod", 1, "D8 B9 B10 B11 - - B8 A9 A10 A11 - -"), 40 | Connector("pmod", 2, "A5 A2 C3 B4 - - B7 B6 B3 B5 - -"), 41 | Connector("pmod", 3, "L9 G5 L7 N6 - - N9 P9 M8 N7 - -"), 42 | Connector("pmod", 4, "T15 T14 T11 R10 - - R14 T13 T10 T9 - -"), 43 | ] 44 | 45 | def toolchain_program(self, products, name): 46 | icoprog = os.environ.get("ICOPROG", "icoprog") 47 | with products.extract("{}.bin".format(name)) as bitstream_filename: 48 | bitstream = Path(bitstream_filename).read_bytes() 49 | subprocess.run([icoprog, "-p"], input=bitstream, check=True) 50 | 51 | 52 | if __name__ == "__main__": 53 | from amaranth_boards.test.blinky import * 54 | IcoboardPlatform().build(Blinky(), do_program=True) 55 | -------------------------------------------------------------------------------- /dhrystone/.gitignore: -------------------------------------------------------------------------------- 1 | dhry.bin 2 | dhry.elf 3 | dhry.map 4 | *.d 5 | *.o 6 | -------------------------------------------------------------------------------- /dhrystone/Makefile: -------------------------------------------------------------------------------- 1 | UARTADDR ?= 0x18000 2 | RUNS ?= 1000 3 | STACK ?= 0x8000 4 | OBJS = dhry_1.o dhry_2.o stdlib.o start.o 5 | CFLAGS = -MD -O3 -mabi=ilp32 -march=rv32i -DTIME -DRISCV -g3 6 | TOOLCHAIN_PREFIX = riscv64-unknown-elf- 7 | 8 | CFLAGS += -DUSE_MYSTDLIB -ffreestanding -nostdlib -DUARTADDR=$(UARTADDR) -DSTACK=$(STACK) -DRUNS=$(RUNS) 9 | 10 | dhry.bin: dhry.elf 11 | $(TOOLCHAIN_PREFIX)objcopy -Obinary $^ $@ 12 | 13 | dhry.elf: $(OBJS) sections.lds 14 | $(TOOLCHAIN_PREFIX)gcc $(CFLAGS) -Wl,-Bstatic,-T,sections.lds,-Map,dhry.map,--strip-debug -o $@ $(OBJS) -lgcc 15 | chmod -x $@ 16 | 17 | %.o: %.c 18 | $(TOOLCHAIN_PREFIX)gcc -c $(CFLAGS) $< 19 | 20 | %.o: %.S 21 | $(TOOLCHAIN_PREFIX)gcc -c $(CFLAGS) $< 22 | 23 | dhry_1.o dhry_2.o: CFLAGS += -Wno-implicit-int -Wno-implicit-function-declaration 24 | 25 | clean: 26 | rm -rf *.o *.d dhry.elf dhry.map dhry.bin dhry.hex 27 | 28 | .PHONY: test clean 29 | 30 | -include *.d 31 | 32 | -------------------------------------------------------------------------------- /dhrystone/README.mkdn: -------------------------------------------------------------------------------- 1 | # Hacked up Dhrystone 2 | 3 | This is the classical Dhrystone benchmark, fitted with code to support a hapenny 4 | SoC. This code and the Makefile are derived from the PicoRV32 Dhrystone test 5 | bench, but further modified and simplified. 6 | 7 | The Dhrystone sources appear to be in the public domain. I've borrowed those 8 | bits and left the non-public-domain bits from PicoRV32 behind, as far as I know. 9 | 10 | By default, this builds an image compatible with the `upduino-large` example 11 | SoC. That's the only example currently in the repo that has enough RAM to run 12 | Dhrystone (you need at least 18 kiB). 13 | 14 | ## Current results 15 | 16 | For the `upduino-large` SoC example using the newer (`box`) CPU revision, with 17 | the integer overflows fixed in the C code for printing cycle counts (boo C!), we 18 | get: 19 | 20 | ``` 21 | Number_Of_Runs: 10000 22 | User_Time: 21440119 cycles, 3880025 insn 23 | Cycles_Per_Instruction: 5.525 24 | Dhrystones_Per_Second_Per_MHz: 466 25 | DMIPS_Per_MHz: 0.265 26 | ``` 27 | 28 | 29 | ## Building and running it 30 | 31 | First, make sure you have an SoC running `tinyboot` and a working serial cable. 32 | Your SoC should respond to the `ping` subcommand in `montool`. 33 | 34 | Build Dhrystone by running `make`. This will produce (among other things) a file 35 | called `dhry.bin`. 36 | 37 | Go into the `montool` directory and run 38 | 39 | ``` 40 | cargo run -q YOURPORT write 0 dhry.bin 41 | cargo run -q YOURPORT call 0 --then-echo 42 | ``` 43 | 44 | ...where `YOURPORT` should be the name of the serial port on your system (e.g. 45 | `/dev/ttyUSB0` or `COM1:`) and `0` is the address to load. If that address isn't 46 | right for your SoC, see the next section. 47 | 48 | The `call` subcommand will activate Dhrystone and print its output to your 49 | terminal. Once it says `DONE`, it will appear to hang; just abort the command at 50 | this point. If you'd like to run it more than once, just `call` again. 51 | 52 | 53 | ## Adapting to your SoC 54 | 55 | The Makefile's behavior can be customized by passing two variables: 56 | 57 | - `UARTADDR` is the address of the UART (default: 0x18000). 58 | - `STACK` is the initial stack pointer (default: 0x8000). 59 | 60 | If your program RAM is not at address 0, you'll need to modify the linker 61 | script. 62 | 63 | -------------------------------------------------------------------------------- /dhrystone/dhry_1.c: -------------------------------------------------------------------------------- 1 | /* 2 | **************************************************************************** 3 | * 4 | * "DHRYSTONE" Benchmark Program 5 | * ----------------------------- 6 | * 7 | * Version: C, Version 2.1 8 | * 9 | * File: dhry_1.c (part 2 of 3) 10 | * 11 | * Date: May 25, 1988 12 | * 13 | * Author: Reinhold P. Weicker 14 | * 15 | **************************************************************************** 16 | */ 17 | 18 | #include "dhry.h" 19 | 20 | #ifdef USE_MYSTDLIB 21 | extern char *malloc (); 22 | #else 23 | # include 24 | # include 25 | #endif 26 | 27 | /* Global Variables: */ 28 | 29 | Rec_Pointer Ptr_Glob, 30 | Next_Ptr_Glob; 31 | int Int_Glob; 32 | Boolean Bool_Glob; 33 | char Ch_1_Glob, 34 | Ch_2_Glob; 35 | int Arr_1_Glob [50]; 36 | int Arr_2_Glob [50] [50]; 37 | 38 | Enumeration Func_1 (); 39 | /* forward declaration necessary since Enumeration may not simply be int */ 40 | 41 | #ifndef REG 42 | Boolean Reg = false; 43 | #define REG 44 | /* REG becomes defined as empty */ 45 | /* i.e. no register variables */ 46 | #else 47 | Boolean Reg = true; 48 | #endif 49 | 50 | /* variables for time measurement: */ 51 | 52 | #ifdef IGN_TIMES 53 | struct tms time_info; 54 | extern int times (); 55 | /* see library function "times" */ 56 | #define Too_Small_Time 120 57 | /* Measurements should last at least about 2 seconds */ 58 | #endif 59 | #ifdef TIME 60 | extern long time(); 61 | #ifdef RISCV 62 | extern long insn(); 63 | #endif 64 | /* see library function "time" */ 65 | #define Too_Small_Time 2 66 | /* Measurements should last at least 2 seconds */ 67 | #endif 68 | 69 | long Begin_Time, 70 | End_Time, 71 | User_Time; 72 | #ifdef RISCV 73 | long Begin_Insn, 74 | End_Insn, 75 | User_Insn; 76 | #endif 77 | float Microseconds, 78 | Dhrystones_Per_Second; 79 | 80 | /* end of variables for time measurement */ 81 | 82 | 83 | main () 84 | /*****/ 85 | 86 | /* main program, corresponds to procedures */ 87 | /* Main and Proc_0 in the Ada version */ 88 | { 89 | One_Fifty Int_1_Loc; 90 | REG One_Fifty Int_2_Loc; 91 | One_Fifty Int_3_Loc; 92 | REG char Ch_Index; 93 | Enumeration Enum_Loc; 94 | Str_30 Str_1_Loc; 95 | Str_30 Str_2_Loc; 96 | REG int Run_Index; 97 | REG int Number_Of_Runs; 98 | 99 | /* Initializations */ 100 | 101 | Next_Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type)); 102 | Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type)); 103 | 104 | Ptr_Glob->Ptr_Comp = Next_Ptr_Glob; 105 | Ptr_Glob->Discr = Ident_1; 106 | Ptr_Glob->variant.var_1.Enum_Comp = Ident_3; 107 | Ptr_Glob->variant.var_1.Int_Comp = 40; 108 | strcpy (Ptr_Glob->variant.var_1.Str_Comp, 109 | "DHRYSTONE PROGRAM, SOME STRING"); 110 | strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING"); 111 | 112 | Arr_2_Glob [8][7] = 10; 113 | /* Was missing in published program. Without this statement, */ 114 | /* Arr_2_Glob [8][7] would have an undefined value. */ 115 | /* Warning: With 16-Bit processors and Number_Of_Runs > 32000, */ 116 | /* overflow may occur for this array element. */ 117 | 118 | printf ("\n"); 119 | printf ("Dhrystone Benchmark, Version 2.1 (Language: C)\n"); 120 | printf ("\n"); 121 | if (Reg) 122 | { 123 | printf ("Program compiled with 'register' attribute\n"); 124 | printf ("\n"); 125 | } 126 | else 127 | { 128 | printf ("Program compiled without 'register' attribute\n"); 129 | printf ("\n"); 130 | } 131 | printf ("Please give the number of runs through the benchmark: "); 132 | { 133 | // int n; 134 | // scanf ("%d", &n); 135 | Number_Of_Runs = RUNS; 136 | } 137 | printf ("\n"); 138 | 139 | printf ("Execution starts, %d runs through Dhrystone\n", Number_Of_Runs); 140 | 141 | /***************/ 142 | /* Start timer */ 143 | /***************/ 144 | 145 | #ifdef IGN_TIMES 146 | times (&time_info); 147 | Begin_Time = (long) time_info.tms_utime; 148 | #endif 149 | #ifdef TIME 150 | Begin_Time = time ( (long *) 0); 151 | #ifdef RISCV 152 | Begin_Insn = insn ( (long *) 0); 153 | #endif 154 | #endif 155 | 156 | for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index) 157 | { 158 | 159 | Proc_5(); 160 | Proc_4(); 161 | /* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */ 162 | Int_1_Loc = 2; 163 | Int_2_Loc = 3; 164 | strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING"); 165 | Enum_Loc = Ident_2; 166 | Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc); 167 | /* Bool_Glob == 1 */ 168 | while (Int_1_Loc < Int_2_Loc) /* loop body executed once */ 169 | { 170 | Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc; 171 | /* Int_3_Loc == 7 */ 172 | Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc); 173 | /* Int_3_Loc == 7 */ 174 | Int_1_Loc += 1; 175 | } /* while */ 176 | /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */ 177 | Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc); 178 | /* Int_Glob == 5 */ 179 | Proc_1 (Ptr_Glob); 180 | for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index) 181 | /* loop body executed twice */ 182 | { 183 | if (Enum_Loc == Func_1 (Ch_Index, 'C')) 184 | /* then, not executed */ 185 | { 186 | Proc_6 (Ident_1, &Enum_Loc); 187 | strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING"); 188 | Int_2_Loc = Run_Index; 189 | Int_Glob = Run_Index; 190 | } 191 | } 192 | /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */ 193 | Int_2_Loc = Int_2_Loc * Int_1_Loc; 194 | Int_1_Loc = Int_2_Loc / Int_3_Loc; 195 | Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc; 196 | /* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */ 197 | Proc_2 (&Int_1_Loc); 198 | /* Int_1_Loc == 5 */ 199 | 200 | } /* loop "for Run_Index" */ 201 | 202 | /**************/ 203 | /* Stop timer */ 204 | /**************/ 205 | 206 | #ifdef IGN_TIMES 207 | times (&time_info); 208 | End_Time = (long) time_info.tms_utime; 209 | #endif 210 | #ifdef TIME 211 | End_Time = time ( (long *) 0); 212 | #ifdef RISCV 213 | End_Insn = insn ( (long *) 0); 214 | #endif 215 | #endif 216 | 217 | printf ("Execution ends\n"); 218 | printf ("\n"); 219 | printf ("Final values of the variables used in the benchmark:\n"); 220 | printf ("\n"); 221 | printf ("Int_Glob: %d\n", Int_Glob); 222 | printf (" should be: %d\n", 5); 223 | printf ("Bool_Glob: %d\n", Bool_Glob); 224 | printf (" should be: %d\n", 1); 225 | printf ("Ch_1_Glob: %c\n", Ch_1_Glob); 226 | printf (" should be: %c\n", 'A'); 227 | printf ("Ch_2_Glob: %c\n", Ch_2_Glob); 228 | printf (" should be: %c\n", 'B'); 229 | printf ("Arr_1_Glob[8]: %d\n", Arr_1_Glob[8]); 230 | printf (" should be: %d\n", 7); 231 | printf ("Arr_2_Glob[8][7]: %d\n", Arr_2_Glob[8][7]); 232 | printf (" should be: Number_Of_Runs + 10\n"); 233 | printf ("Ptr_Glob->\n"); 234 | printf (" Ptr_Comp: %d\n", (int) Ptr_Glob->Ptr_Comp); 235 | printf (" should be: (implementation-dependent)\n"); 236 | printf (" Discr: %d\n", Ptr_Glob->Discr); 237 | printf (" should be: %d\n", 0); 238 | printf (" Enum_Comp: %d\n", Ptr_Glob->variant.var_1.Enum_Comp); 239 | printf (" should be: %d\n", 2); 240 | printf (" Int_Comp: %d\n", Ptr_Glob->variant.var_1.Int_Comp); 241 | printf (" should be: %d\n", 17); 242 | printf (" Str_Comp: %s\n", Ptr_Glob->variant.var_1.Str_Comp); 243 | printf (" should be: DHRYSTONE PROGRAM, SOME STRING\n"); 244 | printf ("Next_Ptr_Glob->\n"); 245 | printf (" Ptr_Comp: %d\n", (int) Next_Ptr_Glob->Ptr_Comp); 246 | printf (" should be: (implementation-dependent), same as above\n"); 247 | printf (" Discr: %d\n", Next_Ptr_Glob->Discr); 248 | printf (" should be: %d\n", 0); 249 | printf (" Enum_Comp: %d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp); 250 | printf (" should be: %d\n", 1); 251 | printf (" Int_Comp: %d\n", Next_Ptr_Glob->variant.var_1.Int_Comp); 252 | printf (" should be: %d\n", 18); 253 | printf (" Str_Comp: %s\n", 254 | Next_Ptr_Glob->variant.var_1.Str_Comp); 255 | printf (" should be: DHRYSTONE PROGRAM, SOME STRING\n"); 256 | printf ("Int_1_Loc: %d\n", Int_1_Loc); 257 | printf (" should be: %d\n", 5); 258 | printf ("Int_2_Loc: %d\n", Int_2_Loc); 259 | printf (" should be: %d\n", 13); 260 | printf ("Int_3_Loc: %d\n", Int_3_Loc); 261 | printf (" should be: %d\n", 7); 262 | printf ("Enum_Loc: %d\n", Enum_Loc); 263 | printf (" should be: %d\n", 1); 264 | printf ("Str_1_Loc: %s\n", Str_1_Loc); 265 | printf (" should be: DHRYSTONE PROGRAM, 1'ST STRING\n"); 266 | printf ("Str_2_Loc: %s\n", Str_2_Loc); 267 | printf (" should be: DHRYSTONE PROGRAM, 2'ND STRING\n"); 268 | printf ("\n"); 269 | 270 | User_Time = End_Time - Begin_Time; 271 | 272 | #ifdef RISCV 273 | User_Insn = End_Insn - Begin_Insn; 274 | 275 | printf("Number_Of_Runs: %d\n", Number_Of_Runs); 276 | printf("User_Time: %d cycles, %d insn\n", User_Time, User_Insn); 277 | 278 | long long Cycles_Per_Instruction_x1000 = (1000 * (long long)User_Time) / User_Insn; 279 | printf("Cycles_Per_Instruction: %d.%d%d%d\n", (int) (Cycles_Per_Instruction_x1000 / 1000), 280 | (int) (Cycles_Per_Instruction_x1000 / 100) % 10, 281 | (int) (Cycles_Per_Instruction_x1000 / 10) % 10, 282 | (int) (Cycles_Per_Instruction_x1000 / 1) % 10); 283 | 284 | int Dhrystones_Per_Second_Per_MHz = ((long long) Number_Of_Runs * 1000000) / User_Time; 285 | printf("Dhrystones_Per_Second_Per_MHz: %d\n", (int) Dhrystones_Per_Second_Per_MHz); 286 | 287 | int DMIPS_Per_MHz_x1000 = (1000 * (long long) Dhrystones_Per_Second_Per_MHz) / 1757; 288 | printf("DMIPS_Per_MHz: %d.%d%d%d\n", (int) (DMIPS_Per_MHz_x1000 / 1000), 289 | (int) (DMIPS_Per_MHz_x1000 / 100) % 10, 290 | (int) (DMIPS_Per_MHz_x1000 / 10) % 10, 291 | (int) (DMIPS_Per_MHz_x1000 / 1) % 10); 292 | #else 293 | if (User_Time < Too_Small_Time) 294 | { 295 | printf ("Measured time too small to obtain meaningful results\n"); 296 | printf ("Please increase number of runs\n"); 297 | printf ("\n"); 298 | } 299 | else 300 | { 301 | #ifdef TIME 302 | Microseconds = (float) User_Time * Mic_secs_Per_Second 303 | / (float) Number_Of_Runs; 304 | Dhrystones_Per_Second = (float) Number_Of_Runs / (float) User_Time; 305 | #else 306 | Microseconds = (float) User_Time * Mic_secs_Per_Second 307 | / ((float) HZ * ((float) Number_Of_Runs)); 308 | Dhrystones_Per_Second = ((float) HZ * (float) Number_Of_Runs) 309 | / (float) User_Time; 310 | #endif 311 | printf ("Microseconds for one run through Dhrystone: "); 312 | printf ("%6.1f \n", Microseconds); 313 | printf ("Dhrystones per Second: "); 314 | printf ("%6.1f \n", Dhrystones_Per_Second); 315 | printf ("\n"); 316 | } 317 | #endif 318 | 319 | } 320 | 321 | 322 | Proc_1 (Ptr_Val_Par) 323 | /******************/ 324 | 325 | REG Rec_Pointer Ptr_Val_Par; 326 | /* executed once */ 327 | { 328 | REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp; 329 | /* == Ptr_Glob_Next */ 330 | /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp, */ 331 | /* corresponds to "rename" in Ada, "with" in Pascal */ 332 | 333 | structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob); 334 | Ptr_Val_Par->variant.var_1.Int_Comp = 5; 335 | Next_Record->variant.var_1.Int_Comp 336 | = Ptr_Val_Par->variant.var_1.Int_Comp; 337 | Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp; 338 | Proc_3 (&Next_Record->Ptr_Comp); 339 | /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp 340 | == Ptr_Glob->Ptr_Comp */ 341 | if (Next_Record->Discr == Ident_1) 342 | /* then, executed */ 343 | { 344 | Next_Record->variant.var_1.Int_Comp = 6; 345 | Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp, 346 | &Next_Record->variant.var_1.Enum_Comp); 347 | Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp; 348 | Proc_7 (Next_Record->variant.var_1.Int_Comp, 10, 349 | &Next_Record->variant.var_1.Int_Comp); 350 | } 351 | else /* not executed */ 352 | structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp); 353 | } /* Proc_1 */ 354 | 355 | 356 | Proc_2 (Int_Par_Ref) 357 | /******************/ 358 | /* executed once */ 359 | /* *Int_Par_Ref == 1, becomes 4 */ 360 | 361 | One_Fifty *Int_Par_Ref; 362 | { 363 | One_Fifty Int_Loc; 364 | Enumeration Enum_Loc; 365 | 366 | Int_Loc = *Int_Par_Ref + 10; 367 | do /* executed once */ 368 | if (Ch_1_Glob == 'A') 369 | /* then, executed */ 370 | { 371 | Int_Loc -= 1; 372 | *Int_Par_Ref = Int_Loc - Int_Glob; 373 | Enum_Loc = Ident_1; 374 | } /* if */ 375 | while (Enum_Loc != Ident_1); /* true */ 376 | } /* Proc_2 */ 377 | 378 | 379 | Proc_3 (Ptr_Ref_Par) 380 | /******************/ 381 | /* executed once */ 382 | /* Ptr_Ref_Par becomes Ptr_Glob */ 383 | 384 | Rec_Pointer *Ptr_Ref_Par; 385 | 386 | { 387 | if (Ptr_Glob != Null) 388 | /* then, executed */ 389 | *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp; 390 | Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp); 391 | } /* Proc_3 */ 392 | 393 | 394 | Proc_4 () /* without parameters */ 395 | /*******/ 396 | /* executed once */ 397 | { 398 | Boolean Bool_Loc; 399 | 400 | Bool_Loc = Ch_1_Glob == 'A'; 401 | Bool_Glob = Bool_Loc | Bool_Glob; 402 | Ch_2_Glob = 'B'; 403 | } /* Proc_4 */ 404 | 405 | 406 | Proc_5 () /* without parameters */ 407 | /*******/ 408 | /* executed once */ 409 | { 410 | Ch_1_Glob = 'A'; 411 | Bool_Glob = false; 412 | } /* Proc_5 */ 413 | 414 | 415 | /* Procedure for the assignment of structures, */ 416 | /* if the C compiler doesn't support this feature */ 417 | #ifdef NOSTRUCTASSIGN 418 | memcpy (d, s, l) 419 | register char *d; 420 | register char *s; 421 | register int l; 422 | { 423 | while (l--) *d++ = *s++; 424 | } 425 | #endif 426 | 427 | 428 | -------------------------------------------------------------------------------- /dhrystone/dhry_2.c: -------------------------------------------------------------------------------- 1 | /* 2 | **************************************************************************** 3 | * 4 | * "DHRYSTONE" Benchmark Program 5 | * ----------------------------- 6 | * 7 | * Version: C, Version 2.1 8 | * 9 | * File: dhry_2.c (part 3 of 3) 10 | * 11 | * Date: May 25, 1988 12 | * 13 | * Author: Reinhold P. Weicker 14 | * 15 | **************************************************************************** 16 | */ 17 | 18 | #include "dhry.h" 19 | 20 | #ifndef REG 21 | #define REG 22 | /* REG becomes defined as empty */ 23 | /* i.e. no register variables */ 24 | #endif 25 | 26 | extern int Int_Glob; 27 | extern char Ch_1_Glob; 28 | 29 | 30 | Proc_6 (Enum_Val_Par, Enum_Ref_Par) 31 | /*********************************/ 32 | /* executed once */ 33 | /* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */ 34 | 35 | Enumeration Enum_Val_Par; 36 | Enumeration *Enum_Ref_Par; 37 | { 38 | *Enum_Ref_Par = Enum_Val_Par; 39 | if (! Func_3 (Enum_Val_Par)) 40 | /* then, not executed */ 41 | *Enum_Ref_Par = Ident_4; 42 | switch (Enum_Val_Par) 43 | { 44 | case Ident_1: 45 | *Enum_Ref_Par = Ident_1; 46 | break; 47 | case Ident_2: 48 | if (Int_Glob > 100) 49 | /* then */ 50 | *Enum_Ref_Par = Ident_1; 51 | else *Enum_Ref_Par = Ident_4; 52 | break; 53 | case Ident_3: /* executed */ 54 | *Enum_Ref_Par = Ident_2; 55 | break; 56 | case Ident_4: break; 57 | case Ident_5: 58 | *Enum_Ref_Par = Ident_3; 59 | break; 60 | } /* switch */ 61 | } /* Proc_6 */ 62 | 63 | 64 | Proc_7 (Int_1_Par_Val, Int_2_Par_Val, Int_Par_Ref) 65 | /**********************************************/ 66 | /* executed three times */ 67 | /* first call: Int_1_Par_Val == 2, Int_2_Par_Val == 3, */ 68 | /* Int_Par_Ref becomes 7 */ 69 | /* second call: Int_1_Par_Val == 10, Int_2_Par_Val == 5, */ 70 | /* Int_Par_Ref becomes 17 */ 71 | /* third call: Int_1_Par_Val == 6, Int_2_Par_Val == 10, */ 72 | /* Int_Par_Ref becomes 18 */ 73 | One_Fifty Int_1_Par_Val; 74 | One_Fifty Int_2_Par_Val; 75 | One_Fifty *Int_Par_Ref; 76 | { 77 | One_Fifty Int_Loc; 78 | 79 | Int_Loc = Int_1_Par_Val + 2; 80 | *Int_Par_Ref = Int_2_Par_Val + Int_Loc; 81 | } /* Proc_7 */ 82 | 83 | 84 | Proc_8 (Arr_1_Par_Ref, Arr_2_Par_Ref, Int_1_Par_Val, Int_2_Par_Val) 85 | /*********************************************************************/ 86 | /* executed once */ 87 | /* Int_Par_Val_1 == 3 */ 88 | /* Int_Par_Val_2 == 7 */ 89 | Arr_1_Dim Arr_1_Par_Ref; 90 | Arr_2_Dim Arr_2_Par_Ref; 91 | int Int_1_Par_Val; 92 | int Int_2_Par_Val; 93 | { 94 | REG One_Fifty Int_Index; 95 | REG One_Fifty Int_Loc; 96 | 97 | Int_Loc = Int_1_Par_Val + 5; 98 | Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val; 99 | Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc]; 100 | Arr_1_Par_Ref [Int_Loc+30] = Int_Loc; 101 | for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index) 102 | Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc; 103 | Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1; 104 | Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc]; 105 | Int_Glob = 5; 106 | } /* Proc_8 */ 107 | 108 | 109 | Enumeration Func_1 (Ch_1_Par_Val, Ch_2_Par_Val) 110 | /*************************************************/ 111 | /* executed three times */ 112 | /* first call: Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R' */ 113 | /* second call: Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C' */ 114 | /* third call: Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C' */ 115 | 116 | Capital_Letter Ch_1_Par_Val; 117 | Capital_Letter Ch_2_Par_Val; 118 | { 119 | Capital_Letter Ch_1_Loc; 120 | Capital_Letter Ch_2_Loc; 121 | 122 | Ch_1_Loc = Ch_1_Par_Val; 123 | Ch_2_Loc = Ch_1_Loc; 124 | if (Ch_2_Loc != Ch_2_Par_Val) 125 | /* then, executed */ 126 | return (Ident_1); 127 | else /* not executed */ 128 | { 129 | Ch_1_Glob = Ch_1_Loc; 130 | return (Ident_2); 131 | } 132 | } /* Func_1 */ 133 | 134 | 135 | Boolean Func_2 (Str_1_Par_Ref, Str_2_Par_Ref) 136 | /*************************************************/ 137 | /* executed once */ 138 | /* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */ 139 | /* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */ 140 | 141 | Str_30 Str_1_Par_Ref; 142 | Str_30 Str_2_Par_Ref; 143 | { 144 | REG One_Thirty Int_Loc; 145 | Capital_Letter Ch_Loc; 146 | 147 | Int_Loc = 2; 148 | while (Int_Loc <= 2) /* loop body executed once */ 149 | if (Func_1 (Str_1_Par_Ref[Int_Loc], 150 | Str_2_Par_Ref[Int_Loc+1]) == Ident_1) 151 | /* then, executed */ 152 | { 153 | Ch_Loc = 'A'; 154 | Int_Loc += 1; 155 | } /* if, while */ 156 | if (Ch_Loc >= 'W' && Ch_Loc < 'Z') 157 | /* then, not executed */ 158 | Int_Loc = 7; 159 | if (Ch_Loc == 'R') 160 | /* then, not executed */ 161 | return (true); 162 | else /* executed */ 163 | { 164 | if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0) 165 | /* then, not executed */ 166 | { 167 | Int_Loc += 7; 168 | Int_Glob = Int_Loc; 169 | return (true); 170 | } 171 | else /* executed */ 172 | return (false); 173 | } /* if Ch_Loc */ 174 | } /* Func_2 */ 175 | 176 | 177 | Boolean Func_3 (Enum_Par_Val) 178 | /***************************/ 179 | /* executed once */ 180 | /* Enum_Par_Val == Ident_3 */ 181 | Enumeration Enum_Par_Val; 182 | { 183 | Enumeration Enum_Loc; 184 | 185 | Enum_Loc = Enum_Par_Val; 186 | if (Enum_Loc == Ident_3) 187 | /* then, executed */ 188 | return (true); 189 | else /* not executed */ 190 | return (false); 191 | } /* Func_3 */ 192 | 193 | -------------------------------------------------------------------------------- /dhrystone/sections.lds: -------------------------------------------------------------------------------- 1 | /* 2 | This is free and unencumbered software released into the public domain. 3 | 4 | Anyone is free to copy, modify, publish, use, compile, sell, or 5 | distribute this software, either in source code form or as a compiled 6 | binary, for any purpose, commercial or non-commercial, and by any 7 | means. 8 | */ 9 | 10 | SECTIONS { 11 | .memory : { 12 | start*(.text); 13 | *(.text); 14 | *(*); 15 | end = .; 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /dhrystone/start.S: -------------------------------------------------------------------------------- 1 | .section .text 2 | .global start 3 | .global main 4 | 5 | start: 6 | /* print "START\r\n" */ 7 | li a0, UARTADDR 8 | li a1, 'S' 9 | li a2, 'T' 10 | li a3, 'A' 11 | li a4, 'R' 12 | li a5, '\r' 13 | li a6, '\n' 14 | sh a1, 2(a0) 15 | 1: lh t0, 2(a0) 16 | bnez t0, 1b 17 | 18 | sh a2, 2(a0) 19 | 1: lh t0, 2(a0) 20 | bnez t0, 1b 21 | 22 | sh a3, 2(a0) 23 | 1: lh t0, 2(a0) 24 | bnez t0, 1b 25 | 26 | sh a4, 2(a0) 27 | 1: lh t0, 2(a0) 28 | bnez t0, 1b 29 | 30 | sh a2, 2(a0) 31 | 1: lh t0, 2(a0) 32 | bnez t0, 1b 33 | 34 | sh a5, 2(a0) 35 | 1: lh t0, 2(a0) 36 | bnez t0, 1b 37 | 38 | sh a6, 2(a0) 39 | 40 | /* set stack pointer */ 41 | li sp, STACK 42 | 43 | # store return address 44 | addi sp, sp, -4 45 | sw ra, (sp) 46 | 47 | /* jump to main C code */ 48 | jal ra,main 49 | 50 | /* print "DONE\r\n" */ 51 | li a0,UARTADDR 52 | addi a1,zero,'D' 53 | addi a2,zero,'O' 54 | addi a3,zero,'N' 55 | addi a4,zero,'E' 56 | addi a5,zero,'\r' 57 | addi a6,zero,'\n' 58 | 59 | 1: lh t0, 2(a0) 60 | bnez t0, 1b 61 | 62 | sh a1, 2(a0) 63 | 1: lh t0, 2(a0) 64 | bnez t0, 1b 65 | 66 | sh a2, 2(a0) 67 | 1: lh t0, 2(a0) 68 | bnez t0, 1b 69 | 70 | sh a3, 2(a0) 71 | 1: lh t0, 2(a0) 72 | bnez t0, 1b 73 | 74 | sh a4, 2(a0) 75 | 1: lh t0, 2(a0) 76 | bnez t0, 1b 77 | 78 | sh a5, 2(a0) 79 | 1: lh t0, 2(a0) 80 | bnez t0, 1b 81 | 82 | sh a6, 2(a0) 83 | 84 | # return to monitor 85 | lw ra, (sp) 86 | ret 87 | 88 | -------------------------------------------------------------------------------- /dhrystone/stdlib.c: -------------------------------------------------------------------------------- 1 | // This is free and unencumbered software released into the public domain. 2 | // 3 | // Anyone is free to copy, modify, publish, use, compile, sell, or 4 | // distribute this software, either in source code form or as a compiled 5 | // binary, for any purpose, commercial or non-commercial, and by any 6 | // means. 7 | 8 | #include 9 | #include 10 | 11 | extern long time(); 12 | extern long insn(); 13 | 14 | #ifdef USE_MYSTDLIB 15 | extern char *malloc(); 16 | extern int printf(const char *format, ...); 17 | 18 | extern void *memcpy(void *dest, const void *src, long n); 19 | extern char *strcpy(char *dest, const char *src); 20 | extern int strcmp(const char *s1, const char *s2); 21 | 22 | char heap_memory[1024]; 23 | int heap_memory_used = 0; 24 | #endif 25 | 26 | long time() 27 | { 28 | int cycles; 29 | asm volatile ("rdcycle %0" : "=r"(cycles)); 30 | //printf("[time() -> %d]\n", cycles); 31 | return cycles; 32 | } 33 | 34 | long insn() 35 | { 36 | int insns; 37 | asm volatile ("rdinstret %0" : "=r"(insns)); 38 | //printf("[insn() -> %d]\n", insns); 39 | return insns; 40 | } 41 | 42 | #ifdef USE_MYSTDLIB 43 | char *malloc(int size) 44 | { 45 | char *p = heap_memory + heap_memory_used; 46 | // printf("[malloc(%d) -> %d (%d..%d)]", size, (int)p, heap_memory_used, heap_memory_used + size); 47 | heap_memory_used += size; 48 | if (heap_memory_used > 1024) 49 | asm volatile ("ebreak"); 50 | return p; 51 | } 52 | 53 | static void printf_c(int c) 54 | { 55 | while (*((short volatile *)(UARTADDR + 2)) != 0) {} 56 | 57 | if (c == '\n') { 58 | // Add carriage returns for serial terminal compatibility. 59 | *((volatile short*)(UARTADDR + 2)) = '\r'; 60 | while (*((short volatile *)(UARTADDR + 2)) != 0) {} 61 | } 62 | 63 | *((volatile short*)(UARTADDR + 2)) = c; 64 | } 65 | 66 | static void printf_s(char *p) 67 | { 68 | while (*p) { 69 | printf_c(*(p++)); 70 | } 71 | } 72 | 73 | static void printf_d(int val) 74 | { 75 | char buffer[32]; 76 | char *p = buffer; 77 | if (val < 0) { 78 | printf_c('-'); 79 | val = -val; 80 | } 81 | while (val || p == buffer) { 82 | *(p++) = '0' + val % 10; 83 | val = val / 10; 84 | } 85 | while (p != buffer) 86 | printf_c(*(--p)); 87 | } 88 | 89 | int printf(const char *format, ...) 90 | { 91 | int i; 92 | va_list ap; 93 | 94 | va_start(ap, format); 95 | 96 | for (i = 0; format[i]; i++) 97 | if (format[i] == '%') { 98 | while (format[++i]) { 99 | if (format[i] == 'c') { 100 | printf_c(va_arg(ap,int)); 101 | break; 102 | } 103 | if (format[i] == 's') { 104 | printf_s(va_arg(ap,char*)); 105 | break; 106 | } 107 | if (format[i] == 'd') { 108 | printf_d(va_arg(ap,int)); 109 | break; 110 | } 111 | } 112 | } else 113 | printf_c(format[i]); 114 | 115 | va_end(ap); 116 | } 117 | 118 | void *memcpy(void *aa, const void *bb, long n) 119 | { 120 | // printf("**MEMCPY**\n"); 121 | char *a = aa; 122 | const char *b = bb; 123 | while (n--) *(a++) = *(b++); 124 | return aa; 125 | } 126 | 127 | char *strcpy(char* dst, const char* src) 128 | { 129 | char *r = dst; 130 | 131 | while ((((uint32_t)dst | (uint32_t)src) & 3) != 0) 132 | { 133 | char c = *(src++); 134 | *(dst++) = c; 135 | if (!c) return r; 136 | } 137 | 138 | while (1) 139 | { 140 | uint32_t v = *(uint32_t*)src; 141 | 142 | if (__builtin_expect((((v) - 0x01010101UL) & ~(v) & 0x80808080UL), 0)) 143 | { 144 | dst[0] = v & 0xff; 145 | if ((v & 0xff) == 0) 146 | return r; 147 | v = v >> 8; 148 | 149 | dst[1] = v & 0xff; 150 | if ((v & 0xff) == 0) 151 | return r; 152 | v = v >> 8; 153 | 154 | dst[2] = v & 0xff; 155 | if ((v & 0xff) == 0) 156 | return r; 157 | v = v >> 8; 158 | 159 | dst[3] = v & 0xff; 160 | return r; 161 | } 162 | 163 | *(uint32_t*)dst = v; 164 | src += 4; 165 | dst += 4; 166 | } 167 | } 168 | 169 | int strcmp(const char *s1, const char *s2) 170 | { 171 | while ((((uint32_t)s1 | (uint32_t)s2) & 3) != 0) 172 | { 173 | char c1 = *(s1++); 174 | char c2 = *(s2++); 175 | 176 | if (c1 != c2) 177 | return c1 < c2 ? -1 : +1; 178 | else if (!c1) 179 | return 0; 180 | } 181 | 182 | while (1) 183 | { 184 | uint32_t v1 = *(uint32_t*)s1; 185 | uint32_t v2 = *(uint32_t*)s2; 186 | 187 | if (__builtin_expect(v1 != v2, 0)) 188 | { 189 | char c1, c2; 190 | 191 | c1 = v1 & 0xff, c2 = v2 & 0xff; 192 | if (c1 != c2) return c1 < c2 ? -1 : +1; 193 | if (!c1) return 0; 194 | v1 = v1 >> 8, v2 = v2 >> 8; 195 | 196 | c1 = v1 & 0xff, c2 = v2 & 0xff; 197 | if (c1 != c2) return c1 < c2 ? -1 : +1; 198 | if (!c1) return 0; 199 | v1 = v1 >> 8, v2 = v2 >> 8; 200 | 201 | c1 = v1 & 0xff, c2 = v2 & 0xff; 202 | if (c1 != c2) return c1 < c2 ? -1 : +1; 203 | if (!c1) return 0; 204 | v1 = v1 >> 8, v2 = v2 >> 8; 205 | 206 | c1 = v1 & 0xff, c2 = v2 & 0xff; 207 | if (c1 != c2) return c1 < c2 ? -1 : +1; 208 | return 0; 209 | } 210 | 211 | if (__builtin_expect((((v1) - 0x01010101UL) & ~(v1) & 0x80808080UL), 0)) 212 | return 0; 213 | 214 | s1 += 4; 215 | s2 += 4; 216 | } 217 | } 218 | #endif 219 | 220 | -------------------------------------------------------------------------------- /doc/chonk.mkdn: -------------------------------------------------------------------------------- 1 | # Cost-Benefit of Half-Width Datapath 2 | 3 | Part of my goal with `hapenny` was to try and determine how the speed and size 4 | of an RV32 CPU changes when it uses a half-width datapath. Comparisons against 5 | other RV32 CPUs are a good start, but it's hard to do apples-to-apples 6 | comparisons, because the different CPUs have different goals, different 7 | microarchitectures, and different bus interfaces. 8 | 9 | To more accurately compare the datapath widths, I've modified the `hapenny` v2 10 | microarchitecture to produce a similarly-designed CPU with a 32-bit datapath, 11 | and I've given it the obvious name. 12 | 13 | ## `chonk`: Oh Lawd, He Comin' 14 | 15 | `chonk` is a copy-paste-edit of the `hapenny` v2 core. The diffs between the 16 | cores are fairly compact, and much logic is shared: 17 | 18 | - The main datapath, including the adder, is now 32 bits wide. 19 | - The register file can still only read one instruction per cycle, though the 20 | reads and writes are now 32 bits wide as well. 21 | - The data bus is now 32 bits wide. 22 | - Most decoding logic is shared and the state machine is similar, but with fewer 23 | states. 24 | 25 | You can load `chonk` onto an Icestick eval board using the `icestick-chonk.py` 26 | script. 27 | 28 | ## Ways this comparison isn't great 29 | 30 | `chonk` uses a 32-bit data bus, which means 32-bit-wide RAM and peripherals. 31 | This makes peripherals slightly more expensive, and doubles the resource usage 32 | of the smallest possible RAM. While `hapenny` can happily run at full speed out 33 | of a single 16-bit block RAM, `chonk` needs at least two. 34 | 35 | Of course, on FPGAs with wider 32-36 bit block RAMs, this is fine. 36 | 37 | `chonk` uses a register file with doubled bandwidth: still only one read port, 38 | but only one read is required to get the full contents of any 32-bit register. 39 | This is largely responsible for its lower clocks-per-instruction. 40 | 41 | ## Effects of widening the datapath 42 | 43 | | Parameter | `hapenny` 2 | `chonk` | Change | 44 | | ---------------------- | ----------- | ------- | ------ | 45 | | LCs on iCE40 | 796 | 971 | +22% | 46 | | Fmax (MHz) | 72 | 62 | -14% | 47 | | Cycles/instruction | 5.525 | 2.925 | -47% | 48 | | Instructions/second | 13.032 | 21.197 | +63% | 49 | 50 | (Comparison of the output of `icestick-smallest` vs `icestick-chonk`. 51 | Cycles/instruction numbers are from Dhrystone and will vary depending on 52 | instruction mix.) 53 | 54 | Observations: 55 | 56 | - `chonk` tends to have a lower Fmax than `hapenny` because of longer carry 57 | chains in additions and comparisons. `hapenny` is far more amenable to having 58 | its critical path rearranged. 59 | 60 | - `chonk` is only about 22% larger, rather than twice as large, because much of 61 | the control logic is unchanged, and a lot of datapath control logic removed 62 | compared to the 16-bit version. 63 | 64 | - Even with the lower Fmax, `chonk` gets significantly higher performance in 65 | terms of RV32 instructions executed per second. 66 | 67 | Instruction timing: 68 | 69 | | Instruction | `hapenny` 2 | `chonk` | Change | 70 | | ------------- | ----------- | ------- | ------- | 71 | | AUIPC | 4 | 2 | -50% | 72 | | LUI | 4 | 2 | -50% | 73 | | JAL | 8 | 4 | -50% | 74 | | JALR | 8 | 4 | -50% | 75 | | Branch | 5/10 | 3/5 | -40/50% | 76 | | Load | 6 | 3 | -50% | 77 | | SW | 5 | 2 | -60% | 78 | | SB/SH | 4 | 2 | -50% | 79 | | SLT(I)(U) | 6 | 3 | -50% | 80 | | Shift | 6 + N | 3 + N | -8-50% | 81 | | Other ALU op | 4 | 2 | -50% | 82 | | division test | 956 | 519 | -46% | 83 | 84 | As you can see from this table, most instructions on `chonk` take half the 85 | cycles as `hapenny`, because both cores are fundamentally restricted by register 86 | file bandwidth. There are some instructions that don't show that degree of 87 | improvement, which is why the average instructions per clock on Dhrystone isn't 88 | exactly 2x: 89 | 90 | - Not-taken branches are only 40% faster. 91 | - Shifts still take one cycle per bit moved, on either core, so the 50% 92 | advantage when shifting by zero bits drops to an 8% advantage at 31. 93 | 94 | On both Dhrystone and the division test case from the testbenches (which is an 95 | extract of libgcc and a hot path in Dhrystone), we see about a 46% reduction in 96 | cycles required to execute a given workload. 97 | 98 | 99 | ## Conclusions 100 | 101 | In FPGAs, where adders are relatively inexpensive and flops plentiful, cutting 102 | the datapath of an RV32 implementation in half doesn't save quite as much area 103 | as you might expect -- about 17% (972 LCs down to 813). It causes RV32 104 | instructions to execute at roughly half the speed, since two steps are required 105 | for any 32-bit operation. (Excluding shifts -- these cores implement shifts 106 | naively.) 107 | 108 | This leaves us with three main benefits to the approach: 109 | 110 | 1. A fully-realized SoC built out of 16-bit memories and peripherals will tend 111 | to use less of an FPGA -- in other words, the area advantage grows with 112 | system complexity. 113 | 114 | 2. The 16-bit version can often close timing at higher frequencies, due in large 115 | part to the shorter carry chains. (The 16-bit design is basically equivalent 116 | to a 32-bit design with a register in the middle of the adder -- only smaller 117 | and more complex.) 118 | 119 | 3. The ability to use 16-bit memory without further performance penalty has its 120 | own advantages, such as the ability to run out of 16-bit external SRAM. On 121 | FPGAs with 16-bit (or 18-bit) block RAMs, a 16-bit implementation can use 122 | fewer of them, leaving others available for other things. 123 | -------------------------------------------------------------------------------- /hapenny/__init__.py: -------------------------------------------------------------------------------- 1 | from amaranth import * 2 | from amaranth.lib import data, enum, wiring 3 | from amaranth.lib.enum import Enum 4 | from amaranth.lib.wiring import In, Out 5 | from amaranth.lib.data import Struct 6 | 7 | from functools import reduce 8 | 9 | class StreamSig(wiring.Signature): 10 | def __init__(self, payload_shape): 11 | super().__init__({ 12 | 'payload': Out(payload_shape), 13 | 'valid': Out(1), 14 | 'ready': In(1), 15 | }) 16 | 17 | class AlwaysReady(wiring.Signature): 18 | def __init__(self, payload_shape): 19 | super().__init__({ 20 | 'payload': Out(payload_shape), 21 | 'valid': Out(1), 22 | }) 23 | 24 | # Builds a mux but out of AND and OR, which often generates cheaper logic on 25 | # 4LUT devices. 26 | def mux(select, one, zero): 27 | if isinstance(one, Enum): 28 | one = one.value 29 | if isinstance(one, int): 30 | one = Const(one) 31 | if isinstance(one, Struct): 32 | one = Value.cast(one) 33 | if isinstance(zero, Enum): 34 | zero = zero.value 35 | if isinstance(zero, int): 36 | zero = Const(zero) 37 | if isinstance(zero, Struct): 38 | zero = Value.cast(zero) 39 | n = max(one.shape().width, zero.shape().width) 40 | select = select.any() # force to 1 bit 41 | return ( 42 | (select.replicate(n) & one) | (~select.replicate(n) & zero) 43 | ) 44 | 45 | # Builds an output net that chooses between options based on a onehot control 46 | # signal. 47 | # 48 | # onehot_sig should be a signal of N bits, and options should be a dict with at 49 | # most N entries. Each key in the dict is a bit number in onehot_sig, or a 50 | # tuple of bit numbers, and the corresponding value will be produced as output 51 | # when the indicated bit(s) are set in the state. 52 | # 53 | # If a default is provided, it will be used if none of the explicit conditions 54 | # in the options map fires. By default, the default is zero. 55 | # 56 | # This assumes all bits in the onehot_sig are mutually exclusive, and combines 57 | # each path using a bitwise OR instead of muxes, which is often cheaper on 4LUT 58 | # devices. However, this means if the onehot invariant is violated, you'll get 59 | # nonsense output. If that concerns you, see oneof instead. 60 | def onehot_choice(onehot_sig, options, default = None): 61 | assert len(options) > 0 62 | output = [] 63 | matches = [] 64 | for (choice, result) in options.items(): 65 | if isinstance(choice, Enum): 66 | choice = choice.value 67 | if isinstance(choice, list) or isinstance(choice, tuple): 68 | pass 69 | else: 70 | # Force choice to be a sequence 71 | choice = [choice] 72 | if isinstance(result, Enum): 73 | result = result.value 74 | if isinstance(result, int): 75 | result = Const(result) 76 | 77 | condition = reduce(lambda a, b: a | b, map(lambda s: onehot_sig[s], 78 | choice)) 79 | matches.append(condition) 80 | 81 | case = condition.replicate(result.shape().width) & result 82 | 83 | output.append(case) 84 | 85 | if default is not None: 86 | if isinstance(default, Enum): 87 | default = default.value 88 | if isinstance(default, int): 89 | default = Const(default) 90 | no_match = ~reduce(lambda a, b: a | b, matches) 91 | output.append(no_match.replicate(default.shape().width) & default) 92 | 93 | return reduce(lambda a, b: a | b, output) 94 | 95 | # Builds a chained mux that selects between a set of options, which must be 96 | # mutually exclusive. 97 | # 98 | # 'options' is a list of pairs. The first element in each pair is evaluated as a 99 | # boolean condition. If 1, the second element is OR'd into the result. 100 | # 101 | # This means if more than one condition is true simultaneously, the result will 102 | # bitwise OR the results together. It is up to you to ensure that all 103 | # conditions are mutually exclusive. 104 | # 105 | # If a default is provided, it will be used when no other conditions match. 106 | # Otherwise, the default is zero. 107 | # 108 | # If you've got a onehot control signal instead of a bunch of separate condition 109 | # strobes, see onehot_choice. 110 | def oneof(options, default = None): 111 | assert len(options) > 0 112 | output = [] 113 | matches = [] 114 | for (condition, result) in options: 115 | if isinstance(condition, int): 116 | condition = Const(condition) 117 | if isinstance(result, Enum): 118 | result = result.value 119 | if isinstance(result, int): 120 | result = Const(result) 121 | 122 | matches.append(condition.any()) 123 | 124 | case = condition.any().replicate(result.shape().width) & result 125 | 126 | output.append(case) 127 | 128 | if default is not None: 129 | if isinstance(default, Enum): 130 | default = default.value 131 | if isinstance(default, int): 132 | default = Const(default) 133 | no_match = ~reduce(lambda a, b: a|b, matches) 134 | output.append(no_match.replicate(default.shape().width) & default) 135 | 136 | return reduce(lambda a, b: a|b, output) 137 | 138 | def hihalf(signal): 139 | return signal[16:] 140 | 141 | def lohalf(signal): 142 | return signal[:16] 143 | 144 | # Selects between the halfwords of (32-bit) signal: if hi is 1, chooses the 145 | # high half, otherwise the low half. 146 | def choosehalf(hi, signal): 147 | return mux(hi, hihalf(signal), lohalf(signal)) 148 | 149 | # Combines a list of signals using binary function 'fun', organizing them into 150 | # a balanced binary tree instead of a linked list like reduce/foldl would. 151 | def treeduce(fun, items): 152 | if len(items) == 1: 153 | return items[0] 154 | 155 | partition = len(items) // 2 156 | left = items[:partition] 157 | right = items[partition:] 158 | return fun(treeduce(fun, left), treeduce(fun, right)) 159 | 160 | -------------------------------------------------------------------------------- /hapenny/bus.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | from amaranth import * 4 | from amaranth.lib.wiring import * 5 | from amaranth.lib.enum import * 6 | from amaranth.lib.coding import Encoder, Decoder 7 | 8 | from hapenny import StreamSig, AlwaysReady, treeduce 9 | 10 | class BusCmd(Signature): 11 | def __init__(self, *, addr, data): 12 | if isinstance(data, int): 13 | lanes = (data + 7) // 8 14 | else: 15 | lanes = (data.width + 7) // 8 16 | super().__init__({ 17 | 'addr': Out(addr), 18 | 'lanes': Out(lanes), 19 | 'data': Out(data) 20 | }) 21 | 22 | class BusPort(Signature): 23 | def __init__(self, *, addr, data): 24 | super().__init__({ 25 | 'cmd': Out(AlwaysReady(BusCmd(addr=addr, data=data))), 26 | 'resp': In(data), 27 | }) 28 | 29 | def partial_decode(m, bus, width): 30 | assert width >= bus.cmd.payload.addr.shape().width, \ 31 | "can't use partial_decode to make a bus narrower" 32 | port = BusPort(addr = width, data = bus.cmd.payload.data.shape()).flip().create() 33 | m.d.comb += [ 34 | bus.cmd.payload.addr.eq(port.cmd.payload.addr), 35 | bus.cmd.payload.data.eq(port.cmd.payload.data), 36 | bus.cmd.payload.lanes.eq(port.cmd.payload.lanes), 37 | bus.cmd.valid.eq(port.cmd.valid), 38 | 39 | port.resp.eq(bus.resp), 40 | ] 41 | return port 42 | 43 | def narrow_addr(m, bus, width): 44 | assert width <= bus.cmd.payload.addr.shape().width, \ 45 | "can't use narrow_addr to make a bus wider" 46 | port = BusPort(addr = width, data = bus.cmd.payload.data.shape()).flip().create() 47 | m.d.comb += [ 48 | bus.cmd.payload.addr.eq(port.cmd.payload.addr), 49 | bus.cmd.payload.data.eq(port.cmd.payload.data), 50 | bus.cmd.payload.lanes.eq(port.cmd.payload.lanes), 51 | bus.cmd.valid.eq(port.cmd.valid), 52 | 53 | port.resp.eq(bus.resp), 54 | ] 55 | return port 56 | 57 | class SimpleFabric(Elaboratable): 58 | def __init__(self, devices): 59 | assert len(devices) > 0 60 | data_bits = max(p.cmd.payload.data.shape().width for p in devices) 61 | addr_bits = max(p.cmd.payload.addr.shape().width for p in devices) 62 | sig = BusPort(addr = addr_bits, data = data_bits).flip() 63 | print(f"fabric configured for {addr_bits} addr bits, {data_bits} data bits") 64 | for i, d in enumerate(devices): 65 | assert sig.is_compliant(d), \ 66 | f"device #{i} does not have {addr_bits} addr bits: {d.cmd.payload.addr.shape()}" 67 | self.devices = devices 68 | self.extra_bits = (len(devices) - 1).bit_length() 69 | self.addr_bits = addr_bits 70 | self.data_bits = data_bits 71 | 72 | self.bus = BusPort(addr = addr_bits + self.extra_bits, data = 73 | data_bits).flip().create() 74 | 75 | def elaborate(self, platform): 76 | m = Module() 77 | 78 | # index of the currently selected device. 79 | devid = Signal(self.extra_bits) 80 | m.d.comb += devid.eq(self.bus.cmd.payload.addr[self.addr_bits:]) 81 | 82 | # index of the last selected device (registered). 83 | last_id = Signal(self.extra_bits) 84 | # Since the setting of the response mux is ignored if the CPU isn't 85 | # expecting data back, we can just capture the address lines on every 86 | # cycle whether it's valid or not. 87 | m.d.sync += last_id.eq(devid) 88 | 89 | for (i, d) in enumerate(self.devices): 90 | # Fan out the incoming address, data, and lanes to every device. 91 | m.d.comb += [ 92 | d.cmd.payload.addr.eq(self.bus.cmd.payload.addr), 93 | d.cmd.payload.data.eq(self.bus.cmd.payload.data), 94 | d.cmd.payload.lanes.eq(self.bus.cmd.payload.lanes), 95 | ] 96 | # Only propagate cmd valid to the specific addressed device. 97 | dv = Signal(1, name = f"valid_{i}") 98 | m.d.comb += [ 99 | dv.eq(self.bus.cmd.valid & (devid == i)), 100 | d.cmd.valid.eq(dv), 101 | ] 102 | 103 | # Fan the response data in based on who we're listening to. 104 | response_data = [] 105 | for (i, d) in enumerate(self.devices): 106 | data = d.resp & (last_id == i).replicate(self.data_bits) 107 | response_data.append(data) 108 | 109 | m.d.comb += self.bus.resp.eq(treeduce(lambda a, b: a | b, response_data)) 110 | 111 | return m 112 | -------------------------------------------------------------------------------- /hapenny/chonk/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/hapenny/chonk/__init__.py -------------------------------------------------------------------------------- /hapenny/chonk/cpu.py: -------------------------------------------------------------------------------- 1 | # A baseline implementation of an RV32 processor for comparison, 2 | # sharing microarchitectural details with hapenny. 3 | 4 | from amaranth import * 5 | from amaranth.lib.wiring import * 6 | from amaranth.lib.enum import * 7 | import amaranth.lib.coding 8 | 9 | from hapenny import StreamSig, AlwaysReady, mux, oneof, onehot_choice 10 | from hapenny.decoder import ImmediateDecoder, Decoder, DecodeSignals 11 | from hapenny.chonk.regfile32 import RegFile, RegWrite 12 | from hapenny.bus import BusPort, BusCmd 13 | from hapenny.chonk.sbox import SBox, STATE_COUNT 14 | from hapenny.chonk.fdbox import FDBox 15 | from hapenny.chonk.ewbox import EWBox 16 | from hapenny.rvfi import Rvfi, Mode, Ixl 17 | 18 | # Note: all debug port signals are directional from the perspective of the DEBUG 19 | # PROBE, not the CPU. 20 | DebugPort = Signature({ 21 | # Register read port. The CPU asserts READY on this port when it is halted 22 | # and the register file is available for inspection. Debug probes should 23 | # place a register number on the payload signals and assert VALID; the 24 | # response will come on reg_value on the next cycle. 25 | 'reg_read': Out(StreamSig(5)), 26 | # Value that was read from the reg_read port above. 27 | 'reg_value': In(32), 28 | # Register write command. Works roughly like reg_read, e.g. only READY when 29 | # the CPU is halted. 30 | 'reg_write': Out(StreamSig(RegWrite(5))), 31 | # PC output from CPU. This is always valid. If the CPU's PC is narrower 32 | # than 32 bits (the prog_addr_width parameter) then its value is 33 | # zero-extended on this port. 34 | 'pc': In(32), 35 | # PC override signal. Becomes READY when the CPU is halted; assert a new 36 | # value with VALID here to change the next instruction that will be 37 | # fetched. If the PC is narrower than 32 bits (the prog_addr_width 38 | # parameter) then the higher bits in this path are ignored. 39 | 'pc_write': Out(StreamSig(32)), 40 | # State output from CPU. This is a one-hot encoding of the CPU's internal 41 | # execution state, mostly intended for testbenches. 42 | 'state': In(STATE_COUNT), 43 | }) 44 | 45 | class Cpu(Component): 46 | """A basic RV32I core. 47 | 48 | Parameters 49 | ---------- 50 | addr_width (int): number of low-order bits that are significant in memory 51 | addresses. The default is 32; if this is reduced, memory and I/O 52 | devices will appear to repeat at higher addresses because the top bits 53 | won't be decoded. Note that this parameter is in terms of byte 54 | addresses (the numbers RV32I software deals with); the actual bus port 55 | has addr_width-2 address lines because it addresses words. 56 | prog_addr_width (int): number of low-order bits that are significant in 57 | instruction addresses. This determines the width of the PC register(s) 58 | and fetch path. If program storage is in the lower section of the 59 | address range, and I/O devices higher, you can set this parameter to 60 | smaller than addr_width to save some area. If not explicitly 61 | overridden, this is the same as addr_width. addr_width. 62 | 63 | Attributes 64 | ---------- 65 | bus (both): connection to the bus, 32 bit data path and `addr_width - 2` 66 | address bits. 67 | debug (both): debug port for testing or development. 68 | halt_request (in): when asserted (1), requests that the CPU stop at the 69 | next instruction boundary. Release (0) to resume. 70 | halted (out): raised when the CPU has halted. 71 | rvfi (out): RISC-V Formal Interface trace port. 72 | """ 73 | halt_request: In(1) 74 | halted: Out(1) 75 | 76 | debug: In(DebugPort) 77 | rvfi: Out(AlwaysReady(Rvfi())) 78 | 79 | def __init__(self, *, 80 | reset_vector = 0, 81 | addr_width = 32, 82 | counters = False, 83 | prog_addr_width = None): 84 | super().__init__() 85 | 86 | # Capture and derive parameter values 87 | self.addr_width = addr_width 88 | self.prog_addr_width = prog_addr_width or addr_width 89 | 90 | # Create our parameterized ports and modules 91 | self.bus = BusPort(addr = addr_width - 2, data = 32).create() 92 | 93 | self.s = SBox() 94 | self.rf = RegFile() 95 | self.fd = FDBox( 96 | prog_addr_width = self.prog_addr_width, 97 | ) 98 | self.ew = EWBox( 99 | reset_vector = reset_vector, 100 | addr_width = addr_width, 101 | prog_addr_width = self.prog_addr_width, 102 | counters = counters, 103 | ) 104 | 105 | def elaborate(self, platform): 106 | m = Module() 107 | 108 | # Make the elaborator aware of all our submodules, and wire them up. 109 | m.submodules.regfile = rf = self.rf 110 | m.submodules.s = s = self.s 111 | m.submodules.fd = fd = self.fd 112 | m.submodules.ew = ew = self.ew 113 | 114 | m.d.comb += [ 115 | fd.onehot_state.eq(s.onehot_state), 116 | fd.pc.eq(ew.pc_next), 117 | fd.from_the_top.eq(ew.from_the_top), 118 | 119 | ew.onehot_state.eq(s.onehot_state), 120 | ew.inst_next.eq(fd.inst_next), 121 | ew.debug_pc_write.valid.eq(self.debug.pc_write.valid), 122 | # Drop the bottom two bits of any incoming PC before feeding to EW. 123 | ew.debug_pc_write.payload.eq(self.debug.pc_write.payload[2:]), 124 | 125 | s.from_the_top.eq(ew.from_the_top), 126 | s.halt_request.eq(self.halt_request), 127 | s.not_a_bubble.eq(ew.full), 128 | s.hold.eq(ew.hold), 129 | 130 | self.halted.eq(s.halted), 131 | 132 | self.debug.reg_value.eq(rf.read_resp), 133 | self.debug.state.eq(s.onehot_state), 134 | # Internal PCs never have bits 0/1, but the debug port deals in 135 | # 32-bit addresses, so add LSBs when exposing the PC: 136 | self.debug.pc.eq(Cat(0, 0, ew.pc)), 137 | self.debug.pc_write.ready.eq(ew.debug_pc_write.ready), 138 | ] 139 | 140 | # Combine the register file write ports from EW (primary) and the debug 141 | # interface (secondary). We use an actual mux here instead of OR-ing to 142 | # keep the debug port from disrupting execution. 143 | m.d.comb += [ 144 | rf.write_cmd.valid.eq( 145 | mux( 146 | s.halted, 147 | self.debug.reg_write.valid, 148 | ew.rf_write_cmd.valid, 149 | ), 150 | ), 151 | rf.write_cmd.payload.reg.eq( 152 | mux( 153 | s.halted, 154 | self.debug.reg_write.payload.reg, 155 | ew.rf_write_cmd.payload.reg, 156 | ), 157 | ), 158 | rf.write_cmd.payload.value.eq( 159 | mux( 160 | s.halted, 161 | self.debug.reg_write.payload.value, 162 | ew.rf_write_cmd.payload.value, 163 | ), 164 | ), 165 | self.debug.reg_write.ready.eq(s.halted), 166 | ] 167 | 168 | # Combine the register file read ports from EW, FD, and debug. We OR 169 | # the EW/FD ports together because those modules are well behaved, but 170 | # explicitly gate signals from the debug port to only work when we're 171 | # halted. 172 | m.d.comb += [ 173 | rf.read_cmd.valid.eq( 174 | fd.rf_cmd.valid | ew.rf_read_cmd.valid 175 | | (self.debug.reg_read.valid & s.halted) 176 | ), 177 | rf.read_cmd.payload.eq( 178 | fd.rf_cmd.payload | ew.rf_read_cmd.payload 179 | | oneof([(s.halted, self.debug.reg_read.payload)]) 180 | ), 181 | ew.rf_resp.eq(rf.read_resp), 182 | self.debug.reg_read.ready.eq(s.halted), 183 | ] 184 | # Combine the bus access ports. The debug port can't drive our bus, so 185 | # this is simpler. 186 | m.d.comb += [ 187 | self.bus.cmd.valid.eq( 188 | fd.bus.cmd.valid | ew.bus.cmd.valid 189 | ), 190 | # Note that this will implicitly zero-extend the FD address if it's 191 | # shorter than the full bus (because prog_addr_width is dialed 192 | # back). 193 | self.bus.cmd.payload.addr.eq( 194 | fd.bus.cmd.payload.addr | ew.bus.cmd.payload.addr 195 | ), 196 | self.bus.cmd.payload.data.eq( 197 | fd.bus.cmd.payload.data | ew.bus.cmd.payload.data 198 | ), 199 | self.bus.cmd.payload.lanes.eq( 200 | fd.bus.cmd.payload.lanes | ew.bus.cmd.payload.lanes 201 | ), 202 | 203 | fd.bus.resp.eq(self.bus.resp), 204 | ew.bus.resp.eq(self.bus.resp), 205 | ] 206 | 207 | # Trace port 208 | m.submodules.rvfi_adapter = rvfi = RvfiPort() 209 | m.d.comb += [ 210 | rvfi.state.eq(s.onehot_state), 211 | rvfi.full.eq(ew.full), 212 | rvfi.end_of_instruction.eq(ew.from_the_top), 213 | rvfi.pc.eq(Cat(0, 0, ew.pc)), 214 | rvfi.pc_next.eq(Cat(0, 0, ew.pc_next)), 215 | rvfi.insn.eq(ew.debug_inst), 216 | rvfi.rf_read_resp_snoop.eq(rf.read_resp), 217 | 218 | rvfi.rf_read_snoop.valid.eq(rf.read_cmd.valid), 219 | rvfi.rf_read_snoop.payload.eq(rf.read_cmd.payload), 220 | 221 | rvfi.rf_write_snoop.valid.eq(rf.write_cmd.valid), 222 | rvfi.rf_write_snoop.payload.reg.eq(rf.write_cmd.payload.reg), 223 | rvfi.rf_write_snoop.payload.value.eq(rf.write_cmd.payload.value), 224 | 225 | rvfi.bus_snoop.valid.eq(self.bus.cmd.valid), 226 | rvfi.bus_snoop.payload.addr.eq(self.bus.cmd.payload.addr), 227 | rvfi.bus_snoop.payload.data.eq(self.bus.cmd.payload.data), 228 | rvfi.bus_snoop.payload.lanes.eq(self.bus.cmd.payload.lanes), 229 | rvfi.bus_resp_snoop.eq(self.bus.resp), 230 | ] 231 | connect(m, rvfi.rvfi_out, flipped(self.rvfi)) 232 | 233 | return m 234 | 235 | class RvfiPort(Component): 236 | state: In(STATE_COUNT) 237 | full: In(1) 238 | end_of_instruction: In(1) 239 | pc: In(32) 240 | pc_next: In(32) 241 | insn: In(32) 242 | 243 | rf_read_snoop: In(AlwaysReady(5)) 244 | rf_read_resp_snoop: In(32) 245 | 246 | rf_write_snoop: In(AlwaysReady(RegWrite(5))) 247 | 248 | bus_snoop: In(AlwaysReady(BusCmd(addr = 30, data = 32))) 249 | bus_resp_snoop: In(32) 250 | 251 | rvfi_out: Out(AlwaysReady(Rvfi())) 252 | 253 | def elaborate(self, platform): 254 | m = Module() 255 | 256 | m.d.comb += [ 257 | self.rvfi_out.payload.ixl.eq(Ixl._32), 258 | self.rvfi_out.payload.mode.eq(Mode.M), 259 | ] 260 | 261 | load_expected = Signal(1) 262 | after_end = Signal() 263 | rs1_addr_d = Signal(5) 264 | 265 | m.d.sync += after_end.eq(self.end_of_instruction) 266 | 267 | with m.If(self.end_of_instruction): 268 | m.d.sync += [ 269 | self.rvfi_out.valid.eq(self.full), 270 | self.rvfi_out.payload.order.eq(self.rvfi_out.payload.order + 1), 271 | self.rvfi_out.payload.pc_wdata.eq(self.pc_next), 272 | 273 | rs1_addr_d.eq(self.rf_read_snoop.payload), 274 | ] 275 | with m.Else(): 276 | m.d.sync += self.rvfi_out.valid.eq(0) 277 | 278 | with m.If(after_end): 279 | m.d.sync += [ 280 | # Clear the things that accumulate 281 | self.rvfi_out.payload.halt.eq(0), 282 | self.rvfi_out.payload.mem_wmask.eq(0), 283 | self.rvfi_out.payload.mem_wdata.eq(0), 284 | self.rvfi_out.payload.mem_rmask.eq(0), 285 | self.rvfi_out.payload.mem_rdata.eq(0), 286 | self.rvfi_out.payload.rd_addr.eq(0), 287 | self.rvfi_out.payload.rd_wdata.eq(0), 288 | 289 | self.rvfi_out.payload.rs1_addr.eq(rs1_addr_d), 290 | ] 291 | 292 | with m.If(self.full): 293 | with m.If(self.state[0]): 294 | m.d.sync += [ 295 | self.rvfi_out.payload.rs1_rdata.eq(self.rf_read_resp_snoop), 296 | self.rvfi_out.payload.rs2_addr.eq(self.rf_read_snoop.payload), 297 | 298 | self.rvfi_out.payload.pc_rdata.eq(self.pc), 299 | 300 | self.rvfi_out.payload.insn.eq(self.insn), 301 | ] 302 | 303 | with m.If(self.state[1]): 304 | m.d.sync += [ 305 | self.rvfi_out.payload.rs2_rdata.eq(self.rf_read_resp_snoop), 306 | ] 307 | 308 | with m.If(self.rf_write_snoop.valid): 309 | m.d.sync += [ 310 | self.rvfi_out.payload.rd_wdata.eq(self.rf_write_snoop.payload.value), 311 | self.rvfi_out.payload.rd_addr.eq(self.rf_write_snoop.payload.reg), 312 | ] 313 | 314 | with m.If(load_expected): 315 | m.d.sync += load_expected.eq(0) 316 | m.d.sync += self.rvfi_out.payload.mem_rdata.eq( 317 | self.bus_resp_snoop 318 | ) 319 | 320 | # Ignore bus activity in state 0 as RVFI doesn't consider fetch 321 | # traffic. 322 | with m.If(self.bus_snoop.valid & ~self.state[0]): 323 | m.d.sync += [ 324 | # Present addresses word-aligned 325 | self.rvfi_out.payload.mem_addr.eq(Cat(0, 0, self.bus_snoop.payload.addr)), 326 | # Set masks. 327 | self.rvfi_out.payload.mem_wmask.eq(self.bus_snoop.payload.lanes), 328 | self.rvfi_out.payload.mem_rmask.eq((~self.bus_snoop.payload.lanes.any()).replicate(4)), 329 | ] 330 | with m.If(self.bus_snoop.payload.lanes[0]): 331 | m.d.sync += self.rvfi_out.payload.mem_wdata[:8].eq( 332 | self.bus_snoop.payload.data[:8] 333 | ) 334 | with m.If(self.bus_snoop.payload.lanes[1]): 335 | m.d.sync += self.rvfi_out.payload.mem_wdata[8:16].eq( 336 | self.bus_snoop.payload.data[8:16] 337 | ) 338 | with m.If(self.bus_snoop.payload.lanes[2]): 339 | m.d.sync += self.rvfi_out.payload.mem_wdata[16:24].eq( 340 | self.bus_snoop.payload.data[16:24] 341 | ) 342 | with m.If(self.bus_snoop.payload.lanes[3]): 343 | m.d.sync += self.rvfi_out.payload.mem_wdata[24:].eq( 344 | self.bus_snoop.payload.data[24:] 345 | ) 346 | with m.If(self.bus_snoop.payload.lanes == 0): 347 | m.d.sync += load_expected.eq(1) 348 | 349 | return m 350 | -------------------------------------------------------------------------------- /hapenny/chonk/fdbox.py: -------------------------------------------------------------------------------- 1 | # The FD-Box, responsible for fetch and decode during execution. 2 | 3 | from amaranth import * 4 | from amaranth.lib.wiring import * 5 | from amaranth.lib.enum import * 6 | from amaranth.lib.coding import Encoder, Decoder 7 | 8 | from hapenny import StreamSig, AlwaysReady, onehot_choice, mux, oneof 9 | from hapenny.chonk.sbox import STATE_COUNT 10 | from hapenny.bus import BusPort 11 | 12 | class FDBox(Component): 13 | """The FD-Box fetches and decodes instructions. 14 | 15 | Based on a PC (provided by the EW-box) the FD-box generates bus 16 | transactions to collect both halfwords of an instruction, and then provides 17 | it on an output signal to the EW-box. 18 | 19 | Parameters 20 | ---------- 21 | prog_addr_width (integer): number of bits in a program address, 32 by default 22 | but can be shrunk to save logic. 23 | 24 | Attributes 25 | ---------- 26 | onehot_state (input): state input from the S-Box 27 | pc (input): program counter from EW-box. 28 | rf_cmd (output): read command to the register file, intended to be OR'd. 29 | inst_next (output): instruction word for EW to use next time we restart 30 | from the top. 31 | bus (port): our connection to the memory fabric. 32 | from_the_top (input): signal from EW indicating that this is the final 33 | cycle of the instruction. We use this to gate register reads. 34 | """ 35 | onehot_state: In(STATE_COUNT) 36 | rf_cmd: Out(AlwaysReady(5)) 37 | inst_next: Out(32) 38 | from_the_top: In(1) 39 | 40 | def __init__(self, *, 41 | prog_addr_width = 32, 42 | ): 43 | super().__init__() 44 | 45 | # Create a bus port of sufficient width to fetch instructions only. 46 | # (Width is -2 because we're addressing words.) 47 | self.bus = BusPort(addr = prog_addr_width - 2, data = 32).create() 48 | 49 | # The PC width is -2 because it's addressing words. 50 | self.pc = Signal(prog_addr_width - 2) 51 | 52 | self.inst = Signal(32) 53 | 54 | def elaborate(self, platform): 55 | m = Module() 56 | 57 | # State 0: we start the fetch. 58 | # State 1: we receive the instruction word and begin a register read. 59 | # State 2+: we don't do anything. 60 | 61 | m.d.comb += [ 62 | # We issue bus transactions in state 0 only. 63 | self.bus.cmd.valid.eq(self.onehot_state[0]), 64 | # In that state we put the PC on the bus. 65 | self.bus.cmd.payload.addr.eq(onehot_choice(self.onehot_state, { 66 | 0: self.pc, 67 | })), 68 | 69 | # We access the register file only in the last cycle. 70 | self.rf_cmd.valid.eq(self.from_the_top), 71 | # If the last cycle is state 1, our fetch is still completing, so 72 | # we need to forward the bus response to the register file. If it 73 | # isn't state 1, we can serve out of our inst register. 74 | # (It's important to send zeros in other states instead of 75 | # hardwiring this so that we can OR.) 76 | self.rf_cmd.payload.eq(oneof([ 77 | (self.from_the_top & self.onehot_state[1], self.bus.resp[15:20]), 78 | (self.from_the_top & ~self.onehot_state[1], self.inst[15:20]), 79 | ])), 80 | 81 | # Forward the instruction through so it's valid in states 1+. In 82 | # other states, serve up the contents of our registers. EW's not 83 | # supposed to look at this in state 0. 84 | self.inst_next.eq(mux( 85 | self.onehot_state[1], 86 | self.bus.resp, 87 | self.inst, 88 | )), 89 | ] 90 | 91 | m.d.sync += [ 92 | # Latch the bottom half of the instruction at the end of state 1. 93 | self.inst.eq(mux( 94 | self.onehot_state[1], 95 | self.bus.resp, 96 | self.inst, 97 | )), 98 | ] 99 | 100 | return m 101 | -------------------------------------------------------------------------------- /hapenny/chonk/gpio32.py: -------------------------------------------------------------------------------- 1 | from amaranth import * 2 | from amaranth.lib.wiring import * 3 | from amaranth.lib.enum import * 4 | from amaranth.lib.coding import Encoder, Decoder 5 | 6 | from hapenny.bus import BusPort 7 | 8 | class OutputPort32(Component): 9 | bus: In(BusPort(addr = 0, data = 32)) 10 | 11 | def __init__(self, pins): 12 | super().__init__() 13 | self.pins = Signal(pins) 14 | 15 | def elaborate(self, platform): 16 | m = Module() 17 | 18 | with m.If(self.bus.cmd.valid & self.bus.cmd.payload.lanes[0]): 19 | m.d.sync += self.pins[:8].eq(self.bus.cmd.payload.data[:8]) 20 | with m.If(self.bus.cmd.valid & self.bus.cmd.payload.lanes[1]): 21 | m.d.sync += self.pins[8:16].eq(self.bus.cmd.payload.data[8:16]) 22 | with m.If(self.bus.cmd.valid & self.bus.cmd.payload.lanes[2]): 23 | m.d.sync += self.pins[16:24].eq(self.bus.cmd.payload.data[16:24]) 24 | with m.If(self.bus.cmd.valid & self.bus.cmd.payload.lanes[3]): 25 | m.d.sync += self.pins[24:].eq(self.bus.cmd.payload.data[24:]) 26 | 27 | return m 28 | -------------------------------------------------------------------------------- /hapenny/chonk/mem32.py: -------------------------------------------------------------------------------- 1 | # Reusable memory with our bus interface. 2 | 3 | from amaranth import * 4 | from amaranth.lib.wiring import * 5 | from amaranth.lib.enum import * 6 | from amaranth.lib.coding import Encoder, Decoder 7 | 8 | from hapenny import StreamSig, AlwaysReady, mux 9 | from hapenny.bus import BusPort 10 | 11 | import hapenny.mem # for stitching together 16-bit primitives 12 | 13 | class BasicMemory(Elaboratable): 14 | """A dead-simple 32-bit-wide memory with the Hapenny bus interface. 15 | 16 | This uses an Amaranth generic memory internally, which relies on inference 17 | in the synthesis tools to map to a specific type of resource such as block 18 | RAM. In practice it won't map to uninitialized RAM (like the iCE40UP5K's 19 | SPRAM) because Amaranth insists on generating it with an initializer; for 20 | that you'll need another module. 21 | 22 | Parameters 23 | ---------- 24 | depth (integer): number of 32-bit words in the memory. If omitted, 25 | contents must be provided, and depth is inferred from len(contents). 26 | contents (list of integer): initialization contents of the memory. If 27 | omitted, depth must be provided, and the RAM is implicitly zeroed. 28 | read_only (boolean): if overridden to True, the memory will not respond to 29 | write strobes. This is useful for using an initialized block RAM as a 30 | program ROM. 31 | 32 | Attributes 33 | ---------- 34 | bus: a BusPort with the minimum number of addr bits required to address 35 | 'depth' words, and a 32-bit data path. 36 | """ 37 | 38 | def __init__(self, *, 39 | depth = None, 40 | contents = [], 41 | read_only = False): 42 | super().__init__() 43 | 44 | if depth is None: 45 | assert len(contents) > 0, "either depth or contents must be provided" 46 | depth = len(contents) 47 | 48 | addr_bits = (depth - 1).bit_length() 49 | 50 | self.bus = BusPort(addr = addr_bits, data = 32).flip().create() 51 | 52 | self.m = Memory( 53 | width = 32, 54 | depth = depth, 55 | name = "basicram", 56 | init = contents, 57 | ) 58 | 59 | self.read_only = False 60 | 61 | def elaborate(self, platform): 62 | m = Module() 63 | 64 | m.submodules.m = self.m 65 | 66 | rp = self.m.read_port(transparent = False) 67 | 68 | 69 | m.d.comb += [ 70 | rp.addr.eq(self.bus.cmd.payload.addr), 71 | rp.en.eq(self.bus.cmd.valid & (self.bus.cmd.payload.lanes == 0)), 72 | self.bus.resp.eq(rp.data), 73 | ] 74 | 75 | if not self.read_only: 76 | wp = self.m.write_port(granularity = 8) 77 | m.d.comb += [ 78 | wp.addr.eq(self.bus.cmd.payload.addr), 79 | wp.data.eq(self.bus.cmd.payload.data), 80 | ] 81 | for i, lane in enumerate(self.bus.cmd.payload.lanes): 82 | m.d.comb += wp.en[i].eq(self.bus.cmd.valid & lane) 83 | 84 | return m 85 | 86 | class SpramMemory(Component): 87 | """A pair of 256 kiB / 32 kiB SPRAMs on the UP5K, joined to make a 32-bit 88 | wide memory. 89 | 90 | This module exists because getting Amaranth to generate a memory that Yosys 91 | is willing to map to SPRAM is currently hard. 92 | 93 | SPRAMs are uninitialized at reset and can retain content across both design 94 | and device resets. As a result, this module doesn't support a read_only 95 | mode, because its contents would be indeterminate (yet not random enough to 96 | be interesting). 97 | 98 | Attributes 99 | ---------- 100 | bus: bus interface with 14 address bits. 101 | """ 102 | bus: In(BusPort(addr = 14, data = 32)) 103 | 104 | def elaborate(self, platform): 105 | m = Module() 106 | 107 | m.submodules.lo = lo = hapenny.mem.SpramMemory() 108 | m.submodules.hi = hi = hapenny.mem.SpramMemory() 109 | 110 | m.d.comb += [ 111 | lo.bus.cmd.valid.eq(self.bus.cmd.valid), 112 | lo.bus.cmd.payload.addr.eq(self.bus.cmd.payload.addr), 113 | lo.bus.cmd.payload.data.eq(self.bus.cmd.payload.data[:16]), 114 | lo.bus.cmd.payload.lanes.eq(self.bus.cmd.payload.lanes[:2]), 115 | 116 | hi.bus.cmd.valid.eq(self.bus.cmd.valid), 117 | hi.bus.cmd.payload.addr.eq(self.bus.cmd.payload.addr), 118 | hi.bus.cmd.payload.data.eq(self.bus.cmd.payload.data[16:]), 119 | hi.bus.cmd.payload.lanes.eq(self.bus.cmd.payload.lanes[2:]), 120 | 121 | self.bus.resp[:16].eq(lo.bus.resp), 122 | self.bus.resp[16:].eq(hi.bus.resp), 123 | ] 124 | 125 | return m 126 | 127 | -------------------------------------------------------------------------------- /hapenny/chonk/regfile32.py: -------------------------------------------------------------------------------- 1 | # 32-bit x 32 register file for a full-width RV32 implementation. 2 | 3 | from amaranth import * 4 | from amaranth.lib.wiring import * 5 | from amaranth.lib.enum import * 6 | 7 | from hapenny import StreamSig, AlwaysReady 8 | 9 | def RegWrite(addrbits = 5): 10 | return Signature({ 11 | 'reg': Out(addrbits), 12 | 'value': Out(32), 13 | }) 14 | 15 | class RegFile(Component): 16 | read_resp: Out(32) 17 | 18 | def __init__(self, *, 19 | banks = 1): 20 | super().__init__() 21 | 22 | self.banks = banks 23 | 24 | # 5 bits for x0..x31, then bank bits 25 | select_bits = 5 + (banks - 1).bit_length() 26 | 27 | self.read_cmd = AlwaysReady(select_bits).flip().create() 28 | self.write_cmd = AlwaysReady(RegWrite(select_bits)).flip().create() 29 | 30 | def elaborate(self, platform): 31 | m = Module() 32 | 33 | nregs = 32 * self.banks 34 | contents = [0xDEAD_0000 | n | (b << 8) for n in range(32) for b in range(self.banks)] 35 | contents[0] = 0 36 | 37 | m.submodules.mem = mem = Memory( 38 | width = 32, 39 | depth = nregs, 40 | name = "regfile", 41 | #init = contents, 42 | ) 43 | 44 | # The 32-bit core can read a register at the same time that it's 45 | # writing it, so we have to make this transparent to bypass. 46 | rp = mem.read_port(transparent = True) 47 | wp = mem.write_port() 48 | 49 | m.d.comb += [ 50 | rp.addr.eq(self.read_cmd.payload), 51 | rp.en.eq(self.read_cmd.valid), 52 | 53 | self.read_resp.eq(rp.data), 54 | 55 | wp.addr.eq(self.write_cmd.payload.reg), 56 | wp.data.eq(self.write_cmd.payload.value), 57 | # Block writes to both halves of x0 in all banks. 58 | wp.en.eq((self.write_cmd.payload.reg != 0) & self.write_cmd.valid), 59 | ] 60 | 61 | return m 62 | -------------------------------------------------------------------------------- /hapenny/chonk/sbox.py: -------------------------------------------------------------------------------- 1 | # The S-Box, responsible for state sequencing of other boxes. 2 | 3 | from amaranth import * 4 | from amaranth.lib.wiring import * 5 | from amaranth.lib.enum import * 6 | from amaranth.lib.coding import Encoder, Decoder 7 | 8 | from hapenny import StreamSig, AlwaysReady 9 | from hapenny.bus import BusPort 10 | 11 | # Maximum number of (unique) states needed by any instruction, plus one 12 | # additional for halt. (Note that repeated states when e.g. shifting do not 13 | # count as unique states.) 14 | STATE_COUNT = 3 + 1 15 | 16 | class SBox(Component): 17 | """The S-Box sequences the other components. 18 | 19 | The S-Box implements a state counter that counts up through the maximum 20 | number of unique states required by any instruction. The count can be reset, 21 | signaling the end of one instruction and the beginning of the next, by 22 | asserting the from_the_top input. 23 | 24 | The state counter, and output, are both one-hot. 25 | 26 | Attributes 27 | ---------- 28 | from_the_top (input): restarts the count for the next instruction. 29 | hold (input): input from EW-box to keep doing this same state. Only safe for 30 | use after state 3 to avoid weird side effects. 31 | halt_request (input): when high, redirects the next from_the_top assertion 32 | to go to the halted state instead. 33 | not_a_bubble (input): indicates that the CPU is doing useful work and not 34 | just fetching. Used to gate transitions to halt state to ensure forward 35 | progress during single-stepping. 36 | onehot_state (output): one bit per possible state. 37 | halted(output): a handy synonym for the last onehot_state bit. 38 | """ 39 | from_the_top: In(1) 40 | hold: In(1) 41 | halt_request: In(1) 42 | not_a_bubble: In(1) 43 | 44 | onehot_state: Out(STATE_COUNT) 45 | halted: Out(1) 46 | 47 | def __init__(self): 48 | super().__init__() 49 | 50 | self.onehot_state.reset = 1 51 | 52 | def elaborate(self, platform): 53 | m = Module() 54 | 55 | # This module is doing a lot of things by hand, because as far as I can 56 | # tell, Amaranth doesn't really know anything about one-hot encoding. 57 | # Like, there's no way to indicate that the bits are exclusive. So in an 58 | # attempt to get this managed like a one-hot FSM rather than a 59 | # STATE_COUNT-wide base-2 FSM, I'm rolling circuits by hand. 60 | 61 | # Inexpensive way to detect that we're leaving a halt request without 62 | # requiring more registers: 63 | end_of_halt = Signal(1) 64 | m.d.comb += end_of_halt.eq( 65 | self.onehot_state[STATE_COUNT - 1] & ~self.halt_request 66 | ) 67 | 68 | # Generate one-hot counter transition circuit. In each state we clear 69 | # one bit and set another to advance. This can be overridden if we get 70 | # the signal to start again from the top. 71 | for state_num in range(STATE_COUNT): 72 | with m.If(self.from_the_top | end_of_halt): 73 | with m.If(self.halt_request & self.not_a_bubble): 74 | # Each bit must clear itself except for the highest. 75 | m.d.sync += self.onehot_state[state_num].eq( 76 | state_num == STATE_COUNT - 1 77 | ) 78 | with m.Else(): 79 | # Each bit must clear itself except for the lowest. 80 | m.d.sync += self.onehot_state[state_num].eq(state_num == 0) 81 | with m.Elif(self.onehot_state[state_num] & ~self.hold): 82 | # The final state is sticky, so, don't implement wraparound 83 | # logic to advance out of it. We only leave that state if we 84 | # receive from_the_top. 85 | if state_num < STATE_COUNT - 1: 86 | m.d.sync += [ 87 | self.onehot_state[state_num].eq(0), 88 | self.onehot_state[state_num + 1].eq(1), 89 | ] 90 | 91 | m.d.comb += self.halted.eq(self.onehot_state[STATE_COUNT - 1]) 92 | return m 93 | -------------------------------------------------------------------------------- /hapenny/chonk/serial32.py: -------------------------------------------------------------------------------- 1 | from amaranth import * 2 | from amaranth.lib.wiring import * 3 | from amaranth.lib.enum import * 4 | from amaranth.lib.coding import Encoder, Decoder 5 | 6 | from hapenny import StreamSig, AlwaysReady, mux, oneof 7 | from hapenny.bus import BusPort 8 | 9 | class ReceiveCore(Component): 10 | rx: In(1) 11 | sample_clock: In(1) 12 | rdr: Out(8) 13 | empty: Out(1) 14 | read_strobe: In(1) 15 | 16 | def __init__(self, oversample = 16): 17 | super().__init__() 18 | 19 | self.oversample = oversample 20 | 21 | def elaborate(self, platform): 22 | m = Module() 23 | 24 | state = Signal(range(4)) 25 | bits_left = Signal(range(8)) 26 | timer = Signal(range(self.oversample)) 27 | have_data = Signal(1) 28 | 29 | m.d.comb += [ 30 | self.empty.eq(~have_data), 31 | ] 32 | 33 | m.d.sync += timer.eq(oneof([ 34 | # Set to delay half a bit period from initial negative edge. 35 | (self.sample_clock & (state == 0), (self.oversample // 2) - 1), 36 | # Count down in all other states until we reach 0. 37 | (self.sample_clock & (state != 0) & (timer != 0), timer - 1), 38 | # Once we reach 0, reset to a full bit time. 39 | (self.sample_clock & (state != 0) & (timer == 0), self.oversample - 1), 40 | ], default = timer)) 41 | 42 | m.d.sync += state.eq(oneof([ 43 | # Leave state 0 if we see the falling edge. 44 | (self.sample_clock & (state == 0), ~self.rx), 45 | # If it's still low at the midpoint of the start bit, proceed. 46 | # Otherwise, treat it as a glitch and reset. 47 | (self.sample_clock & (state == 1) & (timer == 0), mux(~self.rx, 2, 0)), 48 | # Automatically advance when we've done all the bits in state 2. 49 | (self.sample_clock & (state == 2) & (timer == 0), mux(bits_left == 0, 3, 2)), 50 | # Automatically advance at the end of the stop bit. 51 | (self.sample_clock & (state == 3) & (timer == 0), 0), 52 | ], default = state)) 53 | 54 | m.d.sync += bits_left.eq(oneof([ 55 | # Configure for 7 bits after the first one. 56 | (self.sample_clock & (timer == 0), mux(state == 1, 7, bits_left - 1)), 57 | ], default = bits_left)) 58 | 59 | m.d.sync += self.rdr.eq(oneof([ 60 | (self.sample_clock & (state == 2) & (timer == 0), Cat(self.rdr[1:], self.rx)), 61 | ], default = self.rdr)) 62 | 63 | m.d.sync += have_data.eq(oneof([ 64 | # The way this is expressed, newly arriving data will override the 65 | # read strobe -- the two cases will OR if they occur 66 | # simultaneously, and the 0 loses. 67 | (self.sample_clock & (state == 3) & (timer == 0), self.rx), 68 | (self.read_strobe, 0), 69 | ], default = have_data)) 70 | 71 | return m 72 | 73 | 74 | class TransmitCore(Component): 75 | tx: Out(1) 76 | sample_clock: In(1) 77 | thr_write: In(AlwaysReady(8)) 78 | busy: Out(1) 79 | 80 | def __init__(self, oversample = 16): 81 | super().__init__() 82 | 83 | self.oversample = oversample 84 | 85 | def elaborate(self, platform): 86 | m = Module() 87 | 88 | # We use this as a shift register containing: start bit, 8 data bits, 2 89 | # stop bits. Its LSB is our output state, so it's important that it 90 | # reset to 1; the other bits can reset to whatever value. 91 | thr = Signal(1 + 8, reset = 1) 92 | 93 | tx_bits_left = Signal(range(1 + 8 + 2)) 94 | tx_timer = Signal(range(self.oversample)) 95 | 96 | with m.If(self.sample_clock): 97 | with m.If(tx_bits_left != 0): 98 | with m.If(tx_timer == 0): 99 | m.d.sync += [ 100 | thr.eq(Cat(thr[1:], 1)), 101 | tx_timer.eq(self.oversample - 1), 102 | tx_bits_left.eq(tx_bits_left - 1), 103 | ] 104 | with m.Else(): 105 | m.d.sync += tx_timer.eq(tx_timer - 1) 106 | 107 | # Transmit output 108 | m.d.comb += self.tx.eq(thr[0]) 109 | 110 | # Control register interface. 111 | m.d.comb += self.busy.eq(tx_bits_left != 0) 112 | 113 | with m.If(self.thr_write.valid): 114 | m.d.sync += [ 115 | # Load THR with the start bit. 116 | thr.eq(Cat(0, self.thr_write.payload)), 117 | tx_bits_left.eq(1 + 8 + 2), 118 | tx_timer.eq(self.oversample - 1), 119 | ] 120 | 121 | return m 122 | 123 | 124 | class OversampleClock(Component): 125 | out: Out(1) 126 | 127 | def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None): 128 | super().__init__() 129 | self.baud_rate = baud_rate 130 | self.oversample = oversample 131 | self.clock_freq = clock_freq 132 | 133 | def elaborate(self, platform): 134 | m = Module() 135 | 136 | # We divide the system clock to our baud rate * oversample and use that 137 | # clock for sampling. This is a compromise between low cost transmit 138 | # (where we could divide the clock all the way down to the baud rate 139 | # without issue) and accurate receive (where higher sampling rates are 140 | # better but cost more flops). 141 | clock_freq = self.clock_freq or platform.default_clk_frequency 142 | our_freq = self.baud_rate * self.oversample 143 | divisor = int(round(clock_freq / our_freq)) 144 | print(f"UART configured for {self.baud_rate} from input clock {clock_freq}, divisor = {divisor}") 145 | actual_freq = clock_freq / self.oversample / divisor 146 | freq_error = abs(actual_freq - self.baud_rate) / self.baud_rate 147 | print(f"Actual baud rate will be: {actual_freq} (error: {freq_error * 100:.3}%)") 148 | assert freq_error< 0.01, "Error: cannot achieve requested UART frequency" 149 | 150 | sample_clock = Signal(1) 151 | sample_counter = Signal(range(divisor)) 152 | # Generate a pulse on every sample period for one (fast) clock cycle. 153 | m.d.comb += self.out.eq(sample_counter == 0) 154 | 155 | m.d.sync += sample_counter.eq(mux(self.out, divisor - 1, sample_counter - 1)) 156 | 157 | return m 158 | 159 | class TransmitOnlyUart(Component): 160 | """The world's crappiest UART! 161 | 162 | The low byte of any write goes into the transmit holding register and will 163 | be sent out promptly. 164 | 165 | Reads return a status register where bit 0 indicates BUSY. 166 | """ 167 | bus: In(BusPort(addr = 0, data = 32)) 168 | tx: Out(1) 169 | 170 | def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None): 171 | super().__init__() 172 | 173 | self.baud_rate = baud_rate 174 | self.oversample = oversample 175 | self.clock_freq = clock_freq 176 | 177 | def elaborate(self, platform): 178 | m = Module() 179 | m.submodules.clkdiv = clkdiv = OversampleClock( 180 | baud_rate = self.baud_rate, 181 | oversample = self.oversample, 182 | clock_freq = self.clock_freq, 183 | ) 184 | 185 | m.submodules.txr = txr = TransmitCore(oversample = self.oversample) 186 | m.d.comb += [ 187 | txr.sample_clock.eq(clkdiv.out), 188 | self.tx.eq(txr.tx), 189 | self.bus.resp.eq(txr.busy), 190 | 191 | txr.thr_write.payload.eq(self.bus.payload.data[:8]), 192 | ] 193 | 194 | with m.If(self.bus.cmd.valid & self.bus.cmd.payload.lanes[0]): 195 | m.d.comb += txr.thr_write.valid.eq(1) 196 | 197 | return m 198 | 199 | class ReceiveOnlyUart(Component): 200 | """The world's other crappiest UART! 201 | 202 | This can receive a single frame and hold it in registers. 203 | 204 | On any read, this will return the frame in the low 8 bits, plus bit 15 set 205 | if there's actual data. This is intended to be used with LH to easily get 206 | the "data full" flag into the MSB where it can be tested with bltz. 207 | 208 | And, read sensitive, why not. 209 | """ 210 | bus: In(BusPort(addr = 0, data = 32)) 211 | rx: In(1) 212 | 213 | def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None): 214 | super().__init__() 215 | 216 | self.baud_rate = baud_rate 217 | self.clock_freq = clock_freq 218 | 219 | def elaborate(self, platform): 220 | m = Module() 221 | 222 | m.submodules.clkdiv = clkdiv = OversampleClock( 223 | baud_rate = self.baud_rate, 224 | oversample = self.oversample, 225 | clock_freq = self.clock_freq, 226 | ) 227 | 228 | m.submodules.rxr = rxr = ReceiveCore(oversample = self.oversample) 229 | m.d.comb += [ 230 | rxr.rx.eq(self.rx), 231 | rxr.sample_clock.eq(clkdiv.out), 232 | rxr.read_strobe.eq(self.bus.cmd.valid & ~self.bus.cmd.payload.lanes.any()), 233 | ] 234 | 235 | m.d.sync += [ 236 | self.bus.resp[:8].eq(rxr.rdr), 237 | self.bus.resp[-1].eq(rxr.empty), 238 | ] 239 | 240 | return m 241 | 242 | class BidiUart(Component): 243 | """A slightly less crappy UART. 244 | 245 | This combines the transmit and receive logic using a shared clock divider, 246 | to save some space if you need both directions. 247 | 248 | Register Layout 249 | --------------- 250 | 0x0000 RDR - data in low 8 bits, empty flag in bit 15, read-sensitive 251 | 0x0004 THR - reads as 0 if TX is idle, writes send low 8 bits 252 | """ 253 | bus: In(BusPort(addr = 1, data = 32)) 254 | tx: In(1) 255 | rx: In(1) 256 | 257 | def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None): 258 | super().__init__() 259 | 260 | self.baud_rate = baud_rate 261 | self.oversample = oversample 262 | self.clock_freq = clock_freq 263 | 264 | def elaborate(self, platform): 265 | m = Module() 266 | 267 | # Clock divider for sampling 268 | m.submodules.clkdiv = clkdiv = OversampleClock( 269 | baud_rate = self.baud_rate, 270 | oversample = self.oversample, 271 | clock_freq = self.clock_freq, 272 | ) 273 | 274 | # Receive state machine. 275 | m.submodules.rxr = rxr = ReceiveCore(oversample = self.oversample) 276 | m.d.comb += [ 277 | rxr.rx.eq(self.rx), 278 | rxr.sample_clock.eq(clkdiv.out), 279 | ] 280 | 281 | # Transmit machine. 282 | 283 | m.submodules.txr = txr = TransmitCore(oversample = self.oversample) 284 | m.d.comb += [ 285 | txr.sample_clock.eq(clkdiv.out), 286 | self.tx.eq(txr.tx), 287 | ] 288 | 289 | # Bus read port. We register this so that state doesn't change by the 290 | # time the output is read. This is particularly a problem for the 291 | # read-sensitive RDR register. 292 | m.d.sync += [ 293 | self.bus.resp[:8].eq(mux( 294 | self.bus.cmd.payload.addr[0], 295 | txr.busy, 296 | rxr.rdr, 297 | )), 298 | self.bus.resp[-1].eq( 299 | ~self.bus.cmd.payload.addr[0] & rxr.empty 300 | ), 301 | ] 302 | 303 | # Read-sense logic for receive side. 304 | m.d.comb += rxr.read_strobe.eq( 305 | self.bus.cmd.valid 306 | & ~self.bus.cmd.payload.lanes.any() 307 | & ~self.bus.cmd.payload.addr[0] 308 | ) 309 | 310 | # Write logic for TX side. 311 | m.d.comb += txr.thr_write.payload.eq(self.bus.cmd.payload.data[:8]) 312 | 313 | m.d.comb += txr.thr_write.valid.eq( 314 | self.bus.cmd.valid 315 | & self.bus.cmd.payload.lanes[0] 316 | & self.bus.cmd.payload.addr[0] 317 | ) 318 | 319 | return m 320 | 321 | -------------------------------------------------------------------------------- /hapenny/decoder.py: -------------------------------------------------------------------------------- 1 | # Combinational decode logic. 2 | 3 | from amaranth import * 4 | from amaranth.lib.wiring import * 5 | from amaranth.lib.enum import * 6 | from amaranth.lib.data import * 7 | import amaranth.lib.coding 8 | 9 | class Opcode(Enum): 10 | LUI = 0b01101 11 | AUIPC = 0b00101 12 | JAL = 0b11011 13 | JALR = 0b11001 14 | Bxx = 0b11000 15 | Lxx = 0b00000 16 | Sxx = 0b01000 17 | ALUIMM = 0b00100 18 | ALUREG = 0b01100 19 | SYSTEM = 0b11100 20 | CUSTOM0 = 0b00001 21 | 22 | class DecodeSignals(Struct): 23 | inst: unsigned(32) 24 | 25 | opcode: unsigned(5) 26 | funct3: unsigned(3) 27 | rs1: unsigned(5) 28 | rs2: unsigned(5) 29 | rd: unsigned(5) 30 | 31 | is_auipc: unsigned(1) 32 | is_lui: unsigned(1) 33 | is_jal: unsigned(1) 34 | is_jalr: unsigned(1) 35 | is_b: unsigned(1) 36 | is_load: unsigned(1) 37 | is_store: unsigned(1) 38 | is_alu: unsigned(1) 39 | is_alu_rr: unsigned(1) 40 | is_alu_ri: unsigned(1) 41 | is_system: unsigned(1) 42 | is_custom0: unsigned(1) 43 | 44 | # derived signals to make it easier to move the functions before a register. 45 | is_auipc_or_lui: unsigned(1) 46 | is_auipc_or_jal: unsigned(1) 47 | is_auipc_or_lui_or_jal: unsigned(1) 48 | is_jal_or_jalr: unsigned(1) 49 | is_load_or_jalr: unsigned(1) 50 | is_csr: unsigned(1) 51 | writes_rd_normally: unsigned(1) 52 | is_imm_i: unsigned(1) 53 | is_neg_imm_i: unsigned(1) 54 | is_any_imm_i: unsigned(1) 55 | is_neg_reg_to_adder: unsigned(1) 56 | is_reg_to_adder: unsigned(1) 57 | is_any_reg_to_adder: unsigned(1) 58 | is_shift: unsigned(1) 59 | is_slt: unsigned(1) 60 | is_sw: unsigned(1) 61 | is_adder_rhs_complemented: unsigned(1) 62 | writes_adder_to_reg: unsigned(1) 63 | 64 | # one-hot decode of funct3 65 | funct3_is: unsigned(8) 66 | 67 | class Decoder(Component): 68 | """The Decoder is a circuit that breaks an instruction into the various 69 | control signals. It's used by the larger components. 70 | 71 | Attributes 72 | ---------- 73 | inst (input): instruction word. 74 | out (output): group of decode signals, see DecodeSignals struct. 75 | """ 76 | inst: In(32) 77 | 78 | out: Out(DecodeSignals) 79 | 80 | def __init__(self, 81 | ): 82 | super().__init__() 83 | 84 | def elaborate(self, platform): 85 | m = Module() 86 | 87 | m.submodules.funct3_decode = f3d = amaranth.lib.coding.Decoder(8) 88 | 89 | m.d.comb += f3d.i.eq(self.inst[12:15]) 90 | 91 | opcode = Signal(5) 92 | m.d.comb += opcode.eq(self.inst[2:7]) 93 | 94 | m.d.comb += [ 95 | self.out.inst.eq(self.inst), 96 | self.out.opcode.eq(opcode), 97 | self.out.funct3.eq(self.inst[12:15]), 98 | self.out.rs1.eq(self.inst[15:20]), 99 | self.out.rs2.eq(self.inst[20:25]), 100 | self.out.rd.eq(self.inst[7:12]), 101 | self.out.is_auipc.eq(opcode == Opcode.AUIPC), 102 | self.out.is_lui.eq(opcode == Opcode.LUI), 103 | self.out.is_jal.eq(opcode == Opcode.JAL), 104 | self.out.is_jalr.eq(opcode == Opcode.JALR), 105 | self.out.is_b.eq(opcode == Opcode.Bxx), 106 | self.out.is_load.eq(opcode == Opcode.Lxx), 107 | self.out.is_store.eq(opcode == Opcode.Sxx), 108 | self.out.is_alu_rr.eq(opcode == Opcode.ALUREG), 109 | self.out.is_alu_ri.eq(opcode == Opcode.ALUIMM), 110 | self.out.is_system.eq(opcode == Opcode.SYSTEM), 111 | self.out.is_custom0.eq(opcode == Opcode.CUSTOM0), 112 | self.out.funct3_is.eq(f3d.o), 113 | ] 114 | 115 | # derived signals 116 | m.d.comb += [ 117 | self.out.is_alu.eq( 118 | self.out.is_alu_rr | self.out.is_alu_ri 119 | ), 120 | self.out.is_auipc_or_lui.eq( 121 | self.out.is_auipc | self.out.is_lui 122 | ), 123 | self.out.is_auipc_or_jal.eq( 124 | self.out.is_auipc | self.out.is_jal 125 | ), 126 | self.out.is_auipc_or_lui_or_jal.eq( 127 | self.out.is_auipc | self.out.is_lui | self.out.is_jal 128 | ), 129 | self.out.is_jal_or_jalr.eq( 130 | self.out.is_jal | self.out.is_jalr 131 | ), 132 | self.out.is_load_or_jalr.eq( 133 | self.out.is_load | self.out.is_jalr 134 | ), 135 | self.out.is_csr.eq((opcode == Opcode.SYSTEM) & ~self.out.funct3_is[0b000]), 136 | 137 | self.out.writes_rd_normally.eq( 138 | self.out.is_jal 139 | | self.out.is_jalr 140 | | self.out.is_lui 141 | | self.out.is_auipc 142 | | self.out.is_alu 143 | ), 144 | self.out.is_imm_i.eq( 145 | (opcode == Opcode.Lxx) | (opcode == Opcode.JALR) 146 | | ((opcode == Opcode.ALUIMM) & ~(self.out.funct3_is[0b010] | 147 | self.out.funct3_is[0b011])) 148 | | (opcode == Opcode.CUSTOM0) 149 | ), 150 | self.out.is_neg_imm_i.eq( 151 | (opcode == Opcode.ALUIMM) & (self.out.funct3_is[0b010] | 152 | self.out.funct3_is[0b011]) 153 | ), 154 | self.out.is_any_imm_i.eq( 155 | (opcode == Opcode.Lxx) | (opcode == Opcode.JALR) 156 | | (opcode == Opcode.ALUIMM) 157 | | (opcode == Opcode.CUSTOM0) 158 | ), 159 | self.out.is_neg_reg_to_adder.eq( 160 | (opcode == Opcode.Bxx) 161 | | ((opcode == Opcode.ALUREG) & (self.out.funct3_is[0b010] | 162 | self.out.funct3_is[0b011])) 163 | ), 164 | self.out.is_reg_to_adder.eq( 165 | ((opcode == Opcode.ALUREG) & ~(self.out.funct3_is[0b010] | 166 | self.out.funct3_is[0b011])) 167 | ), 168 | self.out.is_any_reg_to_adder.eq( 169 | (opcode == Opcode.Bxx) 170 | | (opcode == Opcode.ALUREG) 171 | ), 172 | self.out.is_shift.eq( 173 | self.out.is_alu & (self.out.funct3_is[0b001] | 174 | self.out.funct3_is[0b101]) 175 | ), 176 | self.out.is_slt.eq( 177 | self.out.is_alu & (self.out.funct3_is[0b010] | 178 | self.out.funct3_is[0b011]) 179 | ), 180 | self.out.is_sw.eq( 181 | self.out.is_store & self.out.funct3_is[0b010] 182 | ), 183 | self.out.is_adder_rhs_complemented.eq( 184 | self.out.is_neg_reg_to_adder 185 | | self.out.is_neg_imm_i 186 | | (self.out.is_reg_to_adder & self.out.inst[30]) 187 | ), 188 | self.out.writes_adder_to_reg.eq( 189 | self.out.is_auipc_or_lui | (self.out.is_alu & 190 | self.out.funct3_is[0b000]) 191 | ), 192 | ] 193 | 194 | return m 195 | 196 | class ImmediateDecoder(Component): 197 | """The ImmediateDecoder decodes an instruction word into its various 198 | immediate formats. It's used by the larger components. 199 | 200 | Attributes 201 | ---------- 202 | inst (input): instruction word. 203 | imm_i (output): I-format immediate. 204 | imm_s (output): S-format immediate. 205 | imm_b (output): B-format immediate. 206 | imm_u (output): U-format immediate. 207 | imm_j (output): J-format immediate. 208 | """ 209 | inst: In(32) 210 | 211 | i: Out(32) 212 | s: Out(32) 213 | b: Out(32) 214 | u: Out(32) 215 | j: Out(32) 216 | 217 | def elaborate(self, platform): 218 | m = Module() 219 | 220 | m.d.comb += [ 221 | self.i.eq(Cat(self.inst[20:31], self.inst[31].replicate(21))), 222 | self.s.eq(Cat(self.inst[7:12], self.inst[25:31], 223 | self.inst[31].replicate(21))), 224 | self.b.eq(Cat(0, self.inst[8:12], self.inst[25:31], self.inst[7], 225 | self.inst[31].replicate(20))), 226 | self.u.eq(self.inst & 0xFFFFF000), 227 | self.j.eq(Cat(0, self.inst[21:31], self.inst[20], 228 | self.inst[12:20], self.inst[31].replicate(12))), 229 | ] 230 | 231 | return m 232 | 233 | -------------------------------------------------------------------------------- /hapenny/extsram.py: -------------------------------------------------------------------------------- 1 | from amaranth import * 2 | from amaranth.lib.wiring import * 3 | from amaranth.lib.enum import * 4 | from amaranth.lib.coding import Encoder, Decoder 5 | 6 | from hapenny import StreamSig, AlwaysReady 7 | from hapenny.bus import BusPort 8 | 9 | class ExternalSRAM(Component): 10 | """An interface to 16-bit-wide external asynchronous SRAM. 11 | 12 | Parameters 13 | ---------- 14 | address_bits (integer): number of implemented address bits at the physical 15 | interface -- so, not including address bit 0 since the memory is 16 16 | bits wide. 17 | 18 | Attributes 19 | ---------- 20 | sram_oe (out): output enable to SRAM, active high. Enables the SRAM's 21 | output drivers during a read cycle. 22 | sram_we (out): write enable to SRAM, active high. 23 | sram_lanes (out): byte select lines to SRAM, a 1 during a write means the 24 | corresponding byte is written, a 0 leaves it untouched. 25 | addr_to_sram (out): address, width determined by the 'address_bits' 26 | parameter. 27 | data_to_sram (out): 16-bit data path to SRAM. Unidirectional because FPGAs 28 | like that, becomes bidirectional at the I/O pin. 29 | data_from_sram (in): 16-bit data path from SRAM. 30 | 31 | bus (port): connection to the SoC bus. 32 | """ 33 | clock_90: In(1) 34 | 35 | sram_oe: Out(1) 36 | sram_we: Out(1) 37 | sram_lanes: Out(2) 38 | data_to_sram: In(16) 39 | data_from_sram: Out(16) 40 | 41 | def __init__(self, *, address_bits): 42 | super().__init__() 43 | self.bus = BusPort(addr = address_bits, data = 16).flip().create() 44 | 45 | self.addr_to_sram = Signal(address_bits) 46 | 47 | self.address_bits = address_bits 48 | 49 | def elaborate(self, platform): 50 | m = Module() 51 | 52 | # Register the bus inputs when we're selected, so that we can maintain 53 | # stable outputs. Note that we register "lanes" separately from the 54 | # "write" signal because the SRAM requires lanes to be asserted to 55 | # read! 56 | r_addr = Signal(self.address_bits) 57 | r_data_to_sram = Signal(16) 58 | r_lanes = Signal(2) 59 | r_write = Signal() 60 | r_read = Signal() 61 | 62 | # Automatically clear any write request on the cycle after it occurs, so 63 | # that we don't sit there repeatedly writing from this interface while 64 | # the CPU is off doing other things. 65 | with m.If(r_write): 66 | m.d.sync += [ 67 | r_write.eq(0), 68 | r_read.eq(0), 69 | ] 70 | 71 | # Copy any bus transaction into the registers. This will override the 72 | # clearing above. 73 | with m.If(self.bus.cmd.valid): 74 | m.d.sync += [ 75 | r_addr.eq(self.bus.cmd.payload.addr), 76 | r_data_to_sram.eq(self.bus.cmd.payload.data), 77 | r_write.eq(self.bus.cmd.payload.lanes.any()), 78 | r_read.eq(~self.bus.cmd.payload.lanes.any()), 79 | # Our bus doesn't use lane signals on read. The external bus 80 | # does. Convert. 81 | r_lanes.eq(self.bus.cmd.payload.lanes 82 | | (~self.bus.cmd.payload.lanes.any()).replicate(2)), 83 | ] 84 | 85 | # Present transactions from our registers on the bus output. 86 | m.d.comb += [ 87 | self.addr_to_sram.eq(r_addr), 88 | self.data_to_sram.eq(r_data_to_sram), 89 | self.sram_lanes.eq(r_lanes), 90 | 91 | # Assert the (active high) output enable line whenever we're not 92 | # writing. Conversely, deassert it on write cycles. The phase 93 | # offset of our write-enable output gives the drivers time to turn 94 | # off. 95 | self.sram_oe.eq(r_read), 96 | # Combine our write enable (active high) with the incoming 97 | # phase-shifted clock to generate a write pulse in the center of 98 | # each write cycle. 99 | self.sram_we.eq(r_write & self.clock_90), 100 | ] 101 | # Responses come back combinationally, in what is likely to be the slow 102 | # path. 103 | m.d.comb += self.bus.resp.eq(self.data_from_sram) 104 | 105 | return m 106 | -------------------------------------------------------------------------------- /hapenny/fdbox.py: -------------------------------------------------------------------------------- 1 | # The FD-Box, responsible for fetch and decode during execution. 2 | 3 | from amaranth import * 4 | from amaranth.lib.wiring import * 5 | from amaranth.lib.enum import * 6 | from amaranth.lib.coding import Encoder, Decoder 7 | 8 | from hapenny import StreamSig, AlwaysReady, onehot_choice, mux, oneof 9 | from hapenny.sbox import STATE_COUNT 10 | from hapenny.bus import BusPort 11 | 12 | class FDBox(Component): 13 | """The FD-Box fetches and decodes instructions. 14 | 15 | Based on a PC (provided by the EW-box) the FD-box generates bus 16 | transactions to collect both halfwords of an instruction, and then provides 17 | it on an output signal to the EW-box. 18 | 19 | Parameters 20 | ---------- 21 | prog_addr_width (integer): number of bits in a program address, 32 by default 22 | but can be shrunk to save logic. 23 | 24 | Attributes 25 | ---------- 26 | onehot_state (input): state input from the S-Box 27 | pc (input): program counter from EW-box; includes a 'valid' signal that 28 | determines whether a fetch happens, to avoid wild fetches of nonsense 29 | addresses. 30 | rf_cmd (output): read command to the register file, intended to be OR'd. 31 | inst_next (output): instruction word for EW to use next time we restart 32 | from the top. 33 | bus (port): our connection to the memory fabric. 34 | from_the_top (input): signal from EW indicating that this is the final 35 | cycle of the instruction. We use this to gate register reads. 36 | """ 37 | onehot_state: In(STATE_COUNT) 38 | rf_cmd: Out(AlwaysReady(6)) 39 | inst_next: Out(32) 40 | from_the_top: In(1) 41 | 42 | def __init__(self, *, 43 | prog_addr_width = 32, 44 | ): 45 | super().__init__() 46 | 47 | # Create a bus port of sufficient width to fetch instructions only. 48 | # (Width is -1 because we're addressing halfwords.) 49 | self.bus = BusPort(addr = prog_addr_width - 1, data = 16).create() 50 | 51 | # The PC width is -2 because it's addressing words. 52 | self.pc = AlwaysReady(prog_addr_width - 2).flip().create() 53 | 54 | self.inst = Signal(32) 55 | 56 | def elaborate(self, platform): 57 | m = Module() 58 | 59 | # State 0: we don't really do anything. 60 | # State 1: we start the low half fetch. 61 | # State 2: we receive the low half of the instruction word and issue 62 | # the high half fetch. 63 | # State 3: we receive the high half fetch and begin a register read. 64 | # State 4+: we don't do anything. 65 | 66 | m.d.comb += [ 67 | # We issue bus transactions in states 1 and 2 only. 68 | self.bus.cmd.valid.eq( 69 | self.pc.valid & (self.onehot_state[1] | self.onehot_state[2]) 70 | ), 71 | # In those states we select the bottom and top halves of the 72 | # instruction, respectively. 73 | self.bus.cmd.payload.addr.eq(onehot_choice(self.onehot_state, { 74 | 1: Cat(0, self.pc.payload), 75 | 2: Cat(1, self.pc.payload), 76 | })), 77 | 78 | # We access the register file only in the last cycle. 79 | self.rf_cmd.valid.eq(self.from_the_top), 80 | # If the last cycle is state 3, our fetch is still completing, so 81 | # we need to forward the bus response to the register file. If it 82 | # isn't state 3, we can serve out of our inst register. 83 | # (It's important to send zeros in other states instead of 84 | # hardwiring this so that we can OR.) 85 | self.rf_cmd.payload.eq(oneof([ 86 | (self.from_the_top & self.onehot_state[3], 87 | Cat(self.inst[15], self.bus.resp[0:4], 0)), 88 | (self.from_the_top & ~self.onehot_state[3], 89 | Cat(self.inst[15:20], 0)), 90 | ])), 91 | 92 | # Forward the instruction through so it's valid in states 3+. In 93 | # state 3 specifically, forward the top half from the bus. In other 94 | # states, serve up the contents of our registers. EW's not supposed 95 | # to look at this in states 0-2. 96 | self.inst_next[:16].eq(self.inst[:16]), 97 | self.inst_next[16:].eq(mux( 98 | self.onehot_state[3], 99 | self.bus.resp, 100 | self.inst[16:], 101 | )), 102 | ] 103 | 104 | m.d.sync += [ 105 | # Latch the bottom half of the instruction at the end of state 2. 106 | self.inst[:16].eq(mux( 107 | self.onehot_state[2], 108 | self.bus.resp, 109 | self.inst[:16], 110 | )), 111 | # Latch the top half at the end of state 3. 112 | self.inst[16:].eq(mux( 113 | self.onehot_state[3], 114 | self.bus.resp, 115 | self.inst[16:], 116 | )), 117 | ] 118 | 119 | return m 120 | -------------------------------------------------------------------------------- /hapenny/gpio.py: -------------------------------------------------------------------------------- 1 | from amaranth import * 2 | from amaranth.lib.wiring import * 3 | from amaranth.lib.enum import * 4 | from amaranth.lib.coding import Encoder, Decoder 5 | 6 | from hapenny import StreamSig, AlwaysReady, mux, oneof 7 | from hapenny.bus import BusPort 8 | 9 | class MinimalOutputPort(Component): 10 | """An absolutely dead-simple output port. Pipes any data written through to 11 | pins. Does not support reading back the state of the pins, or any fancy 12 | manipulation. 13 | 14 | Use this when space is at a premium; otherwise, see OutputPort. Also, 15 | measure the actual area requirement of the port -- in many cases, OutputPort 16 | is the same cost for more functionality. 17 | 18 | Memory Map 19 | ---------- 20 | +00: pins (byte write supported) 21 | 22 | Parameters 23 | ---------- 24 | pins (integer): number of pins to implement, 0-16. 25 | 26 | Attributes 27 | ---------- 28 | bus (port): connection to the fabric. 29 | pins (signal array): output to pins. 30 | """ 31 | bus: In(BusPort(addr = 0, data = 16)) 32 | 33 | def __init__(self, pins): 34 | super().__init__() 35 | self.pins = Signal(pins) 36 | 37 | def elaborate(self, platform): 38 | m = Module() 39 | 40 | m.d.sync += self.pins[:8].eq(mux( 41 | self.bus.cmd.valid & self.bus.cmd.payload.lanes[0], 42 | self.bus.cmd.payload.data[:8], 43 | self.pins[:8], 44 | )) 45 | m.d.sync += self.pins[8:].eq(mux( 46 | self.bus.cmd.valid & self.bus.cmd.payload.lanes[1], 47 | self.bus.cmd.payload.data[8:], 48 | self.pins[8:], 49 | )) 50 | 51 | return m 52 | 53 | class OutputPort(Component): 54 | """A block of general-purpose outputs that can be changed simultaneously. 55 | 56 | Memory Map 57 | ---------- 58 | +0 sets pins when written 59 | +2 ORs value with current pin state 60 | +4 ANDs the complement of the value written with the current pin state. 61 | +6 XORs value with the current pin state 62 | 63 | All registers support byte writes to affect only half the pins. 64 | 65 | Parameters 66 | ---------- 67 | pins (integer): number of pins to implement (1-16) 68 | read_back (boolean): when True (default), the state of the pins can be read 69 | back. When False, reads always return zero. Turning off read-back can 70 | save some space. 71 | 72 | Attributes 73 | ---------- 74 | bus (port): connection to bus fabric 75 | pins (signal array): the output pins 76 | """ 77 | bus: In(BusPort(addr = 2, data = 16)) 78 | 79 | def __init__(self, pins, read_back = True): 80 | super().__init__() 81 | self.pins = Signal(pins) 82 | self.read_back = read_back 83 | 84 | def elaborate(self, platform): 85 | m = Module() 86 | 87 | a = self.bus.cmd.payload.addr 88 | d = self.bus.cmd.payload.data 89 | 90 | m.d.sync += self.pins[:8].eq(mux( 91 | self.bus.cmd.valid & self.bus.cmd.payload.lanes[0], 92 | oneof([ 93 | (a == 1, self.pins[:8] | d[:8]), 94 | (a == 2, self.pins[:8] & ~d[:8]), 95 | (a == 3, self.pins[:8] ^ d[:8]), 96 | ], default = d[:8]), 97 | self.pins, 98 | )) 99 | m.d.sync += self.pins[8:].eq(mux( 100 | self.bus.cmd.valid & self.bus.cmd.payload.lanes[1], 101 | oneof([ 102 | (a == 1, self.pins[8:] | d[8:]), 103 | (a == 2, self.pins[8:] & ~d[8:]), 104 | (a == 3, self.pins[8:] ^ d[8:]), 105 | ], default = d[8:]), 106 | self.pins, 107 | )) 108 | 109 | if self.read_back: 110 | # We can service reads trivially by just permanently connecting our 111 | # register to the bus. 112 | m.d.comb += self.bus.resp.eq(self.pins) 113 | 114 | return m 115 | 116 | class InputPort(Component): 117 | """A simple input port peripheral. Can read the state of pins. 118 | 119 | Memory Map 120 | ---------- 121 | +00: pins (read only, writes ignored) 122 | 123 | Parameters 124 | ---------- 125 | pins (integer): number of pins to implement, 0-16. 126 | 127 | Attributes 128 | ---------- 129 | bus (port): connection to the fabric. 130 | pins (signal array): input from pins. 131 | """ 132 | bus: In(BusPort(addr = 0, data = 16)) 133 | 134 | def __init__(self, pins): 135 | super().__init__() 136 | self.pins = Signal(pins) 137 | 138 | def elaborate(self, platform): 139 | m = Module() 140 | 141 | # Am I the simplest peripheral? I think so! 142 | 143 | # Register inputs to cut the path from pins, and also to avoid leaking 144 | # metastability. 145 | pins_r = Signal(self.pins.shape().width) 146 | m.d.sync += pins_r.eq(self.pins) 147 | 148 | # Always output the state from the last cycle onto the bus. This has the 149 | # nice side effect of returning the state of the pins when the read was 150 | # _issued_ rather than when it completed. 151 | m.d.comb += self.bus.resp.eq(pins_r) 152 | 153 | return m 154 | -------------------------------------------------------------------------------- /hapenny/mem.py: -------------------------------------------------------------------------------- 1 | # Reusable memory with our bus interface. 2 | 3 | from amaranth import * 4 | from amaranth.lib.wiring import * 5 | from amaranth.lib.enum import * 6 | from amaranth.lib.coding import Encoder, Decoder 7 | 8 | from hapenny import StreamSig, AlwaysReady, mux 9 | from hapenny.bus import BusPort 10 | 11 | class BasicMemory(Elaboratable): 12 | """A dead-simple 16-bit-wide memory with the Hapenny bus interface. 13 | 14 | This uses an Amaranth generic memory internally, which relies on inference 15 | in the synthesis tools to map to a specific type of resource such as block 16 | RAM. In practice it won't map to uninitialized RAM (like the iCE40UP5K's 17 | SPRAM) because Amaranth insists on generating it with an initializer; for 18 | that you'll need another module. 19 | 20 | Parameters 21 | ---------- 22 | depth (integer): number of 16-bit halfwords in the memory. If omitted, 23 | contents must be provided, and depth is inferred from len(contents). 24 | contents (list of integer): initialization contents of the memory. If 25 | omitted, depth must be provided, and the RAM is implicitly zeroed. 26 | read_only (boolean): if overridden to True, the memory will not respond to 27 | write strobes. This is useful for using an initialized block RAM as a 28 | program ROM. 29 | 30 | Attributes 31 | ---------- 32 | bus: a BusPort with the minimum number of addr bits required to address 33 | 'depth' words, and a 16-bit data path. 34 | """ 35 | 36 | def __init__(self, *, 37 | depth = None, 38 | contents = [], 39 | read_only = False): 40 | super().__init__() 41 | 42 | if depth is None: 43 | assert len(contents) > 0, "either depth or contents must be provided" 44 | depth = len(contents) 45 | 46 | addr_bits = (depth - 1).bit_length() 47 | 48 | self.bus = BusPort(addr = addr_bits, data = 16).flip().create() 49 | 50 | self.m = Memory( 51 | width = 16, 52 | depth = depth, 53 | name = "basicram", 54 | init = contents, 55 | ) 56 | 57 | self.read_only = False 58 | 59 | def elaborate(self, platform): 60 | m = Module() 61 | 62 | m.submodules.m = self.m 63 | 64 | rp = self.m.read_port(transparent = False) 65 | 66 | 67 | m.d.comb += [ 68 | rp.addr.eq(self.bus.cmd.payload.addr), 69 | rp.en.eq(self.bus.cmd.valid & (self.bus.cmd.payload.lanes == 0)), 70 | self.bus.resp.eq(rp.data), 71 | ] 72 | 73 | if not self.read_only: 74 | wp = self.m.write_port(granularity = 8) 75 | m.d.comb += [ 76 | wp.addr.eq(self.bus.cmd.payload.addr), 77 | wp.data.eq(self.bus.cmd.payload.data), 78 | wp.en[0].eq(self.bus.cmd.valid & self.bus.cmd.payload.lanes[0]), 79 | wp.en[1].eq(self.bus.cmd.valid & self.bus.cmd.payload.lanes[1]), 80 | ] 81 | 82 | return m 83 | 84 | class SpramMemory(Component): 85 | """A single 256 kiB / 32 kiB SPRAM on the UP5K. 86 | 87 | This module exists because getting Amaranth to generate a memory that Yosys 88 | is willing to map to SPRAM is currently hard. 89 | 90 | SPRAMs are uninitialized at reset and can retain content across both design 91 | and device resets. As a result, this module doesn't support a read_only 92 | mode, because its contents would be indeterminate (yet not random enough to 93 | be interesting). 94 | 95 | Attributes 96 | ---------- 97 | bus: bus interface with 14 address bits. 98 | """ 99 | bus: In(BusPort(addr = 14, data = 16)) 100 | 101 | def elaborate(self, platform): 102 | m = Module() 103 | 104 | m.submodules.spram = Instance( 105 | "SB_SPRAM256KA", 106 | i_CLOCK = ClockSignal("sync"), 107 | i_ADDRESS = self.bus.cmd.payload.addr, 108 | i_DATAIN = self.bus.cmd.payload.data, 109 | # Weirdly, write enables are at the nibble level. 110 | i_MASKWREN = Cat( 111 | self.bus.cmd.payload.lanes[0], 112 | self.bus.cmd.payload.lanes[0], 113 | self.bus.cmd.payload.lanes[1], 114 | self.bus.cmd.payload.lanes[1], 115 | ), 116 | i_WREN = self.bus.cmd.payload.lanes != 0, 117 | i_CHIPSELECT = self.bus.cmd.valid, 118 | i_POWEROFF = 1, # active fucking low 119 | i_STANDBY = 0, 120 | i_SLEEP = 0, 121 | o_DATAOUT = self.bus.resp, 122 | ) 123 | 124 | return m 125 | -------------------------------------------------------------------------------- /hapenny/regfile16.py: -------------------------------------------------------------------------------- 1 | # 16-bit x 64 register file for narrow datapath RV32 implementation. 2 | 3 | from amaranth import * 4 | from amaranth.lib.wiring import * 5 | from amaranth.lib.enum import * 6 | 7 | from hapenny import StreamSig, AlwaysReady 8 | 9 | def RegWrite(addrbits = 5): 10 | return Signature({ 11 | 'reg': Out(addrbits), 12 | 'value': Out(16), 13 | }) 14 | 15 | class RegFile16(Component): 16 | read_resp: Out(16) 17 | 18 | def __init__(self, *, 19 | banks = 1): 20 | super().__init__() 21 | 22 | self.banks = banks 23 | 24 | # 5 bits for x0..x31, 1 bit for top vs bottom half, then bank bits 25 | select_bits = 5 + 1 + (banks - 1).bit_length() 26 | 27 | self.read_cmd = AlwaysReady(select_bits).flip().create() 28 | self.write_cmd = AlwaysReady(RegWrite(select_bits)).flip().create() 29 | 30 | def elaborate(self, platform): 31 | m = Module() 32 | 33 | nregs = 32 * self.banks 34 | 35 | m.submodules.mem = mem = Memory( 36 | width = 16, 37 | depth = 2 * nregs, 38 | name = "regfile", 39 | attrs = { 40 | 'ram_style': 'block', 41 | }, 42 | ) 43 | 44 | rp = mem.read_port(transparent = False) 45 | wp = mem.write_port() 46 | 47 | m.d.comb += [ 48 | rp.addr.eq(self.read_cmd.payload), 49 | rp.en.eq(self.read_cmd.valid), 50 | 51 | self.read_resp.eq(rp.data), 52 | 53 | wp.addr.eq(self.write_cmd.payload.reg), 54 | wp.data.eq(self.write_cmd.payload.value), 55 | # Block writes to both halves of x0 in all banks. 56 | wp.en.eq((self.write_cmd.payload.reg[:5] != 0) & self.write_cmd.valid), 57 | ] 58 | 59 | return m 60 | -------------------------------------------------------------------------------- /hapenny/rvfi.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | from amaranth import * 4 | from amaranth.lib.wiring import * 5 | from amaranth.lib.enum import * 6 | from amaranth.lib.coding import Encoder, Decoder 7 | 8 | from hapenny import StreamSig, AlwaysReady 9 | 10 | class Mode(Enum, shape = unsigned(2)): 11 | U = 0 12 | S = 1 13 | # 2 is reserved 14 | M = 3 15 | 16 | class Ixl(Enum, shape = unsigned(2)): 17 | _32 = 1 18 | _64 = 2 19 | 20 | def Rvfi(ilen = 32, xlen = 32): 21 | return Signature({ 22 | # instruction index, unique per instruction retired, no gaps 23 | 'order': Out(64), 24 | 'insn': Out(ilen), 25 | 'trap': Out(1), 26 | 'halt': Out(1), 27 | 'intr': Out(1), 28 | 'mode': Out(Mode), 29 | 'ixl': Out(Ixl, reset = Ixl._32), 30 | 31 | 'rs1_addr': Out(5), 32 | 'rs2_addr': Out(5), 33 | 'rs1_rdata': Out(xlen), 34 | 'rs2_rdata': Out(xlen), 35 | 36 | 'rd_addr': Out(5), 37 | 'rd_wdata': Out(xlen), 38 | 39 | 'pc_rdata': Out(xlen), 40 | 'pc_wdata': Out(xlen), 41 | 42 | 'mem_addr': Out(xlen), 43 | 'mem_rmask': Out(xlen // 8), 44 | 'mem_wmask': Out(xlen // 8), 45 | 'mem_rdata': Out(xlen), 46 | 'mem_wdata': Out(xlen), 47 | }) 48 | -------------------------------------------------------------------------------- /hapenny/sbox.py: -------------------------------------------------------------------------------- 1 | # The S-Box, responsible for state sequencing of other boxes. 2 | 3 | from amaranth import * 4 | from amaranth.lib.wiring import * 5 | from amaranth.lib.enum import * 6 | from amaranth.lib.coding import Encoder, Decoder 7 | 8 | from hapenny import StreamSig, AlwaysReady 9 | from hapenny.bus import BusPort 10 | 11 | # Maximum number of (unique) states needed by any instruction, plus one 12 | # additional for halt. (Note that repeated states when e.g. shifting do not 13 | # count as unique states.) 14 | STATE_COUNT = 6 + 1 15 | 16 | class SBox(Component): 17 | """The S-Box sequences the other components. 18 | 19 | The S-Box implements a state counter that counts up through the maximum 20 | number of unique states required by any instruction. The count can be reset, 21 | signaling the end of one instruction and the beginning of the next, by 22 | asserting the from_the_top input. 23 | 24 | The state counter, and output, are both one-hot. 25 | 26 | Attributes 27 | ---------- 28 | from_the_top (input): restarts the count for the next instruction. 29 | hold (input): input from EW-box to keep doing this same state. Only safe for 30 | use after state 3 to avoid weird side effects. 31 | halt_request (input): when high, redirects the next from_the_top assertion 32 | to go to the halted state instead. 33 | not_a_bubble (input): indicates that the CPU is doing useful work and not 34 | just fetching. Used to gate transitions to halt state to ensure forward 35 | progress during single-stepping. 36 | onehot_state (output): one bit per possible state. 37 | halted(output): a handy synonym for the last onehot_state bit. 38 | """ 39 | from_the_top: In(1) 40 | hold: In(1) 41 | halt_request: In(1) 42 | not_a_bubble: In(1) 43 | 44 | onehot_state: Out(STATE_COUNT) 45 | halted: Out(1) 46 | 47 | def __init__(self): 48 | super().__init__() 49 | 50 | self.onehot_state.reset = 1 51 | 52 | def elaborate(self, platform): 53 | m = Module() 54 | 55 | # This module is doing a lot of things by hand, because as far as I can 56 | # tell, Amaranth doesn't really know anything about one-hot encoding. 57 | # Like, there's no way to indicate that the bits are exclusive. So in an 58 | # attempt to get this managed like a one-hot FSM rather than a 59 | # STATE_COUNT-wide base-2 FSM, I'm rolling circuits by hand. 60 | 61 | # Inexpensive way to detect that we're leaving a halt request without 62 | # requiring more registers: 63 | end_of_halt = Signal(1) 64 | m.d.comb += end_of_halt.eq( 65 | self.onehot_state[STATE_COUNT - 1] & ~self.halt_request 66 | ) 67 | 68 | # Generate one-hot counter transition circuit. In each state we clear 69 | # one bit and set another to advance. This can be overridden if we get 70 | # the signal to start again from the top. 71 | for state_num in range(STATE_COUNT): 72 | with m.If(self.from_the_top | end_of_halt): 73 | with m.If(self.halt_request & self.not_a_bubble): 74 | # Each bit must clear itself except for the highest. 75 | m.d.sync += self.onehot_state[state_num].eq( 76 | state_num == STATE_COUNT - 1 77 | ) 78 | with m.Else(): 79 | # Each bit must clear itself except for the lowest. 80 | m.d.sync += self.onehot_state[state_num].eq(state_num == 0) 81 | with m.Elif(self.onehot_state[state_num] & ~self.hold): 82 | # The final state is sticky, so, don't implement wraparound 83 | # logic to advance out of it. We only leave that state if we 84 | # receive from_the_top. 85 | if state_num < STATE_COUNT - 1: 86 | m.d.sync += [ 87 | self.onehot_state[state_num].eq(0), 88 | self.onehot_state[state_num + 1].eq(1), 89 | ] 90 | 91 | m.d.comb += self.halted.eq(self.onehot_state[STATE_COUNT - 1]) 92 | return m 93 | -------------------------------------------------------------------------------- /hapenny/serial.py: -------------------------------------------------------------------------------- 1 | from amaranth import * 2 | from amaranth.lib.wiring import * 3 | from amaranth.lib.enum import * 4 | from amaranth.lib.coding import Encoder, Decoder 5 | 6 | from hapenny import StreamSig, AlwaysReady, mux, oneof 7 | from hapenny.bus import BusPort 8 | 9 | class ReceiveCore(Component): 10 | rx: In(1) 11 | sample_clock: In(1) 12 | rdr: Out(8) 13 | empty: Out(1) 14 | read_strobe: In(1) 15 | 16 | def __init__(self, oversample = 16): 17 | super().__init__() 18 | 19 | self.oversample = oversample 20 | 21 | def elaborate(self, platform): 22 | m = Module() 23 | 24 | state = Signal(range(4)) 25 | bits_left = Signal(range(8)) 26 | timer = Signal(range(self.oversample)) 27 | have_data = Signal(1) 28 | 29 | m.d.comb += [ 30 | self.empty.eq(~have_data), 31 | ] 32 | 33 | m.d.sync += timer.eq(oneof([ 34 | # Set to delay half a bit period from initial negative edge. 35 | (self.sample_clock & (state == 0), (self.oversample // 2) - 1), 36 | # Count down in all other states until we reach 0. 37 | (self.sample_clock & (state != 0) & (timer != 0), timer - 1), 38 | # Once we reach 0, reset to a full bit time. 39 | (self.sample_clock & (state != 0) & (timer == 0), self.oversample - 1), 40 | ], default = timer)) 41 | 42 | m.d.sync += state.eq(oneof([ 43 | # Leave state 0 if we see the falling edge. 44 | (self.sample_clock & (state == 0), ~self.rx), 45 | # If it's still low at the midpoint of the start bit, proceed. 46 | # Otherwise, treat it as a glitch and reset. 47 | (self.sample_clock & (state == 1) & (timer == 0), mux(~self.rx, 2, 0)), 48 | # Automatically advance when we've done all the bits in state 2. 49 | (self.sample_clock & (state == 2) & (timer == 0), mux(bits_left == 0, 3, 2)), 50 | # Automatically advance at the end of the stop bit. 51 | (self.sample_clock & (state == 3) & (timer == 0), 0), 52 | ], default = state)) 53 | 54 | m.d.sync += bits_left.eq(oneof([ 55 | # Configure for 7 bits after the first one. 56 | (self.sample_clock & (timer == 0), mux(state == 1, 7, bits_left - 1)), 57 | ], default = bits_left)) 58 | 59 | m.d.sync += self.rdr.eq(oneof([ 60 | (self.sample_clock & (state == 2) & (timer == 0), Cat(self.rdr[1:], self.rx)), 61 | ], default = self.rdr)) 62 | 63 | m.d.sync += have_data.eq(oneof([ 64 | # The way this is expressed, newly arriving data will override the 65 | # read strobe -- the two cases will OR if they occur 66 | # simultaneously, and the 0 loses. 67 | (self.sample_clock & (state == 3) & (timer == 0), self.rx), 68 | (self.read_strobe, 0), 69 | ], default = have_data)) 70 | 71 | return m 72 | 73 | 74 | class TransmitCore(Component): 75 | tx: Out(1) 76 | sample_clock: In(1) 77 | thr_write: In(AlwaysReady(8)) 78 | busy: Out(1) 79 | 80 | def __init__(self, oversample = 16): 81 | super().__init__() 82 | 83 | self.oversample = oversample 84 | 85 | def elaborate(self, platform): 86 | m = Module() 87 | 88 | # We use this as a shift register containing: start bit, 8 data bits, 2 89 | # stop bits. Its LSB is our output state, so it's important that it 90 | # reset to 1; the other bits can reset to whatever value. 91 | thr = Signal(1 + 8, reset = 1) 92 | 93 | tx_bits_left = Signal(range(1 + 8 + 2)) 94 | tx_timer = Signal(range(self.oversample)) 95 | 96 | with m.If(self.sample_clock): 97 | with m.If(tx_bits_left != 0): 98 | with m.If(tx_timer == 0): 99 | m.d.sync += [ 100 | thr.eq(Cat(thr[1:], 1)), 101 | tx_timer.eq(self.oversample - 1), 102 | tx_bits_left.eq(tx_bits_left - 1), 103 | ] 104 | with m.Else(): 105 | m.d.sync += tx_timer.eq(tx_timer - 1) 106 | 107 | # Transmit output 108 | m.d.comb += self.tx.eq(thr[0]) 109 | 110 | # Control register interface. 111 | m.d.comb += self.busy.eq(tx_bits_left != 0) 112 | 113 | with m.If(self.thr_write.valid): 114 | m.d.sync += [ 115 | # Load THR with the start bit. 116 | thr.eq(Cat(0, self.thr_write.payload)), 117 | tx_bits_left.eq(1 + 8 + 2), 118 | tx_timer.eq(self.oversample - 1), 119 | ] 120 | 121 | return m 122 | 123 | 124 | class OversampleClock(Component): 125 | out: Out(1) 126 | 127 | def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None): 128 | super().__init__() 129 | self.baud_rate = baud_rate 130 | self.oversample = oversample 131 | self.clock_freq = clock_freq 132 | 133 | def elaborate(self, platform): 134 | m = Module() 135 | 136 | # We divide the system clock to our baud rate * oversample and use that 137 | # clock for sampling. This is a compromise between low cost transmit 138 | # (where we could divide the clock all the way down to the baud rate 139 | # without issue) and accurate receive (where higher sampling rates are 140 | # better but cost more flops). 141 | clock_freq = self.clock_freq or platform.default_clk_frequency 142 | our_freq = self.baud_rate * self.oversample 143 | divisor = int(round(clock_freq / our_freq)) 144 | print(f"UART configured for {self.baud_rate} from input clock {clock_freq}, divisor = {divisor}") 145 | actual_freq = clock_freq / self.oversample / divisor 146 | freq_error = abs(actual_freq - self.baud_rate) / self.baud_rate 147 | print(f"Actual baud rate will be: {actual_freq} (error: {freq_error * 100:.3}%)") 148 | assert freq_error< 0.01, "Error: cannot achieve requested UART frequency" 149 | 150 | sample_clock = Signal(1) 151 | sample_counter = Signal(range(divisor)) 152 | # Generate a pulse on every sample period for one (fast) clock cycle. 153 | m.d.comb += self.out.eq(sample_counter == 0) 154 | 155 | m.d.sync += sample_counter.eq(mux(self.out, divisor - 1, sample_counter - 1)) 156 | 157 | return m 158 | 159 | class TransmitOnlyUart(Component): 160 | """The world's crappiest UART! 161 | 162 | The low byte of any write goes into the transmit holding register and will 163 | be sent out promptly. 164 | 165 | Reads return a status register where bit 0 indicates BUSY. 166 | """ 167 | bus: In(BusPort(addr = 0, data = 16)) 168 | tx: Out(1) 169 | 170 | def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None): 171 | super().__init__() 172 | 173 | self.baud_rate = baud_rate 174 | self.oversample = oversample 175 | self.clock_freq = clock_freq 176 | 177 | def elaborate(self, platform): 178 | m = Module() 179 | m.submodules.clkdiv = clkdiv = OversampleClock( 180 | baud_rate = self.baud_rate, 181 | oversample = self.oversample, 182 | clock_freq = self.clock_freq, 183 | ) 184 | 185 | m.submodules.txr = txr = TransmitCore(oversample = self.oversample) 186 | m.d.comb += [ 187 | txr.sample_clock.eq(clkdiv.out), 188 | self.tx.eq(txr.tx), 189 | self.bus.resp.eq(txr.busy), 190 | 191 | txr.thr_write.payload.eq(self.bus.payload.data[:8]), 192 | ] 193 | 194 | with m.If(self.bus.cmd.valid & self.bus.cmd.payload.lanes[0]): 195 | m.d.comb += txr.thr_write.valid.eq(1) 196 | 197 | return m 198 | 199 | class ReceiveOnlyUart(Component): 200 | """The world's other crappiest UART! 201 | 202 | This can receive a single frame and hold it in registers. 203 | 204 | On any read, this will return the frame in the low 8 bits, plus bit 15 set 205 | if there's actual data. This is intended to be used with LH to easily get 206 | the "data full" flag into the MSB where it can be tested with bltz. 207 | 208 | And, read sensitive, why not. 209 | """ 210 | bus: In(BusPort(addr = 0, data = 16)) 211 | rx: In(1) 212 | 213 | def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None): 214 | super().__init__() 215 | 216 | self.baud_rate = baud_rate 217 | self.clock_freq = clock_freq 218 | 219 | def elaborate(self, platform): 220 | m = Module() 221 | 222 | m.submodules.clkdiv = clkdiv = OversampleClock( 223 | baud_rate = self.baud_rate, 224 | oversample = self.oversample, 225 | clock_freq = self.clock_freq, 226 | ) 227 | 228 | m.submodules.rxr = rxr = ReceiveCore(oversample = self.oversample) 229 | m.d.comb += [ 230 | rxr.rx.eq(self.rx), 231 | rxr.sample_clock.eq(clkdiv.out), 232 | rxr.read_strobe.eq(self.bus.cmd.valid & ~self.bus.cmd.payload.lanes.any()), 233 | ] 234 | 235 | m.d.sync += [ 236 | self.bus.resp[:8].eq(rxr.rdr), 237 | self.bus.resp[15].eq(rxr.empty), 238 | ] 239 | 240 | return m 241 | 242 | class BidiUart(Component): 243 | """A slightly less crappy UART. 244 | 245 | This combines the transmit and receive logic using a shared clock divider, 246 | to save some space if you need both directions. 247 | 248 | Register Layout 249 | --------------- 250 | 0x0000 RDR - data in low 8 bits, empty flag in bit 15, read-sensitive 251 | 0x0002 THR - reads as 0 if TX is idle, writes send low 8 bits 252 | """ 253 | bus: In(BusPort(addr = 1, data = 16)) 254 | tx: In(1) 255 | rx: In(1) 256 | 257 | def __init__(self, baud_rate = 19200, oversample = 16, clock_freq = None): 258 | super().__init__() 259 | 260 | self.baud_rate = baud_rate 261 | self.oversample = oversample 262 | self.clock_freq = clock_freq 263 | 264 | def elaborate(self, platform): 265 | m = Module() 266 | 267 | # Clock divider for sampling 268 | m.submodules.clkdiv = clkdiv = OversampleClock( 269 | baud_rate = self.baud_rate, 270 | oversample = self.oversample, 271 | clock_freq = self.clock_freq, 272 | ) 273 | 274 | # Receive state machine. 275 | m.submodules.rxr = rxr = ReceiveCore(oversample = self.oversample) 276 | m.d.comb += [ 277 | rxr.rx.eq(self.rx), 278 | rxr.sample_clock.eq(clkdiv.out), 279 | ] 280 | 281 | # Transmit machine. 282 | 283 | m.submodules.txr = txr = TransmitCore(oversample = self.oversample) 284 | m.d.comb += [ 285 | txr.sample_clock.eq(clkdiv.out), 286 | self.tx.eq(txr.tx), 287 | ] 288 | 289 | # Bus read port. We register this so that state doesn't change by the 290 | # time the output is read. This is particularly a problem for the 291 | # read-sensitive RDR register. 292 | m.d.sync += [ 293 | self.bus.resp[:8].eq(mux( 294 | self.bus.cmd.payload.addr[0], 295 | txr.busy, 296 | rxr.rdr, 297 | )), 298 | self.bus.resp[15].eq( 299 | ~self.bus.cmd.payload.addr[0] & rxr.empty 300 | ), 301 | ] 302 | 303 | # Read-sense logic for receive side. 304 | m.d.comb += rxr.read_strobe.eq( 305 | self.bus.cmd.valid 306 | & ~self.bus.cmd.payload.lanes.any() 307 | & ~self.bus.cmd.payload.addr[0] 308 | ) 309 | 310 | # Write logic for TX side. 311 | m.d.comb += txr.thr_write.payload.eq(self.bus.cmd.payload.data[:8]) 312 | 313 | m.d.comb += txr.thr_write.valid.eq( 314 | self.bus.cmd.valid 315 | & self.bus.cmd.payload.lanes[0] 316 | & self.bus.cmd.payload.addr[0] 317 | ) 318 | 319 | return m 320 | 321 | -------------------------------------------------------------------------------- /icestick-chonk.py: -------------------------------------------------------------------------------- 1 | # This is a demo for the "chonk" core, which is the hapenny v2 2 | # microarchitecture modified to have a 32-bit datapath and lower cycle count. 3 | # This provides a useful apples-to-apples comparison with the hapenny cores, 4 | # since it's using similar microarchitectural techniques and running the same 5 | # code. 6 | 7 | import itertools 8 | import argparse 9 | import struct 10 | from pathlib import Path 11 | 12 | from amaranth import * 13 | from amaranth.lib.wiring import * 14 | from amaranth.build import ResourceError, Resource, Pins, Attrs 15 | from amaranth_boards.test.blinky import Blinky 16 | from amaranth_boards.icestick import ICEStickPlatform 17 | import amaranth.lib.cdc 18 | 19 | from hapenny import StreamSig 20 | import hapenny.chonk.cpu 21 | from hapenny.bus import BusPort, SimpleFabric, partial_decode 22 | from hapenny.chonk.gpio32 import OutputPort32 23 | from hapenny.chonk.mem32 import BasicMemory 24 | 25 | bootloader = Path("smallest-toggle.bin").read_bytes() 26 | boot_image = struct.unpack("<" + "I" * (len(bootloader) // 4), bootloader) 27 | for i, word in enumerate(boot_image): 28 | print(f"{i*4:08x} {word:08x}") 29 | 30 | # the blinky program does not use RAM at all, so we can fit it in a single 31 | # block RAM. We'll use half of one, wastefully, to preserve the memory map. 32 | RAM_WORDS = 128 * 1 33 | RAM_ADDR_BITS = (RAM_WORDS - 1).bit_length() 34 | 35 | # Add an extra bit to the implemented bus so we can also address I/O. 36 | BUS_ADDR_BITS = RAM_ADDR_BITS + 1 37 | print(f"BUS_ADDR_BITS = {BUS_ADDR_BITS}") 38 | 39 | class Test(Elaboratable): 40 | def elaborate(self, platform): 41 | m = Module() 42 | 43 | # Gotta do some clock gymnastics here. We want the PLL on so that we can 44 | # run faster than the Icestick's 12 MHz crystal can go. However, setting 45 | # up our own sync domain _silently disables_ the Amaranth 46 | # ICE40Platform's reset delay, which is necssary to work around an 47 | # undocumented erratum in the iCE40 BRAM that has been chasing me for at 48 | # least six years. 49 | # 50 | # So, we're going to reconstruct it manually. 51 | clk12 = platform.request("clk12", dir = "-") 52 | 53 | # 15us delay, 12 MHz clock: 180 cycles 54 | por_delay = int(15e-6 * 12e6) 55 | m.domains += ClockDomain("por", reset_less=True, local=True) 56 | por_timer = Signal(range(por_delay)) 57 | por_ready = Signal() 58 | m.d.comb += ClockSignal("por").eq(clk12.io) 59 | with m.If(por_timer == por_delay): 60 | m.d.por += por_ready.eq(1) 61 | with m.Else(): 62 | m.d.por += por_timer.eq(por_timer + 1) 63 | 64 | cd_sync = ClockDomain("sync") 65 | m.domains += cd_sync 66 | m.d.comb += ResetSignal("sync").eq(~por_ready) 67 | 68 | F = 55e6 # Hz 69 | pll_r, pll_f, pll_q, filter_range = 0, 72, 4, 1 70 | 71 | platform.add_clock_constraint(cd_sync.clk, F) 72 | print(f"Configuring SoC for {F/1000000:.03} MHz") 73 | 74 | # PLL settings below must generate F from 12MHz; use icepll to adjust. 75 | m.submodules.pll = Instance( 76 | "SB_PLL40_CORE", 77 | p_FEEDBACK_PATH = "SIMPLE", 78 | p_DIVR = pll_r, 79 | p_DIVF = pll_f, 80 | p_DIVQ = pll_q, 81 | p_FILTER_RANGE = filter_range, 82 | 83 | i_REFERENCECLK = clk12.io, 84 | i_RESETB = 1, 85 | o_PLLOUTGLOBAL = cd_sync.clk, 86 | ) 87 | 88 | # Ok, back to the design. 89 | m.submodules.cpu = cpu = hapenny.chonk.cpu.Cpu( 90 | # +2 to adjust from bus word addressing to CPU byte 91 | # addressing. 92 | addr_width = BUS_ADDR_BITS + 2, 93 | # Program addresses only need to be able to address program 94 | # memory, so configure the PC and fetch port to be narrower. 95 | # (+2 because, again, our RAM is word addressed but this parameter 96 | # is in bytes.) 97 | prog_addr_width = RAM_ADDR_BITS + 2, 98 | ) 99 | m.submodules.mem = mem = BasicMemory(depth = RAM_WORDS, 100 | contents = boot_image) 101 | # Make the simplest output port possible. 102 | m.submodules.outport = outport = OutputPort32(1) 103 | m.submodules.fabric = fabric = SimpleFabric([ 104 | mem.bus, 105 | partial_decode(m, outport.bus, RAM_ADDR_BITS), 106 | ]) 107 | 108 | connect(m, cpu.bus, fabric.bus) 109 | 110 | for i in range(1): 111 | led = platform.request("led", i) 112 | m.d.comb += led.o.eq(outport.pins[i]) 113 | 114 | return m 115 | 116 | parser = argparse.ArgumentParser( 117 | prog = "icestick-smallestbig", 118 | description = "Script for synthesizing smallest image for HX1K", 119 | ) 120 | args = parser.parse_args() 121 | 122 | p = ICEStickPlatform() 123 | p.build(Test(), do_program = True) 124 | -------------------------------------------------------------------------------- /icestick-smallest.py: -------------------------------------------------------------------------------- 1 | # This is the smallest happeny SoC model for the Icestick, to show off its size 2 | # when configured with only a basic assembly program and single peripheral. 3 | # 4 | # This is not a spectacularly _useful_ configuration, and is smaller than the 5 | # "small" configurations most other small RV32I SoCs include, so the numbers 6 | # don't necessarily compare directly. It's essentially a 32-bit version of a 7 | # tinyAVR. 8 | # 9 | # Mostly, I use this to keep an eye on the minimum size with a circuit that 10 | # isn't so simple that it optimizes away in synthesis. 11 | 12 | import itertools 13 | from functools import reduce 14 | import argparse 15 | import struct 16 | from pathlib import Path 17 | 18 | from amaranth import * 19 | from amaranth.lib.wiring import * 20 | from amaranth.build import ResourceError, Resource, Pins, Attrs 21 | from amaranth_boards.test.blinky import Blinky 22 | from amaranth_boards.icestick import ICEStickPlatform 23 | import amaranth.lib.cdc 24 | 25 | from hapenny import StreamSig 26 | import hapenny.cpu 27 | from hapenny.bus import BusPort, SimpleFabric, partial_decode 28 | from hapenny.gpio import OutputPort 29 | from hapenny.mem import BasicMemory 30 | 31 | bootloader = Path("smallest-toggle.bin").read_bytes() 32 | boot_image = struct.unpack("<" + "H" * (len(bootloader) // 2), bootloader) 33 | 34 | image_or = reduce(lambda a, b: a|b, boot_image) 35 | image_and = reduce(lambda a, b: a&b, boot_image) 36 | 37 | problems = set(b for b in range(16) 38 | if image_or & (1 << b) == 0 39 | or image_and & (1 << b) != 0) 40 | 41 | if problems: 42 | print("WARNING: contents of boot ROM may cause logic to be optimized out.") 43 | print("Size estimates generated from such an image would be misleading.") 44 | print(f"The following bit positions are constant: {problems}") 45 | 46 | # the blinky program does not use RAM at all, so we can fit it in a single block RAM. 47 | RAM_WORDS = 256 * 1 48 | RAM_ADDR_BITS = (RAM_WORDS - 1).bit_length() 49 | 50 | # Add an extra bit to the implemented bus so we can also address I/O. 51 | BUS_ADDR_BITS = RAM_ADDR_BITS + 1 52 | 53 | class Test(Elaboratable): 54 | def elaborate(self, platform): 55 | m = Module() 56 | 57 | # Gotta do some clock gymnastics here. We want the PLL on so that we can 58 | # run faster than the Icestick's 12 MHz crystal can go. However, setting 59 | # up our own sync domain _silently disables_ the Amaranth 60 | # ICE40Platform's reset delay, which is necssary to work around an 61 | # undocumented erratum in the iCE40 BRAM that has been chasing me for at 62 | # least six years. 63 | # 64 | # So, we're going to reconstruct it manually. 65 | clk12 = platform.request("clk12", dir = "-") 66 | 67 | # 15us delay, 12 MHz clock: 180 cycles 68 | por_delay = int(20e-6 * 12e6) 69 | m.domains += ClockDomain("por", reset_less=True, local=True) 70 | por_timer = Signal(range(por_delay)) 71 | por_ready = Signal() 72 | m.d.comb += ClockSignal("por").eq(clk12.io) 73 | with m.If(por_timer == por_delay): 74 | m.d.por += por_ready.eq(1) 75 | with m.Else(): 76 | m.d.por += por_timer.eq(por_timer + 1) 77 | 78 | cd_sync = ClockDomain("sync") 79 | m.domains += cd_sync 80 | m.d.comb += ResetSignal("sync").eq(~por_ready) 81 | 82 | F = 70.5e6 # Hz 83 | pll_f, pll_q = 46, 3 84 | 85 | platform.add_clock_constraint(cd_sync.clk, F) 86 | print(f"Configuring SoC for {F/1000000:.03} MHz") 87 | 88 | # PLL settings below must generate F from 12MHz; use icepll to adjust. 89 | m.submodules.pll = Instance( 90 | "SB_PLL40_CORE", 91 | p_FEEDBACK_PATH = "SIMPLE", 92 | p_DIVR = 0, 93 | p_DIVF = pll_f, 94 | p_DIVQ = pll_q, 95 | p_FILTER_RANGE = 1, 96 | 97 | i_REFERENCECLK = clk12.io, 98 | i_RESETB = 1, 99 | o_PLLOUTGLOBAL = cd_sync.clk, 100 | ) 101 | 102 | # Ok, back to the design. 103 | m.submodules.cpu = cpu = hapenny.cpu.Cpu( 104 | # +1 to adjust from bus halfword addressing to CPU byte 105 | # addressing. 106 | addr_width = BUS_ADDR_BITS + 1, 107 | # Program addresses only need to be able to address program 108 | # memory, so configure the PC and fetch port to be narrower. 109 | # (+1 because, again, our RAM is halfword addressed but this 110 | # parameter is in bytes.) 111 | prog_addr_width = RAM_ADDR_BITS + 1, 112 | ) 113 | m.submodules.mem = mem = BasicMemory(depth = RAM_WORDS, 114 | contents = boot_image) 115 | # Make the simplest output port possible. 116 | m.submodules.outport = outport = OutputPort(1, read_back = False) 117 | m.submodules.fabric = fabric = SimpleFabric([ 118 | mem.bus, 119 | partial_decode(m, outport.bus, RAM_ADDR_BITS), 120 | ]) 121 | 122 | connect(m, cpu.bus, fabric.bus) 123 | 124 | for i in range(1): 125 | led = platform.request("led", i) 126 | m.d.comb += led.o.eq(outport.pins[i]) 127 | 128 | return m 129 | 130 | parser = argparse.ArgumentParser( 131 | prog = "icestick-smallest", 132 | description = "Script for synthesizing smallest image for HX1K", 133 | ) 134 | args = parser.parse_args() 135 | 136 | p = ICEStickPlatform() 137 | p.build(Test(), do_program = True) 138 | -------------------------------------------------------------------------------- /icesticktest.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import argparse 3 | import struct 4 | from pathlib import Path 5 | 6 | from amaranth import * 7 | from amaranth.lib.wiring import * 8 | from amaranth.build import ResourceError, Resource, Pins, Attrs 9 | from amaranth_boards.test.blinky import Blinky 10 | from amaranth_boards.icestick import ICEStickPlatform 11 | import amaranth.lib.cdc 12 | 13 | from hapenny import StreamSig 14 | from hapenny.cpu import Cpu 15 | from hapenny.bus import BusPort, SimpleFabric, partial_decode 16 | from hapenny.serial import BidiUart 17 | from hapenny.mem import BasicMemory 18 | 19 | bootloader = Path("tiny-bootloader.bin").read_bytes() 20 | boot_image = struct.unpack("<" + "h" * (len(bootloader) // 2), bootloader) 21 | 22 | # tiny-bootloader is written in a high-level language and needs to have a stack, 23 | # so the minimum here is two 256-halfword (512-byte) RAMs. 24 | RAM_WORDS = 256 * 2 25 | RAM_ADDR_BITS = (RAM_WORDS - 1).bit_length() 26 | 27 | BUS_ADDR_BITS = RAM_ADDR_BITS + 1 28 | 29 | class Test(Elaboratable): 30 | def __init__(self): 31 | super().__init__() 32 | 33 | def elaborate(self, platform): 34 | m = Module() 35 | 36 | # Gotta do some clock gymnastics here. We want the PLL on so that we can 37 | # run faster than the Icestick's 12 MHz crystal can go. However, setting 38 | # up our own sync domain _silently disables_ the Amaranth 39 | # ICE40Platform's reset delay, which is necssary to work around an 40 | # undocumented erratum in the iCE40 BRAM that has been chasing me for at 41 | # least six years. 42 | # 43 | # So, we're going to reconstruct it manually. 44 | clk12 = platform.request("clk12", dir = "-") 45 | 46 | # 15us delay, 12 MHz clock: 180 cycles 47 | por_delay = int(15e-6 * 12e6) 48 | m.domains += ClockDomain("por", reset_less=True, local=True) 49 | por_timer = Signal(range(por_delay)) 50 | por_ready = Signal() 51 | m.d.comb += ClockSignal("por").eq(clk12.io) 52 | with m.If(por_timer == por_delay): 53 | m.d.por += por_ready.eq(1) 54 | with m.Else(): 55 | m.d.por += por_timer.eq(por_timer + 1) 56 | 57 | cd_sync = ClockDomain("sync") 58 | m.domains += cd_sync 59 | m.d.comb += ResetSignal("sync").eq(~por_ready) 60 | 61 | F = 66e6 # Hz 62 | platform.add_clock_constraint(cd_sync.clk, F) 63 | print(f"Configuring SoC for {F/1000000:.03} MHz") 64 | 65 | # PLL settings below must generate F from 12MHz; use icepll to adjust. 66 | m.submodules.pll = Instance( 67 | "SB_PLL40_CORE", 68 | p_FEEDBACK_PATH = "SIMPLE", 69 | p_DIVR = 0, 70 | p_DIVF = 87, 71 | p_DIVQ = 4, 72 | p_FILTER_RANGE = 1, 73 | 74 | i_REFERENCECLK = clk12.io, 75 | i_RESETB = 1, 76 | o_PLLOUTGLOBAL = cd_sync.clk, 77 | ) 78 | 79 | # Ok, back to the design. 80 | m.submodules.cpu = cpu = Cpu( 81 | # +1 to adjust from bus halfword addressing to CPU byte addressing. 82 | addr_width = BUS_ADDR_BITS + 1, 83 | # Program addresses only need to be able to address program memory, 84 | # so configure the PC and fetch port to be narrower. (+1 because, 85 | # again, our RAM is halfword addressed but this parameter is in 86 | # bytes.) 87 | prog_addr_width = RAM_ADDR_BITS + 1, 88 | ) 89 | m.submodules.mem = mem = BasicMemory(depth = RAM_WORDS, 90 | contents = boot_image) 91 | # Set the UART for 8x oversample instead of the default 16, to save some 92 | # logic. 93 | m.submodules.uart = uart = BidiUart(baud_rate = 115_200, 94 | oversample = 8, 95 | clock_freq = F) 96 | m.submodules.fabric = fabric = SimpleFabric([ 97 | mem.bus, 98 | partial_decode(m, uart.bus, RAM_ADDR_BITS), 99 | ]) 100 | 101 | connect(m, cpu.bus, fabric.bus) 102 | 103 | uartpins = platform.request("uart", 0) 104 | rx_post_sync = Signal() 105 | m.submodules.rxsync = amaranth.lib.cdc.FFSynchronizer( 106 | i = uartpins.rx.i, 107 | o = rx_post_sync, 108 | o_domain = "sync", 109 | reset = 1, 110 | stages = 2, 111 | ) 112 | m.d.comb += [ 113 | uartpins.tx.o.eq(uart.tx), 114 | uart.rx.eq(rx_post_sync), 115 | ] 116 | 117 | return m 118 | 119 | p = ICEStickPlatform() 120 | p.build(Test(), do_program = True) 121 | -------------------------------------------------------------------------------- /icoboard-large.py: -------------------------------------------------------------------------------- 1 | # Icoboard example using the external SRAM. 2 | 3 | import itertools 4 | import argparse 5 | import struct 6 | from pathlib import Path 7 | 8 | from amaranth import * 9 | from amaranth.lib.wiring import * 10 | from amaranth.build import ResourceError, Resource, Pins, Attrs 11 | from boards.icoboard import IcoboardPlatform 12 | 13 | from hapenny import StreamSig 14 | from hapenny.cpu import Cpu 15 | from hapenny.bus import BusPort, SimpleFabric, partial_decode 16 | from hapenny.gpio import OutputPort, InputPort 17 | from hapenny.serial import BidiUart 18 | from hapenny.mem import BasicMemory, SpramMemory 19 | from hapenny.extsram import ExternalSRAM 20 | 21 | BOOT_ROM_WORDS = 256 22 | BOOT_ROM_ADDR_BITS = (BOOT_ROM_WORDS - 1).bit_length() 23 | 24 | bootloader = Path("icolarge-bootloader.bin").read_bytes() 25 | boot_image = struct.unpack("<" + "H" * (len(bootloader) // 2), bootloader) 26 | 27 | assert len(boot_image) <= BOOT_ROM_WORDS, \ 28 | f"bootloader is {len(boot(image))} words long, too big for boot ROM" 29 | 30 | class Test(Elaboratable): 31 | def elaborate(self, platform): 32 | m = Module() 33 | 34 | # The Icoboard's input clock is 100MHz, which seems ... optimistic. 35 | # Let's drop it to something we can use. 36 | 37 | # Gotta do some clock gymnastics here. We want the PLL on so that we can 38 | # run faster than the Icestick's 12 MHz crystal can go. However, setting 39 | # up our own sync domain _silently disables_ the Amaranth 40 | # ICE40Platform's reset delay, which is necssary to work around an 41 | # undocumented erratum in the iCE40 BRAM that has been chasing me for at 42 | # least six years. 43 | # 44 | # So, we're going to reconstruct it manually. 45 | clk100 = platform.request("clk100", dir = "-") 46 | 47 | # 15us delay, 100 MHz clock: 1500 cycles 48 | por_delay = int(15e-6 * 100e6) 49 | m.domains += ClockDomain("por", reset_less=True, local=True) 50 | por_timer = Signal(range(por_delay)) 51 | por_ready = Signal() 52 | m.d.comb += ClockSignal("por").eq(clk100.io) 53 | with m.If(por_timer == por_delay): 54 | m.d.por += por_ready.eq(1) 55 | with m.Else(): 56 | m.d.por += por_timer.eq(por_timer + 1) 57 | 58 | cd_sync = ClockDomain("sync") 59 | m.domains += cd_sync 60 | m.d.comb += ResetSignal("sync").eq(~por_ready) 61 | 62 | #F = 25.781e6 # Hz 63 | #pll_r, pll_f, pll_q, filter_range = 3, 0, 5, 2 64 | F = 50e6 # Hz 65 | pll_r, pll_f, pll_q, filter_range = 1, 0, 4, 4 66 | 67 | 68 | platform.add_clock_constraint(cd_sync.clk, F) 69 | print(f"Configuring SoC for {F/1000000:.05} MHz") 70 | 71 | clk_90 = Signal() 72 | 73 | # PLL settings below must generate F from 12MHz; use icepll to adjust. 74 | m.submodules.pll = Instance( 75 | "SB_PLL40_2F_CORE", 76 | p_FEEDBACK_PATH = "PHASE_AND_DELAY", 77 | p_PLLOUT_SELECT_PORTA = "SHIFTREG_0deg", 78 | p_PLLOUT_SELECT_PORTB = "SHIFTREG_90deg", 79 | p_SHIFTREG_DIV_MODE = 0, 80 | p_DIVR = pll_r, 81 | p_DIVF = pll_f, 82 | p_DIVQ = pll_q, 83 | p_FILTER_RANGE = filter_range, 84 | 85 | i_REFERENCECLK = clk100.io, 86 | i_RESETB = 1, 87 | o_PLLOUTGLOBALA = cd_sync.clk, 88 | o_PLLOUTCOREB = clk_90, 89 | ) 90 | 91 | # Memory map should be: 92 | # 0000_0000 external SRAM (1 MiB) 93 | # 0010_0000 boot ROM 94 | # 0010_1000 UART 95 | # 0010_2000 output port 96 | # 0010_3000 input port 97 | 98 | m.submodules.cpu = cpu = Cpu( 99 | reset_vector = 0x10_0000, # boot ROM 100 | # We need 21-bit addressing to reach both all of the 1 MiB SRAM and 101 | # our boot ROM. This also gives us about a MiB of peripheral space, 102 | # which is great, so we set both program and load/store address 103 | # widths to 21. 104 | addr_width = 21, 105 | # We'll turn on the counters because we've got the space, and this 106 | # makes the output from Dhrystone a lot more interesting. 107 | counters = True, 108 | ) 109 | 110 | # Create our RAMs. 111 | m.submodules.extsram = extsram = ExternalSRAM(address_bits = 19) 112 | m.submodules.bootmem = bootmem = BasicMemory(depth = BOOT_ROM_WORDS, 113 | contents = boot_image) 114 | m.submodules.mem = mem = BasicMemory(depth = BOOT_ROM_WORDS) 115 | 116 | # Create a subfabric for the top mebibyte of the addressible space. This 117 | # will include both our I/O devices and our boot ROM. We'll give each 118 | # thing a 4096 byte (2048-halfword) region. 119 | m.submodules.outport = outport = OutputPort(3) 120 | m.submodules.inport = inport = InputPort(2) 121 | m.submodules.uart = uart = BidiUart(baud_rate = 115200, 122 | clock_freq = F) 123 | m.submodules.iofabric = iofabric = SimpleFabric([ 124 | partial_decode(m, bootmem.bus, 11), # 0x____0000 125 | partial_decode(m, uart.bus, 11), # 0x____1000 126 | partial_decode(m, outport.bus, 11), # 0x____2000 127 | partial_decode(m, inport.bus, 11) , # 0x____3000 128 | ]) 129 | 130 | # Create the top-level fabric to unite memory and I/O. 131 | m.submodules.fabric = fabric = SimpleFabric([ 132 | extsram.bus, # 0x0000_0000 133 | partial_decode(m, iofabric.bus, 19), # 0x0010_0000 134 | ]) 135 | 136 | connect(m, cpu.bus, fabric.bus) 137 | 138 | # Add some things describing how I've got PMODs connected. 139 | # USB-serial is connected on top row of PMOD 1 140 | platform.add_resources([ 141 | Resource("tx", 0, Pins("2", dir="o", conn=("pmod", 1))), 142 | Resource("rx", 0, Pins("3", dir="i", conn=("pmod", 1))), 143 | ]) 144 | 145 | # UART wiring 146 | tx = platform.request("tx", 0) 147 | rx = platform.request("rx", 0) 148 | m.d.comb += [ 149 | tx.o[0].eq(uart.tx), 150 | uart.rx.eq(rx.i[0]), 151 | ] 152 | 153 | # LED wiring 154 | for i in range(3): 155 | led = platform.request("led", i) 156 | m.d.comb += led.o.eq(outport.pins[i]) 157 | 158 | # Input port wiring 159 | for i in range(2): 160 | button = platform.request("button", i) 161 | m.d.comb += inport.pins[i].eq(button.i) 162 | 163 | # SRAM wiring. NOTE: Amaranth models all the SRAM control signals as 164 | # active-high and inverts at the pin. This means all of our signals 165 | # here are active-high. This is potentially confusing, hence this 166 | # comment. 167 | sram = platform.request("sram", 0) 168 | m.d.comb += [ 169 | # Our SRAM interface requires a 90-degree-shifted version of the 170 | # clock. 171 | extsram.clock_90.eq(clk_90), 172 | 173 | sram.cs.o.eq(1), # amaranth inverts this 174 | sram.oe.o.eq(extsram.sram_oe), 175 | sram.we.o.eq(extsram.sram_we), 176 | sram.dm.o.eq(extsram.sram_lanes), 177 | 178 | sram.a.o.eq(extsram.addr_to_sram), 179 | 180 | sram.d.o.eq(extsram.data_to_sram), 181 | sram.d.oe.eq(extsram.sram_we), 182 | extsram.data_from_sram.eq(sram.d.i), 183 | ] 184 | 185 | return m 186 | 187 | p = IcoboardPlatform() 188 | p.build(Test(), do_program = True) 189 | -------------------------------------------------------------------------------- /icolarge-bootloader.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/icolarge-bootloader.bin -------------------------------------------------------------------------------- /montool/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /montool/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hapenny-montool" 3 | version = "1.0.0" 4 | edition = "2021" 5 | license = "MPL-2" 6 | 7 | [dependencies] 8 | anyhow = "1.0.71" 9 | clap = { version = "4.3.4", features = ["derive", "wrap_help"] } 10 | indicatif = "0.17.5" 11 | parse_int = "0.6.0" 12 | serialport = "4.2.1" 13 | -------------------------------------------------------------------------------- /montool/README.mkdn: -------------------------------------------------------------------------------- 1 | # hapenny montool 2 | 3 | This is a very basic command line tool for interacting with the tinyboot serial 4 | monitor. 5 | 6 | For instructions: 7 | 8 | `cargo run -- --help` 9 | -------------------------------------------------------------------------------- /montool/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::{time::Duration, path::PathBuf, io::ErrorKind, io::Write}; 2 | 3 | use anyhow::{Context, Result, bail}; 4 | use indicatif::ProgressBar; 5 | use serialport::SerialPort; 6 | use clap::Parser; 7 | 8 | /// A tool for interacting with the hapenny tinyboot serial monitor. 9 | #[derive(Debug, Parser)] 10 | #[clap(version)] 11 | struct BootTool { 12 | /// Path to serial port on your machine, e.g. /dev/ttyUSB0 or COM1: 13 | port: String, 14 | /// Baud rate of serial port. 15 | #[clap(long, short, global = true, default_value_t = 115_200)] 16 | baud_rate: u32, 17 | 18 | #[clap(subcommand)] 19 | cmd: SubCmd, 20 | } 21 | 22 | #[derive(Debug, Parser)] 23 | enum SubCmd { 24 | /// Perform a basic check to see if tinyboot appears to be running. 25 | Ping, 26 | /// Load a single 32-bit word from an address in the target. 27 | Peek { 28 | /// Address to read. 29 | #[clap(value_parser = parse_int::parse::)] 30 | address: u32, 31 | }, 32 | /// Write a single 32-bit word into the taget. 33 | Poke { 34 | /// Address to write. 35 | #[clap(value_parser = parse_int::parse::)] 36 | address: u32, 37 | /// Value to write. 38 | #[clap(value_parser = parse_int::parse::)] 39 | value: u32, 40 | }, 41 | /// Write the contents of a file into the target. Useful for loading a 42 | /// program from a .bin file. 43 | Write { 44 | /// Address to begin writing. 45 | #[clap(value_parser = parse_int::parse::)] 46 | address: u32, 47 | /// File containing bytes to write; will be padded out to a multiple of 48 | /// 4. 49 | image_file: PathBuf, 50 | }, 51 | /// Call into an address in the target. 52 | Call { 53 | /// Address to call. 54 | #[clap(value_parser = parse_int::parse::)] 55 | address: u32, 56 | /// If provided, the tool will immediately begin echoing back data 57 | /// received on the serial report until you kill it. This is useful for 58 | /// loading and running programs that are chatty, such as Dhrystone. 59 | #[clap(long)] 60 | then_echo: bool, 61 | }, 62 | } 63 | 64 | fn main() -> Result<()> { 65 | let args = BootTool::parse(); 66 | 67 | let mut port = serialport::new(&args.port, args.baud_rate) 68 | .timeout(Duration::from_millis(500)) 69 | .open() 70 | .with_context(|| format!("opening serial port {}", args.port))?; 71 | 72 | drain(&mut port)?; 73 | 74 | match args.cmd { 75 | SubCmd::Ping => { 76 | do_cmd(&mut port, &[5]) 77 | .context("pinging")?; 78 | } 79 | SubCmd::Peek { address } => { 80 | // load addr register 81 | let mut cmd = [3, 0, 0, 0, 0]; 82 | cmd[1..].copy_from_slice(&address.to_le_bytes()); 83 | do_cmd(&mut port, &cmd) 84 | .context("loading A")?; 85 | // load count register 86 | let cmd = [4, 1, 0, 0, 0]; 87 | do_cmd(&mut port, &cmd) 88 | .context("loading C")?; 89 | // read out the data 90 | let cmd = [2]; 91 | do_cmd(&mut port, &cmd) 92 | .context("sending GET")?; 93 | let mut data = [0; 4]; 94 | port.read_exact(&mut data) 95 | .context("waiting for data")?; 96 | println!("{:#x}", u32::from_le_bytes(data)); 97 | } 98 | SubCmd::Poke { address, value } => { 99 | // load addr register 100 | let mut cmd = [3, 0, 0, 0, 0]; 101 | cmd[1..].copy_from_slice(&address.to_le_bytes()); 102 | do_cmd(&mut port, &cmd) 103 | .context("loading A")?; 104 | // load count register 105 | let cmd = [4, 1, 0, 0, 0]; 106 | do_cmd(&mut port, &cmd) 107 | .context("loading C")?; 108 | // deposit the data. 109 | let mut cmd = [1, 0, 0, 0, 0]; 110 | cmd[1..].copy_from_slice(&value.to_le_bytes()); 111 | do_cmd(&mut port, &cmd) 112 | .context("sending PUT")?; 113 | } 114 | SubCmd::Write { address, image_file } => { 115 | let mut image = std::fs::read(&image_file)?; 116 | while image.len() % 4 != 0 { 117 | image.push(0); 118 | } 119 | // load addr register 120 | let mut cmd = [3, 0, 0, 0, 0]; 121 | cmd[1..].copy_from_slice(&address.to_le_bytes()); 122 | do_cmd(&mut port, &cmd) 123 | .context("loading A")?; 124 | let bar = ProgressBar::new(image.len() as u64); 125 | for chunk in image.chunks(256) { 126 | // load count register 127 | let word_count = u32::try_from(chunk.len() / 4)?; 128 | let mut cmd = [4, 0, 0, 0, 0]; 129 | cmd[1..].copy_from_slice(&word_count.to_le_bytes()); 130 | do_cmd(&mut port, &cmd) 131 | .context("loading C")?; 132 | let mut packet = vec![1]; 133 | packet.extend_from_slice(chunk); 134 | // deposit the data. 135 | do_cmd(&mut port, &packet) 136 | .context("sending PUT")?; 137 | bar.inc(chunk.len() as u64); 138 | } 139 | bar.finish(); 140 | } 141 | SubCmd::Call { address, then_echo } => { 142 | // load addr register 143 | let mut cmd = [3, 0, 0, 0, 0]; 144 | cmd[1..].copy_from_slice(&address.to_le_bytes()); 145 | do_cmd(&mut port, &cmd) 146 | .context("loading A")?; 147 | // go! 148 | do_cmd(&mut port, &[0]) 149 | .context("sending CALL")?; 150 | 151 | if then_echo { 152 | let stdout = std::io::stdout(); 153 | let mut stdout = stdout.lock(); 154 | loop { 155 | let mut b = [0]; 156 | match port.read_exact(&mut b) { 157 | Ok(()) => { 158 | write!(stdout, "{}", b[0] as char)?; 159 | stdout.flush()?; 160 | }, 161 | Err(e) if e.kind() == ErrorKind::TimedOut => { 162 | // meh 163 | } 164 | other => other?, 165 | } 166 | } 167 | } 168 | } 169 | } 170 | 171 | Ok(()) 172 | } 173 | 174 | fn do_cmd(port: &mut Box, cmd: &[u8]) -> Result<()> { 175 | port.write_all(&cmd).context("writing command")?; 176 | let mut response = [0; 1]; 177 | port.read_exact(&mut response).context("collecting response byte")?; 178 | match response[0] { 179 | 0xAA => Ok(()), 180 | 0xFF => { 181 | bail!("Received NACK"); 182 | } 183 | x => { 184 | bail!("Received unexpected response: {x:#x}"); 185 | } 186 | } 187 | } 188 | 189 | fn drain(port: &mut Box) -> Result<()> { 190 | let saved_timeout = port.timeout(); 191 | 192 | port.set_timeout(Duration::from_millis(1)) 193 | .context("reducing timeout for drain")?; 194 | 195 | let mut buffer = [0; 32]; 196 | let mut cruft = 0_usize; 197 | loop { 198 | match port.read(&mut buffer) { 199 | Ok(n) => cruft += n, 200 | Err(e) if e.kind() == ErrorKind::TimedOut => { 201 | break; 202 | } 203 | Err(e) => return Err(e) 204 | .context("attempting to drain buffer"), 205 | } 206 | } 207 | port.set_timeout(saved_timeout) 208 | .context("restoring timeout after drain")?; 209 | 210 | if cruft > 0 { 211 | println!("note: {cruft} bytes of cruft drained from serial port"); 212 | } 213 | 214 | Ok(()) 215 | } 216 | 217 | -------------------------------------------------------------------------------- /notes/20231001.mkdn: -------------------------------------------------------------------------------- 1 | # Small RV32 core using Amaranth 2 | 3 | Looking into doing a tiny, probably-not-pipelined RV32 core for use on small 4 | iCE40s. Likely approach: 5 | 6 | - Use 2x BRAMs to provide a 1R1W register file. 7 | - Since that necessitates most instructions taking >1 cycle, use as many cycles 8 | as seems appropriate. 9 | 10 | This is pretty similar to my Dinky5 core, which, for the record, used the 11 | following states: 12 | 13 | ``` 14 | JustFetchState, // Reset, or second cycle of store. 15 | Reg2State, // Reading instruction, selecting rs2. 16 | Reg1State, // Latching x2, selecting rs1. 17 | ExecuteState, // Executing first instruction cycle. 18 | LoadState, // Second cycle for loads. 19 | ShiftState, // Processing a serial shift operation. 20 | HaltState // Something has gone wrong. 21 | ``` 22 | 23 | Pretty reasonable, I think. 3 cycles for most instructions since the final cycle 24 | issues a fetch. 4 cycles for stores. A great many cycles for shifts. 25 | 26 | Lessee how things shape up in Amaranth. 27 | 28 | --- 29 | 30 | Going pretty well I think 31 | 32 | Currently fetching rs1 followed by rs2. Dinky did it in the other order. I think 33 | there's some value to reversing it. While load and store both use rs1 as the 34 | base address, it's a question of when it becomes available. 35 | 36 | Currently, load is able to skip a cycle compared to store because it only needs 37 | rs1. Store needs to wait for both registers before it can issue a bus 38 | transaction. 39 | 40 | --- 41 | 42 | Interesting observation. 43 | 44 | On a 4LUT part, putting 2muxes on the inputs of an adder has equivalent resource 45 | cost to generating two adders and muxing between them. Assuming the muxes don't 46 | need to be switched separately. 47 | 48 | Muxing adders has the advantage of loosening the timing constraint on the mux 49 | control signal. For whatever that's worth. 50 | 51 | 52 | -------------------------------------------------------------------------------- /notes/20231002.mkdn: -------------------------------------------------------------------------------- 1 | Got most of RV32 implemented at this point. 2 | 3 | Missing: FENCE and SYSTEM (and so the CSRs and whatnot) 4 | 5 | Current design's synthesizing a lot larger than I'd like. Adding the byte and 6 | halfword loads and stores enlarged things _significantly._ Removing 7 | byte/halfword _store_ support saves ... not a lot. 4 LUTs. Removing 8 | byte/halfword and extending _loads_ saves 105 LUTs. 9 | 10 | SLT and friends are usually the critical path, which has happened to me before. 11 | Making them as simple as possible is basically what I've got. 12 | 13 | I think I might need to take a step back and try to simplify. For instance, I've 14 | currently got two ALUs, effectively -- one for register-immediate and one for 15 | register-register. They're never used on the same cycle. 16 | 17 | --- 18 | 19 | Okay, starting to simplify by more closely mirroring my work on Dinky5 2+ years 20 | ago. I've figured something out. 21 | 22 | Partially implemented core costs: 23 | - 666 LCs with 32-bit PC, but 24 | - 396 LCs with 8-bit PC 25 | 26 | So, that's a lot of the size I'm seeing. Dinky5 used a shrunken PC. 27 | 28 | Why's PC so expensive? 29 | 30 | - Well there's PC+4, PC+IMMU, PC+IMMJ, PC+IMMB... 31 | - And the relevant muxes for selecting among them 32 | 33 | 34 | 35 | ``` 36 | 1100011 Bxx rs2 37 | 0110011 ALU r-r rs2 38 | 0010011 ALU r-i I-format immediate 39 | ``` 40 | -------------------------------------------------------------------------------- /notes/20231003.mkdn: -------------------------------------------------------------------------------- 1 | Okay, got a complete port of Dinky done, and ... it's considerably larger. 2 | 3 | Like, 50% larger. 4 | 5 | I have admittedly added some features to the design, so, let me parameterize 6 | those so I can evaluate with and without them. 7 | 8 | # Alignment checks 9 | 10 | Present in Dinky5, but, easy enough to make conditional. 11 | 12 | With: 925 LC / 47.88 MHz 13 | Without: 912 LC / 44.75 MHz 14 | 15 | # Wait states / bus ready lines 16 | 17 | DinkyBus doesn't allow wait states. The icestick demo doesn't make much use of 18 | them. 19 | 20 | Removed: 907 LC / 44.29 MHz 21 | 22 | # PC implemented width 23 | 24 | Dinky5 demo on Icestick used an 8-bit PC. PC width doesn't automatically shrink 25 | to fit memory bus implementation width, because PC is program visible in 26 | situations like JAL. 27 | 28 | Parameterizing it down to 8 bits: 830 LC / 47.54 MHz 29 | 30 | Turning alignment checks back on: 846 LC 31 | 32 | # Halting debug 33 | 34 | While halting debug isn't wired up in the icestick demo, maybe I should disable 35 | it conclusively. This is a matter of 36 | 37 | 1. Converting all checks of the `halt_request` pin into constant 0s 38 | 2. Removing the entire HALT case from the switch. 39 | 40 | 838 LC / 43.78 MHz. So, some halt logic _was_ getting built in. But not very 41 | much. 42 | 43 | # Fixing PC on fetch 44 | 45 | Turns out, any overlapped fetch was using the wrong PC value, failing to lop off 46 | its bottom two bits. Bet this was producing some extra muxes! 47 | 48 | 836 LC - so, uh, yeah, technically, but not a lot. 49 | 50 | 51 | # Asserting `mem_in.ready` on loads 52 | 53 | whoops 54 | 55 | No impact on the icestick demo since the memory there ignores it. 56 | 57 | 58 | # Instruction mix 59 | 60 | Motherfucker. 61 | 62 | The synthesis tools are being clever about the contents of RAM. Changing the 63 | instruction mix affects synthesized size. 64 | 65 | Using Dinky5's test program (under the assumption that this shit was happening 66 | before) I get 764 LCs. 67 | 68 | By filling RAM with random bits, we're up to 955 LCs with the parameterization 69 | above. 70 | 71 | Without it: 1217 72 | 73 | Well, that makes all of these results really hard to compare then. 74 | 75 | 76 | # Retesting with random data in memory 77 | 78 | - Disabling alignment checks: -22 LUTs 79 | - Disabling ready lines from memory: -6 LUTs (note: memory wasn't relying on 80 | them) 81 | - Shrinking PC: -252 LUTs 82 | - Disabling halting debug: -7 LUTs (note: interface was not connected) 83 | 84 | Moving PC displacement immediate selection into decode step and sharing the PC 85 | adder across various control flow instructions: made shit bigger. But also 86 | faster. 87 | 88 | --- 89 | 90 | Ah, shit, the icestick eval demo wasn't wiring up writes to lanes 2 and 3 of 91 | RAM. That likely explains some things. Doing so costs about 52 LUTs. :( 92 | 93 | However, that also seems to stop the synthesis tools from being excessively 94 | smart about RAM contents -- I now get the same results for a small program as I 95 | do for random bits. Still gonna eval with random bits tho. 96 | 97 | 98 | Noticed that the shift amount always comes from `comp_rhs`. Tried using the 99 | bottom 5 bits of `comp_rhs` as the shift amount during shifting, decrementing 100 | it. This makes things bigger. If I need a decrementer anyway, registering the 101 | output is very cheap. 102 | 103 | 104 | --- 105 | 106 | Okay, so. With some incremental improvements, I've got the CPU in the following 107 | configuration: 108 | 109 | - 256-word memory (so 1024 byte, or 2 BRAMs). 110 | - PC width constrained to 10 (i.e. all of the 256-word memory space) 111 | - alignment checks, illegal instruction checks, and bus wait states ENABLED. 112 | - Full 32-register set. 113 | 114 | ... fitting into 965 LCs, or 75% of an hx1k. (Turning off alignment checks: 934 115 | LCs.) Timing suggests a maximum clock rate of 46 MHz. 116 | 117 | Mostly I've been pulling logic that was conditional on state values / opcode 118 | _up,_ into the top level, so that its LUTs lose their conditional inputs. This 119 | has helped with both size and speed, though the speed's been tremendously 120 | variable. 121 | 122 | I think this is probably a more honest representation of the size of the core, 123 | whereas I think Dinky may have "optimized" itself. 124 | 125 | PicoRV32 doesn't post iCE40 synthesis results, but people are quoting ~1500 LUTs 126 | and saying it won't fit on an hx1k on Reddit. So, my core may be smaller, though 127 | of course picoRV32 implements some features I haven't, like interrupts. 128 | 129 | --- 130 | 131 | Turns out some of the halt logic was being synthesized. 132 | 133 | - Applying a basic fix to that gets us down to 958 LCs (-7) 134 | - Disabling halting debug on icestick (it's not wired up!) gets us to... 968? 135 | +10? Really? 136 | 137 | --- 138 | 139 | Random thoughts on how to be smaller 140 | 141 | - Could probably get away with a 16-bit datapath. 142 | - 143 | -------------------------------------------------------------------------------- /notes/20231004.mkdn: -------------------------------------------------------------------------------- 1 | Starting to play with some more invasive techniques for reducing size. 2 | 3 | Saved 20 LUTs by converting basically all use of funct3 to one-hot. I think this 4 | is mostly working by converting a bunch of muxes to ORs. Gonna see if I can 5 | break down the savings. 6 | 7 | Converting load alignment detect logic: no immediate savings 8 | 9 | Also converting load result assembly logic: -8 LUTs 10 | 11 | Converting store logic: -3 LUTs 12 | 13 | Converting branch condition logic: -14 LUTs (!) 14 | 15 | Converting ALU result logic: -2 LUTs 16 | 17 | Switching PC immediate adder to use a muxed operand based on partially decoded 18 | opcode bits: +58 LUTs! fuuuuuck that 19 | 20 | Switching load and store EA computation and alignment checks to use common 21 | logic: -6 LUTs 22 | 23 | Centralizing load shifter to not be opcode sensitive: +3 LUTs 24 | 25 | --- 26 | 27 | Merely switching state encoding to one-hot: +15 LUTs 28 | 29 | Honestly fewer than I expected 30 | 31 | --- 32 | 33 | Wrote a new RV32I implementation (not including ALU ops, atm) using entirely 34 | structural one-hot primitives -- not an if or switch to be found -- and it's 35 | already at 821 LCs. 281 FFs. 36 | 37 | So, I think the "naive RV32" approach will tend to inherently approach 900-950 38 | LCs on iCE40. 39 | 40 | Wondering about alternative approaches I could use to synthesize something 41 | lighter without going all the way to bitserial. SERV's numbers are a good 42 | illustration of the cost of control logic: at 32+ more cycles per instruction, 43 | it's only about 1/3 the size. 44 | 45 | I keep thinking about a 16-bit datapath implementation, a la 68000. The iCE40's 46 | internal RAMs are 16-bit, so 47 | 48 | - Use one for a register file with separate entries for the high and low half of 49 | each general purpose register (so, 64/256 entries consumed) 50 | - Use one as a 16-bit-wide RAM. 51 | - Operate in halfwords. 52 | 53 | Things like addition/subtraction/comparison would need carry flags to link one 54 | operation to the next. Shifts seem more interesting; it might be worth having an 55 | actual 32-bit shift register in an otherwise 16-bit implementation. 56 | 57 | Sketched execution of some instructions: 58 | 59 | ``` 60 | AUIPC x1, 0xAAAAA000 61 | 62 | 1. Load low half of instruction from RAM. 63 | - Note: have at this point: opcode, rd, funct3, low bit of rs1, some 64 | immediate bits 65 | 2. Load high half of instruction from RAM. 66 | - Now we've got rs1 and rs2 67 | 3. Add low half of PC to low half of U-type immediate. Set carry flag 68 | appropriately. Write result to low half of rd. 69 | 4. Add upper half of PC to upper half of U-type immediate and carry flag. Write 70 | result to upper half of rd. 71 | - In this cycle we could also begin instruction fetch at PC+4 and set it up 72 | to latch into PC at end of cycle 73 | 74 | ADD x1, x2, x3 75 | 76 | 1. Low fetch 77 | 2. High fetch 78 | - Can start addressing registers here 79 | 3. Low half of a register becomes available, start fetching low half of the 80 | other 81 | 4. Low half of second register becomes available, start fetching high half of 82 | the first. Set up the ALU to add the low halves and set carry. Write to low 83 | half of destination. 84 | 5. High half of first register becomes available, start fetching high half of 85 | other 86 | 6. High half of both registers available, set up the ALU to add the high halves 87 | using saved carry. Write to high half of destination. 88 | ``` 89 | 90 | So if I could arrange things into a general pattern that doesn't require fully 91 | realized microcode, that'd rock. Candidate states: 92 | 93 | - Low Fetch 94 | - High Fetch and Register 2 Low Load 95 | - Register 1 Low Load 96 | - Low Operate, Register 2 High Load 97 | - Register 1 High Load 98 | - High Operate (normally overlapped with Low Fetch) 99 | 100 | For loads, one possible sequence is 101 | 102 | - Register 1 Low Load 103 | - Add low to immediate, register 1 high load. Latch result as low bits of EA. 104 | - Add high to immediate. Use result as high bits of EA. Issue load for first 105 | halfword. 106 | - Write result into rd low once it comes back. If the load is a word load, issue 107 | load for second halfword. 108 | - For word loads, write result into rd high, otherwise set or reset it according 109 | to sign extension and contents of low halfword. 110 | 111 | Word stores might resemble 112 | 113 | - Register 1 Low Load 114 | - Add low to immediate, register 1 high load. Latch result as low bits of EA. 115 | - Add high to immediate. Use result as high bits of EA. Register 2 low load. 116 | - Issue store for low halfword. Register 2 high load. EA increment. 117 | - Issue store for high halfword. 118 | 119 | ...while byte and halfword stores could skip the final cycle, if desired. 120 | 121 | The need to have the entire EA handy before making the low halfword store is a 122 | strong argument for loading rs1 before rs2, unlike in my 32-bit implementations 123 | that wind up being able to save time by doing it in the other order. 124 | 125 | If I can limit the EA to < 32 bits, the argument gets even stronger. In the 126 | extreme case, a core with a 16-bit physical address space could avoid dealing 127 | with the top half of rs1 in loads/stores entirely. 128 | 129 | 130 | Operating on RV32 immediates in 16-bit units means, effectively, twice as many 131 | possible immediate inputs into an adder. Assuming the same adder serves both top 132 | and bottom halfwords. Because if it doesn't ... what exactly am I doing 133 | 134 | 135 | -------------------------------------------------------------------------------- /notes/20231005.mkdn: -------------------------------------------------------------------------------- 1 | Ok. Loads. 2 | 3 | Currently, by the time we get to OPERATE-HI, we have 4 | - The LSBs of the EA in `mar_lo`. 5 | - The MSBs of the EA on the adder output. 6 | 7 | We can put that on the bus and goto LOAD. 8 | 9 | If we're doing a halfword load, 10 | 11 | - Write the loaded value into the low half of the destination register. 12 | - Record the top bit somewhere (signed) or a zero (unsigned) 13 | - Fill the top half of the register with that value and goto fetch. 14 | 15 | If we're doing a byte load, 16 | - Write either the low or high byte (depending on the bottom bit of EA) into the 17 | destination register's low half, clearing the top byte. 18 | - Record the top bit somewhere (signed) or a zero (unsigned) 19 | - Fill the top half of the register with that value and goto fetch. 20 | 21 | If we're doing a word load, things are more interesting. We need to issue a 22 | second memory transaction. 23 | 24 | The new EA will be two higher than the last one. However, if we require loads to 25 | be aligned, then all we have to do is bitwise-OR 2 into the EA. No adder 26 | required, no carry chain involved. 27 | 28 | So, we can go ahead and issue the load from `mar_lo` and the adder output, and 29 | then transition to a load high state that stores the result. 30 | 31 | --- 32 | 33 | Muxes 34 | 35 | ``` 36 | adder_rhs 37 | imm_u[15:0] 38 | imm_u[31:16] 39 | imm_j[15:0] 40 | imm_j[31:16] 41 | imm_i[15:0] 42 | imm_i[31:16] 43 | imm_s[15:0] 44 | imm_s[31:16] 45 | rf.read_resp 46 | rf.read_resp ^ 16{inst[30]} (add/sub instruction only?) 47 | ~rf.read_resp (compares) 48 | imm_b[15:0] 49 | imm_b[31:16] 50 | 51 | accum next 52 | pc[15:0] 53 | rf.read_resp 54 | pc[31:16] 55 | accum[14:0], shift_lo[15] (shift left) 56 | shift_fill, accum[15:1] (shift right) 57 | 16{load_result[15]} (for signed loads) 58 | zero (for unsigned loads) 59 | unchanged 60 | 61 | adder_carry_in 62 | 0 might be able to eliminate these constants by 63 | 1 flipping saved_carry 64 | saved_carry 65 | inst[30] 66 | 67 | saved carry next 68 | 0 69 | adder carry out 70 | unchanged (used to carry contents between operate phases) 71 | 72 | rf write payload 73 | zeroes by default but probably not really used 74 | immu[15:0] 75 | adder_result 76 | (pc+4)[15:0] 77 | (pc+4)[15:0] 78 | (pc+4)[31:16] 79 | (pc+4)[31:16] 80 | accum ^ reg 81 | accum | reg 82 | accum & reg 83 | accum ^ imm_i[15:0] 84 | accum | imm_i[15:0] 85 | accum & imm_i[15:0] 86 | accum^imm_i[31:16] 87 | accum|imm_i[31:16] 88 | accum&imm_i[31:16] 89 | immu[31:16] 90 | accum 91 | load_result 92 | shift_lo 93 | zeroes (halted, not real) 94 | 95 | mem out payload addr 96 | zeroes (fake) 97 | pc[31:1] 98 | pc[31:1] + 1 99 | { adder_result, mar_lo[15:1] } 100 | { adder_result, mar_lo[15:1] } | 1 101 | ``` 102 | 103 | states 104 | 105 | ``` 106 | lo hi 107 | reset 108 | fetch_lo fetch_hi 109 | inst_reg1_lo 110 | fetch_hi_wait 111 | reg2_lo reg2_hi 112 | operate_lo operate_hi 113 | halted 114 | branch_lo branch_hi 115 | load load_hi 116 | load_hi_wait 117 | fill_msbs 118 | store_hi 119 | finish_shift 120 | ``` 121 | 122 | --- 123 | 124 | I feel like I could get this smaller by doing some more fixed-function stuff and 125 | possibly using more states/cycles. 126 | 127 | Also, decompose the state into, say, a state variable and a "active halfword" 128 | bit that determines whether we're operating high or low. 129 | 130 | ``` 131 | ALU x1, x2, x3 (not shift) 132 | 133 | fetch low halfword of instruction 134 | clear zero and carry flags 135 | 136 | low halfword of instruction available 137 | fetch high halfword of instruction 138 | latch low halfword into bottom half of inst 139 | 140 | high halfword of instruction available 141 | latch high halfword into top half of inst 142 | set up read of rs1.lo 143 | 144 | READ_RS2/LO 145 | rs1.lo available 146 | set up read of rs2.lo 147 | latch rs1.lo into accum 148 | 149 | OPERATE/LO 150 | rs2.lo available 151 | compute ALU result of (accum, rs2.lo) 152 | set up write of ALU result into rd.lo 153 | latch carry and zero output 154 | set up read of rs1.hi 155 | flip active to high 156 | 157 | READ_RS2/HI 158 | rs1.hi available 159 | set up read of rs2.hi 160 | latch rs1.hi into accum 161 | 162 | OPERATE/HI 163 | rs2.hi available 164 | compute ALU result of (accum, rs2.hi) 165 | set up write of ALU result into rd.hi 166 | latch carry and zero output 167 | flip active to low (toggle it?) 168 | 169 | increment PC 170 | ``` 171 | -------------------------------------------------------------------------------- /notes/20231006.mkdn: -------------------------------------------------------------------------------- 1 | Alright, sprinting last night and this morning I have a revised state model 2 | working. This explicitly reuses logic between low and high halfwords where 3 | feasible, separating state into a (state, hi-halfword) pair. Otherwise it 4 | applies no aggressive optimization -- no one-hot state, no explicitly parallel 5 | logic. 6 | 7 | Currently: 786 LCs (61%) 8 | 47 MHz 9 | 10 | CP: regfile -> adder RHS -> adder -> accumulator? 11 | 12 | Current instruction timings: 13 | 14 | ``` 15 | LUI 6 16 | AUIPC 6 17 | JAL 6 18 | JAL 6 19 | JALR 7 20 | Bxx 9 if taken 21 | 7 if not 22 | Lxx 9 23 | SW 8 24 | SB/SH 7 25 | 26 | ALU 7 27 | 28 | shift 9 + amount 29 | ``` 30 | 31 | So if we call 7 the average, we get 6.714 MIPS. 32 | 33 | In many cases I can probably knock a cycle off instructions, because I'm not 34 | currently doing overlapped fetch. But, that's not my current priority. My 35 | current priority is size. 36 | 37 | Currently SLT/SLTU are not implemented, need to fix that. They're a little 38 | tricky because the LSB of the result depends on the MSBs of the input. Might 39 | need an auxiliary state for them. But, those being missing will be throwing off 40 | both my timing and area reports. 41 | 42 | --- 43 | 44 | Okay. Got SLT/SLTU implemented, they wanted another state, works now. 45 | 46 | We're bigger: 831 LCs. (this is all still with `addr_width = 8`.) 47 | 48 | --- 49 | 50 | Down to 825 with some simplifications in SLT. 51 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | dynamic = ["version"] 3 | 4 | name = "hapenny" 5 | 6 | requires-python = "~=3.8" 7 | dependencies = [ 8 | "amaranth[builtin-yosys]@git+https://github.com/amaranth-lang/amaranth", 9 | "amaranth-boards@git+https://github.com/amaranth-lang/amaranth-boards", 10 | "yowasp-yosys", 11 | ] 12 | 13 | [project.optional-dependencies] 14 | debug = ["jtagtap"] 15 | 16 | [build-system] 17 | requires = ["pdm-backend"] 18 | build-backend = "pdm.backend" 19 | 20 | [tool.pdm.scripts] 21 | _.env_file = ".env.toolchain" 22 | test.composite = ["test-code"] 23 | test-code.cmd = "python -m unittest discover -t . -s tests -v" 24 | -------------------------------------------------------------------------------- /smallest-toggle.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/smallest-toggle.bin -------------------------------------------------------------------------------- /tiny-bootloader.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/tiny-bootloader.bin -------------------------------------------------------------------------------- /tinyboot-upduino-chonk.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/tinyboot-upduino-chonk.bin -------------------------------------------------------------------------------- /tinyboot/.cargo/config: -------------------------------------------------------------------------------- 1 | [build] 2 | target = "riscv32i-unknown-none-elf" 3 | 4 | [target.rv32-hapenny] 5 | rustflags = [ 6 | "-C", "link-arg=-Tlink.x", 7 | ] 8 | 9 | [target.riscv32i-unknown-none-elf] 10 | rustflags = [ 11 | "-C", "relocation-model=pie", 12 | "-C", "link-arg=-Tlink.x", 13 | ] 14 | 15 | # For size comparison 16 | [target.thumbv6m-none-eabi] 17 | rustflags = [ 18 | "-C", "link-arg=-Tlink.x", 19 | ] 20 | [target.thumbv7em-none-eabi] 21 | rustflags = [ 22 | "-C", "link-arg=-Tlink.x", 23 | ] 24 | -------------------------------------------------------------------------------- /tinyboot/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "autocfg" 7 | version = "1.1.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" 10 | 11 | [[package]] 12 | name = "cfg-if" 13 | version = "1.0.0" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 16 | 17 | [[package]] 18 | name = "hapenny-tinyboot" 19 | version = "0.1.0" 20 | dependencies = [ 21 | "cfg-if", 22 | "parse_int", 23 | ] 24 | 25 | [[package]] 26 | name = "num-traits" 27 | version = "0.2.17" 28 | source = "registry+https://github.com/rust-lang/crates.io-index" 29 | checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" 30 | dependencies = [ 31 | "autocfg", 32 | ] 33 | 34 | [[package]] 35 | name = "parse_int" 36 | version = "0.6.0" 37 | source = "registry+https://github.com/rust-lang/crates.io-index" 38 | checksum = "2d695b79916a2c08bcff7be7647ab60d1402885265005a6658ffe6d763553c5a" 39 | dependencies = [ 40 | "num-traits", 41 | ] 42 | -------------------------------------------------------------------------------- /tinyboot/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hapenny-tinyboot" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | cfg-if = "1.0.0" 8 | 9 | [profile.release] 10 | opt-level = "s" 11 | debug = true 12 | 13 | [[bin]] 14 | name = "tinyboot" 15 | path = "src/main.rs" 16 | test = false 17 | bench = false 18 | 19 | [build-dependencies] 20 | parse_int = "0.6.0" 21 | -------------------------------------------------------------------------------- /tinyboot/README.mkdn: -------------------------------------------------------------------------------- 1 | # hapenny tinyboot 2 | 3 | This is a minimal boot ROM and monitor written in Rust. It interacts with a host 4 | over a serial port and provides operations for reading, writing, and calling 5 | memory. 6 | 7 | This is directly inspired by Frank Sergeant's "3-Instruction Forth;" like that 8 | system, this is not Forth. 9 | 10 | ## Building 11 | 12 | `cargo build --release` 13 | 14 | This will generate an ELF file. Extracting a binary file suitable for handing to 15 | the SoC generator: 16 | 17 | ``` 18 | riscv32-elf-objcopy -Obinary \ 19 | target/riscv32i-unknown-none-elf/release/tinyboot \ 20 | path-to-your-output-file.bin 21 | ``` 22 | 23 | In practice, you probably want to override the UART address. You can do that 24 | like this: 25 | 26 | ``` 27 | TINYBOOT_UART_ADDR=0x01_0000 cargo build --release 28 | ``` 29 | 30 | ## Serial protocol 31 | 32 | All the default examples bring up the UART at 115,200 baud, though you can 33 | change this if you like -- it's in the HDL, not the Rust code. 34 | 35 | The protocol is a very simple command-response scheme implementing five 36 | commands. It's a binary protocol; examples below will be shown in hex, but 37 | typing hex digits into the serial port won't do what you want. See the `montool` 38 | in this same repo for a portable command line tool. 39 | 40 | ### Call (0x00) 41 | 42 | Send: `00` 43 | Response: `AA` 44 | 45 | Calls the address in the A register. Loads the tinyboot setup routine's address 46 | into `ra` during the call, so if the code you call returns, it'll hop right back 47 | into tinyboot. 48 | 49 | ### Write (0x01) 50 | 51 | Send: `01 nn nn nn nn ...` 52 | Response: `AA` 53 | 54 | Writes words to memory starting at the address in the A register and continuing 55 | for the count in the C register. Decrements the C register by 1 per word, and 56 | increments the A register by 4 per word. 57 | 58 | Words should be sent after the command byte in little-endian format. 59 | 60 | The ACK byte will arrive after all words have been transmitted. 61 | 62 | ### Read (0x02) 63 | 64 | Send: `02` 65 | Response: `AA nn nn nn nn ...` 66 | 67 | Reads words from memory starting at the address in the A register and continuing 68 | for the count in the C register. Decrements the C register by one per word, and 69 | increments the A register by 4 per word. 70 | 71 | The ACK response is sent first, followed by the requested number of words in 72 | little-endian format. 73 | 74 | ### Load A (0x03) 75 | 76 | Send: `03 ww xx yy zz` 77 | Response: `AA` 78 | 79 | Loads a new value into the A register. The value must be sent after the command 80 | byte in little-endian format (so in this example, the value loaded is 81 | `0xzzyyxxww`). 82 | 83 | ### Load C (0x04) 84 | 85 | Send: `04 ww xx yy zz` 86 | Response: `AA` 87 | 88 | Loads a new value into the C register. The value must be sent after the command 89 | byte in little-endian format (so in this example, the value loaded is 90 | `0xzzyyxxww`). 91 | 92 | Note that the count held in the C register is always measured in _words,_ not 93 | bytes. 94 | 95 | ### Ping (0x05) 96 | 97 | Send: `05` 98 | Response: `AA` 99 | 100 | Basic verification that the bootloader is responding. 101 | 102 | ## Configuring for your board 103 | 104 | The environment variable `TINYBOOT_UART_ADDR` determines the location of the 105 | UART in the address space, which is currently the only configurable part of the 106 | bootloader. 107 | 108 | The binary itself is position-independent so the location of your boot ROM 109 | doesn't matter...except for the following. 110 | 111 | **Note:** The linker script currently assumes that there is useful stack memory 112 | located _immediately below_ the location where tinyboot is loaded. 113 | -------------------------------------------------------------------------------- /tinyboot/build.rs: -------------------------------------------------------------------------------- 1 | use std::{env::VarError, path::PathBuf}; 2 | use std::io::Write; 3 | 4 | fn main() { 5 | println!("cargo:rerun-if-changed=link.x"); 6 | println!("cargo:rerun-if-env-changed=TINYBOOT_UART_ADDR"); 7 | 8 | let addr_input = match std::env::var("TINYBOOT_UART_ADDR") { 9 | // Ugh why is this not an Option 10 | Err(VarError::NotPresent) => None, 11 | Ok(result) => Some(result), 12 | e => panic!("{:?}", e), 13 | }; 14 | 15 | let addr = match addr_input { 16 | None => { 17 | println!("cargo:warning=note: UART address not provided, defaulting to 0x200"); 18 | 0x200 19 | } 20 | Some(text) => { 21 | parse_int::parse::(&text).unwrap() 22 | } 23 | }; 24 | 25 | let mut out = PathBuf::from(std::env::var_os("OUT_DIR").unwrap()); 26 | out.push("peripherals.rs"); 27 | 28 | let mut f = std::fs::File::create(&out).unwrap(); 29 | writeln!(f, "pub const UART_ADDR: u32 = 0x{addr:x};").unwrap(); 30 | } 31 | -------------------------------------------------------------------------------- /tinyboot/link.x: -------------------------------------------------------------------------------- 1 | MEMORY { 2 | PROGMEM (rwx): ORIGIN = 0x8000, LENGTH = 512 3 | RAM (rw): ORIGIN = 0x0000, LENGTH = 32K 4 | } 5 | 6 | EXTERN(__start); 7 | ENTRY(__start); 8 | 9 | SECTIONS { 10 | PROVIDE(__stack_start = ORIGIN(RAM) + LENGTH(RAM)); 11 | 12 | PROVIDE(__stext = ORIGIN(PROGMEM)); 13 | 14 | .text __stext : { 15 | *(.start); 16 | 17 | *(.text .text.*); 18 | 19 | . = ALIGN(4); 20 | __etext = .; 21 | } > PROGMEM 22 | 23 | .rodata : ALIGN(4) { 24 | . = ALIGN(4); 25 | __srodata = .; 26 | *(.rodata .rodata.*); 27 | . = ALIGN(4); 28 | __erodata = .; 29 | } > PROGMEM 30 | 31 | /DISCARD/ : { 32 | /* throw away RAM sections to get a link error if they're used. */ 33 | *(.bss); 34 | *(.bss.*); 35 | *(.data); 36 | *(.data.*); 37 | *(COMMON); 38 | *(.ARM.exidx); 39 | *(.ARM.exidx.*); 40 | *(.ARM.extab.*); 41 | *(.got); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /tinyboot/rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel = "1.73" 3 | targets = [ "riscv32i-unknown-none-elf"] 4 | profile = "minimal" 5 | components = [ "rustfmt", "rust-analyzer" ] 6 | -------------------------------------------------------------------------------- /tinyboot/src/main.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | #![no_std] 3 | 4 | // World's cheapest RISC-V "runtime" - only works because we don't use non-stack 5 | // RAM (as ensured by our linker script) 6 | core::arch::global_asm! { 7 | " 8 | .pushsection .start,\"ax\",%progbits 9 | .globl __start 10 | __start: 11 | # initialize stack pointer 12 | 1: auipc sp, %pcrel_hi(__stack_start) 13 | addi sp, sp, %pcrel_lo(1b) 14 | # No need to fill in a return address, main won't return 15 | j main 16 | 17 | .popsection 18 | " 19 | } 20 | 21 | #[no_mangle] 22 | pub extern "C" fn main() -> ! { 23 | let mut a: *mut u32 = core::ptr::null_mut(); 24 | let mut c: u32 = 0; 25 | loop { 26 | match getc() { 27 | 0 => unsafe { // Call 28 | ack(); 29 | core::arch::asm!( 30 | " 31 | # restart monitor if program returns. 32 | 1: auipc ra, %pcrel_hi(__start) 33 | addi ra, ra, %pcrel_lo(1b) 34 | 35 | jr a0 # activate routine 36 | ", 37 | in("a0") a, 38 | options(noreturn), 39 | ); 40 | } 41 | 1 => { // Write 42 | while c > 0 { 43 | c -= 1; 44 | let word = get32(); 45 | unsafe { 46 | a.write_volatile(word); 47 | a = a.add(1); 48 | } 49 | } 50 | ack(); 51 | } 52 | 2 => { // Read 53 | ack(); 54 | while c > 0 { 55 | c -= 1; 56 | let word = unsafe { a.read_volatile() }; 57 | unsafe { a = a.add(1); } 58 | put32(word); 59 | } 60 | } 61 | 3 => { // Load A 62 | a = get32() as _; 63 | ack(); 64 | } 65 | 4 => { // Load C 66 | c = get32(); 67 | ack(); 68 | } 69 | 5 => { // Just ping 70 | ack(); 71 | } 72 | _ => { 73 | putb(0xFF); 74 | } 75 | } 76 | } 77 | } 78 | 79 | #[inline(never)] 80 | fn ack() { 81 | putb(0xAA); 82 | flush(); 83 | } 84 | 85 | #[inline(never)] 86 | fn get32() -> u32 { 87 | let mut word = u32::from(getc()); 88 | word |= u32::from(getc()) << 8; 89 | word |= u32::from(getc()) << 16; 90 | word |= u32::from(getc()) << 24; 91 | word 92 | } 93 | 94 | //#[inline(never)] 95 | fn put32(word: u32) { 96 | for b in word.to_le_bytes() { 97 | putb(b); 98 | } 99 | } 100 | 101 | const UARTRX: *mut i16 = generated::UART_ADDR as _; 102 | const UARTTX: *mut u16 = (generated::UART_ADDR + 2) as _; 103 | 104 | fn txbusy() -> bool { 105 | unsafe { 106 | UARTTX.read_volatile() != 0 107 | } 108 | } 109 | 110 | fn flush() { 111 | while txbusy() { 112 | // spin 113 | } 114 | } 115 | 116 | fn putb(b: u8) { 117 | flush(); 118 | unsafe { 119 | UARTTX.write_volatile(u16::from(b)); 120 | } 121 | } 122 | 123 | fn getc() -> u8 { 124 | loop { 125 | let status = unsafe { UARTRX.read_volatile() }; 126 | if status >= 0 { 127 | return status as u8; 128 | } 129 | } 130 | } 131 | 132 | extern "C" { 133 | // This function is deliberately not implemented to cause a link error if we 134 | // include a panic. 135 | fn panic_handler_should_be_optimized_out() -> !; 136 | } 137 | 138 | #[panic_handler] 139 | fn panic(_info: &core::panic::PanicInfo<'_>) -> ! { 140 | unsafe { 141 | panic_handler_should_be_optimized_out() 142 | } 143 | } 144 | 145 | mod generated { 146 | include!(concat!(env!("OUT_DIR"), "/peripherals.rs")); 147 | } 148 | -------------------------------------------------------------------------------- /upduino-bootloader.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbiffle/hapenny/10d6af538bb47feb26f5fa807f0c2ae6b64f2e9e/upduino-bootloader.bin -------------------------------------------------------------------------------- /upduino-chonk.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import argparse 3 | import struct 4 | from pathlib import Path 5 | 6 | from amaranth import * 7 | from amaranth.lib.wiring import * 8 | from amaranth.build import ResourceError, Resource, Pins, Attrs 9 | from amaranth_boards.upduino_v3 import UpduinoV3Platform 10 | 11 | from hapenny import StreamSig 12 | import hapenny.chonk.cpu 13 | from hapenny.bus import BusPort, SimpleFabric, partial_decode, narrow_addr 14 | from hapenny.chonk.gpio32 import OutputPort32 15 | from hapenny.chonk.serial32 import BidiUart 16 | from hapenny.chonk.mem32 import BasicMemory, SpramMemory 17 | 18 | RAM_WORDS = 256 * 1 19 | RAM_ADDR_BITS = (RAM_WORDS - 1).bit_length() 20 | 21 | print(f"boot memory will use {RAM_ADDR_BITS}-bit addressing") 22 | 23 | bootloader = Path("tinyboot-upduino-chonk.bin").read_bytes() 24 | boot_image = struct.unpack("<" + "I" * (len(bootloader) // 4), bootloader) 25 | 26 | class Test(Elaboratable): 27 | def elaborate(self, platform): 28 | m = Module() 29 | 30 | m.submodules.cpu = cpu = hapenny.chonk.cpu.Cpu( 31 | reset_vector = 0x1_0000, 32 | # 2 for the width of the fabric's port select 33 | # 14 for the width of the devices attached to the ports 34 | # +2 because the fabric is word-addressed but this is in bytes 35 | addr_width = 2 + 14 + 2, 36 | # Program addresses only need to be able to address RAM, not 37 | # I/O, so configure the PC and fetch port to be narrower. (+2 38 | # because, again, our RAM is halfword addressed but this 39 | # parameter is in bytes.) 40 | prog_addr_width = 1 + 14 + 2, 41 | counters = True, 42 | ) 43 | m.submodules.bootmem = bootmem = BasicMemory(depth = RAM_WORDS, 44 | contents = boot_image) 45 | m.submodules.bulkmem0 = bulkmem0 = SpramMemory() 46 | m.submodules.port = port = OutputPort32(1) 47 | m.submodules.uart = uart = BidiUart(baud_rate = 115200, oversample=8) 48 | m.submodules.fabric = fabric = SimpleFabric([ 49 | # Put all the potentially executable RAM in the bottom portion of 50 | # the address space, to allow PC and fetch circuitry to be slightly 51 | # narrower. This helps with timing. 52 | bulkmem0.bus, # 0x0000_0000 53 | partial_decode(m, bootmem.bus, 14), # 0x0001_0000 54 | partial_decode(m, port.bus, 14), # 0x0002_0000 55 | partial_decode(m, uart.bus, 14), # 0x0003_0000 56 | ]) 57 | 58 | connect(m, cpu.bus, fabric.bus) 59 | platform.add_resources([ 60 | Resource("tx", 0, Pins("7", dir="o", conn=("j", 0))), 61 | Resource("rx", 0, Pins("8", dir="i", conn=("j", 0))), 62 | ]) 63 | 64 | tx = platform.request("tx", 0) 65 | rx = platform.request("rx", 0) 66 | 67 | rgb_led = platform.request("rgb_led", 0) 68 | m.d.comb += [ 69 | rgb_led.r.o.eq(cpu.halted), 70 | rgb_led.g.o.eq(port.pins[0]), 71 | tx.o[0].eq(uart.tx), 72 | uart.rx.eq(rx.i[0]), 73 | ] 74 | 75 | return m 76 | 77 | parser = argparse.ArgumentParser( 78 | prog = "upduino-chonk", 79 | description = "Script for synthesizing a larger UPduino SoC using chonk", 80 | ) 81 | args = parser.parse_args() 82 | 83 | 84 | p = UpduinoV3Platform() 85 | p.hfosc_div = 2 # divide 48MHz by 2**1 = 24 MHz 86 | p.build(Test(), do_program = True) 87 | -------------------------------------------------------------------------------- /upduino-large.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import argparse 3 | import struct 4 | from pathlib import Path 5 | 6 | from amaranth import * 7 | from amaranth.lib.wiring import * 8 | from amaranth.build import ResourceError, Resource, Pins, Attrs 9 | from amaranth_boards.upduino_v3 import UpduinoV3Platform 10 | 11 | from hapenny import StreamSig 12 | import hapenny.cpu 13 | from hapenny.bus import BusPort, SimpleFabric, partial_decode 14 | from hapenny.gpio import OutputPort 15 | from hapenny.serial import BidiUart 16 | from hapenny.mem import BasicMemory, SpramMemory 17 | 18 | RAM_WORDS = 256 * 1 19 | RAM_ADDR_BITS = (RAM_WORDS - 1).bit_length() 20 | 21 | print(f"boot memory will use {RAM_ADDR_BITS}-bit addressing") 22 | 23 | bootloader = Path("upduino-bootloader.bin").read_bytes() 24 | boot_image = struct.unpack("<" + "h" * (len(bootloader) // 2), bootloader) 25 | 26 | class Test(Elaboratable): 27 | def elaborate(self, platform): 28 | m = Module() 29 | 30 | m.submodules.cpu = cpu = hapenny.cpu.Cpu( 31 | reset_vector = 0x0_8000, 32 | # +1 to adjust from bus halfword addressing to CPU byte 33 | # addressing. 34 | addr_width = 2 + 14 + 1, 35 | # Program addresses only need to be able to address RAM, not 36 | # I/O, so configure the PC and fetch port to be narrower. (+1 37 | # because, again, our RAM is halfword addressed but this 38 | # parameter is in bytes.) 39 | prog_addr_width = 1 + 14 + 1, 40 | counters = True, 41 | ) 42 | m.submodules.bootmem = bootmem = BasicMemory(depth = RAM_WORDS, 43 | contents = boot_image) 44 | m.submodules.bulkmem0 = bulkmem0 = SpramMemory() 45 | m.submodules.port = port = OutputPort(1) 46 | m.submodules.uart = uart = BidiUart(baud_rate = 115200) 47 | m.submodules.fabric = fabric = SimpleFabric([ 48 | # Put all the potentially executable RAM in the bottom portion of 49 | # the address space, to allow PC and fetch circuitry to be slightly 50 | # narrower. This helps with timing. 51 | bulkmem0.bus, # 0x0000_0000 52 | partial_decode(m, bootmem.bus, 14), # 0x0000_8000 53 | partial_decode(m, port.bus, 14), # 0x0001_0000 54 | partial_decode(m, uart.bus, 14), # 0x0001_8000 55 | ]) 56 | 57 | connect(m, cpu.bus, fabric.bus) 58 | platform.add_resources([ 59 | Resource("tx", 0, Pins("7", dir="o", conn=("j", 0))), 60 | Resource("rx", 0, Pins("8", dir="i", conn=("j", 0))), 61 | ]) 62 | def get_all_resources(name): 63 | resources = [] 64 | for number in itertools.count(): 65 | try: 66 | resources.append(platform.request(name, number)) 67 | except ResourceError: 68 | break 69 | return resources 70 | 71 | tx = platform.request("tx", 0) 72 | rx = platform.request("rx", 0) 73 | 74 | rgb_led = platform.request("rgb_led", 0) 75 | m.d.comb += [ 76 | rgb_led.r.o.eq(cpu.halted), 77 | rgb_led.g.o.eq(port.pins[0]), 78 | tx.o[0].eq(uart.tx), 79 | uart.rx.eq(rx.i[0]), 80 | ] 81 | 82 | return m 83 | 84 | parser = argparse.ArgumentParser( 85 | prog = "upduino-large", 86 | description = "Script for synthesizing a larger UPduino SoC", 87 | ) 88 | args = parser.parse_args() 89 | 90 | 91 | p = UpduinoV3Platform() 92 | p.hfosc_div = 1 # divide 48MHz by 2**1 = 24 MHz 93 | p.build(Test(), do_program = True) 94 | --------------------------------------------------------------------------------