├── .gitignore ├── LICENSE.md ├── README.md ├── RISC-V.hw └── RISC-V.lpr ├── RISC-V.ip_user_files └── README.txt ├── RISC-V.srcs └── asynchronous │ ├── acib │ └── README.md │ ├── cpu │ ├── ALU │ │ ├── I │ │ │ ├── and.vhdl │ │ │ └── shift.vhdl │ │ ├── README.md │ │ ├── alu.vhdl │ │ └── insn │ │ │ └── insn_two_register.vhdl │ ├── README.md │ ├── adders │ │ ├── adder.vhdl │ │ ├── adders.md │ │ └── shcadder.vhdl │ ├── async_component_block_diagram.png │ ├── pipelines │ │ ├── README.md │ │ ├── ooe-pipeline.md │ │ ├── simple-pipeline.md │ │ └── simple-pipeline.vhdl │ ├── roadmap.md │ └── shifters │ │ ├── barrel_shifter.vhdl │ │ └── barrel_shifter_no_signex.vhdl │ └── infrastructure │ ├── handshake │ ├── README.md │ ├── handshake.vhdl │ ├── ncl_async_logical_not.png │ ├── ncl_async_register.png │ └── register.vhdl │ ├── ncl │ ├── README.md │ └── ncl.vhdl │ └── transceiver │ ├── transceiver_async_to_sync.vhdl │ └── transceiver_sync_to_async.vhdl └── RISC-V.xpr /.gitignore: -------------------------------------------------------------------------------- 1 | .~ 2 | *.cache/ 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # Derivatives of Other Works 2 | 3 | Works herein derived from other works with incompatible licenses are subject 4 | to the terms of those respective license. 5 | 6 | # Original Works in this Repository 7 | 8 | The MIT License 9 | 10 | Copyright (c) 2020 Moonset Technologies, LLC 11 | 12 | Permission is hereby granted, free of charge, to any person obtaining a copy 13 | of this software and associated documentation files (the "Software"), to deal 14 | in the Software without restriction, including without limitation the rights 15 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 16 | copies of the Software, and to permit persons to whom the Software is 17 | furnished to do so, subject to the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be included in 20 | all copies or substantial portions of the Software. 21 | 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 23 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 24 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 26 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 27 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 28 | THE SOFTWARE. 29 | 30 | # Expiration 31 | 32 | All works in this repository legally covered by the above license shall remain 33 | so for a term of seven (7) years after initial publication. Updates to such 34 | works shall begin their own separate licensing term at the time of their own 35 | publication. Nothing in this license shall be construed to extend the 36 | licensing term on any version of any work covered herein. 37 | 38 | Upon expiration of the licensing term, all in this repository legally covered 39 | by the above license shall immediately become licensed under the Creative 40 | Commons "CC0" license and shall simultaneously be declared as in the public 41 | domain by the common law meanin in the United States as understood in 2020. 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | RISC-V Soft CPU 2 | =============== 3 | 4 | The RISC-V Soft CPU provides various CPU packages. 5 | 6 | # RV32/64IM 7 | 8 | This is a small-footprint, embedded processor conformant to the following 9 | RISC-V standards: 10 | 11 | * RV32/64I 2.1 12 | * M 2.0 13 | * Uses FPGA multipliers 14 | * Paravartya integer division implementation 15 | 16 | The FPGA implementation of RISC-V is likely unaffected by registers, as LUT 17 | registers are almost never a resource constraint and BRAM is often plentiful. 18 | 19 | 64-bit extension instructions add 15 RV64I and 5 RV64M instructions. 20 | 21 | This implements only the M machine mode privileged system, and has the 22 | following MISA flags available: 23 | 24 | * E 25 | * I 26 | * M 27 | 28 | This core is suitable for embedded environments, notably for the Retro-1 BIOS 29 | implementation. UEFI always runs in M mode on the BIOS embedded CPU. This 30 | core implements no pipelines, simple adders, and synchronous operations to 31 | minimize size. 32 | 33 | # RV32/64IM-Counters-Zicsr-Zifencei 34 | 35 | This extends the RV32/64IM with the following: 36 | 37 | * Counters 2.0 Draft 38 | * Draft for counters 39 | * Cycle counter uses adder loop when non-retired instructions in pipeline: adder increments counter CSR 40 | * Zfencei 41 | 42 | This core also implements the M, S, and U privilege levels, and so implements 43 | MISA flags: 44 | 45 | * E 46 | * I 47 | * M 48 | * S 49 | * U 50 | 51 | This core is suitable for running Linux or Minix operating systems. 52 | 53 | This core implements simple pipelines, Han-Carlson adders, and NULL Convention 54 | Logic for asynchronous execution. It eschews floating point due to large area 55 | usage. 56 | 57 | # RV32/64IMAFDQC-Counters-Zicsr-Zifencei-Hypervisor 58 | 59 | This extends the RV32/64IM core with floating point and hypervisor support. 60 | This is a *large* core implementing as much logic as possible as NCL. 61 | 62 | This core does not exclude simultaneous multithreading (SMT), out-of-order 63 | execution (OOE), speculative execution, runahead, and so forth. It includes 64 | custom counters to determine which facilities stall the most (e.g. contention 65 | for adders, multipliers, registers in register renaming) to guide customized 66 | implementation. 67 | 68 | 69 | -------------------------------------------------------------------------------- /RISC-V.hw/RISC-V.lpr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /RISC-V.ip_user_files/README.txt: -------------------------------------------------------------------------------- 1 | The files in this directory structure are automatically generated and managed by Vivado. Editing these files is not recommended. 2 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/acib/README.md: -------------------------------------------------------------------------------- 1 | Asynchronous Component Interface Bus 2 | ==================================== 3 | 4 | This describes an Asynchronous Component Interface Bus (ACIB), a bus to 5 | connect components with an asynchronous communications protocol. 6 | 7 | This is an extreme rough draft. 8 | 9 | # Asynchronous Differential Null Convention Coding 10 | 11 | ACIB uses Asynchronous Differential Null Convention Coding (ADNCC) to transmit 12 | data. 13 | 14 | ADNCC uses a serial differential pair to transmit a NULL Convention Logic (NCL) 15 | signal from a sender to a receiver. Unlike NCL communications internal to a 16 | given IC, ACBI is serial and self-timing. Like all component buses, it uses 17 | signal negotiation and data error detection and correction to preserve data 18 | integrity. This can create more latency, but high throughput, which is more 19 | appropriate for communications between components. 20 | 21 | ## Differential Pairs 22 | 23 | ADNCC communicates via differential pairs. Each pair has a fixed positive `p` 24 | and a differential negative `n` rail carrying an NCL bit of `[p n]`. The 25 | rails are considered equivalent with a NULL value of `[0 0]` when within a 26 | defined voltage of one another; otherwise, the more-negative rail is the 27 | `1` bit. 28 | 29 | Given a base voltage of 0 on both rails and a 50mV standard signal 30 | differential, the signal would read as follows: 31 | 32 | | `p` | `n` | NCL | Value 33 | | ---:| -----:| -------:| ----: 34 | | 0 | +50mV | `[1 0]` | `0` 35 | | 0 | 0mV | `[0 0]` | `NULL` 36 | | 0 | -50mV | `[0 1]` | `1` 37 | 38 | To transmit multiple `0` or `1` bits in series, the sender must transition to 39 | `NULL` between each bit; whereas a transition directly between `0` and `1` 40 | is always accepted for several reasons: 41 | 42 | * The `p` rail is constant, so there's no chance of a glitch from propagation 43 | delay on `p`; 44 | * A transition between `0` and `1` on the `n` line must necessarily pass 45 | through `NULL`, which would only signal that the next non-`NULL` value is a 46 | an intended data bit, and so is implicit; and 47 | * If the `n` line can spuriously transition between `0` and `1`, the circuit 48 | can spuriously transition between `NULL` and non-`NULL`, and no data integrity 49 | is possible. 50 | 51 | This together means there is no value in requiring a `NULL` between valid and 52 | distinct `0` and `1` signals; rather a transition between *any* of the three 53 | states is valid, and the `NULL` state is just not data and not recorded. 54 | 55 | ## Voltage Characteristics 56 | 57 | ADNCC uses a variable transmission voltage. Any voltage differential between 58 | 50mV and 300mV is acceptable, negotiated between the two endpoints. `NULL` is 59 | always *sent* as `+0mV`, and the threshold for transition to `NULL` is 1/3 the 60 | voltage differential, while transition to not-`null` is 2/3 the voltage. 61 | 62 | (FIXME: is that reasonable thresholding?) 63 | 64 | Implementations are not required to support all voltages. Implementations 65 | *must* support each of `p`+/-50mV, `p`+/-150mV, and `p`+/-300mV. 66 | 67 | ## Signal Negotiation 68 | 69 | Bus protocols over ADNCC must uses packet error detection and correction, as 70 | is the case with most modern bus protocols. Bus protocols using ADNCC may 71 | responds to error rate by: 72 | 73 | * Implementing error-correcting coding; 74 | * Negotiating a diffrent voltage differential; or 75 | * Clocking the sender. 76 | 77 | If a voltage differential of 300mV does not result in a low-error connection, 78 | the sender may physically delay each transition to align with a clock signal, 79 | varying this clock and the voltage differential to achieve optimal transmission 80 | rate. The receiver doesn't concern itself with the error rate. 81 | 82 | ## Signal Transmission Rate 83 | 84 | ADNCC provides asynchronous transmission of digital signals. Bus protocols 85 | using ADNCC must negotiate packet size and manage error over this coding. 86 | 87 | ADNCC will operate at higher or lower baud rate based on cable length, 88 | temperature, encoding and decoding hardware, and other characteristics. It 89 | is delay-insensitive, but requires readable signal. 90 | 91 | Data transmission may, in some cases, overwhelm the capabalities of the 92 | receiver in buffering and processing the data. This causes errors unrelated 93 | to the transmission protocol, but rather to the sheer volume of data received. 94 | Bus protocols using ADNCC must handle these errors either by negotiating 95 | packet size and transmission rate or by slowing down the data transmission as 96 | in any other error condition. 97 | 98 | # Asynchronous Component Interface Bus 99 | 100 | Asynchronous Component Interface Bus (ACIB) uses ADNCC to provide an 101 | asynchronous data bus. 102 | 103 | ## Electrical Characteristics 104 | 105 | ACIB uses two types of connectors: a 20-pin interface and ... 106 | 107 | ### 20-pin connector 108 | 109 | The 20-pin connector is pin-compatible with Type-C, including a maximum 100W 110 | power delivery and four differential pairs. The differential pairs must 111 | operate as ADNCC in ACIB mode. 112 | 113 | ### X-pin connector 114 | 115 | TBD: Number of pairs, power characteristics. 116 | 117 | ## Bus protocol 118 | 119 | XXX: Bus protocol 120 | 121 | Packets have specific connection ID attached to them. 122 | 123 | DMA is negotiated to specific memory areas via a memory controller. 124 | 125 | ### Error Correction 126 | 127 | ACIB uses a fast, variable Reed-Solomon coding to correct for errors, as well 128 | as variation of the ADNCC physical layer. 129 | 130 | TBD: Specific RS Coding, fast hardware implementation. 131 | 132 | ## Implementation considerations 133 | 134 | ACIB transceivers may support multiple devices and simultaneous communication 135 | with the host. Such devices may use multiplexers in a one-to-many or 136 | many-to-many configuration to allow simultaneous communication. 137 | 138 | ACIB controls the communication between two ACIB devices. ADNCC does not 139 | negotiate asynchronous data flow, but only uses delay-insensitive data 140 | transmission. ACIB can delay data flow by delaying an acknowledgement of 141 | readiness for a packet. 142 | 143 | An ACIB transceiver may interface asynchronously with the host device via a 144 | handshake protocol, notably when the ACIB transceiver is integrated into a 145 | SoC. This automatically manages behavior related to data transfer and 146 | processing capability: if the ACIB transciever can buffer all data packets 147 | it requests or accepts, then it can wait to acknowledge pending requests or 148 | make new requests simply by waiting until its internal buffers are flushed. 149 | If this happens over an asynchronous handshake protocol, then the transceiver 150 | waits precisely until the host receives and acknowledges its receipt of the 151 | buffered data. 152 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/ALU/I/and.vhdl: -------------------------------------------------------------------------------- 1 | -- vim: sw=4 ts=4 et 2 | -- AND a sign-extended 12-bit immediate register 3 | library IEEE; 4 | use IEEE.std_logic_1164.all; 5 | library async_ncl; 6 | use async_ncl.ncl.all; 7 | use work.e_ncl_logic_register; 8 | use work.e_riscv_insn_async_2reg_infra; 9 | 10 | entity e_riscv_insn_async_2reg is 11 | generic ( XLEN : positive ); 12 | port ( 13 | -- Receiver port and handshake 14 | rs1 : in ncl_logic_vector(XLEN-1 downto 0); 15 | rs2 : in ncl_logic_vector(XLEN-1 downto 0); 16 | insn : in ncl_logic_vector(31 downto 0); 17 | Rr : out std_logic; 18 | Wr : in std_logic; 19 | -- Sender port and handshake 20 | Dout : out ncl_logic_vector(XLEN-1 downto 0); 21 | Rs : in std_logic; 22 | Ws : out std_logic 23 | ); 24 | end e_riscv_insn_async_2reg; 25 | 26 | architecture riscv_i_async_bitmask of e_riscv_insn_async_2reg is 27 | signal Din : ncl_logic_vector( (rs1'LENGTH 28 | + rs2'LENGTH 29 | + insn'LENGTH)-1 downto 0); 30 | -- Buffered into a delay-insensitive register 31 | signal in_buffer : ncl_logic_vector( ( rs1'LENGTH 32 | + rs2'LENGTH 33 | + insn'LENGTH)-1 downto 0); 34 | 35 | signal r_rs1 : ncl_logic_vector(rs1'RANGE); 36 | signal r_rs2 : ncl_logic_vector(rs2'RANGE); 37 | signal r_insn : ncl_logic_vector(insn'RANGE); 38 | 39 | -- Data extracted from the buffered instruction 40 | alias opcode : ncl_logic_vector(6 downto 0) is r_insn(6 downto 0); 41 | -- I-type immediate value 42 | alias imm : ncl_logic_vector(11 downto 0) is r_insn(31 downto 20); 43 | -- R-type 44 | alias funct7 : ncl_logic_vector(6 downto 0) is r_insn(31 downto 25); 45 | alias funct3 : ncl_logic_vector(2 downto 0) is r_insn(14 downto 12); 46 | -- opcode is 0010011 if I-type, 0110011 if R-type 47 | alias rtype : ncl_logic is r_insn(5); 48 | begin 49 | 50 | -- DI registered buffer 51 | r_infra: entity e_riscv_insn_async_2reg_infra(riscv_insn_async_2reg_infra) 52 | generic map (XLEN => XLEN) 53 | port map 54 | (rs1 => rs1, 55 | rs2 => rs2, 56 | insn => insn, 57 | Rr => Rr, 58 | Wr => Wr, 59 | Rs => Rs, 60 | Ws => Ws, 61 | -- Buffered registers 62 | rs1b => r_rs1, 63 | rs2b => r_rs2, 64 | insnb => r_insn, 65 | -- send output to 2reg infrastructure 66 | rdl => Dout 67 | ); 68 | 69 | -- TODO: 70 | -- instantiate an ncl_logic_register of length 71 | -- (rs1'LENGTH + rs2'LENGTH + insn'LENGTH) 72 | -- and handshake to store input into that register. 73 | -- 74 | -- Rewrite slices above to use this register 75 | 76 | bitmask : process(all) is 77 | begin 78 | -- FIXME: Handshake. We need the handshake or this WILL fail. 79 | if ( rtype = ncl_encode('1') ) then 80 | -- R-type opcode 81 | if ((funct3(2) AND funct3(1) AND funct3(0)) = ncl_encode('1')) then 82 | -- funct3 = 111 is AND 83 | Dout <= r_rs1 AND r_rs2; 84 | elsif ( ((funct3(2) AND funct3(1)) = ncl_encode('1')) 85 | AND (funct3(0) = ncl_encode('0'))) then 86 | -- funct3 = 110 = or 87 | Dout <= rs1 OR rs2; 88 | elsif ( (funct3(2) = ncl_encode('1')) 89 | AND ((funct3(1) OR funct3(0)) = ncl_encode('0'))) then 90 | -- funct3 = 100 = xor 91 | Dout <= rs1 XOR rs2; 92 | else 93 | -- NULL output 94 | Dout <= (others => (others => '0')); 95 | end if; 96 | elsif ( rtype = ncl_encode('0') ) then 97 | -- I-type opcode 98 | if ((funct3(2) AND funct3(1) AND funct3(0)) = ncl_encode('1')) then 99 | -- funct3 = 111 is AND 100 | Dout <= (11 downto 0 => rs1(11 downto 0) AND imm); 101 | -- Sign extend 102 | for i in Dout'HIGH downto 12 loop 103 | Dout(i) <= rs1(i) AND imm(11); 104 | end loop; 105 | elsif ( ((funct3(2) AND funct3(1)) = ncl_encode('1')) 106 | AND (funct3(0) = ncl_encode('0'))) then 107 | -- funct3 = 110 = or 108 | Dout <= (11 downto 0 => rs1(11 downto 0) OR imm); 109 | -- Sign extend 110 | for i in Dout'HIGH downto 12 loop 111 | Dout(i) <= rs1(i) OR imm(11); 112 | end loop; 113 | elsif ( (funct3(2) = ncl_encode('1')) 114 | AND ((funct3(1) OR funct3(0)) = ncl_encode('0'))) then 115 | -- funct3 = 100 = xor 116 | Dout <= (11 downto 0 => rs1(11 downto 0) XOR imm); 117 | -- Sign extend 118 | for i in Dout'HIGH downto 12 loop 119 | Dout(i) <= rs1(i) XOR imm(11); 120 | end loop; 121 | else 122 | -- NULL output 123 | Dout <= (others => (others => '0')); 124 | end if; 125 | else 126 | -- NULL output 127 | Dout <= (others => (others => '0')); 128 | end if; 129 | end process bitmask; 130 | end riscv_i_async_bitmask; -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/ALU/I/shift.vhdl: -------------------------------------------------------------------------------- 1 | -- vim: sw=4 ts=4 et 2 | -- Shift instructions, including: 3 | -- 4 | -- RV32I 5 | -- SLLI Shift Left Logical Immediate (32) 6 | -- SRLI Shift Right Logical Immediate (32) 7 | -- SRAI Shift Right Arithmetic Immediate (32) 8 | -- SLL Shift Left Logical (32) 9 | -- SRL Shift Right Logical (32) 10 | -- SRA Shift Right Arithmetic (32) 11 | -- 12 | -- RV64I 13 | -- SLLI Shift Left Logical Immediate (64) 14 | -- SRLI Shift Right Logical Immediate (64) 15 | -- SRAI Shift Right Arithmetic Immediate (64) 16 | -- SLL Shift Left Logical (64) 17 | -- SRL Shift Right Logical (64) 18 | -- SRA Shift Right Arithmetic (64) 19 | -- SLLIW SLLI (32) 20 | -- SRLIW SRLI (32) 21 | -- SRAIW SRAI (32) 22 | -- SLLW SLL (32) 23 | -- SRLW SRL (32) 24 | -- SRAW SRA (32) 25 | -- 26 | -- RV128I 27 | -- TBA 28 | 29 | library IEEE; 30 | use IEEE.std_logic_1164.all; 31 | use work.ncl.all; 32 | 33 | architecture riscv_insn_shift of riscv_insn is 34 | begin 35 | -- XLEN will be 32, 64, or 128, and will instantiate a shifter 36 | -- that many bits wide. 37 | -- 38 | -- The barrel shifter can place bail-out circuits at each halving 39 | -- of the bit width, e.g. with XLEN=128 and BitWidths=3, the 40 | -- shifter can direct to output at 128, 64, or 32 bits. 41 | barrel_shifter: entity e_barrel_shifter_ncl(a_barrel_shifter_ncl) 42 | generic map (n => XLEN, 43 | BitWidths => BitWidthCount ); 44 | 45 | -- TODO: send current bit width mode to barrel_shifter 46 | 47 | -- TODO: 48 | 49 | end architecture; 50 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/ALU/README.md: -------------------------------------------------------------------------------- 1 | Arithmetic Logic Unit 2 | ===================== 3 | 4 | The ALUs here implement RV32I and RV64I instructions. Various configurations 5 | may enable multiple copies of particular facilities (adders, multipliers, 6 | incrementers), multi-port ALUs (for SMT or OOE), and other features. 7 | 8 | ALUs execute instructions in the order and with the data they are given. 9 | Out-of-order and speculative execution are carried out before sending 10 | instructions to the ALU. 11 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/ALU/alu.vhdl: -------------------------------------------------------------------------------- 1 | -- vim: sw=4 ts=4 et 2 | -- 3 | -- Highly-conceptual rough-in, very broken 4 | library IEEE; 5 | use IEEE.std_logic_1164.all; 6 | use work.ncl.all; 7 | 8 | -- Circuit to 9 | entity insn_output is 10 | generic ( XLEN : positive ); 11 | port map ( 12 | -- The content to stick into rd 13 | rd : ncl_logic_vector(XLEN-1 downto 0); 14 | -- The instruction itself, which contains 15 | -- rd at [11:7], along with all information 16 | -- about read and write targets 17 | insn : ncl_logic_vector(31 downto 0) 18 | ); 19 | end insn; 20 | 21 | entity insn_riscv_execution is 22 | generic ( XLEN : positive ); 23 | port map ( 24 | rs1 : in ncl_logic_vector(XLEN-1 downto 0); 25 | rs2 : in ncl_logic_vector(XLEN-1 downto 0); 26 | insn : in ncl_logic_vector(31 downto 0); 27 | -- FIXME: Need all the machine registers passed in 28 | -- some readable manner so instructions can react to 29 | -- the machine's mode. 30 | -- 31 | -- MISA lets us at least check 32 | misa_r : in ncl_logic_vector(XLEN-1); 33 | Rt, Wr : in std_logic; 34 | -- rd is the actual output data 35 | rd : out ncl_logic_vector(XLEN-1 downto 0); 36 | Rr, Wt : out std_logic 37 | ); 38 | end insn_riscv; 39 | 40 | entity insn_riscv_decoder is 41 | generic ( XLEN : positive ); 42 | port map ( 43 | insn : in ncl_logic_vector(31 downto 0); 44 | Rr, Wr : in std_logic; 45 | pc : in ncl_logic_vector(XLEN-1 downto 0); 46 | misa_r : in ncl_logic_vector(XLEN-1); 47 | -- Change this to actual not-crap 48 | regfile : in ncl_logic_vector(15 downto 0); 49 | -- rd is the actual output data 50 | rd : out ncl_logic_vector(XLEN-1 downto 0); 51 | Rt, Wt : out std_logic 52 | ); 53 | end insn_riscv_decoder; 54 | 55 | architecture a_insn_riscv_decoder of insn_riscv_decoder is 56 | signal data_rs1, data_rs2 = ncl_logic_vector(XLEN-1 downto 0); 57 | signal Rinsn : 58 | begin 59 | 60 | andi_insn : entity insn_riscv_execution(insn_riscv_andi) 61 | generic map ( XLEN => XLEN) 62 | port map ( 63 | rs1 => data_rs1; 64 | rs2 => data_rs2; 65 | insn => insn; 66 | misa_r => misa_r; 67 | 68 | ); 69 | 70 | 71 | end a_insn_riscv_decoder; 72 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/ALU/insn/insn_two_register.vhdl: -------------------------------------------------------------------------------- 1 | -- vim: sw=4 ts=4 et 2 | -- AND a sign-extended 12-bit immediate register 3 | library IEEE; 4 | use IEEE.std_logic_1164.all; 5 | library async_ncl; 6 | use async_ncl.ncl.all; 7 | use work.e_ncl_logic_register; 8 | use work.e_ncl_handshake_receiver; 9 | 10 | entity e_riscv_insn_async_2reg_infra is 11 | generic ( XLEN : positive ); 12 | port ( 13 | -- Receiver port and handshake 14 | rs1 : in ncl_logic_vector(XLEN-1 downto 0); 15 | rs2 : in ncl_logic_vector(XLEN-1 downto 0); 16 | insn : in ncl_logic_vector(31 downto 0); 17 | Rr : out std_logic; 18 | Wr : in std_logic; 19 | -- Sender handshake 20 | Rs : in std_logic; 21 | Ws : out std_logic; 22 | -- Logic circuit: buffered rs1, rs2, insn 23 | rs1b : out ncl_logic_vector(XLEN-1 downto 0); 24 | rs2b : out ncl_logic_vector(XLEN-1 downto 0); 25 | insnb: out ncl_logic_vector(31 downto 0); 26 | -- result from the logic circuit 27 | rdl : in ncl_logic_vector(XLEN-1 downto 0) 28 | ); 29 | end e_riscv_insn_async_2reg_infra; 30 | 31 | architecture riscv_insn_async_2reg_infra of e_riscv_insn_async_2reg_infra is 32 | signal Din : ncl_logic_vector( (rs1'LENGTH 33 | + rs2'LENGTH 34 | + insn'LENGTH)-1 downto 0); 35 | -- Buffered into a delay-insensitive register 36 | signal in_buffer : ncl_logic_vector( ( rs1'LENGTH 37 | + rs2'LENGTH 38 | + insn'LENGTH)-1 downto 0); 39 | 40 | alias r_rs1 : ncl_logic_vector((rs1'LENGTH)-1 downto 0) is 41 | in_buffer((rs1'LENGTH)-1 downto 0); 42 | 43 | alias r_rs2 : ncl_logic_vector( (rs2'LENGTH)-1 downto 0) is 44 | in_buffer( (rs1'LENGTH 45 | + rs2'LENGTH)-1 downto (rs1'LENGTH)); 46 | 47 | alias r_insn : ncl_logic_vector( (insn'LENGTH)-1 downto 0) is 48 | in_buffer( (rs1'LENGTH 49 | + rs2'LENGTH 50 | + insn'LENGTH)-1 downto (rs1'LENGTH 51 | + rs2'LENGTH)); 52 | signal r_Enable : std_logic; 53 | signal r_Clear : std_logic; 54 | signal r_Stored : std_logic; 55 | -- Receiver handshake 56 | signal r_hs_Enable : std_logic; 57 | 58 | begin 59 | 60 | -- DI registered buffer 61 | r_buffer: entity e_ncl_logic_register(ncl_logic_register) 62 | generic map (n => rs1'LENGTH + rs2'LENGTH + insn'LENGTH) 63 | port map 64 | (D => Din, 65 | Q => in_buffer, 66 | En => r_Enable, 67 | CLR => r_Clear, 68 | W => Wr, 69 | Stored => r_Stored 70 | ); 71 | 72 | -- Handshake to receive input data 73 | hs_receiver: entity e_ncl_handshake_receiver(ncl_handshake_receiver) 74 | port map ( 75 | Ready => Rr, 76 | -- Enable when nothing stored 77 | En => NOT r_Stored, -- FIXME: Needs to come from the ICT component (yellow) 78 | Waiting => Wr, 79 | EnOut => r_Enable 80 | ); 81 | 82 | -- TODO: Input completion test component 83 | -- TODO: Sender handshake component 84 | -- TODO: Flush signal 85 | 86 | -- TODO: Setup receiver handshake enable 87 | 88 | -- Set up r_buffer input signal 89 | Din((rs1'LENGTH)-1 downto 0) <= rs1; 90 | Din( (rs1'LENGTH 91 | + rs2'LENGTH)-1 downto (rs1'LENGTH)) <= rs2; 92 | Din( (rs1'LENGTH 93 | + rs2'LENGTH 94 | + insn'LENGTH)-1 downto (rs1'LENGTH 95 | + rs2'LENGTH)) <= insn; 96 | 97 | end riscv_insn_async_2reg_infra; -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/README.md: -------------------------------------------------------------------------------- 1 | Asynchronous CPU Components 2 | =========================== 3 | 4 | These CPU components are asynchronous. They include adders, dividers, 5 | pipelines, and other features. 6 | 7 | # Major Architecture 8 | 9 | Ultimately, this RISC-V implementation will use an entirely asynchronous 10 | architecture. This consumes significant area, largely due to routing; 11 | however, routing is between directly-attached components generally, and 12 | should not be a problem in and of itself. 13 | 14 | In general, an Asynchronous CPU operates in a synchronous system as below: 15 | ``` 16 | __________________________________________________________ 17 | | _______________ ________________________ | 18 | CLK-|--| Transceiver |=Handshake=| Asynchronous circuitry | | 19 | D0..n=|==| |=D[0]0..n==| | | 20 | | | |=D[1]0..n==| | | 21 | | |_______________| |________________________| | 22 | |__________________________________________________________| 23 | ``` 24 | Above, a transceiver operates as a clocked (synchronous) component and an 25 | unclocked (asynchronous) component. The asynchronous side experiences delay 26 | controlled by the clock, but uses the asynchronous protocol. 27 | 28 | Marcos Luiggi Lemos Sartori of the Pontifical Catholic University of Rio 29 | Grande do Sul once [wrote](https://www.inf.pucrs.br/~calazans/publications/2017_MarcosSartori_EoTW.pdf): 30 | 31 | > As far as the Author knows, this is both the first asynchronous RISC-V 32 | > implementation and the first use of Go as a hardware description language. 33 | 34 | The [ARV implementation](https://github.com/marlls1989/arv) appears to be a 35 | RISC-V emulator written in Go, although the author notes: 36 | 37 | > A smart asynchronous synthesis tool can extract the intended behaviour 38 | > from the high level model and implement it in any such template. 39 | 40 | So far as we are aware, the VHDL implementation presented here is the first 41 | asynchronous RISC-V CPU hardware implementation, and the first using unbroken 42 | NULL Convention Logic to implement delay-insensitive components. Unlike 43 | [previous work by Christensen, Jensen, Jorger, and Sparsø](https://backend.orbit.dtu.dk/ws/portalfiles/portal/4361393/Christensen.pdf), which implemented 44 | an asynchronous TinyRISC™ TR41401 via delay elements, the RISC-V implementation 45 | here uses NULL Convention Logic (NCL) and delay-insensitive registers to 46 | overcome timing issues. 47 | 48 | This CPU requires transceivers at every memory access point, including to 49 | access any BRAM used as cache, DRAM used as main memory, or internal DSP and 50 | multiplier facilities. It provides a full VHDL implementation of all 51 | facilities except internal cache to facilitate synthesization as an ASIC; 52 | configurable support for internal use of FPGA facilities is included. 53 | 54 | # Handshake 55 | 56 | A completion-detection handshake allows for delay-insensitive components. Such components are attached as such: 57 | ``` 58 | Sender Receiver 59 | _______________ _______________ 60 | | Ready (in) |-| Ready (out) | 61 | | Waiting (out) |-| Waiting (in) | 62 | | d[0..x] (out) |=| d[0..x] (in) | 63 | |_______________| |_______________| 64 | ``` 65 | A strict handshake protocol ensures transitions on each side follow a state 66 | machine in which data must be acknowledged seen, then not seen; sent, then 67 | not sent; and so forth. This protocol ensures each sender holds the data 68 | lines stable until the recipient acknowldeges it has a stable copy of the data, 69 | and only sends data when a recipient *is* ready to receive data. 70 | 71 | # NULL Convention Logic 72 | 73 | Asynchronous components use a form of one-hot logic called NULL Convention 74 | Logic. Each bit has one of the following states: 75 | 76 | ``` 77 | High Low Value 78 | 0 0 NULL 79 | 1 0 0 80 | 0 1 1 81 | ``` 82 | 83 | The `[1 1]` signal is invalid. Completion detection circuits wait for all 84 | bits to see `High XOR Low = 1` before signaling the completion of some action. 85 | 86 | # Asynchronous Process 87 | 88 | The asynchronous process relies on both the handshake and NCL to function. 89 | 90 | Consider the below: 91 | ``` 92 | Sender Adder Consumer 93 | _______________ ______________________________ ______________ 94 | | Ready (in) |-| Ready (out) Ready (in)|-| Ready (out) | 95 | | Waiting (out) |-| Waiting (in) Waiting (out)|-| Waiting (in) | 96 | | d[0..x] (out) |=| d[0..x] (in) d[0..x] (out)|=| d[0..x] (in) | 97 | |_______________| |______________________________| |______________| 98 | ``` 99 | Above, the **Sender** sends a computation to the **Adder**, which sends the 100 | result to the **Consumer**. 101 | 102 | Overall, an asynchronous component has the below general block diagram: 103 | 104 | ![Asynchronous component block diagram](async_component_block_diagram.png) 105 | 106 | Think of the fancy parallel prefix adder as below: 107 | ``` 108 | [Input] (in) Waiting, (Out) Ready 109 | | | | 110 | [Register] 111 | | | | * Completion: input 112 | G G G 113 | | /| /| 114 | G G | 115 | | /| | 116 | G | | 117 | / | | | * Completion: output 118 | [ Output ] (Out) Waiting, (In) Ready 119 | 120 | ``` 121 | The component needs its data input to remain in place until its data output 122 | is complete and no longer needed by the receiver of this output. That means 123 | all circuits must complete before this can propagate down. 124 | 125 | The asynchronous register stores the data in a delay-insensitive manner (see 126 | [the handshake components](handshake/), allowing the handshake to immediately 127 | finish while the component processes the data. The component becomes ready 128 | for new data as soon as the next component has likewise stored the output 129 | and signaled it has done so (by clearing `Ready`). 130 | 131 | This coordination is necessary to ensure asynchronous components do not get 132 | out of sync and produce bad data. Clocked circuits assume every component 133 | does its part in one clock cycle, while asynchronous circuits move data as 134 | soon as the sender is ready to send and the receiver is ready to receive. 135 | This can vary with electrical characteristics, temperature, and which 136 | component is in use—parallel adders, slow multipliers, fast incrementers, 137 | all with different amounts of delay, and all operating at full speed rather 138 | than at the speed of the slowest, even when those speeds change. 139 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/adders/adder.vhdl: -------------------------------------------------------------------------------- 1 | -- adder components 2 | -- 3 | -- These are parts of adders 4 | library IEEE; 5 | use IEEE.std_logic_1164.all; 6 | library async_ncl; 7 | use async_ncl.ncl.all; 8 | 9 | -- Binary adder 10 | -- 11 | -- Ripple-Carry: 12 | -- 13 | -- A B 14 | -- | | 15 | -- ------- 16 | -- Cout-| Adder |-Cin 17 | -- ------- 18 | -- | 19 | -- S 20 | -- 21 | -- Parallel prefix: 22 | -- 23 | -- A B 24 | -- | | 25 | -- ------- 26 | -- Cout (G)-| Adder |-Cin (G[n-1]) 27 | -- ------- 28 | -- | 29 | -- S (P) 30 | -- 31 | -- Parallel prefix adder sends P to an XOR gate along with Cin 32 | -- (final output from last stage, so it has the same interface. 33 | -- In architecture, G would be sent to Cout, P sent to MUX. 34 | entity binary_adder_ncl_entity is 35 | port( 36 | A : in ncl_logic; 37 | B : in ncl_logic; 38 | Cin : in ncl_logic; 39 | Cout : out ncl_logic; 40 | S : out ncl_logic 41 | ); 42 | end binary_adder_ncl_entity; 43 | 44 | library IEEE; 45 | use IEEE.std_logic_1164.all; 46 | library async_ncl; 47 | use async_ncl.ncl.all; 48 | -- There are two forms of this. All but the last for a given 49 | -- bit are as follows: 50 | -- 51 | -- G Gin P Pin 52 | -- | | | | 53 | -- | AND-| | 54 | -- | | AND 55 | -- XOR | 56 | -- | | 57 | -- Gout Pout 58 | -- 59 | -- The last stage is as follows: 60 | -- 61 | -- G Gin P 62 | -- | | | 63 | -- | AND- 64 | -- | | 65 | -- XOR 66 | -- | 67 | -- Gout 68 | entity binary_adder_pg_mux_ncl_entity is 69 | port ( 70 | P : in ncl_logic; 71 | G : in ncl_logic; 72 | Pin : in ncl_logic; 73 | Gin : in ncl_logic; 74 | Pout : out ncl_logic; 75 | Gout : out ncl_logic 76 | ); 77 | end binary_adder_pg_mux_ncl_entity; 78 | 79 | -- A simple full adder. 80 | -- 81 | -- A-------A 82 | -- | N----- 83 | -- | B-----D | 84 | -- | | | 85 | -- XOR | 86 | -- |-------A | 87 | -- | N--OR 88 | -- | CIN---D | 89 | -- | | | 90 | -- XOR | 91 | -- | | 92 | -- S Cout 93 | -- 94 | -- All computations require NCL-complete input signals and pass 95 | -- NULL if any signal is incomplete. This prevents invalid output. 96 | architecture binary_adder_ncl_fulladder_arch of binary_adder_ncl_entity is 97 | begin 98 | -- S bit is A XOR B XOR Cin; output NULL if A or B is null 99 | S <= A XOR B XOR Cin; 100 | -- Cout is (A AND B) OR ((A XOR B) AND Cin); output NULL if null 101 | Cout <= (A AND B) OR ((A XOR B) AND Cin); 102 | end binary_adder_ncl_fulladder_arch; 103 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/adders/adders.md: -------------------------------------------------------------------------------- 1 | Adders 2 | ====== 3 | 4 | Various adders are available, using various amount of space and operating 5 | at various speeds. 6 | 7 | # Speculative Adders 8 | 9 | Speculative adders take up additional space, but operate at higher frequencies. 10 | They can run at higher fmax in synchronous circuits, and in less time in 11 | asynchronous circuits. 12 | 13 | Asynchronous adders require additional space, but have enormous advantages in 14 | asynchronous circuits. 15 | 16 | In synchronous circuits, if the fmax of the adder is higher than the fmax of 17 | the CPU in general, the adder can be clocked higher and latch its output to 18 | provide the addition in one CPU clock cycle instead of two when speculation 19 | produces error. Speculative adders have an error probability on the order of 20 | 10^-5, so this rarely happens and is not worth the additional space. 21 | 22 | In a CPU with an asynchronous pipeline, a clocked speculative adder can run at 23 | high speed to the same benefit, with a clock rate independent of the CPU. An 24 | asynchronous speculative adder can return a result immediately upon completion, 25 | with negligible additional delay when speculation fails. Synchronous 26 | speculative adders with lower delay but higher error probability can require 27 | several clock cycles to recover; while asynchronous highly-speculative adders 28 | can take advantage of early completion. 29 | 30 | ## Han-Carlson 31 | 32 | The Han-Carlson Speculative Adder shortens the critical path by one stage. It 33 | detects and corrects for error in the rare case of an error. This adder 34 | consumes minimal area and has a high fmax. 35 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/adders/shcadder.vhdl: -------------------------------------------------------------------------------- 1 | -- Speculative Han-Carlson adder 2 | -- 3 | -- A one-bit full adder looks as below: 4 | -- 5 | -- S <= A XOR B XOR Cin 6 | -- Cout <= (A AND B) OR (B AND Cin) OR (Cin AND A) 7 | -- 8 | -- A different adder uses three circuits. 9 | -- 10 | -- Adder: 11 | -- 12 | -- A : (in) 13 | -- B : (in) 14 | -- G : (out) 15 | -- P : (out) 16 | -- G <= A AND B 17 | -- P <= A XOR B 18 | -- 19 | -- Propagate: 20 | -- 21 | -- Gin : (in) 22 | -- Pin : (in) 23 | -- Cin : (in) 24 | -- PCin : (in) 25 | -- Gout <= (Pin AND Cin) XOR Gin 26 | -- Pout <= Pin AND PCin 27 | -- 28 | -- Sum bit: 29 | -- 30 | -- Pin : (in) 31 | -- Cin : (in) 32 | -- S : (out) 33 | -- S <= Pin XOR Cin 34 | -- 35 | -- P from the Adder goes to the Sum bit. Gout from the Adder goes to 36 | -- Cin on the NEXT Propagator. The final propagated Gout goes to Cin on 37 | -- the NEXT Sum bit. 38 | -- 39 | -- These propagate forward a bunch, creating a complex mess. Han-Carlson 40 | -- simply shortcuts some of this: 41 | -- 42 | -- For every even bit, Gout and Pout from the final Propagate cycle begin 43 | -- forwarding to PCin in the next stage at each power of two. That is: 44 | -- Bit 0 sends its (G,P) from Input to Stage 1 of bit 1, which sends its 45 | -- (G,P) from Stage 1 to Stage 2 of Bit 3, which sends its (G,P) from 46 | -- Stage 2 to Stage 3 of Bit 7, and so forth. In the final stage, the 47 | -- odd bits propagate their (P,G) to the outputs. 48 | -- 49 | -- Each bit has to propagate to each other bit. At Stage 1, Bit 1 50 | -- propagates to Stage 2 of Bit 3; at Stage 2, Bit 1 propagates to Stage 51 | -- 3 of Bit 5. This is because Stage 3 of Bit 3 propagates to Stage 4 of 52 | -- Bit 7, and so Bit 5 carries no information about Bit 1! Notably, Bit 2 53 | -- propagates to 3, then 5, but this propagation does not bring any 54 | -- information about Bit 1. The final stage propagates Bit 1 to Bit 2, 55 | -- which is the first time Bit 2 receives information about Bit 1. 56 | -- 57 | -- Speculative Han-Carlson skips the propagation stage before the last. 58 | -- For a 16-bit adder, Bit 7 Stage 3 never propagates to Bit 15 Stage 4; 59 | -- rather it directly propagates to Bit 8 output. 60 | -- 61 | -- Just before the output stage, speculative Han-Carlson tests all the 62 | -- odd-numbered bits: 63 | -- 64 | -- D : (in) [15 downto 0] 65 | -- Error : (out) 66 | -- Error <= ((D[1] AND D[9]) XOR (D[3] AND D[11])) XOR 67 | -- ((D[5] AND D[13]) XOR (D[7] AND [D15])) 68 | -- 69 | -- When an error is detected, the last stage is computed. Errors are 70 | -- fairly infrequent, so the fast path usually occurs. The adder also 71 | -- reduces the amount of space needed. 72 | -- 73 | -- Each component can also use a two-way state signal rather than the 74 | -- adder running on a clock. This essentially propagates a "done" 75 | -- signal. Such an adder can begin computing new addition before prior 76 | -- signals have fully propagated and effectively pipeline additions. 77 | 78 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/async_component_block_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrmoserbaltimore/risc-v-cpu-asynchronous/3c0864c1b023da5a7b3475c484f0aca1b9310e09/RISC-V.srcs/asynchronous/cpu/async_component_block_diagram.png -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/pipelines/README.md: -------------------------------------------------------------------------------- 1 | Pipelines 2 | ========= 3 | 4 | These pipelines provide various facilities, such as a simple pipeline; 5 | out-of-order execution; or speculative execution. 6 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/pipelines/ooe-pipeline.md: -------------------------------------------------------------------------------- 1 | Simple Out-of-Order Execution Pipeline 2 | ====================================== 3 | 4 | This pipeline extends the Simple Pipeline to include out-of-order execution. 5 | 6 | # Pipeline staging 7 | ``` 8 | [Fetch] 9 | [LRW] 10 | [Load] 11 | [ULR] 12 | [Decode] 13 | [Execute] 14 | [Store] 15 | [Retire] 16 | ``` 17 | In this pipeline, each instruction takes both read and write locks. As in 18 | the simple pipeline, locks are taken before Load; however, both read and 19 | write lock counts are tracked. 20 | 21 | Speculative execution and branch prediction are unsupported by this pipeline. 22 | As the `Fetch` stage must use and update `pc`, `Fetch` sends the current `pc` 23 | with the fetched instruction. The `Fetch` stage occurs in order. 24 | 25 | The `LRW` stage takes Read and Write locks. `LRW` stalls any instructions 26 | reading or writing data under write lock; the stalled instruction is placed 27 | into a buffer, and the next instruction goes into `LRW`. The next instruction 28 | stalls both by normal locks and by having a locking contention with the 29 | buffered instruction. If the next instruction stalls, `LRW` stalls entirely 30 | until the buffer is free; otherwise the instruction continues as normal. 31 | 32 | This process allows simple out-of-order instruction execution for most RV32I 33 | and RV64I instructions. RISC-V instructions generally don't have 34 | side-effects, such as setting status flag registers, so their order is 35 | generally unimportant. 36 | 37 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/pipelines/simple-pipeline.md: -------------------------------------------------------------------------------- 1 | Pipeline design 2 | =============== 3 | 4 | The pipeline takes a minimalist approach. 5 | 6 | 7 | ``` 8 | [Fetch] 9 | [Decode] 10 | [Semaphore] 11 | [Load] 12 | [Execute] 13 | [Store] 14 | [Retire] 15 | ``` 16 | The Semaphore is an atomic locking operation to keep computations in-order, 17 | activated before Load and on Retire. In pseudocode: 18 | 19 | ``` 20 | RWSemaphore(Resource, Read, Write) 21 | if Read 22 | ReadLock(Resource) 23 | if Write 24 | WriteLock(Resource) 25 | 26 | WriteLock(Resource) 27 | NoWaitWriteLock(Resource) 28 | 29 | ReadLock(Resource) 30 | WaitForWriteUnlock(Resource) 31 | ``` 32 | No locking occurs before the first semaphore stage. Write locks do not block 33 | when taken because all read locks from earlier instructions will be closed out 34 | before the current instruction reaches the Store stage. 35 | 36 | Read locks block when a write lock is held on the resource. This prevents the 37 | Load until the write lock is released. Write locks increment and decrement 38 | for this reason: multiple writes to a resource *without* reads will pipeline 39 | multiple non-blocked write locks. All writes must complete before a further 40 | read can occur. Only out-of-order execution environments need to track read 41 | locks. 42 | 43 | This behavior also means taking a read lock first avoids blocking on the 44 | instruction's own write lock, avoiding read-and-write lock logic. 45 | 46 | In the Retire stage, the write lock is atomically decremented. When the write 47 | lock hits zero, any instruction blocked at Semaphore waiting for that resource 48 | continues its execution. 49 | 50 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/pipelines/simple-pipeline.vhdl: -------------------------------------------------------------------------------- 1 | -- pipeline 2 | -- 3 | -- A pipeline component carries out a particular stage in a pipeline. 4 | -- The component provides stage-to-stage synchronization. 5 | -- 6 | -- In this way, the pipeline is clockless. The pipeline may rely on external 7 | -- clocked components and so may in practice wait for those. 8 | -- 9 | -- Note that the pipelined component must handle all interdependencies. If 10 | -- an operation relies on a prior operation completing, it must coordinate 11 | -- with the further stages of the pipeline. For example: if an instruction 12 | -- decodes with a write, the write must be notated in some kind of semaphore 13 | -- system, and reads and writes must stall. 14 | -- 15 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/roadmap.md: -------------------------------------------------------------------------------- 1 | Roadmap for CPU 2 | =============== 3 | 4 | # Minimal CPU 5 | 6 | The steps to create a working RISC-V RV32I implementation are simple: 7 | 8 | 1. Asynchronous component infrastructure 9 | * Handshake 10 | * Asynchronous register 11 | * Sync-Async transceiver 12 | 2. Infrastructure 13 | * Register file 14 | * Instruction fetch 15 | * Instruction decoder 16 | 3. Asynchronous RAM bus interface 17 | * Interface with synchronous memory 18 | 4. Asynchronous components 19 | * Adder 20 | * Incrementer (toggle bits until encountering the firts 0 bit) 21 | * 2's complement 22 | 5. Basic asynchronous instruction implementations 23 | * Canonical NOP instruction 24 | * Detect `ADDI x0, x0, 0` and silently abort the insn 25 | * Other writes to `x0` are `HINT` insns 26 | * Execute-circuit implementations 27 | * Load (`LW`, `LUI`) 28 | * Sign-extending Load (`LB`, `LH`, `LBU`, `LHU`) 29 | * Store (`SW`, `SH`, `SB`) 30 | * Bitwise logic (`AND`, `OR`, `XOR`, `ANDI`, `ORI`, `XORI`) 31 | * Bit shifters (`SLLI`, `SRLI`, `SRAI`, `SLL`, `SRL`, `SRA`) 32 | * Arithmetic (`ADD`, `SUB`, `ADDI`) 33 | * Branch (`BEQ`, `BNE`, `BLT`, `BGE`, `BLTU`, `BGEU`) 34 | * Control flow (`AUIPC`, `JAL`, `JALR`) 35 | * Comparison instructions (`SLTI`, `SLTIU`, `SLT`, `SLTU`) 36 | 6. Asynchronous ALU 37 | 7. Asynchronous pipeline 38 | * Fetch 39 | * Decode 40 | * Locking 41 | * Load 42 | * Execute 43 | * Retire 44 | 45 | The above implements all the RV32I instructions except `FENCE`. This 46 | does not, however, implement machine mode: the CPU is not a proper 47 | RISC-V CPU. With the above implemented, test RISC-V code can run on 48 | the core. 49 | 50 | # Machine-Mode 51 | 52 | To implement a machine-mode RISC-V CPU, we need more infrastructure: 53 | 54 | 1. Machine-mode CSRs 55 | * `misa` 56 | * `mvendorid` 57 | * `marchid` 58 | * `mimpid` 59 | * `mhartid` 60 | * `mstatus` 61 | * `mstatush` 62 | * `mdeleg` 63 | * `mideleg` 64 | * `mip` 65 | * `mie` 66 | * `mtime` 67 | * `mtimecmp` 68 | * `mcycle` 69 | * `minstret` 70 | * `mcounteren` 71 | * `mcountinhibit` 72 | * `mscratch` 73 | * `mepc` 74 | * `mcause` 75 | * `mtval` 76 | 2. Machine-level ISA 77 | * Environment call (`ECALL`, `EBREAK`) 78 | * Trap-return (`MRET`, `SRET`) 79 | * Wait for interrupt (`WFI`) 80 | 3. Machine-level infrastructure 81 | * Reset state 82 | * NMI 83 | * Physical memory considerations 84 | * Memory protection 85 | * Paging 86 | 3. `FENCE` instruction to complete RV32I 87 | 88 | Machine mode is not overly complex, but does carry a lot of infrastructure. 89 | 90 | # Supervisor mode 91 | 92 | Supervisor mode extends a CPU with machine mode, providing all the facilities 93 | to run a modern Linux operating system. 94 | 95 | # RV32M extension 96 | 97 | Multiply and Divide add additional instructions and multipliers. 98 | 99 | 1. Infrastructure 100 | * Multipliers 101 | * Dividers (Paravartya using multipliers) 102 | 2. Instructions 103 | * Multiplication (`MUL`, `MULH`, `MULHSU`, `MULHU`) 104 | * Division (`DIV`, `DIVU`, `REM`, `REMU`) 105 | 106 | # RV64IM 107 | 108 | RV64I extends the addressing space and register size in 64-bit mode, and adds 109 | a few 64-bit instructions. 110 | 111 | 1. Infrastructure 112 | * 64-bit flag and proper behavior 113 | * Decoder 114 | 2. Instructions 115 | * 64-bit load/store (`LD`, `SD`) 116 | * 32-bit W instructions 117 | * Adjustments to base instructions for 64-bit operation 118 | 119 | Implementation of RV64M on top of all the above provides a full 64-bit 120 | asynchronous RISC-V processor, albeit without floating point. 121 | 122 | # Hypervisor Mode 123 | 124 | The hypervisor extension is in draft as of RISC-V privileged architectures 125 | V1.12 draft. 126 | 127 | Hypervisor extensions add a significant amount of infrastructure and 128 | instructions to the CPU and are far more challenging to implement than 129 | Supervisor-mode extensions. 130 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/shifters/barrel_shifter.vhdl: -------------------------------------------------------------------------------- 1 | -- vim: ts=4 sw=4 et 2 | -- Barrel shifter 3 | -- 4 | -- n-bit NCL barrel shifter with arithmetic right-shift 5 | library IEEE; 6 | use IEEE.std_logic_1164.all; 7 | use IEEE.math_real."ceil"; 8 | use ieee.math_real."log2"; 9 | library async_ncl; 10 | use async_ncl.ncl.all; 11 | 12 | -- NCL 2:1 mux 13 | -- 14 | -- Spurious outputs (glitch) happen if you use a non-NCL bit for 15 | -- the shifter, so yes the MUX selector input has to be NCL. 16 | -- 17 | -- Bit 1 Bit 0 18 | -- | | | | 19 | -- ------------------ 20 | -- \ /-- NCL 21 | -- \ OUT /---Select 22 | -- ------------ 23 | -- | | 24 | -- 25 | 26 | 27 | -- Barrel shifter 28 | -- 29 | -- Select bit of '1' selects the left (input) bit. 30 | -- 31 | -- bit 4 3 2 1 32 | -- | -| -| -|- --Arithmetic Shift 33 | -- | | | | | | | | | 34 | -- | | | | | | | AND 35 | -- | | | | | | | | 36 | -- | | | | | | | Sx Sign-extend bit 37 | -- | | | | | | | 38 | -- | | | | | | | Sx 39 | -- | | | | | | | | 40 | -- MUX MUX MUX MUX--Select bit 0 (To all stage-1 MUX) 41 | -- | -|---+ | 42 | -- | | | -|---+ 43 | -- | | | | | -|-+---Sx 44 | -- | | | | | | | | 45 | -- MUX MUX MUX MUX--Select bit 1 (To all stage-2 MUX) 46 | -- | | | | 47 | -- | -|-+-|-+-|-+---Sx 48 | -- | | | | | | | | 49 | -- MUX MUX MUX MUX--Select bit 2 (To all stage-3 MUX) 50 | -- | | | | 51 | -- 52 | -- In theory, it's faster to take the first stage if the shift is 53 | -- all off or all on, but that's more tests and gates. 54 | -- 55 | -- Barrel shifter r2 only has to be log(xlen), e.g 5 for 32-bit, 56 | -- 6 for 64-bit, 7 for 128-bit. 57 | entity e_barrel_shifter_ncl is 58 | -- Only feed this a power of 2! 59 | generic ( XLEN : positive; 60 | BitWidths : positive); 61 | port( 62 | Din : in ncl_logic_vector(XLEN-1 downto 0); 63 | Shift : in ncl_logic_vector(integer(ceil(log2(real(XLEN))))-1 downto 0); 64 | ShRight : in ncl_logic; 65 | Arithmetic : in ncl_logic; 66 | BitWidth : in ncl_logic_vector(BitWidths-1 downto 0); 67 | Dout : out ncl_logic_vector(XLEN-1 downto 0) 68 | ); 69 | end e_barrel_shifter_ncl; 70 | 71 | -- All computations require NCL-complete input signals and pass 72 | -- NULL if any signal is incomplete. This prevents invalid output. 73 | -- 74 | -- This barrel shifter is reversible by using n muxes on input and 75 | -- output to reverse the bit order (reverse input, shift left, 76 | -- reverse output). 77 | architecture barrel_shifter_ncl of e_barrel_shifter_ncl is 78 | type tree_array is array (Shift'HIGH downto SHIFT'LOW-1) of ncl_logic_vector(XLEN-1 downto 0); 79 | signal tree : tree_array := (others => (others => ('0', '0'))); 80 | signal SignEx : ncl_logic; 81 | signal result : ncl_logic_vector(XLEN-1 downto 0); 82 | begin 83 | 84 | -- This thing is actually inherently combinatorial 85 | barrel: process(all) is 86 | variable BWNumeric : integer := 0; 87 | variable MSBidx : integer := XLEN-1; 88 | begin 89 | -- Find the bit divisor 90 | -- If the MSB in BitWidth is set, then use full width. 91 | -- If MSB-1 is set, half width. 92 | -- If MSB-2, quarter width. 93 | -- So on. 94 | -- 95 | -- This works by returning 0, 1, and 2, respectively, 96 | -- for the three above. 2**0 = 1, 2**1 = 2, 2**2 = 4. 97 | -- This gives us both results. 98 | for i in BitWidth'HIGH downto BitWidth'LOW loop 99 | if (BitWidth(i) = '1') then 100 | BWNumeric := BitWidth'HIGH - i; 101 | exit; 102 | end if; 103 | end loop; 104 | 105 | -- Figure out the index of the most significant bit 106 | -- 107 | -- e.g. RV128I and we're doing a 32-bit shift: 108 | -- BitWidth = "001" 109 | -- BWHigh = 2 - 0 = 2 110 | -- Din(((127+1) / (2^^2)) - 1) 111 | -- = Din((128 / 4) - 1) 112 | -- = Din(31) -- i.e. (31 downto 0) 113 | MSBidx := ((Din'HIGH+1) / (2**BWNumeric)) - 1; 114 | 115 | -- SignBit Arithmetic 116 | -- | | 117 | -- AND ShRight 118 | -- | | 119 | -- AND 120 | -- | 121 | -- All shifted-out MUXes 122 | -- 123 | -- NULL if any of these are NULL, so incorporates the 124 | -- ShRight check. 125 | SignEx <= Din(MSBidx) AND Arithmetic AND ShRight; 126 | 127 | if (ncl_is_null(BitWidth) OR ncl_is_null(SignEx)) then 128 | -- if we don't check this, we might just use BWNumeric 129 | -- as derived above erroneously and get bad results. 130 | -- Same if we never check Arithmetic and ShRight. 131 | -- 132 | -- Until then we null the top of the tree, since no 133 | -- actual combinatorial circuit along the way CHECKS 134 | -- if BitWidth is null, and so will produce spurious 135 | -- non-null output otherwise. 136 | tree(-1) <= (others => ('0','0')); 137 | elsif (Shift(Shift'HIGH - BWNumeric) = '1') then 138 | -- If last shift bit is high, it shifts out to zero, so 139 | -- just set all output to zero. Also true if arithmetic. 140 | -- 141 | -- Fun fact: no matter what the input, this is the 142 | -- result; so it's actually reasonable to drop the 143 | -- Ready signal and tell the component sending the 144 | -- shift that you've received the data as soon as 145 | -- Shift() has that bit on. 146 | -- 147 | -- This also applies in lower XLEN, such as when 148 | -- a 64-bit processor running in 32-bit mode 149 | -- or calling a 32-bit shift sets bit 6 rather. 150 | -- For narrower BitWidth, this does exactly that, 151 | -- e.g. 1/4 width BWNumeric = 2, so instead of 152 | -- bit 8 in 128-bit, we check bit 6 (32-bit) 153 | -- 154 | -- THIS IS A 0 OUTPUT, NOT A NULL OUTPUT. 155 | Dout <= (others => ncl_encode('0')); 156 | else 157 | if (ShRight = '0') then 158 | -- Put Din into the top of the tree to avoid breaking out special 159 | -- handling for the first row. The "top" is basically tree(-1). 160 | tree(Shift'LOW-1) <= Din; 161 | elsif (ShRight = '1') then 162 | -- Put it in backwards. This should just be a row of muxes. 163 | for j in Din'RANGE loop 164 | 165 | -- Assign Din(0) to tree(-1)(127) 166 | -- Assign SignEx to tree(-1)(32) when we're using 32-bit 167 | -- instructions or modes on 64-bit or 128-bit platforms 168 | -- etc. 169 | -- 170 | -- Accordingly, we want the most significant bit down. 171 | tree(Shift'LOW-1)(Din'HIGH - j) <= SignEx WHEN j > MSBidx 172 | ELSE Din(j); 173 | end loop; 174 | end if; 175 | 176 | -- It's going to compute them all in parallel; 177 | -- combinatorial logic is not any faster by using 178 | -- j in MSBidx downto 0 179 | for i in Shift'HIGH - BWNumeric downto Shift'LOW loop 180 | for j in Din'RANGE loop 181 | if (j <= 2**i) then 182 | -- Sign-extend 183 | -- This will actually test the Arithmetic and 184 | -- ShRight bits for non-NULL status. 185 | tree(i)(j) <= (tree(i-1)(j) AND NOT Shift(i)) 186 | OR (SignEx AND Shift(i)); 187 | else 188 | -- This part will NOT check Arithmetic or 189 | -- ShRight, which can lead to spurious outputs in 190 | -- contrived situations given valid input and handshake, 191 | -- hence the explicit SignEx NULL check above. 192 | -- 193 | -- If shift bit not on, take this column; 194 | -- if shift on, take the column 2**i to the right 195 | tree(i)(j) <= (tree(i-1)(j) AND NOT Shift(i)) 196 | OR (tree(i-1)(j-2**i) AND Shift(i)); 197 | end if; 198 | end loop; 199 | end loop; 200 | if (ShRight = '0') then 201 | -- Shift left doesn't care about the rest of the register 202 | Dout <= tree(Shift'HIGH - BWNumeric); 203 | else 204 | -- We have to reverse the lowest bits below MSBidx. 205 | for j in MSBidx downto 0 loop 206 | Dout(MSBidx - j) <= tree(Shift'HIGH - BWNumeric)(j); 207 | end loop; 208 | end if; 209 | end if; 210 | end process barrel; 211 | end barrel_shifter_ncl; 212 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/cpu/shifters/barrel_shifter_no_signex.vhdl: -------------------------------------------------------------------------------- 1 | -- vim: ts=4 sw=4 et 2 | -- Barrel shifter 3 | -- 4 | -- n-bit NCL barrel shifter. Cannot do arithmetic (sign extension) 5 | library IEEE; 6 | use IEEE.std_logic_1164.all; 7 | use IEEE.math_real."ceil"; 8 | use ieee.math_real."log2"; 9 | library async_ncl; 10 | use async_ncl.ncl.all; 11 | -- NCL 2:1 mux 12 | -- 13 | -- Spurious outputs (glitch) happen if you use a non-NCL bit for 14 | -- the shifter, so yes the MUX selector input has to be NCL. 15 | -- 16 | -- Bit 1 Bit 0 17 | -- | | | | 18 | -- ------------------ 19 | -- \ /-- NCL 20 | -- \ OUT /---Select 21 | -- ------------ 22 | -- | | 23 | -- 24 | 25 | 26 | -- Barrel shifter 27 | -- 28 | -- Select bit of '1' selects the left (input) bit. 29 | -- 30 | -- bit 4 3 2 1 31 | -- | -| -| -| 32 | -- | | | | | | | 33 | -- MUX MUX MUX AND--NOT-- Select bit 0 34 | -- | -|---| | | 35 | -- | | | -|---+ - To all MUX on first stage 36 | -- | | | | | | 37 | -- | | | | | | 38 | -- MUX MUX AND AND--NOT-- Select bit 1 39 | -- | | | | | 40 | -- | | | | - to all MUX on this stage 41 | -- | | | | 42 | -- AND AND AND AND--NOT-- Select bit 2 (to all AND on this stage) 43 | -- | | | | 44 | -- 45 | -- In theory, it's faster to take the first stage if the shift is 46 | -- all off or all on, but that's more tests and gates. 47 | -- 48 | -- Barrel shifter r2 only has to be log(xlen), e.g 5 for 32-bit, 49 | -- 6 for 64-bit, 7 for 128-bit. 50 | entity e_barrel_shifter_ncl is 51 | -- Only feed this a power of 2! 52 | generic ( n : positive ); 53 | port( 54 | Din : in ncl_logic_vector(n-1 downto 0); 55 | Shift : in ncl_logic_vector(integer(ceil(log2(real(n))))-1 downto 0); 56 | ShRight : in ncl_logic; 57 | Dout : out ncl_logic_vector(n-1 downto 0) 58 | ); 59 | end e_barrel_shifter_ncl; 60 | 61 | -- All computations require NCL-complete input signals and pass 62 | -- NULL if any signal is incomplete. This prevents invalid output. 63 | -- 64 | -- This barrel shifter is reversible by using n muxes on input and 65 | -- output to reverse the bit order (reverse input, shift left, 66 | -- reverse output). 67 | architecture a_barrel_shifter_ncl of e_barrel_shifter_ncl is 68 | type tree_array is array (Shift'RANGE) of ncl_logic_vector(n-1 downto 0); 69 | signal tree : tree_array := (others => (others => ('0', '0'))); 70 | begin 71 | 72 | -- This thing is actually inherently combinatorial 73 | barrel: process(all) is 74 | begin 75 | if (Shift'HIGH = 1) then 76 | -- If last shift bit is high, it shifts out to zero, so 77 | -- just set all output to zero 78 | Dout <= (others => ('0','0')); 79 | else 80 | for i in Shift'RANGE loop 81 | for j in Din'RANGE loop 82 | -- First row from Din 83 | if (i = 0 and ShRight = '0') then 84 | -- Shift left 85 | if (j <= 2**i) then 86 | -- AND gate instead of MUX 87 | tree(i)(j) <= Din(j) 88 | AND NOT Shift(i); 89 | else 90 | -- If shift bit not on, take this column; 91 | -- if shift bit on, take the column 2**i right 92 | tree(i)(j) <= (Din(j) AND NOT Shift(i)) 93 | OR (Din(j-2**i) AND Shift(i)); 94 | end if; 95 | elsif (i = 0 and ShRight = '1') then 96 | -- Shift right 97 | if (j <= 2**i) then 98 | -- AND gate 99 | -- First row from Din, reversed 100 | tree(i)(j) <= Din(Din'HIGH - j) 101 | AND NOT Shift(i); 102 | else 103 | -- If shift bit not on, take this column; 104 | -- if shift bit on, take the column 2**i to the right 105 | tree(i)(j) <= (Din(Din'HIGH - j) 106 | AND NOT Shift(i)) 107 | OR (Din(Din'HIGH - (j-2**i)) 108 | AND Shift(i)); 109 | end if; 110 | elsif (i = Shift'HIGH) then 111 | -- Final row, already handled if the shift bit is on. 112 | -- Reverse back to normal if shifting right. 113 | if (ShRight = '0') then 114 | Dout <= tree(i-1); 115 | elsif (ShRight = '1') then 116 | Dout(j) <= tree(i-1)(Din'HIGH - j); 117 | end if; 118 | else -- 119 | if (j <= 2**i) then 120 | -- AND gate instead of MUX 121 | tree(i)(j) <= tree(i-1)(j) 122 | AND NOT Shift(i); 123 | else 124 | -- If shift bit not on, take this column; 125 | -- if shift on, take the column 2**i to the right 126 | tree(i)(j) <= (tree(i-1)(j) 127 | AND NOT Shift(i)) 128 | OR (tree(i-1)(j-2**i) 129 | AND Shift(i)); 130 | end if; 131 | end if; 132 | end loop; 133 | end loop; 134 | end if; 135 | end process barrel; 136 | end a_barrel_shifter_ncl; 137 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/infrastructure/handshake/README.md: -------------------------------------------------------------------------------- 1 | Asynchronous Handshake 2 | ====================== 3 | 4 | A completion-detection handshake allows for delay-insensitive components. 5 | 6 | Consider a component shaped as below: 7 | ``` 8 | ______________________________ 9 | -| Ready (out) Ready (in) |- 10 | -| Waiting (in) Waiting (out) |- 11 | =| d[0..x] (in) d[0..x] (out) |= 12 | |______________________________| 13 | ``` 14 | The above has a data input and a data output. Another component with the same 15 | interface would send and received data over the same interface. 16 | 17 | This allows only the transitions below: 18 | ``` 19 | Receiver: 20 | W[in] R[out] Transition 21 | 1 1 R[out] <= 0 22 | 0 0 R[out] <= 1 23 | 24 | Sender: 25 | R[in] W[out] Transition 26 | 1 0 W[out] <= 1 27 | 0 1 W[out] <= 0 28 | ``` 29 | 30 | The initialization state is `W[out]=0, R[out]=X`, with `R[out]` transitioning 31 | to `1` when ready. To be more clear: 32 | ``` 33 | Sender Receiver 34 | _______________ _______________ 35 | | Ready (in) |-| Ready (out) | 36 | | Waiting (out) |-| Waiting (in) | 37 | | d[0..x] (out) |=| d[0..x] (in) | 38 | |_______________| |_______________| 39 | ``` 40 | This proceeds as follows: 41 | ``` 42 | Sender: W = 0 State: R=0, W=0 43 | Receiver: R = 0 State: R=0, W=0 44 | Sender becomes ready to send data (valid here...) 45 | Sender: d <= [data] 46 | Receiver becomes ready to accept new data 47 | Receiver: R <= 1 State: R=1, W=0 48 | (...or sender can become ready to send here) 49 | Sender: W <= 1 State: R=1, W=1 50 | Receiver stores the data and no longer needs d 51 | Receiver: R <= 0 State: R=0, W=1 52 | Sender acknowledges (any state on Sender's end) 53 | Sender: W <= 0 State: R=0, W=0 54 | ``` 55 | 56 | The Receiver must buffer the data or the entire asynchronous CPU will 57 | simply wait until each single instruction is 100% complete and the 58 | output sent to main RAM before fetching the next single instruction. 59 | 60 | To do this, we use a delay-insensitive flip flop as a one-bit NCL 61 | register: 62 | 63 | ![Delay-Insensitev Flip Flop Buffer](ncl_async_register.png) 64 | 65 | This register outputs `STORED` or `St` when `D` is non-NULL, `Q = D`, 66 | and `W` is `1` (i.e. sender is signaling the data on `D` is valid). 67 | In this way, it is insensitive to its own delay and the delay of the 68 | sending circuit. 69 | 70 | Assembling these into an n-bit register produces the below black box: 71 | ``` 72 | __________ 73 | =| D EN |- 74 | -| W CLR |- 75 | | St |- 76 | | Q |= 77 | |__________| 78 | ``` 79 | The receiver checks `Q` (for valid non-NULL data) and `St` (for the 80 | register indicating that it itself considers the data non-NULL and 81 | stored) before transitionig `R` from `1` to `0`. 82 | 83 | For proper operation, the receiver should not transition `R` from `0` 84 | to `1` until first setting `CLR`, then validating the circuit's final 85 | output is NULL. This ensures the circuit has been flushed and won't 86 | generate spurious non-NULL output when new data comes in. 87 | 88 | Once the receiver has validated its output is flushed, it transitions 89 | `CLR` to `0` and `R` to `1`. The `EN` signal is suppressed until `CLR` 90 | is `0` so as to avoid an `EN AND CLR` situation (although the DFF 91 | respects `CLR` over `EN` in its current implementation). 92 | 93 | Importantly, `D` must remain valid until `R` transitions to `0`. The 94 | sending circuit can output `D` continuously when `R` is `0`, and so 95 | can send its output directly to `D` with no buffer. 96 | 97 | The sender should transition its output to NULL and then to valid; 98 | however, the NCL check fails on both `00` and `11`, so even if the 99 | circuit glitches and stores `Q = D = 11`, it will not proceed. This 100 | suppresses the data hazard. Consider transitioning from `10` to 101 | `01`: 102 | ``` 103 | 10 => 00 => 01 104 | 10 => 11 => 01 105 | ``` 106 | The supposed flush moves to `00`, indicating NULL; but a glitch where 107 | the `0` to `1` transition arrives before the corresponding `1` to `0` 108 | transition moves to `11`, which is treated as NULL. The circuit 109 | behaves identically in either case. The flush is to ensure a 110 | fully-valid but *incorrect* value does not land on `D` while `W` is 111 | `1`, which would result in the receiver accepting the data as 112 | correct and processing it—a severe data hazard. 113 | 114 | Altogether, the handshake protocol and the delay-insensitive NCL 115 | registers provide for asynchronous data transfer between internal 116 | components. 117 | 118 | # Example: Logical NOT 119 | 120 | The below circuit implements the receiver and sender handshake, a 121 | one-bit delay-insensitive register, and a logical NOT (which 122 | requires no gates itself: the signals are connected to inverse 123 | output). 124 | 125 | ![Delay-Insensitev Logical NOT](ncl_async_logical_not.png) 126 | 127 | The NCL completion check occurs several times in larger circuits: 128 | ``` 129 | -AH AL- 130 | | | | | 131 | | XOR | 132 | | _|_ | 133 | | | | | 134 | AND AND 135 | | | 136 | AH AL 137 | ``` 138 | This check uses one XOR gate and two AND gates per one NCL bit 139 | and is effectively a specialized four-to-two mux selecting 140 | between `[AH, AL]` when `AH XOR AL = 1` and `[0 0]` when 141 | `AH XOR AL = 0`. 142 | 143 | This component can wrap around other components: 144 | ``` 145 | AH AL- -BL BH 146 | | | | | | | 147 | XOR | | XOR 148 | | | | | 149 | | AND | 150 | | | | | 151 | | NOT | | 152 | | | | | 153 | -AND AND- 154 | | | 155 | OH OL 156 | ``` 157 | The above performs a logical `AND` of two bits `A` and `B`, 158 | outputting `NULL` when either input is `NULL`. The same 159 | circuit will be needed around the next component in the 160 | event the `XOR` signals propagate befor the `AND`, causing 161 | the `NOT` to output a signal when the `AND` gate is 162 | receiving invalid input. This is unlikely, but 163 | mathematically possible in the event the signal propagates 164 | through the central `AND` gate later than it does from the 165 | peripheral `XOR` gates. 166 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/infrastructure/handshake/handshake.vhdl: -------------------------------------------------------------------------------- 1 | -- vim: sw=4 ts=4 et 2 | -- Handshake 3 | -- 4 | -- Sender Receiver 5 | -- _______________ _______________ 6 | -- | Ready (in) |-| Ready (out) | 7 | -- | Waiting (out) |-| Waiting (in) | 8 | -- | d[0..x] (out) |=| d[0..x] (in) | 9 | -- |_______________| |_______________| 10 | -- 11 | -- Transitions: 12 | -- W[in] R[out] Transition 13 | -- 1 1 R[out] <= 0 14 | -- 0 0 R[out] <= 1 15 | -- 16 | -- R[in] W[out] Transition 17 | -- 1 0 W[out] <= 1 18 | -- 0 1 W[out] <= 0 19 | -- 20 | -- Ready and Waiting are TTL, not NCL: there is only 21 | -- one valid transition each way on the En input to the 22 | -- handshakes (0 to 1 or 1 to 0). State cannot go from 23 | -- enabled to null to enabled, because the circuit 24 | -- using the handshake MUST be able to identify with 25 | -- 100% certainty that it's transitioned TO a ready 26 | -- or waiting state and with 100% certainty that it's 27 | -- transitioned TO a not-ready or not-waiting state. 28 | -- This is why the asynchronous circuits using this 29 | -- handshake MUST verify complete data input BEFORE 30 | -- indicating they're no longer Ready (have received 31 | -- the data) AND confirm they've fully flushed the 32 | -- input buffer AND the output NULL BEFORE indicating 33 | -- they ARE ready: spurious mis-estimates of completion 34 | -- are fatal. 35 | library IEEE; 36 | use IEEE.std_logic_1164.all; 37 | library async_ncl; 38 | use async_ncl.ncl.all; 39 | 40 | entity e_ncl_handshake_sender is 41 | generic ( n : positive ); 42 | port( 43 | Ready : in std_logic; 44 | -- Output data 45 | Dout : in ncl_logic_vector(n-1 downto 0); 46 | -- Waiting signal 47 | Waiting : out std_logic 48 | ); 49 | end e_ncl_handshake_sender; 50 | 51 | library IEEE; 52 | use IEEE.std_logic_1164.all; 53 | library async_ncl; 54 | use async_ncl.ncl.all; 55 | 56 | entity e_ncl_handshake_receiver is 57 | port( 58 | Ready : out std_logic; 59 | -- Enable "Ready" output 60 | En : in std_logic; 61 | Waiting : in std_logic; 62 | EnOut : Out std_logic 63 | ); 64 | end e_ncl_handshake_receiver; 65 | 66 | architecture ncl_handshake_sender of e_ncl_handshake_sender is 67 | signal data_complete : std_logic; 68 | signal data_flushed : std_logic; 69 | signal data_complete_a : std_logic_vector(Dout'RANGE); 70 | begin 71 | -- Track when outgoing data is all not null 72 | data_complete <= NOT (OR ncl_is_null(Dout)); 73 | 74 | -- Track when absolutely every outgoing data LINE is '0' 75 | G1: for i in Dout'RANGE generate 76 | data_complete_a(i) <= Dout(i).H OR Dout(i).L; 77 | end generate G1; 78 | data_flushed <= NOT (OR data_complete_a); 79 | 80 | -- Signal data is waiting when receiver is Ready AND 81 | -- our data lines are complete; 82 | -- 83 | -- Keep signaling data is waiting until our data 84 | -- lines are flushed. 85 | -- 86 | -- Circuit must NOT alter incoming data UNTIL the 87 | -- incoming READY signal is dropped! 88 | Waiting <= (Ready AND data_complete) 89 | OR (Waiting AND data_flushed); 90 | end ncl_handshake_sender; 91 | 92 | architecture ncl_handshake_receiver of e_ncl_handshake_receiver is 93 | begin 94 | -- Waiting MUST only transition from 0 to 1 when sending Ready! 95 | -- En should be 1 when ready for new data. 96 | EnOut <= Ready AND Waiting AND En; 97 | Ready <= (Waiting NOR (NOT En)) OR (Waiting AND En); 98 | end ncl_handshake_receiver; 99 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/infrastructure/handshake/ncl_async_logical_not.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrmoserbaltimore/risc-v-cpu-asynchronous/3c0864c1b023da5a7b3475c484f0aca1b9310e09/RISC-V.srcs/asynchronous/infrastructure/handshake/ncl_async_logical_not.png -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/infrastructure/handshake/ncl_async_register.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrmoserbaltimore/risc-v-cpu-asynchronous/3c0864c1b023da5a7b3475c484f0aca1b9310e09/RISC-V.srcs/asynchronous/infrastructure/handshake/ncl_async_register.png -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/infrastructure/handshake/register.vhdl: -------------------------------------------------------------------------------- 1 | -- vim: sw=4 ts=4 et 2 | -- NCL Buffer Register 3 | -- 4 | -- INTEGRATION, the VLSI Journal, 59 (2017), 31-41, 5 | -- doi 10.1016/j.vlsi.2017.05.002 6 | -- "Simple Method of Asynchronous Circuits Implementation 7 | -- in Commercial FPGAs" by Zbigniew Hajduk 8 | -- 9 | -- The article above describes an Asynchronous Pipeline 10 | -- Register (APR) as a single-ended data bus fed into a 11 | -- network of two comparators, two flip-flops, AND gates, 12 | -- inverters, and three multiplexers. 13 | -- 14 | -- That design is not suitable for NULL convention logic. 15 | -- Instead, we use a simpler overall circuit with two D 16 | -- Flip-Flops, two Comparators, an N-Completion circuit, 17 | -- and three AND gates. This circuit specifically 18 | -- interfaces with our handshake protocol and allows 19 | -- reliable register storage without clock. 20 | -- 21 | -- In a sane world, this would be defined in async_ncl, 22 | -- and we could either have an architecture instantiate 23 | -- a component defined in the async_ncl package as 24 | -- being the entity(architecture) pair. VHDL as-is 25 | -- is analogous to pinouts being defined and sockets 26 | -- being sold, but nobody sells the chips or any 27 | -- design for the chips, so you have to make it yourself. 28 | library IEEE; 29 | use IEEE.std_logic_1164.all; 30 | library async_ncl; 31 | use async_ncl.ncl.all; 32 | 33 | entity e_ncl_latch is 34 | port ( 35 | D : in ncl_logic; 36 | EN, CLR : in std_logic; 37 | Q : out ncl_logic 38 | ); 39 | end e_ncl_latch; 40 | 41 | architecture ncl_latch of e_ncl_latch is 42 | begin 43 | 44 | latch: process(all) 45 | begin 46 | -- Activating both is not valid! In practice, favors CLR 47 | if (CLR) then 48 | -- Clear Q to all NULL regardless of D 49 | Q <= ('0', '0'); 50 | elsif (EN) then 51 | Q <= D; 52 | end if; 53 | end process latch; 54 | end ncl_latch; 55 | 56 | library IEEE; 57 | use IEEE.std_logic_1164.all; 58 | library async_ncl; 59 | use async_ncl.ncl.all; 60 | -- Registered logic for wide bus 61 | -- 62 | -- When R<='1' and D is NCL-complete, EN is activated.. 63 | -- 64 | -- When D=Q and D is NCL-complete and W<='1', 65 | entity e_ncl_logic_register is 66 | generic ( n: positive ); 67 | port ( 68 | D : in ncl_logic_vector(n-1 downto 0); 69 | -- Receiver R and W, that is, Ready(out) Waiting(in) 70 | -- EN should usually come from the sender handshake 71 | EN, W, CLR : in std_logic; 72 | Q : out ncl_logic_vector(n-1 downto 0); 73 | Stored : out std_logic 74 | ); 75 | end e_ncl_logic_register; 76 | 77 | use work.e_ncl_latch; 78 | library async_ncl; 79 | use async_ncl.ncl.all; 80 | -- n-bit delay-insensitive asynchronous register 81 | architecture ncl_logic_register of e_ncl_logic_register is 82 | -- On when time to enable the latch array 83 | signal en_latch : std_logic; 84 | begin 85 | latches: for i in n downto 0 generate 86 | latch: entity e_ncl_latch(ncl_latch) 87 | port map ( D => D(i), 88 | EN => en_latch, 89 | CLR => CLR, 90 | Q => Q(i)); 91 | end generate; 92 | -- Handshake protocol allows data in at all times, but 93 | -- only stores it when R=1, and only sets R=0 when W=1 AND 94 | -- the data has been stored (i.e. once the data-in lines 95 | -- can change without impacting the circuit, R <= 0 ). 96 | -- 97 | -- CLR should never be '1' while R is '1'; however, it is 98 | -- intended to keep R = '0' until Q reads full NULL, then 99 | -- set CLR <= '0' and R <= '1'. This creates a glitch 100 | -- wherein R propagates more slowly than CLR, so we ignore 101 | -- R when CLR is set. 102 | -- 103 | -- Adding AND NOT Stored would tend to cut off EN just 104 | -- slightly earlier: Stored <= '1' has to propagate for 105 | -- R <= '0', which has to propagate through the AND gate 106 | -- to set EN <= '0'. 107 | en_latch <= '1' when EN AND (NOT CLR) else 108 | '0'; 109 | 110 | -- D and Q must be distinct signals 111 | Stored <= '1' when (NOT ncl_is_null(D)) AND (D = Q) AND W = '1' else 112 | '0'; 113 | end ncl_logic_register; 114 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/infrastructure/ncl/README.md: -------------------------------------------------------------------------------- 1 | Delay-insensitive encoding 2 | =========================== 3 | 4 | Asynchronous components use one-hot delay-insensitive encoding internally. In 5 | standard TTL, a 32-bit data bus has 32 wires; in dual-rail one-hot encoding, 6 | the same data bus has 64 wires. Each pair carries one bit in one-hot encoding: 7 | ``` 8 | Dx0 Dx1 Value 9 | 0 0 NULL 10 | 1 0 0 11 | 0 1 1 12 | 1 1 Halt and catch fire 13 | ``` 14 | Each step internally must encode in this way to ensure detectable completion. 15 | The final output reaches a completion-detection circuit which then signals 16 | completion. 17 | 18 | The single-bit encoder looks as such: 19 | ``` 20 | INPUT 21 | |---- 22 | NOT | 23 | | | 24 | dx0 dx1 25 | ``` 26 | The dx1 bit is just the input, while the dx0 bit is the input inverted, as 27 | shown in the encoding table above. Note that `1`-bits propagate slightly 28 | faster than `0` bits if the NOT introduces delay. 29 | 30 | A `0` input gives `[1 0]`, while a `1` gives `[0 1]`. While switching from 31 | `1` to `0` is flawless, a glitch occurs switching from `0` to `1`: the `1` 32 | can propagate and produce `[1 1]` outputs. To handle this glitch, validation 33 | circuits must treat `[1 1]` identically to `[0 0]`, which is easy: `A XOR B` 34 | gives `1` if valid and `0` if `NULL` or invalid, so take `A XOR B = 1` as 35 | completion. 36 | 37 | The single-bit decoder looks as such: 38 | ``` 39 | Theoretical Optimized 40 | INPUT INPUT 41 | | |---- | | 42 | XOR | | 43 | | | OUTPUT 44 | ----AND 45 | | 46 | OUTPUT 47 | ``` 48 | The optimized decoder uses no gates. In either case, the component using 49 | the decoder must validate completion before considering the output valid. 50 | 51 | Consider a two-bit adder, as below: 52 | ``` 53 | INPUT: A1 B1 A0 B0 54 | 0 1 1 1 55 | | | | | 56 | [Encoder] [Encoder] 57 | 1 0 0 1 0 1 0 1 58 | | | | | | | | | 59 | [Full Adder]-0[Half Adder] (Cout=[0 1]=1) 60 | [ ]-1[ ] 61 | 0 1 1 0 1 0 (Carry=[0 1], S1=[1 0], S0=[1 0]) 62 | * | | | | | | 63 | [Decoder] [Decoder] 64 | OUTPUT: 1 0 0 65 | ``` 66 | The input is two binary values, `A=01` and `B=11`. The encoder encodes these 67 | to `A=[10 01]` and `B=[01 01]`. The adders themselves also encode in this 68 | manner (note this is a ripple-carry adder). 69 | 70 | The outputs go to a decoder, which asserts 00 on the output and 1 as the carry 71 | bit. Note that adding 1 to 0b11 overflows and produces 0 and a carry bit. 72 | 73 | The six output lines (marked `*`) also drive a gate tree as follows: 74 | ``` 75 | * | | | | | | 76 | XOR XOR XOR Ready[In]--(INPUT) 77 | | | | | 78 | AND AND 79 | | | 80 | -----AND----- 81 | | 82 | Waiting[Out] 83 | ``` 84 | Note the convention here: on NULL `[0 0]`, the XOR gates output nothing, and 85 | the component does not assert Waiting. Because `[1 1]` is an invalid state, 86 | OR gates also work rather than XOR; an XOR gate causes halt on invalid 87 | encoding, while OR causes invalid output. 88 | 89 | Also note Waiting[Out] is delayed by the gate delay of an XOR gate and *two* 90 | AND gates, while the assertion of a `1` bit through the decoder is delayed 91 | by an XOR gate and *one* AND gate. For single-bit output, the delay is the 92 | same as a `1` bit decode: 93 | ``` 94 | | | 95 | XOR Ready[In]--(INPUT) 96 | | | 97 | AND 98 | | 99 | Waiting[Out] 100 | ``` 101 | In general, for `n` bits of output, the delay to assert Waiting[Out] when 102 | Ready[In] is asserted and all data lines are available is one level of XOR 103 | gates plus `log(n+1,2)` levels of AND gates. 104 | 105 | When Ready[In] becomes `0`, Waiting[Out] automatically becomes `0`. 106 | 107 | The component can be made to also not pass Ready[Out] until all outputs read 108 | `[0 0]` and Waiting[In] reads `0`: 109 | ``` 110 | * | | | | | | 111 | NOR NOR NOR Waiting[In]--(INPUT) 112 | | | | | 113 | AND AND----NOT 114 | | | 115 | -----AND----- 116 | | 117 | Ready[Out] 118 | ``` 119 | In this way, the component asserts Waiting[Out] when all outputs are ready, 120 | and asserts Ready[Out] when all outputs are cleared and Waiting[in] is not 121 | asserted. 122 | 123 | These assertions should be latched and reset at appropriate times. For 124 | example: Ready[Out] must remain asserted until the component is no longer 125 | affected by state changes on the data bus. 126 | 127 | When two components are both asynchronous, there is no reason to encode and 128 | decode the output between them. For example, the binary adder above may be 129 | connected to an instruction pipeline which itself encodes and decodes the 130 | data, or which itself only interfaces with asynchronous components. In such 131 | a case, the decoding delay is zero and the validation delay is greater than 132 | zero. Such a configuration would need to convert when leaving its domain, 133 | such as when sending data to memory or peripherals. 134 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/infrastructure/ncl/ncl.vhdl: -------------------------------------------------------------------------------- 1 | -- vim: sw=4 ts=4 et 2 | -- 2 hot 2 handle 3 | -- 4 | -- An NCL bit is basically two rail one-hot specified as such: 5 | -- 6 | -- d : std_logic_vector(1 downto 0); 7 | -- 8 | -- d = "00" -- NULL 9 | -- d = "10" -- 0, note d(1) = '1', d(0) = '0' 10 | -- d = "01" -- 1 11 | -- 12 | -- Our NCL implementation operates as follows: 13 | -- 14 | -- entity foo is 15 | -- port( d : (in) ncl_logic(7 downto 0) ); 16 | -- end foo; 17 | -- 18 | -- d(0)(L) will give the low bit, d(0)(H) will give the high bit, 19 | -- on data bit 0. 20 | library IEEE; 21 | use IEEE.std_logic_1164.all; 22 | 23 | package ncl is 24 | type ncl_logic is record 25 | L : std_logic; 26 | H : std_logic; 27 | end record; 28 | 29 | type ncl_logic_vector is array (natural range <>) of ncl_logic; 30 | 31 | -- NULL check 32 | function ncl_is_null(d : ncl_logic) return std_logic; 33 | function ncl_is_null(d : ncl_logic_vector) return std_logic_vector; 34 | function ncl_is_null(d : ncl_logic) return boolean; 35 | function ncl_is_null(d : ncl_logic_vector) return boolean; 36 | -- Encoder and decoder 37 | function ncl_encode (d : std_logic) return ncl_logic; 38 | function ncl_encode (d : std_logic_vector) return ncl_logic_vector; 39 | function ncl_decode (d : ncl_logic) return std_logic; 40 | function ncl_decode (d : ncl_logic_vector) return std_logic_vector; 41 | -- Logic operators 42 | function "and" (l, r: ncl_logic) return ncl_logic; 43 | function "nand" (l, r: ncl_logic) return ncl_logic; 44 | function "or" (l, r: ncl_logic) return ncl_logic; 45 | function "nor" (l, r: ncl_logic) return ncl_logic; 46 | function "xor" (l, r: ncl_logic) return ncl_logic; 47 | function "xnor" (l, r: ncl_logic) return ncl_logic; 48 | function "not" (l : ncl_logic) return ncl_logic; 49 | -- Logical operators on multiple bits 50 | function "and" (l, r: ncl_logic_vector) return ncl_logic_vector; 51 | function "nand" (l, r: ncl_logic_vector) return ncl_logic_vector; 52 | function "or" (l, r: ncl_logic_vector) return ncl_logic_vector; 53 | function "nor" (l, r: ncl_logic_vector) return ncl_logic_vector; 54 | function "xor" (l, r: ncl_logic_vector) return ncl_logic_vector; 55 | function "xnor" (l, r: ncl_logic_vector) return ncl_logic_vector; 56 | function "not" (l : ncl_logic_vector) return ncl_logic_vector; 57 | -- Comparators 58 | function "=" (l, r: ncl_logic) return boolean; 59 | function "=" (l : ncl_logic; r: std_logic) return boolean; 60 | end; 61 | 62 | package body ncl is 63 | -- returns the glitch "11" as NULL as well 64 | function ncl_is_null(d: ncl_logic) return std_logic is 65 | begin 66 | return d.H XNOR d.L; 67 | end function; 68 | 69 | function ncl_is_null(d : ncl_logic_vector) return std_logic_vector is 70 | variable dout : std_logic_vector(d'RANGE); 71 | begin 72 | for i in d'RANGE loop 73 | dout(i) := ncl_is_null(d(i)); 74 | end loop; 75 | return dout; 76 | end function; 77 | 78 | function ncl_is_null(d: ncl_logic) return boolean is 79 | begin 80 | -- Any result that's not '1' is not non-NULL 81 | return (d.H XNOR d.L) /= '1'; 82 | end function; 83 | 84 | function ncl_is_null(d : ncl_logic_vector) return boolean is 85 | begin 86 | for i in d'RANGE loop 87 | -- True if anything in here is null 88 | if (ncl_is_null(d(i))) then 89 | return true; 90 | end if; 91 | end loop; 92 | return false; 93 | end function; 94 | 95 | function ncl_encode (d : std_logic) return ncl_logic is 96 | begin 97 | return (H => NOT d, L => d); 98 | end function; 99 | 100 | function ncl_encode (d : std_logic_vector) return ncl_logic_vector is 101 | variable dout : ncl_logic_vector(d'RANGE); 102 | begin 103 | for i in d'RANGE loop 104 | dout(i) := ncl_encode(d(i)); 105 | end loop; 106 | return dout; 107 | end function; 108 | 109 | -- In NCL, the low bit represents the value and the high 110 | -- bit is the inverse of the value. 111 | function ncl_decode (d : ncl_logic) return std_logic is 112 | begin 113 | -- Invalid, can't decode. 114 | -- Can't read this reliably, so check BEFORE decoding! 115 | if (ncl_is_null(d)) then 116 | return 'U'; 117 | end if; 118 | return (d.L); 119 | end function; 120 | 121 | function ncl_decode (d : ncl_logic_vector) return std_logic_vector is 122 | variable dout : std_logic_vector(d'RANGE); 123 | begin 124 | for i in d'RANGE loop 125 | dout(i) := ncl_decode(d(i)); 126 | end loop; 127 | return dout; 128 | end function; 129 | 130 | -- For all logical functions, the low bit is the logical 131 | -- operator applied to the low bits, and the high bit is 132 | -- the inverse applied to the high bits (or the low bit 133 | -- inverted). 134 | -- 135 | -- If either is NULL, return NULL. 136 | 137 | -- The AND circuit should look like this: 138 | -- 139 | -- AH AL- -BL BH 140 | -- | | | | | | 141 | -- XOR | | XOR 142 | -- | | | | 143 | -- | AND | 144 | -- | | | | 145 | -- | NOT | | 146 | -- | | | | 147 | -- ---|-|-AND 148 | -- | | | 149 | -- -|-|---+ 150 | -- | | | | 151 | -- AND AND- 152 | -- | | 153 | -- OH OL 154 | -- 155 | function "and" (l, r : ncl_logic) return ncl_logic is 156 | begin 157 | if (ncl_is_null(l) OR ncl_is_null(r)) then 158 | return (H=>'0', L=>'0'); 159 | end if; 160 | return ncl_encode(l.L AND r.L); 161 | end function; 162 | 163 | function "nand" (l, r : ncl_logic) return ncl_logic is 164 | begin 165 | if (ncl_is_null(l) OR ncl_is_null(r)) then 166 | return (H=>'0', L=>'0'); 167 | end if; 168 | return ncl_encode(l.L NAND r.L); 169 | end function; 170 | 171 | function "or" (l, r : ncl_logic) return ncl_logic is 172 | begin 173 | if (ncl_is_null(l) OR ncl_is_null(r)) then 174 | return (H=>'0', L=>'0'); 175 | end if; 176 | return ncl_encode(l.L OR r.L); 177 | end function; 178 | 179 | function "nor" (l, r : ncl_logic) return ncl_logic is 180 | begin 181 | if (ncl_is_null(l) OR ncl_is_null(r)) then 182 | return (H=>'0', L=>'0'); 183 | end if; 184 | return ncl_encode(l.L NOR r.L); 185 | end function; 186 | 187 | function "xor" (l, r : ncl_logic) return ncl_logic is 188 | begin 189 | if (ncl_is_null(l) OR ncl_is_null(r)) then 190 | return (H=>'0', L=>'0'); 191 | end if; 192 | return ncl_encode(l.L XOR r.L); 193 | end function; 194 | 195 | function "xnor" (l, r : ncl_logic) return ncl_logic is 196 | begin 197 | if (ncl_is_null(l) OR ncl_is_null(r)) then 198 | return (H=>'0', L=>'0'); 199 | end if; 200 | return ncl_encode(l.L XNOR r.L); 201 | end function; 202 | 203 | -- The inverter is special: just swap the signals. 204 | function "not" (l : ncl_logic) return ncl_logic is 205 | begin 206 | return (H=>l.L, L=>l.H); 207 | end function; 208 | 209 | -- Above functions on arrays 210 | function "and" (l, r : ncl_logic_vector) return ncl_logic_vector is 211 | variable dout : ncl_logic_vector(l'RANGE); 212 | begin 213 | for i in l'RANGE loop 214 | dout(i) := l(i) AND r(i); 215 | end loop; 216 | return dout; 217 | end function; 218 | 219 | function "nand" (l, r : ncl_logic_vector) return ncl_logic_vector is 220 | variable dout : ncl_logic_vector(l'RANGE); 221 | begin 222 | for i in l'RANGE loop 223 | dout(i) := l(i) NAND r(i); 224 | end loop; 225 | return dout; 226 | end function; 227 | 228 | function "or" (l, r : ncl_logic_vector) return ncl_logic_vector is 229 | variable dout : ncl_logic_vector(l'RANGE); 230 | begin 231 | for i in l'RANGE loop 232 | dout(i) := l(i) OR r(i); 233 | end loop; 234 | return dout; 235 | end function; 236 | 237 | function "nor" (l, r : ncl_logic_vector) return ncl_logic_vector is 238 | variable dout : ncl_logic_vector(l'RANGE); 239 | begin 240 | for i in l'RANGE loop 241 | dout(i) := l(i) NOR r(i); 242 | end loop; 243 | return dout; 244 | end function; 245 | 246 | function "xor" (l, r : ncl_logic_vector) return ncl_logic_vector is 247 | variable dout : ncl_logic_vector(l'RANGE); 248 | begin 249 | for i in l'RANGE loop 250 | dout(i) := l(i) XOR r(i); 251 | end loop; 252 | return dout; 253 | end function; 254 | 255 | function "xnor" (l, r : ncl_logic_vector) return ncl_logic_vector is 256 | variable dout : ncl_logic_vector(l'RANGE); 257 | begin 258 | for i in l'RANGE loop 259 | dout(i) := l(i) XNOR r(i); 260 | end loop; 261 | return dout; 262 | end function; 263 | 264 | function "not" (l : ncl_logic_vector) return ncl_logic_vector is 265 | variable dout : ncl_logic_vector(l'RANGE); 266 | begin 267 | for i in l'RANGE loop 268 | dout(i) := NOT l(i); 269 | end loop; 270 | return dout; 271 | end function; 272 | 273 | -- Comparators 274 | function "=" (l, r: ncl_logic) return boolean is 275 | begin 276 | if (ncl_is_null(l) or ncl_is_null(r) or (l.L /= r.L)) then 277 | return false; 278 | end if; 279 | return true; 280 | end function; 281 | 282 | function "=" (l: ncl_logic; r: std_logic) return boolean is 283 | begin 284 | if (ncl_is_null(l) or (l.L /= r)) then 285 | return false; 286 | end if; 287 | return true; 288 | end function; 289 | 290 | end package body; 291 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/infrastructure/transceiver/transceiver_async_to_sync.vhdl: -------------------------------------------------------------------------------- 1 | -- vim: sw=4 ts=4 et 2 | -- Synchronous-Asynchronous Transceiver 3 | -- 4 | -- Connects to/from a synchronous interface. 5 | -- 6 | -- Sync to Async 7 | -- 8 | -- Theory of operation: 9 | -- 10 | -- An asynchronous interface recognizes when data is ready and negotiates 11 | -- completion state continuously. Synchronous circuits rely on a clock 12 | -- timed to at least the delay of the circuits. 13 | -- 14 | -- An asynchronous interface is delay-insensitive and can wait forever for 15 | -- input or for a neighboring circuit to be ready to receive input. As 16 | -- such, any asynchronous circuit can synchronize to a clock and interface 17 | -- with other asynchronous circuits via the asynchronous protocol, only 18 | -- experiencing additional delay. 19 | -- 20 | -- Interfacing between asynchronous and synchronous circuits only requires 21 | -- an asynchronous circuit clocked to the synchronous circuit. The client 22 | -- circuits only connect to their respective interfaces, thus translating 23 | -- between the two. 24 | library IEEE; 25 | use IEEE.std_logic_1164.all; 26 | use work.ncl.all; 27 | 28 | entity e_transceiver_async_to_sync is 29 | generic( n: positive ); 30 | port( 31 | clk : in std_logic; 32 | din : in ncl_logic_vector(n-1 downto 0); 33 | dout : out std_logic_vector(n-1 downto 0); 34 | -- Write signal 35 | wr : out std_logic 36 | ); 37 | end e_transceiver_async_to_sync; 38 | 39 | -- FIXME: these need a complete transceiver architecture with 40 | -- an appropriate handshake. 41 | 42 | architecture transceiver_async_to_sync of e_transceiver_async_to_sync is 43 | begin 44 | process(clk) 45 | begin 46 | if (rising_edge(clk) and not ncl_is_null(din)) then 47 | dout <= ncl_decode(din); 48 | wr <= '1'; 49 | elsif (falling_edge(clk)) then 50 | wr <= '0'; 51 | end if; 52 | end process; 53 | end transceiver_async_to_sync; 54 | -------------------------------------------------------------------------------- /RISC-V.srcs/asynchronous/infrastructure/transceiver/transceiver_sync_to_async.vhdl: -------------------------------------------------------------------------------- 1 | -- vim: sw=4 ts=4 et 2 | -- Synchronous-Asynchronous Transceiver 3 | -- 4 | -- Connects to/from a synchronous interface. 5 | -- 6 | -- Theory of operation: 7 | -- 8 | -- An asynchronous interface recognizes when data is ready and negotiates 9 | -- completion state continuously. Synchronous circuits rely on a clock 10 | -- timed to at least the delay of the circuits. 11 | -- 12 | -- An asynchronous interface is delay-insensitive and can wait forever for 13 | -- input or for a neighboring circuit to be ready to receive input. As 14 | -- such, any asynchronous circuit can synchronize to a clock and interface 15 | -- with other asynchronous circuits via the asynchronous protocol, only 16 | -- experiencing additional delay. 17 | -- 18 | -- Interfacing between asynchronous and synchronous circuits only requires 19 | -- an asynchronous circuit clocked to the synchronous circuit. The client 20 | -- circuits only connect to their respective interfaces, thus translating 21 | -- between the two. 22 | library IEEE; 23 | use IEEE.std_logic_1164.all; 24 | use work.ncl.all; 25 | 26 | entity e_transceiver_sync_to_async is 27 | generic( n: positive ); 28 | port( 29 | clk : in std_logic; 30 | din : in std_logic_vector(n-1 downto 0); 31 | dout : out ncl_logic_vector(n-1 downto 0) 32 | ); 33 | end e_transceiver_sync_to_async; 34 | 35 | -- FIXME: these need a complete transceiver architecture with 36 | -- an appropriate handshake. 37 | architecture transceiver_sync_to_async of e_transceiver_sync_to_async is 38 | begin 39 | process(clk) 40 | begin 41 | if (rising_edge(clk)) then 42 | dout <= ncl_encode(din); 43 | elsif (falling_edge(clk)) then 44 | -- Send null 45 | dout <= (others => ('0', '0')); 46 | end if; 47 | end process; 48 | end transceiver_sync_to_async; -------------------------------------------------------------------------------- /RISC-V.xpr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 160 | 161 | 162 | 163 | 164 | 166 | 167 | 168 | 169 | 170 | 179 | 180 | 181 | 182 | 183 | 185 | 186 | 187 | 188 | 189 | 192 | 193 | 195 | 196 | 198 | 199 | 201 | 202 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | default_dashboard 263 | 264 | 265 | 266 | --------------------------------------------------------------------------------