├── project └── build.properties ├── .gitmodules ├── board-test.mk ├── src ├── main │ ├── scala │ │ └── fpgatidbits │ │ │ ├── interfaces │ │ │ ├── AXIStreamDefs.scala │ │ │ └── AsymPipelinedDualPortBRAM.scala │ │ │ ├── platform-wrapper │ │ │ ├── zedboard-linux │ │ │ │ └── load-bitfile.sh │ │ │ ├── axi │ │ │ │ ├── PYNQU96Wrapper.scala │ │ │ │ ├── PYNQZCU104Wrapper.scala │ │ │ │ ├── ZC706Wrapper.scala │ │ │ │ ├── PYNQU96CCWrapper.scala │ │ │ │ ├── PYNQZCU104CCWrapper.scala │ │ │ │ ├── PYNQWrapper.scala │ │ │ │ ├── GenericSDAccelWrapper.scala │ │ │ │ └── ZedBoardWrapper.scala │ │ │ ├── convey │ │ │ │ └── Makefile │ │ │ └── GenericAccelerator.scala │ │ │ ├── ocm │ │ │ ├── SinglePortBRAM.scala │ │ │ ├── SimpleDualPortBRAM.scala │ │ │ ├── DualPortMaskedBRAM.scala │ │ │ └── AsymDualPortRAM.scala │ │ │ ├── examples │ │ │ ├── HelloAccel.scala │ │ │ ├── ExampleRegOps.scala │ │ │ ├── ExampleSinglePortBRAM.scala │ │ │ ├── ExampleBRAM.scala │ │ │ ├── ExampleBRAMMasked.scala │ │ │ ├── ExampleSeqWrite.scala │ │ │ ├── ExampleSum.scala │ │ │ ├── ExampleRandomRead.scala │ │ │ ├── ExampleMultiChanSum.scala │ │ │ ├── ExampleMemLatency.scala │ │ │ ├── ExampleCopy.scala │ │ │ └── ExampleGrayScale.scala │ │ │ ├── streams │ │ │ ├── StreamInterleaver.scala │ │ │ ├── StreamResizer.scala │ │ │ ├── StreamThrottle.scala │ │ │ ├── StreamFilter.scala │ │ │ ├── StreamJoin.scala │ │ │ ├── StreamReducer.scala │ │ │ ├── StreamFork.scala │ │ │ ├── AXIStreamInputMux.scala │ │ │ ├── AXIStreamOutputMux.scala │ │ │ ├── SequenceGenerator.scala │ │ │ ├── BlockSequenceGenerator.scala │ │ │ ├── StreamDeinterleaver.scala │ │ │ ├── StreamLimiter.scala │ │ │ ├── SearchableQueue.scala │ │ │ ├── AffineLoopNestIndGen.scala │ │ │ ├── StreamSync.scala │ │ │ ├── StreamCAM.scala │ │ │ └── AXIStreamUpsizer.scala │ │ │ ├── math │ │ │ ├── Counter.scala │ │ │ ├── PipelinedMul.scala │ │ │ └── MathDef.scala │ │ │ ├── utils │ │ │ └── utils.scala │ │ │ ├── hlstools │ │ │ ├── HLSTools.scala │ │ │ └── TemplatedHLSBlackBox.scala │ │ │ ├── profiler │ │ │ ├── StateProfiler.scala │ │ │ ├── LevelProfiler.scala │ │ │ └── OutstandingTxnProfiler.scala │ │ │ ├── dma │ │ │ ├── scatter-gather │ │ │ │ └── GatherIF.scala │ │ │ ├── RespDeinterleaver.scala │ │ │ ├── StreamWriter.scala │ │ │ └── ReqInterleaver.scala │ │ │ ├── regfile │ │ │ └── RegFile.scala │ │ │ └── interconnect │ │ │ └── AXILiteSwitch.scala │ ├── resources │ │ ├── cpp │ │ │ ├── platform-wrapper-regdriver │ │ │ │ ├── platform.h │ │ │ │ ├── platform-wolverine.cpp │ │ │ │ ├── platform-tester.cpp │ │ │ │ ├── platform-wolverine-debug.cpp │ │ │ │ ├── platform-zedboard.cpp │ │ │ │ ├── platform-verilatedtester.cpp │ │ │ │ ├── axiregdriver.hpp │ │ │ │ ├── platform-zc706-linux.cpp │ │ │ │ ├── wrapperregdriver.h │ │ │ │ ├── platform-zc706-plddr-linux.cpp │ │ │ │ ├── platform-zedboard-linux.cpp │ │ │ │ ├── zedboardregdriver.hpp │ │ │ │ ├── platform-genericsdaccel.cpp │ │ │ │ ├── platform-xlnk.cpp │ │ │ │ ├── platform-mpsoc-xlnk.cpp │ │ │ │ ├── platform-mpsoc-cc-xlnk.cpp │ │ │ │ ├── wolverineregdriver.hpp │ │ │ │ └── wolverineregdriverdebug.hpp │ │ │ ├── platform-wrapper-tests │ │ │ │ ├── HelloAccel.cpp │ │ │ │ ├── ExampleRegOps.cpp │ │ │ │ ├── ExampleBRAM.cpp │ │ │ │ ├── ExampleHostCopy.cpp │ │ │ │ ├── ExampleSum.cpp │ │ │ │ ├── ExampleBRAMMasked.cpp │ │ │ │ ├── ExampleSeqWrite.cpp │ │ │ │ ├── ExampleCopy.cpp │ │ │ │ ├── ExampleMultiChanSum.cpp │ │ │ │ ├── ExampleMemLatency.cpp │ │ │ │ ├── GrayScaleFilter.cpp │ │ │ │ └── ExampleGather.cpp │ │ │ └── platform-wrapper-integration-tests │ │ │ │ ├── TestExampleRegOps.cpp │ │ │ │ ├── TestExampleBRAM.cpp │ │ │ │ ├── TestExampleSum.cpp │ │ │ │ ├── TestExampleMultiChanSum.cpp │ │ │ │ └── TestExampleBRAMMasked.cpp │ │ ├── script │ │ │ ├── make-vivado-proj.sh │ │ │ ├── verilator-build.sh │ │ │ ├── VerilatorMakefile │ │ │ ├── gen_xo.tcl │ │ │ ├── hls_syn.tcl │ │ │ ├── vivado-platformwrapper-zc706.tcl │ │ │ └── vivado-platformwrapper-zedboard.tcl │ │ ├── vivado-ip-cores │ │ │ └── ZedBoardWrapper │ │ │ │ └── xgui │ │ │ │ └── ZedBoardWrapper_v1_0.tcl │ │ ├── xml │ │ │ └── kernel_GenericSDAccelWrapperTop.xml │ │ └── verilog │ │ │ └── DualPortBRAM.v │ └── cpp │ │ └── platform-wrapper-regdriver │ │ └── platform-zc706-linux.cpp └── test │ └── scala │ ├── streams │ └── TestStreamCAM.scala │ ├── utils │ ├── TestTesterWrapper.scala │ ├── TestCat.scala │ └── TestUtils.scala │ ├── TestCounter.scala │ └── TestAffineLoopNestIndGen.scala ├── integration-test.mk ├── README.md ├── .github └── workflows │ └── ci.yml ├── test-all.sh ├── .gitignore ├── Makefile └── LICENSE /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.3.13 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/main/resources/script/oh-my-xilinx"] 2 | path = src/main/resources/script/oh-my-xilinx 3 | url = https://bitbucket.org/maltanar/oh-my-xilinx.git 4 | -------------------------------------------------------------------------------- /board-test.mk: -------------------------------------------------------------------------------- 1 | BOARDS = ZedBoard PYNQZ1 PYNQU96 PYNQU96CC PYNQZCU104CC ZC706 GenericSDAccel WX690T 2 | %: 3 | echo "Compiling board tst for $@" 4 | sbt "run v ExampleMultiChanSum $@" 5 | 6 | .PHONY: board-test 7 | board-test: $(BOARDS) 8 | 9 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/interfaces/AXIStreamDefs.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.axi 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | class AXIStreamIF[T <: Data](gen: T) extends DecoupledIO(gen) { 7 | ready.suggestName("TREADY") 8 | valid.suggestName("TVALID") 9 | bits.suggestName("TDATA") 10 | } 11 | -------------------------------------------------------------------------------- /integration-test.mk: -------------------------------------------------------------------------------- 1 | # The run all the integration tests 2 | TESTS = ExampleMultiChanSum ExampleSum ExampleRegOps ExampleBRAM ExampleBRAMMasked 3 | 4 | Example%: 5 | echo "Compiling chisel for $@" 6 | sbt "run test $@ Tester" 7 | cd "integration-tests/$@"; make; ./emu 10 8 | 9 | .PHONY: integration-test 10 | integration-test: $(TESTS) 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fpga-tidbits 2 | A collection of Chisel hardware generators for small but useful components for FPGA projects. 3 | 4 | There is some documentation available on the ([wiki](https://github.com/maltanar/fpga-tidbits/wiki)), but is rather incomplete. The best source of information is taking a look at the source code (most components are pretty small) or sending me an ([e-mail](mailto:maltanar@gmail.com)). 5 | 6 | Contributors and pull requests are welcome! 7 | -------------------------------------------------------------------------------- /src/test/scala/streams/TestStreamCAM.scala: -------------------------------------------------------------------------------- 1 | package streams 2 | 3 | import chisel3._ 4 | import chiseltest._ 5 | import org.scalatest.flatspec.AnyFlatSpec 6 | 7 | import fpgatidbits.streams._ 8 | 9 | class TestStreamCAM extends AnyFlatSpec with ChiselScalatestTester { 10 | behavior of "StreamCAM" 11 | it should "Initalize properly" in { 12 | test(new CAM(10, 8)) { 13 | c => 14 | c.io.hasFree.expect(true.B) 15 | c.io.freeInd.expect(0.U) 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/platform.h: -------------------------------------------------------------------------------- 1 | // header file defining the platform init-deinit functions 2 | // include the appropriate platform-*.cpp file that implements 3 | // these, and call initPlatform() at the start of your program to 4 | // get a WrapperRegDriver handle 5 | 6 | #ifndef PLATFORM_H_ 7 | #define PLATFORM_H_ 8 | #include "wrapperregdriver.h" 9 | 10 | WrapperRegDriver * initPlatform(bool tracing=false); 11 | void deinitPlatform(WrapperRegDriver * driver); 12 | 13 | 14 | #endif /* PLATFORM_H_ */ 15 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/platform-wrapper/zedboard-linux/load-bitfile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # See here for an example of using Vivado to convert regular bitfiles to 4 | # the appropriate binfile format for the ZedBoard: 5 | # https://github.com/maltanar/spmv-vector-cache/tree/master/bitfiles 6 | 7 | BITFILE_PATH="/root/bitfiles/$1.bin" 8 | 9 | if [ ! -f $BITFILE_PATH ]; then 10 | echo "File not found!" 11 | exit 1 12 | else 13 | echo "Loading bitfile: $BITFILE_PATH" 14 | cat $BITFILE_PATH > /dev/xdevcfg 15 | fi 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/platform-wolverine.cpp: -------------------------------------------------------------------------------- 1 | // platform init-deinit functions for the Convey Wolverine WX690T 2 | 3 | #include "platform.h" 4 | #include "wolverineregdriver.hpp" 5 | 6 | WolverineRegDriver * platform = 0; 7 | 8 | WrapperRegDriver * initPlatform(bool tracing) { 9 | if(!platform) { 10 | platform = new WolverineRegDriver(); 11 | } 12 | return (WrapperRegDriver *) platform; 13 | } 14 | 15 | void deinitPlatform(WrapperRegDriver * driver) { 16 | (void) driver; 17 | delete platform; 18 | } 19 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | ci: 7 | name: ci 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout 11 | uses: actions/checkout@v2 12 | - name: Setup Scala 13 | uses: olafurpg/setup-scala@v10 14 | with: 15 | java-version: adopt@1.8 16 | - name: Install verilator 17 | run: sudo apt-get update && sudo apt-get install verilator -y 18 | - name: Cache Scala 19 | uses: coursier/cache-action@v5 20 | - name: Run tests 21 | run: make test -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/ocm/SinglePortBRAM.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.ocm 2 | 3 | import chisel3._ 4 | 5 | // A simple single port BRAM which hopefully is inferred by the Synthesis tools 6 | class SinglePortBRAM(addrBits: Int, dataBits: Int) extends Module { 7 | val io = IO(new OCMSlaveIF(dataBits, dataBits, addrBits)) 8 | 9 | val mem = SyncReadMem(1 << addrBits, UInt(dataBits.W)) 10 | io.rsp.readData := DontCare 11 | 12 | val rdwrPort = mem(io.req.addr) 13 | when (io.req.writeEn) {rdwrPort := io.req.writeData} 14 | .otherwise {io.rsp.readData := rdwrPort} 15 | } 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/platform-tester.cpp: -------------------------------------------------------------------------------- 1 | // platform init-deinit functions for the TesterWrapper 2 | // note that this assumes the peripheral lives at address 0x43c00000 3 | 4 | #include "platform.h" 5 | #include "testerdriver.hpp" 6 | 7 | TesterRegDriver * platform = 0; 8 | 9 | WrapperRegDriver * initPlatform(bool tracing) { 10 | if(!platform) { 11 | platform = new TesterRegDriver(); // real setup done inside attach() 12 | } 13 | return (WrapperRegDriver *) platform; 14 | } 15 | 16 | void deinitPlatform(WrapperRegDriver * driver) { 17 | // TODO deinit tester? 18 | } 19 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/platform-wolverine-debug.cpp: -------------------------------------------------------------------------------- 1 | // platform init-deinit functions for the Convey Wolverine WX690T 2 | // debug variant using the AEG registers 3 | 4 | #include "platform.h" 5 | #include "wolverineregdriverdebug.hpp" 6 | 7 | WolverineRegDriverDebug * platform = 0; 8 | 9 | WrapperRegDriver * initPlatform(bool tracing) { 10 | if(!platform) { 11 | platform = new WolverineRegDriverDebug(); 12 | } 13 | return (WrapperRegDriver *) platform; 14 | } 15 | 16 | void deinitPlatform(WrapperRegDriver * driver) { 17 | (void) driver; 18 | delete platform; 19 | } 20 | -------------------------------------------------------------------------------- /test-all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script runs all the tests 4 | set -e 5 | # First run all the Chisel Unit tests 6 | #sbt 'test' 7 | 8 | N_TESTS=100 9 | 10 | # The run all the integration tests 11 | declare -a testArr=("ExampleMultiChanSum" "ExampleSum" "ExampleRegOps" "ExampleBRAM" "ExampleBRAMMasked") 12 | 13 | for t in "${testArr[@]}" 14 | do 15 | echo "Compiling chisel for $t" 16 | sbt "run test $t Tester" > .sbt_log 17 | cd "integration-tests/$t" 18 | echo "Compiling Verilator for $t" 19 | eval "./verilator-build.sh" 20 | eval "./VerilatedTesterWrapper $N_TESTS" 21 | cd "../.." 22 | done 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-tests/HelloAccel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "platform.h" 3 | #include "HelloAccel.hpp" 4 | #include 5 | 6 | using namespace std; 7 | 8 | int main() { 9 | WrapperRegDriver * platform = initPlatform(); 10 | 11 | HelloAccel t(platform); 12 | 13 | cout << "Signature: " << hex << t.get_signature() << dec << endl; 14 | unsigned int op1, op2, res; 15 | 16 | while(true) { 17 | cin >> op1; 18 | cin >> op2; 19 | t.set_op1(op1); 20 | t.set_op2(op2); 21 | res = t.get_res(); 22 | cout < " >&2 11 | exit 1 12 | fi 13 | 14 | PLATFORM="$1" 15 | ACCEL_VERILOG="$2" 16 | PROJECT_NAME="$3" 17 | PROJECT_DIR="$4" 18 | FREQ_MHZ=$5 19 | 20 | TCL_PATH=$TIDBITS_ROOT/src/main/script/vivado-platformwrapper-$PLATFORM.tcl 21 | echo $TCL_PATH 22 | vivado -mode batch -source $TCL_PATH -tclargs $TIDBITS_ROOT $ACCEL_VERILOG $PROJECT_NAME $PROJECT_DIR $FREQ_MHZ 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | .bsp 19 | .metals 20 | .sbt_log 21 | .vscode 22 | project/metals.sbt 23 | .idea 24 | 25 | *~ 26 | /*.v 27 | testOutput/ 28 | verilogOutput/ 29 | /*.hpp 30 | test_run_dir/ 31 | verilator/ 32 | *DS_STORE 33 | integration-tests/ 34 | 35 | 36 | # Qt 37 | *.pro.user 38 | 39 | # other 40 | emulator/ 41 | verilator/ 42 | emu-*/ 43 | vivado*.jou 44 | notes.md 45 | vivado 46 | Zedboard 47 | *.fir -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/platform-zedboard.cpp: -------------------------------------------------------------------------------- 1 | // platform init-deinit functions for the ZedBoard 2 | // note that this assumes the peripheral lives at address 0x43c00000 3 | 4 | #include "platform.h" 5 | #include "zedboardregdriver.hpp" 6 | 7 | ZedBoardRegDriver * platform = 0; 8 | 9 | WrapperRegDriver * initPlatform(bool tracing) { 10 | if(!platform) { 11 | platform = new ZedBoardRegDriver((void *) 0x43c00000); 12 | } 13 | return (WrapperRegDriver *) platform; 14 | } 15 | 16 | void deinitPlatform(WrapperRegDriver * driver) { 17 | // TODO doing a delete here causes the zedboard to go in a loop, debug this 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/examples/HelloAccel.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.examples 2 | 3 | import chisel3._ 4 | import fpgatidbits.PlatformWrapper._ 5 | import fpgatidbits.ocm._ 6 | 7 | class HelloAccelIO(numMemPorts: Int, p: PlatformWrapperParams) extends GenericAcceleratorIF(numMemPorts, p) { 8 | val op1 = Input(UInt(32.W)) 9 | val op2 = Input(UInt(32.W)) 10 | val res = Output(UInt(32.W)) 11 | } 12 | 13 | class HelloAccel(p: PlatformWrapperParams) extends GenericAccelerator(p) { 14 | def numMemPorts=0 15 | val io = IO(new HelloAccelIO(numMemPorts, p)) 16 | io.signature := makeDefaultSignature() 17 | 18 | io.res := io.op1 + io.op2 19 | } 20 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/platform-verilatedtester.cpp: -------------------------------------------------------------------------------- 1 | // platform init-deinit functions for the TesterWrapper 2 | // note that this assumes the peripheral lives at address 0x43c00000 3 | 4 | #include "platform.h" 5 | #include "verilatedtesterdriver.hpp" 6 | 7 | VerilatedTesterRegDriver * platform = 0; 8 | 9 | WrapperRegDriver * initPlatform(bool tracing) { 10 | if(!platform) { 11 | platform = new VerilatedTesterRegDriver(tracing); // real setup done inside attach() 12 | } 13 | return (WrapperRegDriver *) platform; 14 | } 15 | 16 | void deinitPlatform(WrapperRegDriver * driver) { 17 | if(platform) { 18 | delete platform; 19 | platform = 0; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/StreamInterleaver.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | // interleaves input streams onto a single output stream, 7 | // currently just a wrapper for a round-robin arbiter, may eventually 8 | // add some fancier variants and statistics/counting 9 | class StreamInterleaver[T <: Data](numSources: Int, gen: T) extends Module { 10 | val io = new Bundle { 11 | val in = Vec(numSources, Flipped(Decoupled(gen))) 12 | val out = Decoupled(gen) 13 | } 14 | 15 | val arb = Module(new RRArbiter(gen, n=numSources)).io 16 | for (i <- 0 until numSources) { 17 | arb.in(i) <> io.in(i) 18 | } 19 | arb.out <> io.out 20 | } 21 | -------------------------------------------------------------------------------- /src/main/resources/vivado-ip-cores/ZedBoardWrapper/xgui/ZedBoardWrapper_v1_0.tcl: -------------------------------------------------------------------------------- 1 | # Definitional proc to organize widgets for parameters. 2 | proc init_gui { IPINST } { 3 | ipgui::add_param $IPINST -name "Component_Name" 4 | #Adding Page 5 | set Page_0 [ipgui::add_page $IPINST -name "Page 0"] 6 | ipgui::add_param $IPINST -name "NUM_MEM_PORTS" -parent ${Page_0} -widget comboBox 7 | 8 | 9 | } 10 | 11 | proc update_PARAM_VALUE.NUM_MEM_PORTS { PARAM_VALUE.NUM_MEM_PORTS } { 12 | # Procedure called to update NUM_MEM_PORTS when any of the dependent parameters in the arguments change 13 | } 14 | 15 | proc validate_PARAM_VALUE.NUM_MEM_PORTS { PARAM_VALUE.NUM_MEM_PORTS } { 16 | # Procedure called to validate NUM_MEM_PORTS 17 | return true 18 | } 19 | 20 | 21 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/axiregdriver.hpp: -------------------------------------------------------------------------------- 1 | #ifndef AXIREGDRIVER_H 2 | #define AXIREGDRIVER_H 3 | 4 | #include 5 | #include "wrapperregdriver.h" 6 | 7 | class AXIRegDriver : public WrapperRegDriver { 8 | public: 9 | AXIRegDriver(void *baseAddr) { 10 | m_baseAddr = (AccelReg *) baseAddr; 11 | } 12 | 13 | virtual ~AXIRegDriver() {} 14 | 15 | virtual std::string platformID() { 16 | return "AXIDriver"; 17 | } 18 | 19 | virtual void writeReg(unsigned int regInd, AccelReg regValue) { 20 | m_baseAddr[regInd] = regValue; 21 | } 22 | 23 | virtual AccelReg readReg(unsigned int regInd) { 24 | return m_baseAddr[regInd]; 25 | } 26 | 27 | protected: 28 | AccelReg * m_baseAddr; 29 | 30 | }; 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /src/main/resources/script/verilator-build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # requires a recent version of verilator, e.g. 3.916 4 | VERILATOR_SRC_DIR="/usr/share/verilator/include" 5 | 6 | # call verilator to translate verilog to C++ 7 | verilator -Iother-verilog --cc TesterWrapper.v -Wno-assignin -Wno-fatal -Wno-lint -Wno-style -Wno-COMBDLY -Wno-STMTDLY --Mdir verilated --trace 8 | # if verilator freezes while executing, consider adding +define+SYNTHESIS=1 9 | # to the cmdline here. this will disable the Chisel printfs though. 10 | 11 | # add verilated.cpp from source dirs 12 | cp -f $VERILATOR_SRC_DIR/verilated.cpp . 13 | cp -f $VERILATOR_SRC_DIR/verilated_vcd_c.cpp . 14 | # compile everything 15 | g++ -std=c++11 $@ -I$VERILATOR_SRC_DIR -Iverilated *.cpp verilated/*.cpp -o VerilatedTesterWrapper 16 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-tests/ExampleRegOps.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | 4 | #include "ExampleRegOps.hpp" 5 | #include "platform.h" 6 | 7 | bool Run_ExampleRegOps(WrapperRegDriver * platform) { 8 | ExampleRegOps t(platform); 9 | 10 | cout << "Signature: " << hex << t.get_signature() << dec << endl; 11 | cout << "Enter two operands to sum: "; 12 | unsigned int a, b; 13 | cin >> a >> b; 14 | 15 | t.set_op_0(a); 16 | t.set_op_1(b); 17 | 18 | cout << "Result: " << t.get_sum() << " expected: " << a+b << endl; 19 | 20 | return (a+b) == t.get_sum(); 21 | } 22 | 23 | int main() 24 | { 25 | WrapperRegDriver * platform = initPlatform(); 26 | 27 | Run_ExampleRegOps(platform); 28 | 29 | deinitPlatform(platform); 30 | 31 | return 0; 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/examples/ExampleRegOps.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.examples 2 | 3 | import chisel3._ 4 | import fpgatidbits.PlatformWrapper._ 5 | 6 | // test for register reads and writes: 7 | // add two 64-bit values 8 | 9 | class ExampleRegOpsIO(n: Int, p: PlatformWrapperParams) extends GenericAcceleratorIF(n,p) { 10 | val op = Vec(2, Input(UInt(64.W))) 11 | val sum = Output(UInt(64.W)) 12 | val cc = Output(UInt(32.W)) 13 | } 14 | 15 | class ExampleRegOps(p: PlatformWrapperParams) extends GenericAccelerator(p) { 16 | val numMemPorts = 0 17 | val io = IO(new ExampleRegOpsIO(numMemPorts, p)) 18 | 19 | io.signature := makeDefaultSignature() 20 | io.sum := io.op(0) + io.op(1) 21 | 22 | val regCC = RegInit(0.U(32.W)) 23 | regCC := regCC + 1.U 24 | 25 | io.cc := regCC 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/StreamResizer.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import fpgatidbits.axi._ 6 | 7 | class StreamResizer(inWidth: Int, outWidth: Int) extends Module { 8 | val io = new Bundle { 9 | val in = Flipped(Decoupled(UInt(inWidth.W))) 10 | val out = Decoupled((UInt(outWidth.W))) 11 | } 12 | if(inWidth == outWidth) { 13 | // no need for any resizing, directly connect in/out 14 | io.out.valid := io.in.valid 15 | io.out.bits := io.in.bits 16 | io.in.ready := io.out.ready 17 | } else if(inWidth < outWidth) { 18 | Predef.assert(outWidth % inWidth == 0) 19 | StreamUpsizer(io.in, outWidth) <> io.out 20 | } else if(inWidth > outWidth) { 21 | Predef.assert(inWidth % outWidth == 0) 22 | StreamDownsizer(io.in, outWidth) <> io.out 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/examples/ExampleSinglePortBRAM.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.examples 2 | 3 | import chisel3._ 4 | import fpgatidbits.PlatformWrapper._ 5 | import fpgatidbits.ocm._ 6 | 7 | // instantiate a 512-wide 64-deep single-port BRAM and directly connect its 8 | // inputs to the module I/O (thus the register file) 9 | 10 | class ExampleSinglePortBRAMIO(w: Int, a:Int, n: Int, p: PlatformWrapperParams) extends GenericAcceleratorIF(n,p) { 11 | val ports = new OCMSlaveIF(w, a, 8) 12 | } 13 | class ExampleSinglePortBRAM(p: PlatformWrapperParams) extends GenericAccelerator(p) { 14 | val numMemPorts = 0 15 | val bramDataWidth = 512 16 | val io = IO(new ExampleSinglePortBRAMIO(bramDataWidth, bramDataWidth, numMemPorts,p)) 17 | io.signature := makeDefaultSignature() 18 | 19 | val mem = Module(new SinglePortBRAM(8, bramDataWidth)) 20 | mem.io <> io.ports 21 | } 22 | 23 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/math/Counter.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.math 2 | import chisel3._ 3 | 4 | // a simple counter incrementing by 1 every clock cycle when io.enable is set 5 | // up to io.nsteps-1 (upon which io.overflow will be 1), then back to 0 6 | // make sure io.nsteps is held for at least 1 cycle after a change. before 7 | // starting a new counting operation 8 | class Counter(w: Int) extends Module { 9 | val io = new Bundle { 10 | val nsteps = Input(UInt(w.W)) 11 | val current = Output(UInt(w.W)) 12 | val enable = Input(Bool()) 13 | val full = Output(Bool()) 14 | } 15 | val regCount = RegInit(0.U(w.W)) 16 | val regMax = RegNext(io.nsteps - 1.U) 17 | val limitReached = (regCount === regMax) 18 | when(io.enable) { 19 | regCount := Mux(limitReached, 0.U, regCount + 1.U) 20 | } 21 | io.full := limitReached 22 | io.current := regCount 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/platform-wrapper/axi/PYNQU96Wrapper.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.PlatformWrapper 2 | 3 | import chisel3._ 4 | 5 | // platform wrapper for PYNQ on Ultra-96 6 | 7 | object PYNQU96Params extends PlatformWrapperParams { 8 | val platformName = "PYNQU96" 9 | val memAddrBits = 32 10 | val memDataBits = 64 11 | val memIDBits = 6 12 | val memMetaBits = 1 13 | val numMemPorts = 4 14 | val sameIDInOrder = true 15 | val typicalMemLatencyCycles = 32 16 | val burstBeats = 8 // TODO why cap bursts at 8? AXI can do more 17 | val coherentMem = false 18 | } 19 | 20 | class PYNQU96Wrapper(instFxn: PlatformWrapperParams => GenericAccelerator, targetDir: String) 21 | extends AXIPlatformWrapper(PYNQU96Params, instFxn) { 22 | val platformDriverFiles = baseDriverFiles ++ Array[String]( 23 | "platform-mpsoc-xlnk.cpp", "xlnkdriver.hpp" 24 | ) 25 | suggestName("PYNQU96Wrapper") 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/StreamThrottle.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | // throttle the requests passing from a producer to a consumer, controlled 7 | // by an explicit signal 8 | 9 | class StreamThrottle[T <: Data](gen: T) extends Module { 10 | val io = new Bundle { 11 | val in = Flipped(Decoupled(gen)) // input stream 12 | val out = Decoupled(gen) // output stream 13 | val throttle = Input(Bool()) // stop input to output when this is high 14 | } 15 | io.out.bits := io.in.bits 16 | io.out.valid := io.in.valid & !io.throttle 17 | io.in.ready := io.out.ready & !io.throttle 18 | } 19 | 20 | object StreamThrottle { 21 | def apply[T <: Data](in: DecoupledIO[T], throttle: Bool) = { 22 | val m = Module(new StreamThrottle(gen = in.bits)).io 23 | in <> m.in 24 | m.throttle := throttle 25 | m.out 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/platform-wrapper/axi/PYNQZCU104Wrapper.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.PlatformWrapper 2 | 3 | import chisel3._ 4 | 5 | // platform wrapper for PYNQ on Ultra-96 6 | 7 | object PYNQZCU104Params extends PlatformWrapperParams { 8 | val platformName = "PYNQZCU104" 9 | val memAddrBits = 32 10 | val memDataBits = 64 11 | val memIDBits = 6 12 | val memMetaBits = 1 13 | val numMemPorts = 4 14 | val sameIDInOrder = true 15 | val typicalMemLatencyCycles = 32 16 | val burstBeats = 8 // TODO why cap bursts at 8? AXI can do more 17 | val coherentMem = false 18 | } 19 | 20 | class PYNQZCU104Wrapper(instFxn: PlatformWrapperParams => GenericAccelerator, targetDir: String) 21 | extends AXIPlatformWrapper(PYNQZCU104Params, instFxn) { 22 | val platformDriverFiles = baseDriverFiles ++ Array[String]( 23 | "platform-mpsoc-xlnk.cpp", "xlnkdriver.hpp" 24 | ) 25 | suggestName("PYNQZCU104Wrapper") 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/ocm/SimpleDualPortBRAM.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.ocm 2 | 3 | import chisel3._ 4 | 5 | // Targeting the Simple Dual Port BRAMs of Xilinx. 6 | // We can have 1 read port and 1 write port that is accessed simoultaneously 7 | 8 | class SimpleDualPortBRAM(addrBits: Int, dataBits: Int) extends Module { 9 | 10 | val io = IO( new Bundle { 11 | val read = new OCMSlaveIF(dataBits, dataBits, addrBits) 12 | val write = new OCMSlaveIF(dataBits, dataBits, addrBits) 13 | }) 14 | // Port 0 is read port 15 | // Port 1 is write port 16 | assert(!io.read.req.writeEn, "[SDP BRAM] can only read from readport") 17 | io.write.rsp.readData := DontCare 18 | io.read.rsp.readData := DontCare 19 | 20 | val mem = SyncReadMem(1 << addrBits, UInt(dataBits.W)) 21 | 22 | val rdPort = mem(io.read.req.addr) 23 | 24 | io.read.rsp.readData := rdPort 25 | when (io.write.req.writeEn) {mem(io.write.req.addr) := io.write.req.writeData} 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/platform-wrapper/axi/ZC706Wrapper.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.PlatformWrapper 2 | 3 | import chisel3._ 4 | 5 | // platform wrapper for the ZC706, not using the dedicated PL DDR 6 | 7 | object ZC706Params extends PlatformWrapperParams { 8 | val platformName = "ZC706" 9 | val memAddrBits = 32 10 | val memDataBits = 64 11 | val memIDBits = 6 12 | val memMetaBits = 1 13 | val numMemPorts = 4 14 | val sameIDInOrder = true 15 | val typicalMemLatencyCycles = 32 16 | val burstBeats = 8 // TODO why cap bursts at 8? AXI can do more 17 | val coherentMem = false // TODO add CC version 18 | } 19 | 20 | 21 | class ZC706Wrapper(instFxn: PlatformWrapperParams => GenericAccelerator, targetDir: String) 22 | extends AXIPlatformWrapper(ZC706Params, instFxn) { 23 | val platformDriverFiles = baseDriverFiles ++ Array[String]( 24 | "platform-zc706-linux.cpp", "linuxphysregdriver.hpp", "axiregdriver.hpp" 25 | ) 26 | suggestName("ZC706Wrapper") 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/examples/ExampleBRAM.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.examples 2 | 3 | import chisel3._ 4 | import fpgatidbits.PlatformWrapper._ 5 | import fpgatidbits.ocm._ 6 | 7 | // instantiate a 32-wide 1024-deep dual-port BRAM and directly connect its 8 | // inputs to the module I/O (thus the register file) 9 | 10 | class ExampleBRAMIO(n: Int, p: PlatformWrapperParams) extends GenericAcceleratorIF(n,p) { 11 | val ports = Vec(2, new OCMSlaveIF(32, 32, 10)) 12 | } 13 | 14 | class ExampleBRAM(p: PlatformWrapperParams) extends GenericAccelerator(p) { 15 | val numMemPorts = 0 16 | val io = IO(new ExampleBRAMIO(numMemPorts, p)) 17 | io.signature := makeDefaultSignature() 18 | 19 | val memExt = Module(new DualPortBRAM(10, 32)).io 20 | val mem = Wire(new DualPortBRAMIOWrapper(10, 32)) 21 | memExt.clk := clock 22 | 23 | memExt.a.connect(mem.ports(0)) 24 | memExt.b.connect(mem.ports(1)) 25 | 26 | mem.ports(0) <> io.ports(0) 27 | mem.ports(1) <> io.ports(1) 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/StreamFilter.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | class StreamFilter[Tin <: Data, Tout <: Data] 7 | (genI: Tin, genO: Tout, filterFxn: Tin => Tout ) extends Module { 8 | val io = IO(new Bundle { 9 | val in = Flipped(Decoupled(genI)) 10 | val out = Decoupled(genO) 11 | }) 12 | io.out.valid := io.in.valid 13 | io.out.bits := filterFxn(io.in.bits) 14 | io.in.ready := io.out.ready 15 | } 16 | 17 | object StreamFilter { 18 | def apply[Tin <: Data, Tout <: Data] 19 | (in: DecoupledIO[Tin], outGen: Tout, filterFxn: Tin => Tout ) = { 20 | val sf = Module(new StreamFilter[Tin,Tout](in.bits.cloneType,outGen.cloneType, filterFxn)).io 21 | sf.in <> in 22 | sf.out 23 | } 24 | } 25 | 26 | import fpgatidbits.dma._ 27 | 28 | object ReadRespFilter { 29 | def apply(in: DecoupledIO[GenericMemoryResponse]) = { 30 | val filterFxn = {r: GenericMemoryResponse => r.readData} 31 | StreamFilter(in, in.bits.readData, filterFxn) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-tests/ExampleBRAM.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "platform.h" 3 | #include "ExampleBRAM.hpp" 4 | #include 5 | 6 | using namespace std; 7 | 8 | int main() { 9 | WrapperRegDriver * platform = initPlatform(); 10 | 11 | ExampleBRAM t(platform); 12 | 13 | cout << "Signature: " << hex << t.get_signature() << dec << endl; 14 | 15 | string cmd; 16 | unsigned int addr, dat; 17 | 18 | cin >> cmd; 19 | 20 | while(cmd != "q") { 21 | if(cmd == "r") { 22 | cin >> addr; 23 | t.set_ports_0_req_addr(addr); 24 | cout << "addr " << addr << " = " << t.get_ports_0_rsp_readData() << endl; 25 | } else if (cmd == "w") { 26 | cin >> addr >> dat; 27 | t.set_ports_0_req_addr(addr); 28 | t.set_ports_0_req_writeData(dat); 29 | t.set_ports_0_req_writeEn(1); 30 | t.set_ports_0_req_writeEn(0); 31 | cout << "wrote " << dat << " to " << addr << endl; 32 | } else cout << "unrecognized" << endl; 33 | 34 | cin >> cmd; 35 | } 36 | 37 | return 0; 38 | } 39 | -------------------------------------------------------------------------------- /src/main/resources/xml/kernel_GenericSDAccelWrapperTop.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /src/main/resources/script/VerilatorMakefile: -------------------------------------------------------------------------------- 1 | EMULATOR_BIN:=emu 2 | TRACE ?= false 3 | VERILATOR_SRC_DIR=/usr/share/verilator/include 4 | VERIlATOR_SRC_GEN_DIR=verilated 5 | VERILATOR_OPTIONS=-Iother-verilog -Wno-assignin -Wno-fatal -Wno-lint -Wno-style -Wno-COMBDLY -Wno-STMTDLY 6 | TARGET_LIB=tidbits.a 7 | VER_BUILD_DIR=obj_dir 8 | 9 | ifeq ($(TRACE),true) 10 | VERILATOR_OPTIONS += --trace 11 | CC_OPTIONS := -CFLAGS "-DTRACE" 12 | endif 13 | 14 | PLATFORM := TesterWrapper 15 | SRCS=main.cpp platform-verilatedtester.cpp 16 | 17 | build: $(EMULATOR_BIN) 18 | 19 | lib: $(SRCS) 20 | verilator --cc $(PLATFORM).v $(VERILATOR_OPTIONS) $(CC_OPTIONS) --exe --build $^ 21 | @rm $(VER_BUILD_DIR)/main.o 22 | ar -rcs $(TARGET_LIB) $(VER_BUILD_DIR)/*.o 23 | 24 | $(EMULATOR_BIN): main.cpp platform-verilatedtester.cpp 25 | @verilator --cc $(PLATFORM).v $(VERILATOR_OPTIONS) $(CC_OPTIONS) --exe --build main.cpp platform-verilatedtester.cpp 26 | @cp obj_dir/V$(PLATFORM) $(EMULATOR_BIN) 27 | 28 | clean: 29 | @rm -rf obj_dir 30 | @rm -f $(EMULATOR_BIN) 31 | 32 | rebuild: clean build 33 | 34 | run: build 35 | @./$(EMULATOR_BIN) 36 | 37 | .PHONY: build clean rebuild run 38 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/StreamJoin.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | // combinational join for two streams -> one stream 7 | // join function can be customized 8 | 9 | class StreamJoin[TiA <: Data, TiB <: Data, TO <: Data] 10 | (genA: TiA, genB: TiB, genOut: TO, join: (TiA, TiB) => TO) extends Module { 11 | val io = IO(new Bundle { 12 | val inA = Flipped(Decoupled(genA)) 13 | val inB = Flipped(Decoupled(genB)) 14 | val out = Decoupled(genOut) 15 | }) 16 | 17 | io.out.valid := io.inA.valid & io.inB.valid 18 | io.out.bits := join(io.inA.bits, io.inB.bits) 19 | 20 | io.inA.ready := io.out.ready & io.inB.valid 21 | io.inB.ready := io.out.ready & io.inA.valid 22 | } 23 | 24 | object StreamJoin { 25 | def apply[TiA <: Data, TiB <: Data, TO <: Data] 26 | (inA: DecoupledIO[TiA], 27 | inB: DecoupledIO[TiB], 28 | genO: TO, 29 | join: (TiA, TiB) => TO): DecoupledIO[TO] = { 30 | val joiner = Module(new StreamJoin(inA.bits.cloneType, inB.bits.cloneType, genO.cloneType, join)).io 31 | joiner.inA <> inA 32 | joiner.inB <> inB 33 | joiner.out 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/utils/utils.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.utils 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | // For testing 7 | //import chisel3.tester._ 8 | //import chisel3.tester.RawTester.test 9 | 10 | object BitExtraction { 11 | def apply(word: UInt, high: UInt, low: UInt): UInt = { 12 | 13 | val width = word.getWidth 14 | var res = UInt(width.W) 15 | val highMask = (( (1.U << (high+1.U)).asUInt) - 1.U)(31,0) 16 | val lowMask = ~((1.U(width.W) << low).asUInt - 1.U)(31,0) 17 | val mask = highMask.asUInt & lowMask.asUInt 18 | res = ((word & mask) >> low).asUInt 19 | 20 | res 21 | } 22 | } 23 | 24 | 25 | object SubWordAssignment { 26 | def apply(word: UInt, high: Int, low:Int, subWord: UInt): UInt = { 27 | val wireOut = WireInit(word) 28 | // Clean out the desired area 29 | val highMask = (( (1.U << (high+1)).asUInt) - 1.U) 30 | val lowMask = ~((1.U << low).asUInt - 1.U) 31 | val cleanMask = highMask.asUInt & lowMask.asUInt 32 | 33 | // Overwrite that area 34 | val writeMask = ((~cleanMask).asUInt & (subWord << low).asUInt) 35 | 36 | wireOut & cleanMask | writeMask 37 | } 38 | } 39 | 40 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/platform-wrapper/axi/PYNQU96CCWrapper.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.PlatformWrapper 2 | 3 | import chisel3._ 4 | 5 | // platform wrapper for PYNQ on Ultra-96 with one cache-coherent port 6 | 7 | object PYNQU96CCParams extends PlatformWrapperParams { 8 | val platformName = "PYNQU96CC" 9 | val memAddrBits = 32 10 | val memDataBits = 64 11 | val memIDBits = 6 12 | val memMetaBits = 1 13 | val numMemPorts = 1 14 | val sameIDInOrder = true 15 | val typicalMemLatencyCycles = 32 16 | val burstBeats = 8 // TODO why cap bursts at 8? AXI can do more 17 | val coherentMem = true 18 | } 19 | 20 | class PYNQU96CCWrapper(instFxn: PlatformWrapperParams => GenericAccelerator, targetDir: String) 21 | extends AXIPlatformWrapper(PYNQU96Params, instFxn) { 22 | val platformDriverFiles = baseDriverFiles ++ Array[String]( 23 | "platform-mpsoc-cc-xlnk.cpp", "xlnkdriver.hpp" 24 | ) 25 | suggestName("PYNQU96CCWrapper") 26 | // override AXI MM signals for cache coherency 27 | mem(0).readAddr.bits.cache := "b1100".U 28 | mem(0).writeAddr.bits.cache := "b1100".U 29 | mem(0).readAddr.bits.prot := "b10".U 30 | mem(0).writeAddr.bits.prot := "b10".U 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/platform-wrapper/axi/PYNQZCU104CCWrapper.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.PlatformWrapper 2 | 3 | import chisel3._ 4 | 5 | // platform wrapper for PYNQ on Ultra-96 with one cache-coherent port 6 | 7 | object PYNQZCU104CCParams extends PlatformWrapperParams { 8 | val platformName = "PYNQZCU104CC" 9 | val memAddrBits = 32 10 | val memDataBits = 64 11 | val memIDBits = 6 12 | val memMetaBits = 1 13 | val numMemPorts = 1 14 | val sameIDInOrder = true 15 | val typicalMemLatencyCycles = 32 16 | val burstBeats = 8 // TODO why cap bursts at 8? AXI can do more 17 | val coherentMem = true 18 | } 19 | 20 | class PYNQZCU104CCWrapper(instFxn: PlatformWrapperParams => GenericAccelerator, targetDir: String) 21 | extends AXIPlatformWrapper(PYNQZCU104Params, instFxn) { 22 | val platformDriverFiles = baseDriverFiles ++ Array[String]( 23 | "platform-mpsoc-cc-xlnk.cpp", "xlnkdriver.hpp" 24 | ) 25 | suggestName("PYNQZCU104CCWrapper") 26 | // override AXI MM signals for cache coherency 27 | mem(0).readAddr.bits.cache := "b1100".U 28 | mem(0).writeAddr.bits.cache := "b1100".U 29 | mem(0).readAddr.bits.prot := "b10".U 30 | mem(0).writeAddr.bits.prot := "b10".U 31 | } 32 | -------------------------------------------------------------------------------- /src/main/resources/verilog/DualPortBRAM.v: -------------------------------------------------------------------------------- 1 | // the dual-port BRAM Verilog below is adapted from Dan Strother's example: 2 | // http://danstrother.com/2010/09/11/inferring-rams-in-fpgas/ 3 | 4 | module DualPortBRAM #( 5 | parameter DATA = 72, 6 | parameter ADDR = 10 7 | ) ( 8 | input wire clk, 9 | 10 | // Port A 11 | input wire a_wr, 12 | input wire [ADDR-1:0] a_addr, 13 | input wire [DATA-1:0] a_din, 14 | output reg [DATA-1:0] a_dout, 15 | 16 | // Port B 17 | input wire b_wr, 18 | input wire [ADDR-1:0] b_addr, 19 | input wire [DATA-1:0] b_din, 20 | output reg [DATA-1:0] b_dout 21 | ); 22 | 23 | // Shared memory 24 | reg [DATA-1:0] mem [(2**ADDR)-1:0]; 25 | 26 | // Port A 27 | always @(posedge clk) begin 28 | a_dout <= mem[a_addr]; 29 | if(a_wr) begin 30 | a_dout <= a_din; 31 | mem[a_addr] <= a_din; 32 | end 33 | end 34 | 35 | // Port B 36 | always @(posedge clk) begin 37 | b_dout <= mem[b_addr]; 38 | if(b_wr) begin 39 | b_dout <= b_din; 40 | mem[b_addr] <= b_din; 41 | end 42 | end 43 | 44 | endmodule 45 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SBT ?= sbt 2 | SBT_FLAGS ?= -Dsbt.log.noformat=true 3 | CC = gcc 4 | 5 | CHISEL_FLAGS := 6 | 7 | top_srcdir ?= . 8 | top_file := src/main/scala/Main.scala 9 | executables := $(filter-out top, $(notdir $(basename $(wildcard $(srcdir)/*.scala)))) 10 | integration_test_script = test-all.sh 11 | 12 | 13 | default: emulator 14 | 15 | all: verilog 16 | 17 | clean: 18 | -rm -f *.h *.hex *.flo *.cpp *.o *.out *.v *.vcd $(executables) 19 | -rm -rf project/target/ target/ verilator integration-tests 20 | 21 | verilog: 22 | $(SBT) $(SBT_FLAGS) "verilog $(ACCEL) $(PLATFORM)" 23 | 24 | driver: 25 | $(SBT) $(SBT_FLAGS) "driver $(ACCEL) $(PLATFORM)" 26 | 27 | emulator: 28 | $(SBT) $(SBT_FLAGS) "emulator $(ACCEL) $(PLATFORM)" 29 | 30 | # ---------------------------------------------------------------------------------------------------------------------- 31 | # Tests 32 | # ---------------------------------------------------------------------------------------------------------------------- 33 | include integration-test.mk 34 | include board-test.mk 35 | 36 | unit-test: 37 | $(SBT) $(SBT_FLAGS) test 38 | 39 | test: unit-test integration-test board-test 40 | 41 | .PHONY: all emulator verilog driver test unit-test 42 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/examples/ExampleBRAMMasked.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.examples 2 | 3 | import chisel3._ 4 | import fpgatidbits.PlatformWrapper._ 5 | import fpgatidbits.ocm._ 6 | 7 | // instantiate a 32-wide 1024-deep dual-port BRAM and directly connect its 8 | // inputs to the module I/O (thus the register file) 9 | // the difference from ExampleBRAM is that we use the partially-writable variant 10 | // here, e.g. it is possible to do a write to a BRAM address that modifies 11 | // only part of the data there, depending on the write enable signals 12 | 13 | class ExampleBRAMMaskedIO(maskWidth: Int, n: Int, p: PlatformWrapperParams) extends GenericAcceleratorIF(n,p) { 14 | val ports = Vec(2, new OCMMaskedSlaveIF(32, 32, maskWidth)) 15 | } 16 | class ExampleBRAMMasked(p: PlatformWrapperParams) extends GenericAccelerator(p) { 17 | val numMemPorts = 0 18 | val dataWidth = 32 19 | val addrWidth = 10 20 | val maskUnit = 8 21 | val maskWidth = dataWidth/maskUnit 22 | val io = IO(new ExampleBRAMMaskedIO(maskWidth, numMemPorts,p)) 23 | io.signature := makeDefaultSignature() 24 | 25 | val mem = Module(new DualPortMaskedBRAM(addrWidth, dataWidth)).io 26 | mem.ports(0) <> io.ports(0) 27 | mem.ports(1) <> io.ports(1) 28 | } 29 | -------------------------------------------------------------------------------- /src/test/scala/utils/TestTesterWrapper.scala: -------------------------------------------------------------------------------- 1 | package PlatformWrapper 2 | 3 | import org.scalatest.flatspec.AnyFlatSpec 4 | import chiseltest._ 5 | import chisel3._ 6 | import fpgatidbits.PlatformWrapper._ 7 | import fpgatidbits.examples.ExampleSum 8 | 9 | class TestTesterWrapper extends AnyFlatSpec with ChiselScalatestTester { 10 | def initClocks(c: TesterWrapper): Unit = { 11 | c.accio.memPort.map(mp => { 12 | mp.memRdRsp.initSink().setSinkClock(c.clock) 13 | mp.memRdReq.initSource().setSourceClock(c.clock) 14 | mp.memWrRsp.initSink().setSinkClock(c.clock) 15 | mp.memWrReq.initSource().setSourceClock(c.clock) 16 | mp.memWrDat.initSink().setSinkClock(c.clock) 17 | } 18 | ) 19 | } 20 | 21 | type AccelInstFxn = PlatformWrapperParams => GenericAccelerator 22 | type AccelMap = Map[String, AccelInstFxn] 23 | 24 | def makeInstFxn(): AccelInstFxn = { 25 | return { (p: PlatformWrapperParams) => new ExampleSum(p) } 26 | } 27 | 28 | behavior of "TesterWrapper" 29 | 30 | it should "Initialize correctly" in { 31 | test(new TesterWrapper(makeInstFxn(), targetDir = "._tmp") { c => 32 | initClocks(c) 33 | c.accio.memPort(0).memRdReq.ready.expect(true.B) 34 | c.accio.memPort(0).memRdRsp.valid.expect(false.B) 35 | }) 36 | } 37 | } -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/hlstools/HLSTools.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.hlstools 2 | import sys.process._ 3 | import java.io.File 4 | 5 | // Collection of utilities for Vivado HLS 6 | 7 | object TidbitsHLSTools { 8 | // quick-and-dirty single file HLS synthesis 9 | def hlsToVerilog( 10 | inFile: String, 11 | outDir: String, 12 | synDir: String, 13 | projName: String, 14 | topFxnName: String, 15 | inclDirs: Seq[String] = Seq(), 16 | fpgaPart: String = "xc7z020clg400-1", 17 | nsClk: String = "5.0" 18 | ) = { 19 | // get path to hls_syn.tcl 20 | val synthScriptPath = getClass.getResource("/script/hls_syn.tcl").getPath 21 | // need to provide include dirs as a single string argument, parsing 22 | // done in tcl. note: dirs here should have no spaces! 23 | val inclDirString = inclDirs.mkString(" ") 24 | // call the actual synthesis script 25 | val cmdline = Seq( 26 | "vivado_hls", 27 | "-f", synthScriptPath, 28 | "-tclargs", projName, inFile, fpgaPart, nsClk, topFxnName, inclDirString 29 | ) 30 | val status = Process(cmdline, new File(synDir)) ! ProcessLogger(stdout append _+"\n", stderr append _+"\n") 31 | // copy results to outDir 32 | s"cp -a $synDir/$projName/sol1/impl/verilog/. $outDir/".!! 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-tests/ExampleHostCopy.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | #include 4 | #include "platform.h" 5 | 6 | bool Run_TestHostCopy(WrapperRegDriver * platform) { 7 | platform->attach("TestHostCopy"); 8 | cout << "HostCopy test" << endl; 9 | unsigned int ub = 0; 10 | cout << "Enter number of words to generate and copy: " << endl; 11 | cin >> ub; 12 | 13 | uint64_t * hostSrc = new uint64_t[ub]; 14 | unsigned int bufsize = ub * sizeof(uint64_t); 15 | unsigned int golden = (ub*(ub+1))/2; 16 | 17 | for(uint64_t i = 0; i < ub; i++) { hostSrc[i] = i+1; } 18 | 19 | void * accelBuf = platform->allocAccelBuffer(bufsize); 20 | platform->copyBufferHostToAccel(hostSrc, accelBuf, bufsize); 21 | 22 | uint64_t * hostDst = new uint64_t[ub]; 23 | platform->copyBufferAccelToHost(accelBuf, hostDst, bufsize); 24 | 25 | platform->deallocAccelBuffer(accelBuf); 26 | 27 | int res = memcmp(hostSrc, hostDst, bufsize); 28 | 29 | delete [] hostSrc; 30 | delete [] hostDst; 31 | 32 | cout << "memcmp result: " << res << endl; 33 | 34 | return res == 0; 35 | } 36 | 37 | int main() 38 | { 39 | WrapperRegDriver * platform = initPlatform(); 40 | 41 | Run_TestHostCopy(platform); 42 | 43 | deinitPlatform(platform); 44 | 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /src/test/scala/utils/TestCat.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | import chisel3._ 3 | import chisel3.util._ 4 | import chiseltest._ 5 | import org.scalatest.flatspec.AnyFlatSpec 6 | 7 | class CatExample extends Module { 8 | val io = IO(new Bundle { 9 | val out = Vec(2, Output(UInt(32.W))) 10 | val in = Vec(2, Vec(4, Input(UInt(8.W)))) 11 | }) 12 | val wiresReadOut =VecInit(Seq.fill(2)(VecInit(Seq.fill(4)(WireInit(0.U(8.W)))))) 13 | 14 | for (i <- 0 until 4) { 15 | for (p <- 0 until 2) { 16 | wiresReadOut(p)(i) := io.in(p)(i) 17 | } 18 | } 19 | 20 | for (i <- 0 until 2) { 21 | io.out(i) := wiresReadOut(i).asUInt 22 | } 23 | 24 | } 25 | 26 | 27 | class TestCat extends AnyFlatSpec with ChiselScalatestTester { 28 | 29 | behavior of "SubWordAssignment" 30 | it should "work" in { 31 | test(new CatExample) { 32 | c => 33 | c.io.in(0)(0).poke( "hde".U) 34 | c.io.in(0)(1).poke( "had".U) 35 | c.io.in(0)(2).poke( "hbe".U) 36 | c.io.in(0)(3).poke( "hef".U) 37 | 38 | c.io.in(1)(0).poke( "hab".U) 39 | c.io.in(1)(1).poke( "hed".U) 40 | c.io.in(1)(2).poke( "hde".U) 41 | c.io.in(1)(3).poke( "hce".U) 42 | 43 | c.io.out(0).expect("hef_be_ad_de".U) 44 | c.io.out(1).expect("hce_de_ed_ab".U) 45 | 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-integration-tests/TestExampleRegOps.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | 4 | #include "ExampleRegOps.hpp" 5 | #include "platform.h" 6 | #include 7 | #include 8 | 9 | 10 | bool Run_ExampleRegOps(WrapperRegDriver * platform, uint32_t a, uint32_t b) { 11 | ExampleRegOps t(platform); 12 | t.set_op_0(a); 13 | t.set_op_1(b); 14 | 15 | return (a+b) == t.get_sum(); 16 | } 17 | 18 | int main(int argc, char **argv) 19 | { 20 | if (argc != 2) { 21 | cout << "Please pass the number of tests to run as the only command line argument" < GenericAccelerator, targetDir: String) 24 | extends AXIPlatformWrapper(PYNQZ1Params, instFxn) { 25 | val platformDriverFiles = baseDriverFiles ++ Array[String]( 26 | "platform-xlnk.cpp", "xlnkdriver.hpp" 27 | ) 28 | suggestName("PYNQZ1Wrapper") 29 | override def desiredName = "PYNQZ1Wrapper" 30 | // Generate the RegFile driver 31 | generateRegDriver(targetDir) 32 | 33 | // Copy over the other needed files 34 | resourceCopyBulk("cpp/platform-wrapper-regdriver/", targetDir, platformDriverFiles) 35 | println(s"=======> Driver files copied to ${targetDir}") 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/platform-wrapper/axi/GenericSDAccelWrapper.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.PlatformWrapper 2 | 3 | import java.nio.file.Paths 4 | import chisel3._ 5 | import chisel3.util._ 6 | import fpgatidbits.TidbitsMakeUtils.{resourceCopy, resourceCopyBulk} 7 | 8 | // wrapper for a "generic" SDAccel platform 9 | 10 | object GenericSDAccelParams extends PlatformWrapperParams { 11 | val platformName = "GenericSDAccel" 12 | val memAddrBits = 64 13 | val memDataBits = 64 14 | val memIDBits = 1 15 | val memMetaBits = 1 16 | val numMemPorts = 1 17 | val sameIDInOrder = true 18 | val typicalMemLatencyCycles = 128 19 | val burstBeats = 8 20 | val coherentMem = false 21 | } 22 | 23 | class GenericSDAccelWrapper(instFxn: PlatformWrapperParams => GenericAccelerator, val targetDir: String) 24 | extends AXIPlatformWrapper(GenericSDAccelParams, instFxn) { 25 | val platformDriverFiles = baseDriverFiles ++ Array[String]( 26 | "platform-genericsdaccel.cpp", "xclhalwrapper.hpp" 27 | ) 28 | suggestName("GenericSDAccelWrapper") 29 | override def desiredName = "GenericSDAccelWrapper" 30 | 31 | // Generate the RegFile driver 32 | generateRegDriver(targetDir) 33 | 34 | // Copy over the other needed files 35 | resourceCopyBulk("cpp/platform-wrapper-regdriver/", targetDir, platformDriverFiles) 36 | println(s"=======> Driver files copied to ${targetDir}") 37 | } 38 | -------------------------------------------------------------------------------- /src/main/cpp/platform-wrapper-regdriver/platform-zc706-linux.cpp: -------------------------------------------------------------------------------- 1 | // platform init-deinit functions for Linux on the ZC706 2 | // note that this assumes the peripheral lives at address 0x43c00000, and that 256MB of 3 | // unmanaged memory is available at address 0x10000000 4 | 5 | /* 6 | something like this can be used for the ZC706 to ensure the kernel leaves the upper half 7 | of the DDR alone: 8 | 9 | env set fdt_high 0x10000000 10 | env set initrd_high 0x10000000 11 | env set bootargs "console=ttyPS0,115200 root=/dev/mmcblk0p2 rw rootwait earlyprintk cma=16m mem=256m" 12 | */ 13 | 14 | #include "platform.h" 15 | #include "linuxphysregdriver.hpp" 16 | #include 17 | #include 18 | using namespace std; 19 | 20 | extern "C" { 21 | #include 22 | #include 23 | #include 24 | #include 25 | } 26 | 27 | LinuxPhysRegDriver * platform = 0; 28 | 29 | WrapperRegDriver * initPlatform(bool tracing) { 30 | if(!platform) { 31 | platform = new LinuxPhysRegDriver((void *) 0x43c00000, (void *) 0x10000000, 256 * 1024 * 1024); 32 | } 33 | return (WrapperRegDriver *) platform; 34 | } 35 | 36 | void deinitPlatform(WrapperRegDriver * driver) { 37 | // TODO doing a delete here causes it to go in a loop, debug this 38 | } 39 | 40 | void loadBitfile(const char * accelName) { 41 | // TODO add bitfile loader here if desired 42 | } 43 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/platform-zc706-linux.cpp: -------------------------------------------------------------------------------- 1 | // platform init-deinit functions for Linux on the ZC706 2 | // note that this assumes the peripheral lives at address 0x43c00000, and that 256MB of 3 | // unmanaged memory is available at address 0x10000000 4 | 5 | /* 6 | something like this can be used for the ZC706 to ensure the kernel leaves the upper half 7 | of the DDR alone: 8 | 9 | env set fdt_high 0x10000000 10 | env set initrd_high 0x10000000 11 | env set bootargs "console=ttyPS0,115200 root=/dev/mmcblk0p2 rw rootwait earlyprintk cma=16m mem=256m" 12 | */ 13 | 14 | #include "platform.h" 15 | #include "linuxphysregdriver.hpp" 16 | #include 17 | #include 18 | using namespace std; 19 | 20 | extern "C" { 21 | #include 22 | #include 23 | #include 24 | #include 25 | } 26 | 27 | LinuxPhysRegDriver * platform = 0; 28 | 29 | WrapperRegDriver * initPlatform(bool tracing) { 30 | if(!platform) { 31 | platform = new LinuxPhysRegDriver((void *) 0x43c00000, (void *) 0x10000000, 256 * 1024 * 1024); 32 | } 33 | return (WrapperRegDriver *) platform; 34 | } 35 | 36 | void deinitPlatform(WrapperRegDriver * driver) { 37 | // TODO doing a delete here causes it to go in a loop, debug this 38 | } 39 | 40 | void loadBitfile(const char * accelName) { 41 | // TODO add bitfile loader here if desired 42 | } 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018, Norwegian University of Science and Technology (NTNU) 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /src/test/scala/utils/TestUtils.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import chisel3._ 4 | import org.scalatest._ 5 | 6 | import fpgatidbits.utils.BitExtraction 7 | 8 | class TestWrapper extends Module { 9 | val io = IO( new Bundle { 10 | val word = Input(UInt(32.W)) 11 | val high = Input(UInt(8.W)) 12 | val low = Input(UInt(8.W)) 13 | val result = Output(UInt(32.W)) 14 | }) 15 | 16 | io.result := BitExtraction(io.word, io.high, io.low) 17 | } 18 | 19 | 20 | 21 | //class TestUtils(dut: TestWrapper) extends PeekPokeTester(dut) { 22 | // def TestBitExtraction(in: Int, h: Int, l: Int) = { 23 | // 24 | // poke(dut.io.word, in.U) 25 | // poke(dut.io.high, h.U) 26 | // poke(dut.io.low, l.U) 27 | // step(1) 28 | // expect(dut.io.result, in.U(32.W)(h,l)) 29 | // } 30 | // 31 | // TestBitExtraction(123456789,8,1) 32 | // TestBitExtraction(123456789,7,6) 33 | // TestBitExtraction(123456789,32,4) 34 | // TestBitExtraction(123456789,3,0) 35 | //} 36 | // 37 | // 38 | // 39 | //class SimpleSpec extends FlatSpec with Matchers { 40 | ///* 41 | // val t = new TestUtils 42 | // "BitExtraction" should "pass" in { 43 | // t.TestBitExtraction() 44 | // } 45 | // */ 46 | // "BitExtraction" should "pass" in { 47 | // chisel3.iotesters.Driver( 48 | // () => new TestWrapper) 49 | // { 50 | // c => new TestUtils(c) 51 | // } should be (true) 52 | // } 53 | // 54 | //} -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/examples/ExampleSeqWrite.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.examples 2 | 3 | import chisel3._ 4 | import fpgatidbits.PlatformWrapper._ 5 | import fpgatidbits.dma._ 6 | import fpgatidbits.streams._ 7 | 8 | class ExampleSeqWriteIO(n: Int, p: PlatformWrapperParams) extends GenericAcceleratorIF(n,p) { 9 | val start = Input(Bool()) 10 | val finished = Output(Bool()) 11 | val baseAddr = Input(UInt(64.W)) 12 | val init = Input(UInt(32.W)) 13 | val step = Input(UInt(32.W)) 14 | val count = Input(UInt(32.W)) 15 | } 16 | 17 | class ExampleSeqWrite(p: PlatformWrapperParams) extends GenericAccelerator(p) { 18 | val numMemPorts = 1 19 | val io = IO(new ExampleSeqWriteIO(numMemPorts, p)) 20 | plugMemReadPort(0) // read port not used 21 | io.signature := makeDefaultSignature() 22 | 23 | val sw = Module(new StreamWriter(new StreamWriterParams( 24 | streamWidth = p.memDataBits, mem = p.toMemReqParams(), chanID = 0 25 | ))).io 26 | 27 | sw.start := io.start 28 | sw.baseAddr := io.baseAddr 29 | sw.byteCount := io.count * (p.memDataBits/8).U 30 | io.finished := sw.finished 31 | 32 | val sg = Module(new SequenceGenerator(p.memDataBits)).io 33 | sg.start := io.start 34 | sg.init := io.init 35 | sg.step := io.step 36 | sg.count := io.count 37 | 38 | sg.seq <> sw.in 39 | sw.req <> io.memPort(0).memWrReq 40 | sw.wdat <> io.memPort(0).memWrDat 41 | io.memPort(0).memWrRsp <> sw.rsp 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/profiler/StateProfiler.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.profiler 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | class StateProfiler(StateCount: Int) extends Module { 7 | val io = new Bundle { 8 | val start = Input(Bool()) 9 | val probe = Input(UInt(32.W)) 10 | val count = Output(UInt(32.W)) 11 | val sel = Input(UInt(log2Ceil(StateCount).W)) 12 | } 13 | 14 | // create profiling registers for keeping state counts 15 | val regStateCount = RegInit(VecInit(Seq.fill(StateCount)(0.U(32.W)))) 16 | // register input state before treatment 17 | val regInState = RegInit(0.U(32.W)) 18 | regInState := io.probe 19 | 20 | // finite state machine for control 21 | val sIdle :: sRun :: sFinished :: Nil = Enum(3) 22 | val regState = RegInit(sIdle) 23 | 24 | // default outputs 25 | io.count := regStateCount(io.sel) 26 | 27 | switch(regState) { 28 | is(sIdle) { 29 | when ( io.start ) { 30 | // move to running state 31 | regState := sRun 32 | // reset all profiling registers 33 | for(i <- 0 until StateCount) { 34 | regStateCount(i) := 0.U 35 | } 36 | } 37 | } 38 | 39 | is(sRun) { 40 | regStateCount(regInState) := regStateCount(regInState) + 1.U 41 | // finish profiling when start goes low 42 | when( !io.start ) { 43 | regState := sIdle 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-tests/ExampleSum.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | 4 | #include "ExampleSum.hpp" 5 | #include "platform.h" 6 | 7 | bool Run_ExampleSum(WrapperRegDriver * platform) { 8 | ExampleSum t(platform); 9 | 10 | cout << "Signature: " << hex << t.get_signature() << dec << endl; 11 | unsigned int ub = 0; 12 | cout << "Enter upper bound of sum: " << endl; 13 | cin >> ub; 14 | 15 | unsigned int * hostBuf = new unsigned int[ub]; 16 | unsigned int bufsize = ub * sizeof(unsigned int); 17 | unsigned int golden = (ub*(ub+1))/2; 18 | 19 | for(unsigned int i = 0; i < ub; i++) { hostBuf[i] = i+1; } 20 | 21 | void * accelBuf = platform->allocAccelBuffer(bufsize); 22 | platform->copyBufferHostToAccel(hostBuf, accelBuf, bufsize); 23 | 24 | t.set_baseAddr((AccelDblReg) accelBuf); 25 | t.set_byteCount(bufsize); 26 | 27 | t.set_start(1); 28 | 29 | while(t.get_finished() != 1); 30 | 31 | platform->deallocAccelBuffer(accelBuf); 32 | delete [] hostBuf; 33 | 34 | AccelReg res = t.get_sum(); 35 | cout << "Result = " << res << " expected " << golden << endl; 36 | unsigned int cc = t.get_cycleCount(); 37 | cout << "#cycles = " << cc << " cycles per word = " << (float)cc/(float)ub << endl; 38 | t.set_start(0); 39 | return res == golden; 40 | } 41 | 42 | int main() 43 | { 44 | WrapperRegDriver * platform = initPlatform(); 45 | 46 | Run_ExampleSum(platform); 47 | 48 | deinitPlatform(platform); 49 | 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/StreamReducer.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | class StreamReducer(w: Int, initVal: Int, fxn: (UInt,UInt)=>UInt) extends Module { 7 | 8 | val io = IO(new Bundle { 9 | val start = Input(Bool()) 10 | val finished = Output(Bool()) 11 | val reduced = Output(UInt(w.W)) 12 | val byteCount = Input(UInt(32.W)) 13 | val streamIn = Flipped(Decoupled(UInt(w.W))) 14 | }) 15 | val bytesPerElem = w/8 16 | 17 | val sIdle :: sRunning :: sFinished :: Nil = Enum(3) 18 | val regState = RegInit((sIdle)) 19 | val regReduced = RegInit(initVal.U(w.W)) 20 | val regBytesLeft = RegInit(0.U(32.W)) 21 | 22 | io.finished := false.B 23 | io.reduced := regReduced 24 | io.streamIn.ready := false.B 25 | 26 | switch(regState) { 27 | is(sIdle) { 28 | regReduced := initVal.U 29 | regBytesLeft := io.byteCount 30 | 31 | when (io.start) { regState := sRunning } 32 | } 33 | 34 | is(sRunning) { 35 | when (regBytesLeft === 0.U) { regState := sFinished} 36 | .otherwise { 37 | io.streamIn.ready := true.B 38 | when (io.streamIn.valid) { 39 | regReduced := fxn(regReduced, io.streamIn.bits) 40 | regBytesLeft := regBytesLeft - (bytesPerElem).U 41 | } 42 | } 43 | } 44 | 45 | is(sFinished) { 46 | io.finished := true.B 47 | when (!io.start) { regState := sIdle} 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-tests/ExampleBRAMMasked.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "platform.h" 3 | #include "ExampleBRAMMasked.hpp" 4 | #include 5 | 6 | using namespace std; 7 | 8 | int main() { 9 | WrapperRegDriver * platform = initPlatform(); 10 | 11 | ExampleBRAMMasked t(platform); 12 | 13 | cout << "Signature: " << hex << t.get_signature() << dec << endl; 14 | 15 | string cmd; 16 | unsigned int addr, dat, writeMask; 17 | 18 | cin >> cmd; 19 | 20 | // commands: 21 | // r [address] -- read data from [address] and print out 22 | // w [address] [data] [writeMask] -- write data to address with given write mask 23 | 24 | while(cmd != "q") { 25 | if(cmd == "r") { 26 | cin >> addr; 27 | t.set_ports_0_req_addr(addr); 28 | cout << "addr " << addr << " = " << t.get_ports_0_rsp_readData() << endl; 29 | } else if (cmd == "w") { 30 | cin >> addr >> dat >> writeMask; 31 | t.set_ports_0_req_addr(addr); 32 | t.set_ports_0_req_writeData(dat); 33 | t.set_ports_0_req_writeMask_0(writeMask & 1); 34 | t.set_ports_0_req_writeMask_1((writeMask & 2) >> 1); 35 | t.set_ports_0_req_writeMask_2((writeMask & 4) >> 2); 36 | t.set_ports_0_req_writeMask_3((writeMask & 8) >> 3); 37 | t.set_ports_0_req_writeEn(1); 38 | t.set_ports_0_req_writeEn(0); 39 | cout << "wrote " << dat << " to " << addr << endl; 40 | } else cout << "unrecognized" << endl; 41 | 42 | cin >> cmd; 43 | } 44 | 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/StreamFork.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | // combinational fork for one stream -> two streams 7 | // fork functions can be customized 8 | 9 | class StreamFork[Ti <: Data, ToA <: Data, ToB <: Data] 10 | (genIn: Ti, genA: ToA, genB: ToB, forkA: Ti => ToA, forkB: Ti => ToB) 11 | extends Module { 12 | val io = IO(new Bundle { 13 | val in = Flipped(Decoupled(genIn)) 14 | val outA = Decoupled(genA) 15 | val outB = Decoupled(genB) 16 | }) 17 | 18 | io.in.ready := io.outA.ready & io.outB.ready 19 | 20 | io.outA.bits := forkA(io.in.bits) 21 | io.outB.bits := forkB(io.in.bits) 22 | 23 | io.outA.valid := io.in.valid & io.outB.ready 24 | io.outB.valid := io.in.valid & io.outA.ready 25 | } 26 | 27 | // convenience constructor for making two identical copies of the stream 28 | object StreamCopy { 29 | def apply[T <: Data] 30 | (in: DecoupledIO[T], outA: DecoupledIO[T], outB: DecoupledIO[T]) = { 31 | val m = Module(new StreamFork( 32 | genIn = in.bits, genA = outA.bits, genB = outB.bits, 33 | forkA = {x: T => x}, forkB = {x: T => x} 34 | )).io 35 | in <> m.in 36 | m.outA <> outA 37 | m.outB <> outB 38 | } 39 | 40 | def apply[T <: Data] 41 | (in: DecoupledIO[T], out: Seq[DecoupledIO[T]]) = { 42 | for(o <- out) { 43 | o.bits := in.bits 44 | o.valid := in.valid & out.filterNot(_ == o).map(_.ready).reduce(_&_) 45 | } 46 | in.ready := out.map(_.ready).reduce(_ & _) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/profiler/LevelProfiler.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.profiler 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | class LevelProfilerOutput(ctrW: Int) extends Bundle { 7 | val sum = Output(UInt(ctrW.W)) // accumulated sum 8 | val cycles = Output(UInt(ctrW.W)) // # of cycles monitored 9 | } 10 | 11 | class LevelProfiler(inpW: Int, ctrW: Int, name: String = "lvl") extends Module { 12 | val io = new Bundle { 13 | val probe = Input(UInt(inpW.W)) // level to measure 14 | val enable = Input(Bool()) // enable/disable monitoring 15 | val out = new LevelProfilerOutput(ctrW) 16 | } 17 | 18 | val regActive = RegNext(io.enable) 19 | val regSum = RegInit(0.U(ctrW.W)) 20 | val regCycleCount = RegInit(0.U(ctrW.W)) 21 | val regProbeValue = RegNext(io.probe) 22 | 23 | when(!regActive & io.enable) { 24 | regSum := 0.U 25 | regCycleCount := 0.U 26 | } .elsewhen(regActive) { 27 | regCycleCount := regCycleCount + 1.U 28 | regSum := regSum + regProbeValue 29 | } 30 | 31 | when(regActive & !io.enable) { 32 | //printf(name + " avg level: %d \n", regSum / regCycleCount) 33 | printf(name + " sum = %d cycles = %d \n", regSum, regCycleCount) 34 | } 35 | 36 | io.out.sum := regSum 37 | io.out.cycles := regCycleCount 38 | } 39 | 40 | object LevelProfiler { 41 | def apply(probe: UInt, enable: Bool, name: String): LevelProfilerOutput = { 42 | val profiler = Module(new LevelProfiler(probe.getWidth, 32, name)).io 43 | profiler.probe := probe 44 | profiler.enable := enable 45 | profiler.out 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/dma/scatter-gather/GatherIF.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.dma 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import fpgatidbits.streams._ 6 | 7 | // interface for "gather"-accelerators 8 | // gather is defined as an indirectly index memory operation; e.g. given 9 | // two arrays ind and val, produce a third array res where 10 | // the res array is constructed by indexing the val array by the ind array 11 | 12 | // a single gather request 13 | class GatherReq(indWidth: Int, tagWidth: Int) extends PrintableBundle { 14 | val ind = UInt(indWidth.W) // index to be loaded 15 | val tag = UInt(tagWidth.W) // tag associated with this request 16 | 17 | val printfStr = "gatherReq: ind = %d tag = %d \n" 18 | val printfElems = {() => Seq(ind, tag)} 19 | 20 | } 21 | 22 | // response to a single gather request 23 | class GatherRsp(datWidth: Int, tagWidth: Int) extends PrintableBundle { 24 | val dat = UInt(datWidth.W) // return data 25 | val tag = UInt(tagWidth.W) // tag of original request 26 | 27 | val printfStr = "GatherRsp: dat = %d tag = %d \n" 28 | val printfElems = {() => Seq(dat, tag)} 29 | } 30 | 31 | // interface used by gather accelerators, taking in a stream of requests, and 32 | // emitting a stream of responses 33 | /* TODO IMPROVEMENT: carry in-order / out-of-order info here? */ 34 | class GatherIF(indWidth: Int, datWidth: Int, tagWidth: Int, mrp: MemReqParams) 35 | extends Bundle { 36 | val base = Input(UInt(mrp.addrWidth.W)) 37 | val in = Flipped(Decoupled(new GatherReq(indWidth, tagWidth))) 38 | val out = Decoupled(new GatherRsp(datWidth, tagWidth)) 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/test/scala/TestCounter.scala: -------------------------------------------------------------------------------- 1 | //import chisel3._ 2 | //import chisel3.util._ 3 | //import fpgatidbits.math.Counter 4 | 5 | /* 6 | // Tester-derived class to give stimulus and observe the outputs for the 7 | // Module to be tested 8 | class CounterTester(c: Counter) extends Tester(c) { 9 | val r = scala.util.Random 10 | val n_reps = 5 11 | 12 | def resetAndSetSteps(n: Int) = { 13 | poke(c.io.enable, 0) 14 | reset(1) 15 | poke(c.io.nsteps, n) 16 | step(1) 17 | peek(c.io.current) 18 | } 19 | 20 | for(rep <- 0 until n_reps) { 21 | val nsteps = r.nextInt(10) 22 | var acc_golden = 0 23 | var overflow_golden = 0 24 | resetAndSetSteps(nsteps) 25 | poke(c.io.enable, 0) 26 | for(i <- 0 to 20) { 27 | expect(c.io.current, acc_golden) 28 | val incr = r.nextInt(2) 29 | poke(c.io.enable, incr) 30 | step(1) 31 | acc_golden += incr 32 | if(acc_golden == nsteps) { acc_golden = 0 } 33 | } 34 | } 35 | } 36 | 37 | class TestCounter extends JUnitSuite { 38 | @Test def CounterTest { 39 | for(w <- 32 to 32) { 40 | // Chisel arguments to pass to chiselMainTest 41 | def testArgs = TestHelpers.stdArgs 42 | // function that instantiates the Module to be tested 43 | def testModuleInstFxn = () => { Module(new Counter(w = w)) } 44 | // function that instantiates the Tester to test the Module 45 | def testTesterInstFxn = (c: Counter) => new CounterTester(c) 46 | // actually run the test 47 | chiselMainTest( 48 | testArgs, 49 | testModuleInstFxn 50 | ) { 51 | testTesterInstFxn 52 | } 53 | } 54 | } 55 | } 56 | 57 | */ 58 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-tests/ExampleSeqWrite.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | #include 4 | #include "TestSeqWrite.hpp" 5 | #include "platform.h" 6 | 7 | bool Run_TestSeqWrite(WrapperRegDriver * platform) { 8 | TestSeqWrite t(platform); 9 | cout << "TestSeqWrite test" << endl; 10 | cout << "Signature: " << hex << t.get_signature() << dec << endl; 11 | unsigned int init, step, count; 12 | cout << "Enter init, step and count: " << endl; 13 | cin >> init >> step >> count; 14 | 15 | uint64_t * hostSrc = new uint64_t[count]; 16 | unsigned int bufsize = count * sizeof(uint64_t); 17 | 18 | for(uint64_t i = 0; i < count; i++) { hostSrc[i] = init+step*i; } 19 | 20 | void * accelSrc = platform->allocAccelBuffer(bufsize); 21 | 22 | t.set_init(init); 23 | t.set_step(step); 24 | t.set_count(count); 25 | t.set_baseAddr((AccelDblReg) accelSrc); 26 | 27 | 28 | t.set_start(1); 29 | while(t.get_finished() != 1); 30 | 31 | uint64_t * hostDst = new uint64_t[count]; 32 | platform->copyBufferAccelToHost(accelSrc, hostDst, bufsize); 33 | 34 | t.set_start(0); 35 | 36 | platform->deallocAccelBuffer(accelSrc); 37 | 38 | int res = memcmp(hostSrc, hostDst, bufsize); 39 | 40 | if(res != 0) 41 | for(uint64_t i = 0; i < count; i++) { 42 | cout << i << " " << hostSrc[i] << " " << hostDst[i] << endl; 43 | } 44 | 45 | delete [] hostSrc; 46 | delete [] hostDst; 47 | 48 | cout << "memcmp result: " << res << endl; 49 | 50 | return res == 0; 51 | } 52 | 53 | int main() 54 | { 55 | WrapperRegDriver * platform = initPlatform(); 56 | 57 | Run_TestSeqWrite(platform); 58 | 59 | deinitPlatform(platform); 60 | 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-tests/ExampleCopy.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | #include 4 | #include "TestCopy.hpp" 5 | #include "platform.h" 6 | 7 | bool Run_TestCopy(WrapperRegDriver * platform) { 8 | TestCopy t(platform); 9 | cout << "TestCopy test" << endl; 10 | cout << "Signature: " << hex << t.get_signature() << dec << endl; 11 | unsigned int ub = 0; 12 | cout << "Enter number of words to generate and copy: " << endl; 13 | cin >> ub; 14 | 15 | uint64_t * hostSrc = new uint64_t[ub]; 16 | unsigned int bufsize = ub * sizeof(uint64_t); 17 | unsigned int golden = (ub*(ub+1))/2; 18 | 19 | for(uint64_t i = 0; i < ub; i++) { hostSrc[i] = i+1; } 20 | 21 | void * accelSrc = platform->allocAccelBuffer(bufsize); 22 | platform->copyBufferHostToAccel(hostSrc, accelSrc, bufsize); 23 | 24 | void * accelDst = platform->allocAccelBuffer(bufsize); 25 | 26 | t.set_srcAddr((AccelDblReg) accelSrc); 27 | t.set_dstAddr((AccelDblReg) accelDst); 28 | t.set_byteCount(bufsize); 29 | 30 | t.set_start(1); 31 | 32 | while(t.get_finished() != 1); 33 | 34 | uint64_t * hostDst = new uint64_t[ub]; 35 | platform->copyBufferAccelToHost(accelDst, hostDst, bufsize); 36 | 37 | t.set_start(0); 38 | 39 | platform->deallocAccelBuffer(accelSrc); 40 | platform->deallocAccelBuffer(accelDst); 41 | 42 | int res = memcmp(hostSrc, hostDst, bufsize); 43 | 44 | delete [] hostSrc; 45 | delete [] hostDst; 46 | 47 | cout << "memcmp result: " << res << endl; 48 | 49 | return res == 0; 50 | } 51 | 52 | int main() 53 | { 54 | WrapperRegDriver * platform = initPlatform(); 55 | 56 | Run_TestCopy(platform); 57 | 58 | deinitPlatform(platform); 59 | 60 | return 0; 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/AXIStreamInputMux.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import fpgatidbits.axi._ 6 | 7 | class AXIStreamInputMux(dataWidth: Int) extends Module { 8 | val io = IO(new Bundle { 9 | val sel = Input(UInt(1.W)) 10 | val in0 = Flipped(new AXIStreamIF(UInt(dataWidth.W))) 11 | val in1 = Flipped(new AXIStreamIF(UInt(dataWidth.W))) 12 | val strm = new AXIStreamIF(UInt(dataWidth.W)) 13 | }) 14 | 15 | io.strm.suggestName("strm") 16 | io.in0.suggestName("in0") 17 | io.in1.suggestName("in1") 18 | 19 | io.strm.bits := Mux(io.sel === 0.U, io.in0.bits, io.in1.bits) 20 | io.strm.valid := Mux(io.sel === 0.U, io.in0.valid, io.in1.valid) 21 | 22 | io.in0.ready := (io.sel === 0.U) & io.strm.ready 23 | io.in1.ready := (io.sel === 1.U) & io.strm.ready 24 | } 25 | 26 | 27 | class DecoupledInputMuxIO[T <: Data](gen: T, numChans: Int) extends Bundle { 28 | val sel = Input(UInt(log2Ceil(numChans).W)) 29 | val in = Vec(numChans, Flipped(Decoupled(gen))) 30 | val out = Decoupled(gen) 31 | 32 | } 33 | 34 | class DecoupledInputMux[T <: Data](gen: T, numChans: Int) extends Module { 35 | val io = IO(new DecoupledInputMuxIO(gen, numChans)) 36 | 37 | io.out.bits := io.in(io.sel).bits 38 | io.out.valid := io.in(io.sel).valid 39 | 40 | for(i <- 0 until numChans) { 41 | io.in(i).ready := io.out.ready & (io.sel === i.U) 42 | } 43 | } 44 | 45 | object DecoupledInputMux { 46 | def apply[T <: Data](sel: UInt, chans: Seq[DecoupledIO[T]]): DecoupledIO[T] = { 47 | val inst = Module(new DecoupledInputMux(chans(0).bits.cloneType, chans.size)).io 48 | for(i <- 0 until chans.size) {inst.in(i) <> chans(i)} 49 | inst.sel := sel 50 | inst.out 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/AXIStreamOutputMux.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import fpgatidbits.axi._ 6 | 7 | class AXIStreamOutputMux(dataWidth: Int) extends Module { 8 | val io = IO(new Bundle { 9 | val sel = Input(UInt(1.W)) 10 | val strm = Flipped(new AXIStreamIF(UInt(dataWidth.W))) 11 | val out0 = new AXIStreamIF(UInt(dataWidth.W)) 12 | val out1 = new AXIStreamIF(UInt(dataWidth.W)) 13 | }) 14 | 15 | io.strm.suggestName("strm") 16 | io.out0.suggestName("out0") 17 | io.out1.suggestName("out1") 18 | 19 | io.out0.bits := io.strm.bits 20 | io.out1.bits := io.strm.bits 21 | 22 | io.out0.valid := (io.sel === 0.U) & io.strm.valid 23 | io.out1.valid := (io.sel === 1.U) & io.strm.valid 24 | 25 | io.strm.ready := Mux(io.sel === 0.U, io.out0.ready, io.out1.ready) 26 | } 27 | 28 | class DecoupledOutputDemuxIO[T <: Data](gen: T, numChans: Int) extends Bundle { 29 | val sel = Input(UInt(log2Ceil(numChans).W)) 30 | val in = Flipped(Decoupled(gen)) 31 | val out = Vec(numChans,Decoupled(gen)) 32 | 33 | } 34 | 35 | class DecoupledOutputDemux[T <: Data](gen: T, numChans: Int) extends Module { 36 | val io = IO(new DecoupledOutputDemuxIO(gen, numChans)) 37 | 38 | io.in.ready := io.out(io.sel).ready 39 | 40 | for(i <- 0 until numChans) { 41 | io.out(i).valid := io.in.valid & (io.sel === i.U) 42 | io.out(i).bits := io.in.bits 43 | } 44 | } 45 | 46 | object DecoupledOutputDemux { 47 | def apply[T <: Data](sel: UInt, chans: Seq[DecoupledIO[T]]): DecoupledIO[T] = { 48 | val inst = Module(new DecoupledOutputDemux(chans(0).bits.cloneType, chans.size)).io 49 | for(i <- 0 until chans.size) {inst.out(i) <> chans(i)} 50 | inst.sel := sel 51 | inst.in 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/wrapperregdriver.h: -------------------------------------------------------------------------------- 1 | #ifndef WRAPPERREGDRIVER_H 2 | #define WRAPPERREGDRIVER_H 3 | 4 | #include 5 | #include 6 | 7 | // TODO wrapper driver should be a singleton 8 | typedef unsigned int AccelReg; 9 | typedef uint64_t AccelDblReg; 10 | 11 | class WrapperRegDriver 12 | { 13 | public: 14 | virtual ~WrapperRegDriver() {} 15 | // (optional) functions for host-accelerator buffer management 16 | virtual void copyBufferHostToAccel(const void * hostBuffer, void * accelBuffer, unsigned int numBytes) {(void) hostBuffer; (void) accelBuffer; (void) numBytes;} 17 | virtual void copyBufferAccelToHost(const void * accelBuffer, void * hostBuffer, unsigned int numBytes) {(void) hostBuffer; (void) accelBuffer; (void) numBytes;} 18 | virtual void * allocAccelBuffer(unsigned int numBytes) { (void)numBytes; throw "allocAccelBuffer not supported"; } 19 | virtual void deallocAccelBuffer(void * buffer) {(void) buffer;} 20 | // return CPU-accessible address for a buffer returned from allocAccelBuffer 21 | // only makes sense for some (shared-memory) platforms 22 | // facilitates SW that takes advantage of cache coherency 23 | virtual void * phys2virt(void * accelBuffer) {(void) accelBuffer; throw "phys2virt not supported"; } 24 | virtual bool is_coherent() { return false; } 25 | 26 | // (optional) functions for accelerator attach-detach handling 27 | virtual void attach(const char * name) {(void)name; } 28 | virtual void detach() {} 29 | 30 | // (mandatory) register access methods for the platform wrapper 31 | virtual void writeReg(unsigned int regInd, AccelReg regValue) = 0; 32 | virtual AccelReg readReg(unsigned int regInd) = 0; 33 | virtual std::string platformID() = 0; 34 | 35 | }; 36 | 37 | #endif // WRAPPERREGDRIVER_H 38 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-integration-tests/TestExampleBRAM.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "platform.h" 3 | #include "ExampleBRAM.hpp" 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | 9 | int main(int argc, char** argv) { 10 | 11 | if (argc != 2) { 12 | cout << "Please pass the number of tests to run as the only command line argument" < 2 | using namespace std; 3 | 4 | #include "ExampleSum.hpp" 5 | #include "platform.h" 6 | 7 | bool Run_ExampleSum(WrapperRegDriver * platform, int upper) { 8 | ExampleSum t(platform); 9 | 10 | unsigned int * hostBuf = new unsigned int[upper]; 11 | unsigned int bufsize = upper * sizeof(unsigned int); 12 | unsigned int golden = (upper*(upper+1))/2; 13 | 14 | for(unsigned int i = 0; i < upper; i++) { hostBuf[i] = i+1; } 15 | 16 | void * accelBuf = platform->allocAccelBuffer(bufsize); 17 | platform->copyBufferHostToAccel(hostBuf, accelBuf, bufsize); 18 | 19 | t.set_baseAddr((AccelDblReg) accelBuf); 20 | t.set_byteCount(bufsize); 21 | 22 | t.set_start(1); 23 | 24 | while(t.get_finished() != 1); 25 | 26 | platform->deallocAccelBuffer(accelBuf); 27 | delete [] hostBuf; 28 | 29 | AccelReg res = t.get_sum(); 30 | unsigned int cc = t.get_cycleCount(); 31 | t.set_start(0); 32 | return res == golden; 33 | } 34 | 35 | int main(int argc, char** argv) 36 | { 37 | if (argc != 2) { 38 | cout << "Please pass the number of tests to run as the only command line argument" < 11 | #include 12 | using namespace std; 13 | 14 | extern "C" { 15 | #include 16 | #include 17 | #include 18 | #include 19 | } 20 | 21 | LinuxPhysRegDriver * platform = 0; 22 | 23 | WrapperRegDriver * initPlatform(bool tracing) { 24 | if(!platform) { 25 | /* TODO correct the slave reg addresses */ 26 | platform = new LinuxPhysRegDriver((void *) 0x50000000, (void *) 0x40000000, 1024 * 1024 * 1024); 27 | } 28 | return (WrapperRegDriver *) platform; 29 | } 30 | 31 | void deinitPlatform(WrapperRegDriver * driver) { 32 | // TODO doing a delete here causes the zedboard to go in a loop, debug this 33 | } 34 | 35 | void loadBitfile(const char * accelName) { 36 | pid_t c_pid, pid; 37 | int status; 38 | 39 | // call a shell script to do the bitfile loading, fork & exec & wait 40 | char * loader = getenv("ZYNQ_BITFILE_LOADER"); 41 | if(!loader) 42 | throw "ZYNQ_BITFILE_LOADER must be set"; 43 | 44 | c_pid = fork(); 45 | 46 | if (c_pid == 0){ 47 | execl(loader, loader, accelName, NULL); 48 | throw "execl failed"; 49 | } else if (c_pid > 0){ 50 | if( (pid = wait(&status)) < 0){ 51 | throw "wait failed"; 52 | _exit(1); 53 | } 54 | } else{ 55 | throw ("fork failed"); 56 | _exit(1); 57 | } 58 | cout << "loadBitfile finished: " << accelName << endl; 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/platform-wrapper/convey/Makefile: -------------------------------------------------------------------------------- 1 | ifdef VERBOSE 2 | Q = 3 | E = @true 4 | else 5 | Q = @ 6 | E = @echo 7 | endif 8 | 9 | CFILES := $(shell find src -mindepth 1 -maxdepth 4 -name "*.c") 10 | CXXFILES := $(shell find src -mindepth 1 -maxdepth 4 -name "*.cpp") 11 | 12 | INFILES := $(CFILES) $(CXXFILES) 13 | 14 | OBJFILES := $(CXXFILES:src/%.cpp=%) $(CFILES:src/%.c=%) 15 | DEPFILES := $(CXXFILES:src/%.cpp=%) $(CFILES:src/%.c=%) 16 | OFILES := $(OBJFILES:%=obj/%.o) 17 | 18 | BINFILE = conveyapp 19 | 20 | INCLUDEDIRS = -I/opt/convey/include -I/opt/convey/pdk2/latest/wx-690/include 21 | LIBDIRS = -L/opt/convey/lib -L/opt/convey/pdk2/latest/wx-690/lib 22 | COMMONFLAGS = -Wall $(INCLUDEDIRS) 23 | LDFLAGS = $(LIBDIRS) -lcnyfwd -lwx_runtime 24 | 25 | ifdef DEBUG 26 | COMMONFLAGS := $(COMMONFLAGS) -g 27 | endif 28 | CFLAGS = $(COMMONFLAGS) --std=c99 29 | CXXFLAGS = $(COMMONFLAGS) --std=c++0x 30 | DEPDIR = deps 31 | all: $(BINFILE) 32 | ifeq ($(MAKECMDGOALS),) 33 | -include Makefile.dep 34 | endif 35 | ifneq ($(filter-out clean, $(MAKECMDGOALS)),) 36 | -include Makefile.dep 37 | endif 38 | 39 | CC = gcc 40 | CXX = g++ 41 | 42 | 43 | -include Makefile.local 44 | 45 | .PHONY: clean all depend 46 | .SUFFIXES: 47 | obj/%.o: src/%.c 48 | $(E)C-compiling $< 49 | $(Q)if [ ! -d `dirname $@` ]; then mkdir -p `dirname $@`; fi 50 | $(Q)$(CC) -o $@ -c $< $(CFLAGS) 51 | obj/%.o: src/%.cpp 52 | $(E)C++-compiling $< 53 | $(Q)if [ ! -d `dirname $@` ]; then mkdir -p `dirname $@`; fi 54 | $(Q)$(CXX) -o $@ -c $< $(CXXFLAGS) 55 | Makefile.dep: $(CFILES) $(CXXFILES) 56 | $(E)Depend 57 | $(Q)for i in $(^); do $(CXX) $(CXXFLAGS) -MM "$${i}" -MT obj/`basename $${i%.*}`.o; done > $@ 58 | 59 | 60 | $(BINFILE): $(OFILES) 61 | $(E)Linking $@ 62 | $(Q)$(CXX) -o $@ $(OFILES) $(LDFLAGS) 63 | clean: 64 | $(E)Removing files 65 | $(Q)rm -f $(BINFILE) obj/* Makefile.dep 66 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-tests/ExampleMultiChanSum.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | 4 | #include "ExampleMultiChanSum.hpp" 5 | #include "platform.h" 6 | 7 | bool Run_ExampleMultiChanSum(WrapperRegDriver * platform) { 8 | ExampleMultiChanSum t(platform); 9 | 10 | cout << "Signature: " << hex << t.get_signature() << dec << endl; 11 | unsigned int ub = 0, offs = 0; 12 | cout << "Enter upper bound of sum and channel 1 const offset: " << endl; 13 | cin >> ub >> offs; 14 | 15 | unsigned int * hostBuf0 = new unsigned int[ub]; 16 | unsigned int * hostBuf1 = new unsigned int[ub]; 17 | unsigned int bufsize = ub * sizeof(unsigned int); 18 | 19 | for(unsigned int i = 0; i < ub; i++) {hostBuf0[i] = i+1; hostBuf1[i] = i+1 + offs;} 20 | 21 | void * accBuf0 = platform->allocAccelBuffer(bufsize); 22 | void * accBuf1 = platform->allocAccelBuffer(bufsize); 23 | 24 | platform->copyBufferHostToAccel((void *) hostBuf0, accBuf0, bufsize); 25 | platform->copyBufferHostToAccel((void *) hostBuf1, accBuf1, bufsize); 26 | 27 | t.set_byteCount_0(bufsize); t.set_byteCount_1(bufsize); 28 | t.set_baseAddr_0((AccelDblReg) accBuf0); t.set_baseAddr_1((AccelDblReg) accBuf1); 29 | 30 | t.set_start(1); 31 | 32 | while(t.get_status() != 1); 33 | 34 | unsigned int res0 = t.get_sum_0(); unsigned int res1 = t.get_sum_1(); 35 | unsigned int exp0 = (ub*(ub+1))/2; unsigned int exp1 = exp0 + ub*offs; 36 | 37 | t.set_start(0); 38 | 39 | cout << "Chan 0 sum = " << res0 << " expected = " << exp0 << endl; 40 | cout << "Chan 1 sum = " << res1 << " expected = " << exp1 << endl; 41 | 42 | platform->deallocAccelBuffer(accBuf0); 43 | platform->deallocAccelBuffer(accBuf1); 44 | 45 | delete [] hostBuf0; 46 | delete [] hostBuf1; 47 | 48 | return (res0 == exp0) && (res1 == exp1); 49 | } 50 | 51 | int main() 52 | { 53 | WrapperRegDriver * platform = initPlatform(); 54 | 55 | Run_ExampleMultiChanSum(platform); 56 | 57 | deinitPlatform(platform); 58 | 59 | return 0; 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/examples/ExampleSum.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.examples 2 | 3 | import chisel3._ 4 | import fpgatidbits.PlatformWrapper._ 5 | import fpgatidbits.dma._ 6 | import fpgatidbits.streams._ 7 | 8 | 9 | class ExampleSumIO(p: PlatformWrapperParams) extends GenericAcceleratorIF(1, p) { 10 | val start = Input(Bool()) 11 | val finished = Output(Bool()) 12 | val baseAddr = Input(UInt(64.W)) 13 | val byteCount = Input(UInt(32.W)) 14 | val sum = Output(UInt(32.W)) 15 | val cycleCount = Output(UInt(32.W)) 16 | } 17 | // read and sum a contiguous stream of 32-bit uints from main memory 18 | class ExampleSum(p: PlatformWrapperParams) extends GenericAccelerator(p) { 19 | val numMemPorts = 1 20 | val io = IO(new ExampleSumIO(p)) 21 | io.signature := makeDefaultSignature() 22 | plugMemWritePort(0) 23 | 24 | val rdP = new StreamReaderParams( 25 | streamWidth = 32, fifoElems = 8, mem = p.toMemReqParams(), 26 | maxBeats = 1, chanID = 0, disableThrottle = true 27 | ) 28 | 29 | val reader = Module(new StreamReader(rdP)).io 30 | val red = Module(new StreamReducer(32, 0, {_+_})).io 31 | 32 | reader.start := io.start 33 | reader.baseAddr := io.baseAddr 34 | reader.byteCount := io.byteCount 35 | 36 | // Added by erlingrj because chisel3 complains they are not initialized 37 | // when inspecting verilog output of chisel2 synthesis they are commented out of the 38 | // module interface of StreamReader, how? 39 | reader.doInit := false.B 40 | reader.initCount := 8.U 41 | 42 | red.start := io.start 43 | red.byteCount := io.byteCount 44 | 45 | io.sum := red.reduced 46 | io.finished := red.finished 47 | 48 | reader.req <> io.memPort(0).memRdReq 49 | io.memPort(0).memRdRsp <> reader.rsp 50 | 51 | reader.out <> red.streamIn 52 | 53 | val regCycleCount = RegInit(0.U(32.W)) 54 | io.cycleCount := regCycleCount 55 | when(!io.start) {regCycleCount := 0.U} 56 | .elsewhen(io.start & !io.finished) {regCycleCount := regCycleCount + 1.U} 57 | } 58 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/platform-zedboard-linux.cpp: -------------------------------------------------------------------------------- 1 | // platform init-deinit functions for Linux on the ZedBoard 2 | // note that this assumes the peripheral lives at address 0x43c00000, and that 256MB of 3 | // unmanaged memory is available at address 0x10000000 4 | 5 | /* 6 | something like this can be used for the ZedBoard to ensure the kernel leaves the upper half 7 | of the DDR alone: 8 | 9 | env set fdt_high 0x10000000 10 | env set initrd_high 0x10000000 11 | env set bootargs "console=ttyPS0,115200 root=/dev/mmcblk0p2 rw rootwait earlyprintk cma=16m mem=256m" 12 | */ 13 | 14 | #include "platform.h" 15 | #include "linuxphysregdriver.hpp" 16 | #include 17 | #include 18 | using namespace std; 19 | 20 | extern "C" { 21 | #include 22 | #include 23 | #include 24 | #include 25 | } 26 | 27 | LinuxPhysRegDriver * platform = 0; 28 | 29 | WrapperRegDriver * initPlatform(bool tracing) { 30 | if(!platform) { 31 | platform = new LinuxPhysRegDriver((void *) 0x43c00000, (void *) 0x10000000, 256 * 1024 * 1024); 32 | } 33 | return (WrapperRegDriver *) platform; 34 | } 35 | 36 | void deinitPlatform(WrapperRegDriver * driver) { 37 | // TODO doing a delete here causes the zedboard to go in a loop, debug this 38 | } 39 | 40 | void loadBitfile(const char * accelName) { 41 | pid_t c_pid, pid; 42 | int status; 43 | 44 | // call a shell script to do the bitfile loading, fork & exec & wait 45 | char * loader = getenv("ZYNQ_BITFILE_LOADER"); 46 | if(!loader) 47 | throw "ZYNQ_BITFILE_LOADER must be set"; 48 | 49 | c_pid = fork(); 50 | 51 | if (c_pid == 0){ 52 | execl(loader, loader, accelName, NULL); 53 | throw "execl failed"; 54 | } else if (c_pid > 0){ 55 | if( (pid = wait(&status)) < 0){ 56 | throw "wait failed"; 57 | _exit(1); 58 | } 59 | } else{ 60 | throw ("fork failed"); 61 | _exit(1); 62 | } 63 | cout << "loadBitfile finished: " << accelName << endl; 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/hlstools/TemplatedHLSBlackBox.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.hlstools 2 | 3 | import chisel3._ 4 | 5 | // helper class to derive BlackBox modules that correspond to Vivado HLS 6 | // templated functions which come with a macro-based wrapper. Suppose that 7 | // you have the following templated Vivado HLS module: 8 | // template myTemplatedHLSModule(runtime args) {...} 9 | // This should have a wrapper that looks like: 10 | // #include "myTemplateDefines.h" 11 | // void myTemplatedHLSModuleWrapper(runtime args) { 12 | // 13 | // myTemplatedHLSModule(runtime args); 14 | // } 15 | // The TemplatedHLSBlackBox class can now be used to make a Chisel BlackBox 16 | // whose generateTemplateDefines function generates the myTemplateDefines.h 17 | // The hlsTemplateParams member must map template parameter names to values, 18 | // where each parameter name will be prefixed with templateParamPrefix 19 | // for instance, hlsTemplateParams = ("A" -> "12", "B" -> "55") will generate: 20 | // #define TEMPLATE_PARAM_A 12 21 | // #define TEMPLATE_PARAM_B 55 22 | 23 | abstract class TemplatedHLSBlackBox() extends BlackBox { 24 | def hlsTemplateParams: Map[String, String] 25 | val templateParamPrefix = "TEMPLATE_PARAM_" 26 | 27 | def generateTemplateDefines(fileName: String): String = { 28 | val templateDefines = generateTemplateDefines() 29 | // write the generated define string into a file 30 | import java.io._ 31 | val writer = new PrintWriter(new File(fileName)) 32 | writer.write(templateDefines) 33 | writer.close() 34 | return templateDefines 35 | } 36 | 37 | def generateTemplateDefines(): String = { 38 | // build and return the define string by serializing the pairs as 39 | // given by hlsTemplateParams 40 | var templateDefines: String = "" 41 | for((name, value) <- hlsTemplateParams) { 42 | templateDefines += s"#define ${templateParamPrefix}${name} ${value}\n" 43 | } 44 | return templateDefines 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/profiler/OutstandingTxnProfiler.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.profiler 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | class OutstandingTxnProfilerOutput(w: Int) extends Bundle { 7 | val sum = Output(UInt(w.W)) 8 | val cycles = Output(UInt(w.W)) 9 | } 10 | 11 | class OutstandingTxnProfiler(w: Int) extends Module { 12 | val io = new Bundle { 13 | val enable = Input(Bool()) 14 | val probeReqValid = Input(Bool()) 15 | val probeReqReady = Input(Bool()) 16 | val probeRspValid = Input(Bool()) 17 | val probeRspReady = Input(Bool()) 18 | 19 | val out = new OutstandingTxnProfilerOutput(w) 20 | } 21 | 22 | val regCycles = RegInit(0.U(32.W)) 23 | val regTotalReq = RegInit(0.U(32.W)) 24 | val regTotalRsp = RegInit(0.U(32.W)) 25 | val regActiveTxns = RegInit(0.U(32.W)) 26 | 27 | val regActive = RegInit(false.B) 28 | regActive := io.enable 29 | 30 | val reqTxn = io.probeReqReady & io.probeReqValid 31 | val rspTxn = io.probeRspValid & io.probeRspReady 32 | 33 | when(!regActive & io.enable) { 34 | // reset all counters when first enabled 35 | regCycles := 0.U 36 | regTotalReq := 0.U 37 | regTotalRsp := 0.U 38 | regActiveTxns := 0.U 39 | } .elsewhen(regActive & io.enable) { 40 | regCycles := regCycles + 1.U 41 | when(reqTxn) { regTotalReq := regTotalReq + 1.U } 42 | when(rspTxn) { regTotalRsp := regTotalRsp + 1.U } 43 | regActiveTxns := regActiveTxns + (regTotalReq - regTotalRsp) 44 | } 45 | 46 | io.out.cycles := regCycles 47 | io.out.sum := regActiveTxns 48 | } 49 | 50 | 51 | object OutstandingTxnProfiler { 52 | def apply[T <: Data]( 53 | req: DecoupledIO[T], 54 | rsp: DecoupledIO[T], 55 | enable: Bool): OutstandingTxnProfilerOutput = { 56 | val mon = Module(new OutstandingTxnProfiler(32)) 57 | mon.io.enable := enable 58 | mon.io.probeReqValid := req.valid 59 | mon.io.probeReqReady := req.ready 60 | mon.io.probeRspValid := rsp.valid 61 | mon.io.probeRspReady := rsp.ready 62 | 63 | return mon.io.out 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/SequenceGenerator.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | // Generates an arithmetic sequence with a given and 7 | // with elements. Example: =1 =2 =4 8 | // seq = 1 3 5 7 9 | class SequenceGenerator(w: Int, a: Int = 32) extends Module { 10 | val io = IO(new Bundle { 11 | val start = Input(Bool()) 12 | val init = Input(UInt(w.W)) 13 | val count = Input(UInt(a.W)) 14 | val step = Input(UInt(w.W)) 15 | val finished = Output(Bool()) 16 | val seq = Decoupled(UInt(w.W)) 17 | }) 18 | 19 | val regSeqElem = RegInit(0.U(w.W)) 20 | val regCounter = RegInit(0.U(a.W)) 21 | val regMaxCount = RegInit(0.U(a.W)) 22 | val regStep = RegInit(0.U(a.W)) 23 | io.finished := false.B 24 | io.seq.valid := false.B 25 | io.seq.bits := regSeqElem 26 | 27 | val sIdle :: sRun :: sFinished :: Nil = Enum(3) 28 | val regState = RegInit((sIdle)) 29 | 30 | switch(regState) { 31 | is(sIdle) { 32 | when(io.start) { 33 | regStep := io.step 34 | regState := sRun 35 | regCounter := 0.U 36 | regMaxCount := io.count 37 | regSeqElem := io.init 38 | } 39 | } 40 | 41 | is(sRun) { 42 | when (regCounter === regMaxCount) { 43 | regState := sFinished 44 | } .otherwise { 45 | io.seq.valid := true.B 46 | when(io.seq.ready) { 47 | regCounter := regCounter + 1.U 48 | regSeqElem := regSeqElem + regStep 49 | } 50 | } 51 | } 52 | 53 | is(sFinished) { 54 | io.finished := true.B 55 | when(!io.start) { 56 | regState := sIdle 57 | } 58 | } 59 | } 60 | } 61 | 62 | // convenience constructor for natural numbers 63 | object NaturalNumbers { 64 | def apply(w: Int, start: Bool, count: UInt) = { 65 | val m = Module(new SequenceGenerator(w)).io 66 | m.start := start 67 | m.init := 0.U 68 | m.count := count 69 | m.step := 1.U 70 | m.seq 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/platform-wrapper/axi/ZedBoardWrapper.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.PlatformWrapper 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import java.nio.file.Paths 6 | 7 | import fpgatidbits.TidbitsMakeUtils._ 8 | 9 | import scala.io.Source 10 | // platform wrapper for the ZedBoard 11 | 12 | object ZedBoardParams extends PlatformWrapperParams { 13 | val platformName = "ZedBoard" 14 | val memAddrBits = 32 15 | val memDataBits = 64 16 | val memIDBits = 6 17 | val memMetaBits = 1 18 | val numMemPorts = 4 19 | val sameIDInOrder = true 20 | val typicalMemLatencyCycles = 32 21 | val burstBeats = 8 // TODO why cap bursts at 8? AXI can do more 22 | val coherentMem = false // TODO add CC version 23 | } 24 | 25 | 26 | class ZedBoardWrapper(instFxn: PlatformWrapperParams => GenericAccelerator, targetDir: String, generateRegDriver: Boolean = true) 27 | extends AXIPlatformWrapper(ZedBoardParams, instFxn) { 28 | val platformDriverFiles = baseDriverFiles ++ Array[String]( 29 | "platform-zedboard.cpp", "zedboardregdriver.hpp", "axiregdriver.hpp" 30 | ) 31 | 32 | if (generateRegDriver) { 33 | // Generate the RegFile driver 34 | println("Generating Register Driver at directory:" + targetDir) 35 | generateRegDriver(targetDir) 36 | // Copy over the other needed files 37 | //val resRoot = getClass.getResource("").getPath 38 | resourceCopyBulk("cpp/platform-wrapper-regdriver/", targetDir, platformDriverFiles) 39 | println(s"=======> Driver files copied to ${targetDir}") 40 | 41 | } 42 | 43 | } 44 | 45 | class ZedBoardLinuxWrapper(instFxn: PlatformWrapperParams => GenericAccelerator, targetDir: String) 46 | extends AXIPlatformWrapper(ZedBoardParams, instFxn) { 47 | val platformDriverFiles = baseDriverFiles ++ Array[String]( 48 | "platform-zedboard-linux.cpp", "linuxphysregdriver.hpp", "axiregdriver.hpp" 49 | ) 50 | 51 | // Generate the RegFile driver 52 | generateRegDriver(targetDir) 53 | 54 | // Copy over the other needed files 55 | resourceCopyBulk("/cpp/platform-wrapper-regdriver/", targetDir, platformDriverFiles) 56 | println(s"=======> Driver files copied to ${targetDir}") 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/BlockSequenceGenerator.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | // Generates a "blocked" sequence from a sequence descriptor 7 | // example: descriptor start = 100 count = 10 blockSize = 3 8 | // output sequence: 9 | // start: 100 103 106 109 10 | // count: 3 3 3 1 11 | 12 | class BlockSequenceDescriptor(w: Int) extends Bundle { 13 | val start = UInt(w.W) // starting element 14 | val count = UInt(w.W) // total elements 15 | val blockSize = UInt(w.W) // preferred block size 16 | } 17 | 18 | class BlockSequenceOutput(w: Int) extends Bundle { 19 | val count = UInt(w.W) 20 | val start = UInt(w.W) 21 | 22 | } 23 | 24 | class BlockSequenceGenerator(w: Int) extends Module { 25 | val io = new Bundle { 26 | val cmd = Flipped(Decoupled(new BlockSequenceDescriptor(w))) 27 | val out = Decoupled(new BlockSequenceOutput(w)) 28 | } 29 | 30 | val regPtr = RegInit(0.U(w.W)) 31 | val regBlockSize = RegInit(0.U(32.W)) 32 | val regElemsLeft = RegInit(0.U(32.W)) 33 | 34 | io.cmd.ready := false.B 35 | io.out.valid := false.B 36 | io.out.bits.count := 0.U 37 | io.out.bits.start := regPtr 38 | 39 | val sIdle :: sRun :: sLast :: Nil = Enum(3) 40 | val regState = RegInit(sIdle) 41 | 42 | switch(regState) { 43 | is(sIdle) { 44 | io.cmd.ready := true.B 45 | when(io.cmd.valid) { 46 | regPtr := io.cmd.bits.start 47 | regBlockSize := io.cmd.bits.blockSize 48 | regElemsLeft := io.cmd.bits.count 49 | regState := sRun 50 | } 51 | } 52 | 53 | is(sRun) { 54 | when(regElemsLeft > regBlockSize) { 55 | io.out.valid := true.B 56 | io.out.bits.count := regBlockSize 57 | when(io.out.ready) { 58 | regElemsLeft := regElemsLeft - regBlockSize 59 | regPtr := regPtr + regBlockSize 60 | } 61 | } .otherwise { 62 | regState := sLast 63 | } 64 | } 65 | 66 | is(sLast) { 67 | io.out.valid := true.B 68 | io.out.bits.count := regElemsLeft 69 | when(io.out.ready) { 70 | regState := sIdle 71 | } 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/examples/ExampleRandomRead.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.examples 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import fpgatidbits.PlatformWrapper._ 6 | import fpgatidbits.dma._ 7 | import fpgatidbits.streams._ 8 | 9 | class ExampleRandomReadIO(n: Int, p: PlatformWrapperParams) extends GenericAcceleratorIF(n,p) { 10 | val start = Input(Bool()) 11 | val finished = Output(Bool()) 12 | val indsBase = Input(UInt(64.W)) 13 | val valsBase = Input(UInt(64.W)) 14 | val count = Input(UInt(32.W)) 15 | val sum = Output(UInt(32.W)) 16 | } 17 | class ExampleRandomRead(p: PlatformWrapperParams) extends GenericAccelerator(p) { 18 | val numMemPorts = 2 19 | val io = IO(new ExampleRandomReadIO(numMemPorts, p)) 20 | io.signature := makeDefaultSignature() 21 | // plug unused ports 22 | plugMemWritePort(0) 23 | plugMemWritePort(1) 24 | 25 | val rrgInds = Module(new ReadReqGen(p.toMemReqParams(), 0, 1)).io 26 | val opBytes = (p.memDataBits/8).U 27 | 28 | rrgInds.ctrl.start := io.start 29 | rrgInds.ctrl.throttle := false.B 30 | rrgInds.ctrl.baseAddr := io.indsBase 31 | rrgInds.ctrl.byteCount := io.count * opBytes 32 | rrgInds.reqs <> io.memPort(0).memRdReq 33 | 34 | def idsToReqs(ids: DecoupledIO[UInt]): DecoupledIO[GenericMemoryRequest] = { 35 | val reqs = Decoupled(new GenericMemoryRequest(p.toMemReqParams())) 36 | val req = reqs.bits 37 | req.channelID := 0.U // TODO parametrize! 38 | req.isWrite := false.B 39 | req.addr := io.valsBase + ids.bits * opBytes 40 | req.numBytes := opBytes // single-beat burst 41 | req.metaData := 0.U 42 | 43 | reqs.valid := ids.valid 44 | ids.ready := reqs.ready 45 | 46 | return reqs 47 | } 48 | val readDataFilter = {x: GenericMemoryResponse => x.readData} 49 | 50 | val readInds = StreamFilter(io.memPort(0).memRdRsp, UInt(p.memDataBits.W), readDataFilter) 51 | io.memPort(1).memRdReq <> idsToReqs(readInds) 52 | 53 | val red = Module(new StreamReducer(p.memDataBits, 0, {_+_})).io 54 | red.start := io.start 55 | red.byteCount := io.count * opBytes 56 | red.streamIn <> StreamFilter(io.memPort(1).memRdRsp, UInt(p.memDataBits.W), readDataFilter) 57 | 58 | io.sum := red.reduced 59 | io.finished := red.finished 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/StreamDeinterleaver.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import fpgatidbits.ocm.FPGAQueue 6 | 7 | // deinterleavers a input stream with identifiers onto one of the output 8 | // streams, based on the ID value and a routing function 9 | // TODO the current implementation is likely to cause timing problems 10 | // due to high-fanout signals and combinational paths 11 | // - to avoid high-fanout signals: implement decoding as e.g shiftreg 12 | // - to avoid combinational paths, pipeline the deinterleaver 13 | 14 | class StreamDeinterleaverIF[T <: Data](numDests: Int, gen: T) extends Bundle { 15 | val in = Flipped(Decoupled(gen)) 16 | val out = Vec(numDests, Decoupled(gen)) 17 | val decodeErrors = Output(UInt(32.W)) 18 | } 19 | 20 | class StreamDeinterleaver[T <: Data](numDests: Int, gen: T, route: T => UInt) 21 | extends Module { 22 | val io = new StreamDeinterleaverIF(numDests, gen) 23 | 24 | val regDecodeErrors = RegInit(0.U(32.W)) 25 | 26 | for(i <- 0 until numDests) { 27 | io.out(i).bits := io.in.bits 28 | io.out(i).valid := false.B 29 | } 30 | 31 | io.in.ready := false.B 32 | io.decodeErrors := regDecodeErrors 33 | 34 | val destPipe = route(io.in.bits) 35 | val invalidChannel = (destPipe >= (numDests.U)) 36 | val canProceed = io.in.valid && io.out(destPipe).ready 37 | 38 | when (invalidChannel) { 39 | // do not let the entire pipe stall because head of line has invalid dest 40 | // increment error counter and move on 41 | regDecodeErrors := regDecodeErrors + 1.U 42 | io.in.ready := true.B 43 | } 44 | .elsewhen (canProceed) { 45 | io.in.ready := true.B 46 | io.out(destPipe).valid := true.B 47 | } 48 | } 49 | 50 | class StreamDeinterleaverQueued[T <: Data](numDests: Int, gen: T, route: T => UInt, capacity: Int) 51 | extends Module { 52 | val io = new StreamDeinterleaverIF(numDests, gen) 53 | 54 | val deintl = Module(new StreamDeinterleaver(numDests, gen, route)).io 55 | 56 | FPGAQueue(io.in, 2) <> deintl.in 57 | io.decodeErrors := deintl.decodeErrors 58 | 59 | for(i <- 0 until numDests) { 60 | val q = Module(new FPGAQueue(gen, capacity)).io 61 | q.enq <> deintl.out(i) 62 | io.out(i) <> q.deq 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/StreamLimiter.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | // limits the amount of data passing through a stream 7 | // - when not started, the stream just passes through unhindered 8 | // - after start (must be held high), first bytes of the 9 | // stream pass through. afterwards, no further data is sent to the output. 10 | // the input stream is allowed to drain out in order not to clog the upstream. 11 | 12 | object StreamLimiter { 13 | def apply(in: DecoupledIO[UInt], start: Bool, count: UInt): DecoupledIO[UInt] = { 14 | val limiter = Module(new StreamLimiter(in.bits.getWidth)).io 15 | limiter.start := start 16 | limiter.byteCount := count 17 | limiter.streamIn <> in 18 | limiter.streamOut 19 | } 20 | } 21 | 22 | class StreamLimiter(w: Int) extends Module { 23 | val io = IO(new Bundle { 24 | val start = Input(Bool()) 25 | val done = Output(Bool()) 26 | val byteCount = Input(UInt(32.W)) 27 | val streamIn = Flipped(Decoupled(UInt(w.W))) 28 | val streamOut = Decoupled(UInt(w.W)) 29 | }) 30 | 31 | io.done := false.B 32 | 33 | io.streamOut.bits := io.streamIn.bits 34 | io.streamOut.valid := io.streamIn.valid 35 | io.streamIn.ready := io.streamOut.ready 36 | 37 | val sIdle :: sRun :: sFinished :: Nil = Enum(3) 38 | val regState = RegInit((sIdle)) 39 | 40 | val regBytesLeft = RegInit(0.U(32.W)) 41 | 42 | switch(regState) { 43 | is(sIdle) { 44 | regBytesLeft := io.byteCount 45 | when(io.start) {regState := sRun} 46 | } 47 | 48 | is(sRun) { 49 | // count each transaction and decrement counter 50 | when (io.streamIn.valid & io.streamOut.ready) { 51 | regBytesLeft := regBytesLeft - (w/8).U 52 | when (regBytesLeft === (w/8).U) {regState := sFinished} 53 | } 54 | } 55 | 56 | is(sFinished) { 57 | // do not let any more transactions through towards out 58 | io.streamOut.valid := false.B 59 | // let upstream sources continue, do not clog the pipes 60 | io.streamIn.ready := true.B 61 | // signal finished 62 | io.done := true.B 63 | when(!io.start) {regState := sIdle} 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/resources/script/gen_xo.tcl: -------------------------------------------------------------------------------- 1 | # /******************************************************************************* 2 | # Copyright (c) 2018, Xilinx, Inc. 3 | # All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without modification, 6 | # are permitted provided that the following conditions are met: 7 | # 8 | # 1. Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # 11 | # 12 | # 2. Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # 17 | # 3. Neither the name of the copyright holder nor the names of its contributors 18 | # may be used to endorse or promote products derived from this software 19 | # without specific prior written permission. 20 | # 21 | # 22 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 23 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE IMPLIED 24 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 25 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 26 | # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 29 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 31 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | # 33 | # *******************************************************************************/ 34 | 35 | if { $::argc != 4 } { 36 | puts "ERROR: Program \"$::argv0\" requires 4 arguments!\n" 37 | puts "Usage: $::argv0 \n" 38 | exit 39 | } 40 | 41 | set xoname [lindex $::argv 0] 42 | set kernel_name [lindex $::argv 1] 43 | set ip_dir [lindex $::argv 2] 44 | set kernel_xml [lindex $::argv 3] 45 | 46 | package_xo -xo_path ${xoname} -kernel_name ${kernel_name} -ip_directory ${ip_dir} -kernel_xml ${kernel_xml} 47 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/SearchableQueue.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | import chisel3._ 3 | import chisel3.util._ 4 | 5 | // a content-searchable queue 6 | // mostly a straightforward copy from ChiselUtils Queue; with modifications 7 | // to permit making the content searchable 8 | 9 | class SearchableQueueIO[T <: Data](gen: T, n: Int) extends QueueIO(gen, n) { 10 | val searchVal = Input(gen) 11 | val foundVal = Output(Bool()) 12 | 13 | } 14 | 15 | class SearchableQueue[T <: Data](gen: T, entries: Int) extends Module { 16 | val io = new SearchableQueueIO(gen, entries) 17 | 18 | // mostly copied from Chisel Queue, with a few modifications: 19 | // - simplified to pipe = false flow = false 20 | // - vector of registers instead of Mem, to expose all outputs 21 | 22 | val ram: Vec[UInt] = RegInit(VecInit(Seq.fill(entries)(0.U(gen.getWidth.W)))) 23 | val ramValid = RegInit(VecInit(Seq.fill(entries)(false.B))) 24 | 25 | val enq_ptr = Counter(entries) 26 | val deq_ptr = Counter(entries) 27 | val maybe_full = RegInit(false.B) 28 | 29 | val ptr_match = enq_ptr.value === deq_ptr.value 30 | val empty = ptr_match && !maybe_full 31 | val full = ptr_match && maybe_full 32 | 33 | val do_enq = io.enq.ready && io.enq.valid 34 | val do_deq = io.deq.ready && io.deq.valid 35 | when (do_enq) { 36 | ram(enq_ptr.value) := io.enq.bits 37 | ramValid(enq_ptr.value) := true.B 38 | enq_ptr.inc() 39 | } 40 | when (do_deq) { 41 | ramValid(deq_ptr.value) := false.B 42 | deq_ptr.inc() 43 | } 44 | when (do_enq =/= do_deq) { 45 | maybe_full := do_enq 46 | } 47 | 48 | // 49 | val newData = io.searchVal 50 | val hits = VecInit(Seq.tabulate(entries)(i => ram(i) === newData.asUInt && ramValid(i))) 51 | io.foundVal := hits.asUInt.orR 52 | // 53 | 54 | io.deq.valid := !empty 55 | io.enq.ready := !full 56 | io.deq.bits := ram(deq_ptr.value) 57 | 58 | val ptr_diff = enq_ptr.value - deq_ptr.value 59 | if (isPow2(entries)) { 60 | io.count := Cat(maybe_full && ptr_match, ptr_diff) 61 | } else { 62 | io.count := Mux(ptr_match, 63 | Mux(maybe_full, entries.U, 0.U), 64 | Mux(deq_ptr.value > enq_ptr.value, 65 | entries.U + ptr_diff, ptr_diff) 66 | ) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/examples/ExampleMultiChanSum.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.examples 2 | 3 | import chisel3._ 4 | import fpgatidbits.PlatformWrapper._ 5 | import fpgatidbits.dma._ 6 | import fpgatidbits.streams._ 7 | 8 | 9 | class ExampleMultiChanSumIO(n: Int, p: PlatformWrapperParams, numChans: Int) extends GenericAcceleratorIF(n,p) { 10 | val start = Input(Bool()) 11 | val baseAddr = Vec(numChans, Input(UInt(64.W))) 12 | val byteCount = Vec(numChans, Input(UInt(32.W))) 13 | val sum = Vec(numChans, Output(UInt(32.W))) 14 | val status = Output(Bool()) 15 | } 16 | class ExampleMultiChanSum(p: PlatformWrapperParams) extends GenericAccelerator(p) { 17 | val numMemPorts = 1 18 | val numChans = 2 19 | val io = IO(new ExampleMultiChanSumIO(numMemPorts, p, numChans)) 20 | plugMemWritePort(0) // write ports not used 21 | io.signature := makeDefaultSignature() 22 | val mrp = p.toMemReqParams() 23 | 24 | def makeReader(id: Int) = { 25 | Module(new StreamReader(new StreamReaderParams( 26 | streamWidth = 32, fifoElems = 8, mem = mrp, 27 | maxBeats = 1, chanID = id, disableThrottle = true 28 | ))).io 29 | } 30 | 31 | val readers = VecInit(Seq.tabulate(numChans) {i:Int => makeReader(i)}) 32 | val reducers = VecInit(Seq.fill(numChans) { 33 | Module(new StreamReducer(32, 0, {_+_})).io 34 | }) 35 | 36 | val intl = Module(new ReqInterleaver(numChans, mrp)).io 37 | val deintl = Module(new QueuedDeinterleaver(numChans, mrp, 4)).io 38 | 39 | // regGen -> intl -> (memRdReq) -> (memRdRsp) -> deintl -> reducer 40 | 41 | for(i <- 0 until numChans) { 42 | readers(i).start := io.start 43 | readers(i).baseAddr := io.baseAddr(i) 44 | readers(i).byteCount := io.byteCount(i) 45 | 46 | readers(i).doInit := false.B 47 | readers(i).initCount := 0.U 48 | 49 | readers(i).req <> intl.reqIn(i) 50 | deintl.rspOut(i) <> readers(i).rsp 51 | readers(i).out.ready := reducers(i).streamIn.ready 52 | reducers(i).streamIn.valid := readers(i).out.valid 53 | reducers(i).streamIn.bits := readers(i).out.bits 54 | 55 | reducers(i).start := io.start 56 | reducers(i).byteCount := io.byteCount(i) 57 | io.sum(i) := reducers(i).reduced 58 | } 59 | 60 | intl.reqOut <> io.memPort(0).memRdReq 61 | deintl.rspIn <> io.memPort(0).memRdRsp 62 | 63 | io.status := reducers.forall(x => x.finished) 64 | } 65 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-tests/ExampleMemLatency.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | using namespace std; 4 | 5 | #include "TestMemLatency.hpp" 6 | #include "platform.h" 7 | 8 | // issue a number of 8-beat bursts, with a parametrizable number of outstanding 9 | // memory requests. the memory latency can be estimated from the number of 10 | // cycles it takes per word; when the cycles/word is approx TODO, the latency 11 | // is completely hidden by outstanding requests 12 | // thus, the minimum # of outstanding reqs (OMR) that hides the latency can 13 | // be used to estimate the average latency as L = OMR * 8 14 | // (since the accelerator uses 8-beat bursts) 15 | 16 | bool Run_TestMemLatency(WrapperRegDriver * platform) { 17 | TestMemLatency t(platform); 18 | 19 | cout << "Signature: " << hex << t.get_signature() << dec << endl; 20 | 21 | while(1) { 22 | unsigned int omr = 16; 23 | cout << "Enter # of outstanding mem requests (max 16, 0 to exit):" << endl; 24 | cin >> omr; 25 | 26 | if(omr == 0) break; 27 | 28 | unsigned int ub = 0; 29 | cout << "Enter upper bound of sum (divisable by 8): " << endl; 30 | cin >> ub; 31 | 32 | typedef uint64_t AccelWord; 33 | AccelWord * hostBuf = new AccelWord[ub]; 34 | unsigned int bufsize = ub * sizeof(AccelWord); 35 | unsigned int golden = (ub*(ub+1))/2; 36 | 37 | for(unsigned int i = 0; i < ub; i++) { hostBuf[i] = i+1; } 38 | 39 | void * accelBuf = platform->allocAccelBuffer(bufsize); 40 | platform->copyBufferHostToAccel(hostBuf, accelBuf, bufsize); 41 | 42 | t.set_baseAddr((AccelDblReg) accelBuf); 43 | t.set_byteCount(bufsize); 44 | 45 | // set # outstanding mem requests and pulse doInit to reinitialize pool 46 | t.set_initCount(omr); 47 | t.set_doInit(1); 48 | t.set_doInit(0); 49 | 50 | t.set_start(1); 51 | 52 | while(t.get_finished() != 1); 53 | 54 | platform->deallocAccelBuffer(accelBuf); 55 | delete [] hostBuf; 56 | 57 | AccelReg res = t.get_sum(); 58 | cout << "Result = " << res << " expected " << golden << endl; 59 | unsigned int cc = t.get_cycleCount(); 60 | cout << "#cycles = " << cc << " cycles per word = " << (float)cc/(float)ub << endl; 61 | t.set_start(0); 62 | } 63 | 64 | return true; 65 | } 66 | 67 | int main() 68 | { 69 | WrapperRegDriver * platform = initPlatform(); 70 | 71 | Run_TestMemLatency(platform); 72 | 73 | deinitPlatform(platform); 74 | 75 | return 0; 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/examples/ExampleMemLatency.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.examples 2 | 3 | import chisel3._ 4 | import fpgatidbits.PlatformWrapper._ 5 | import fpgatidbits.dma._ 6 | import fpgatidbits.streams._ 7 | 8 | // very similar to ExampleSum, except a StreamReader with configurable # of 9 | // outstanding memory requests is used. by increasing the number of outstanding 10 | // requests in software, the cycles per word should converge to TODO 11 | // (the +TODO is due to inefficiencies in the ReadOrderCache -- needs 2 cycles 12 | // between bursts, so here we need x+2 cycles for x words) 13 | 14 | class ExampleMemLatencyIO(n: Int, p: PlatformWrapperParams) extends GenericAcceleratorIF(n,p) { 15 | val start = Input(Bool()) 16 | val finished = Output(Bool()) 17 | val baseAddr = Input(UInt(64.W)) 18 | val byteCount = Input(UInt(32.W)) 19 | val sum = Output(UInt(32.W)) 20 | val cycleCount = Output(UInt(32.W)) 21 | // controls for ID pool reinit 22 | val doInit = Input(Bool()) // pulse this to re-init ID pool 23 | val initCount = Input(UInt(8.W)) // # IDs to initialize 24 | } 25 | 26 | class ExampleMemLatency(p: PlatformWrapperParams) extends GenericAccelerator(p) { 27 | val numMemPorts = 1 28 | val io = IO(new ExampleMemLatencyIO(numMemPorts, p)) 29 | io.signature := makeDefaultSignature() 30 | plugMemWritePort(0) 31 | 32 | val rdP = new StreamReaderParams( 33 | streamWidth = 64, fifoElems = 8, mem = p.toMemReqParams(), 34 | maxBeats = 8, chanID = 0, 35 | disableThrottle = true, // outstanding reqs limits request rate 36 | readOrderCache = true, // enable read order cache 37 | readOrderTxns = 16 // outstanding mem reqs. capped at 16 38 | ) 39 | 40 | val reader = Module(new StreamReader(rdP)).io 41 | val red = Module(new StreamReducer(64, 0, {_+_})).io 42 | 43 | reader.start := io.start 44 | reader.baseAddr := io.baseAddr 45 | reader.byteCount := io.byteCount 46 | reader.doInit := io.doInit 47 | reader.initCount := io.initCount 48 | 49 | red.start := io.start 50 | red.byteCount := io.byteCount 51 | 52 | io.sum := red.reduced 53 | io.finished := red.finished 54 | 55 | reader.req <> io.memPort(0).memRdReq 56 | io.memPort(0).memRdRsp <> reader.rsp 57 | 58 | reader.out <> red.streamIn 59 | 60 | val regCycleCount = RegInit(0.U(32.W)) 61 | io.cycleCount := regCycleCount 62 | when(!io.start) {regCycleCount := 0.U} 63 | .elsewhen(io.start & !io.finished) {regCycleCount := regCycleCount + 1.U} 64 | } 65 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-integration-tests/TestExampleMultiChanSum.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | 4 | #include "ExampleMultiChanSum.hpp" 5 | #include "platform.h" 6 | 7 | bool Run_ExampleMultiChanSum(WrapperRegDriver * platform, uint ub, uint offs) { 8 | ExampleMultiChanSum t(platform); 9 | 10 | 11 | unsigned int * hostBuf0 = new unsigned int[ub]; 12 | unsigned int * hostBuf1 = new unsigned int[ub]; 13 | unsigned int bufsize = ub * sizeof(unsigned int); 14 | 15 | for(unsigned int i = 0; i < ub; i++) {hostBuf0[i] = i+1; hostBuf1[i] = i+1 + offs;} 16 | 17 | void * accBuf0 = platform->allocAccelBuffer(bufsize); 18 | void * accBuf1 = platform->allocAccelBuffer(bufsize); 19 | 20 | platform->copyBufferHostToAccel((void *) hostBuf0, accBuf0, bufsize); 21 | platform->copyBufferHostToAccel((void *) hostBuf1, accBuf1, bufsize); 22 | 23 | t.set_byteCount_0(bufsize); t.set_byteCount_1(bufsize); 24 | t.set_baseAddr_0((AccelDblReg) accBuf0); t.set_baseAddr_1((AccelDblReg) accBuf1); 25 | 26 | t.set_start(1); 27 | 28 | while(t.get_status() != 1); 29 | 30 | unsigned int res0 = t.get_sum_0(); unsigned int res1 = t.get_sum_1(); 31 | unsigned int exp0 = (ub*(ub+1))/2; unsigned int exp1 = exp0 + ub*offs; 32 | 33 | t.set_start(0); 34 | 35 | 36 | platform->deallocAccelBuffer(accBuf0); 37 | platform->deallocAccelBuffer(accBuf1); 38 | 39 | delete [] hostBuf0; 40 | delete [] hostBuf1; 41 | 42 | return (res0 == exp0) && (res1 == exp1); 43 | } 44 | 45 | 46 | int main(int argc, char **argv) { 47 | 48 | if (argc != 2) { 49 | cout << "Please pass the number of tests to run as the only command line argument" < readPort.memRdReq 38 | 39 | wrg.ctrl.start := io.start 40 | wrg.ctrl.throttle := false.B 41 | wrg.ctrl.baseAddr := io.dstAddr 42 | wrg.ctrl.byteCount := io.byteCount 43 | wrg.reqs <> writePort.memWrReq 44 | 45 | // pull out read response as write data 46 | ReadRespFilter(readPort.memRdRsp) <> writePort.memWrDat 47 | 48 | // count write responses to determine finished 49 | val regNumPendingReqs = RegInit(0.U(32.W)) 50 | val regRequestedBytes = RegInit(0.U(32.W)) 51 | 52 | writePort.memWrRsp.ready := true.B 53 | 54 | when(!io.start) { 55 | regNumPendingReqs := 0.U 56 | regRequestedBytes := 0.U 57 | } .otherwise { 58 | val reqFired = writePort.memWrReq.valid & writePort.memWrReq.ready 59 | val rspFired = writePort.memWrRsp.valid & writePort.memWrRsp.ready 60 | 61 | regRequestedBytes := regRequestedBytes + Mux(reqFired, writePort.memWrReq.bits.numBytes, 0.U) 62 | 63 | when(reqFired && !rspFired) { regNumPendingReqs := regNumPendingReqs + 1.U} 64 | .elsewhen(!reqFired && rspFired) { regNumPendingReqs := regNumPendingReqs - 1.U } 65 | } 66 | 67 | io.finished := io.start & (regRequestedBytes === io.byteCount) & (regNumPendingReqs === 0.U) 68 | io.finBytes := regRequestedBytes 69 | } 70 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/AffineLoopNestIndGen.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | import fpgatidbits.math.Counter 7 | 8 | // bundle that describes the iteration space, or a single point 9 | // in the iteration space, of an affine loop 10 | // n: number of nesting levels in the loop 11 | // w: bitwidth of iteration count of a level 12 | class AffineLoopNestDescriptor(n: Int, w: Int) extends Bundle { 13 | // element 0 is the innermost loop 14 | val inds = Vec(n, UInt(w.W)) 15 | } 16 | 17 | // given the number of iterations for a nested affine loop, 18 | // generate the iteration space 19 | // n: number of nesting levels in the loop 20 | // w: bitwidth of iteration count of a level 21 | class AffineLoopNestIndGen(val n: Int, val w: Int) extends Module { 22 | val io = new Bundle { 23 | val in = Flipped(Decoupled(new AffineLoopNestDescriptor(n, w))) 24 | val out = Decoupled(new AffineLoopNestDescriptor(n, w)) 25 | } 26 | val doStep = WireDefault(false.B) 27 | // register to keep current descriptor with bounds 28 | val regBounds = RegInit(io.in.bits) 29 | // instantiate counters, one for each loop level 30 | val cntrs = for (i <- 0 until n) yield Module(new Counter(w)).io 31 | // default values for signals 32 | io.in.ready := false.B 33 | io.out.valid := false.B 34 | // wire up counters 35 | for(i <- 0 until n) { 36 | io.out.bits.inds(i) := cntrs(i).current 37 | cntrs(i).nsteps := regBounds.inds(i) 38 | if(i == 0) { 39 | cntrs(i).enable := doStep 40 | } else { 41 | cntrs(i).enable := cntrs(i-1).full & cntrs(i-1).enable & doStep 42 | } 43 | } 44 | // finite state machine for decoupled logic 45 | val sIdle :: sWaitCounterInit :: sRun :: Nil = Enum(3) 46 | val regState = RegInit(sIdle) 47 | switch(regState) { 48 | is(sIdle) { 49 | io.in.ready := true.B 50 | when(io.in.valid) { 51 | regBounds := io.in.bits 52 | regState := sWaitCounterInit 53 | } 54 | } 55 | is(sWaitCounterInit) { 56 | regState := sRun 57 | // TODO consider removing regMax reg from counter to avoid 58 | // the sWaitCounterInit state 59 | } 60 | is(sRun) { 61 | io.out.valid := true.B 62 | when(io.out.ready) { 63 | // note: we send doStep to make all ctrs go back to 0 also at the end 64 | doStep := true.B 65 | // finished when the outermost loop level is finished 66 | when(cntrs(n-1).full && cntrs(n-1).enable) { 67 | regState := sIdle 68 | } 69 | } 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/dma/RespDeinterleaver.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.dma 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import fpgatidbits.ocm._ 6 | 7 | class RespDeinterleaverIF(numPipes: Int, p: MemReqParams) extends Bundle { 8 | // interleaved responses in 9 | val rspIn = Flipped(Decoupled(new GenericMemoryResponse(p))) 10 | // deinterleaved responses out 11 | val rspOut = Vec(numPipes, Decoupled(new GenericMemoryResponse(p))) 12 | // number of decode errors (ID width no matching pipe) 13 | val decodeErrors = Output(UInt(32.W)) 14 | 15 | } 16 | 17 | class QueuedDeinterleaver(numPipes: Int, p: MemReqParams, n: Int, 18 | routeFxn: GenericMemoryResponse => UInt = {x: GenericMemoryResponse => x.channelID} 19 | ) extends Module { 20 | 21 | val io = IO(new RespDeinterleaverIF(numPipes,p)) 22 | val deint = Module(new RespDeinterleaver(numPipes, p, routeFxn)).io 23 | deint.rspIn <> io.rspIn 24 | io.decodeErrors := deint.decodeErrors 25 | 26 | 27 | for(i <- 0 until numPipes) { 28 | val rspQ = Module(new FPGAQueue(new GenericMemoryResponse(p), n)).io 29 | rspQ.deq <> io.rspOut(i) 30 | rspQ.enq <> deint.rspOut(i) 31 | } 32 | } 33 | 34 | class RespDeinterleaver(numPipes: Int, p: MemReqParams, 35 | routeFxn: GenericMemoryResponse => UInt = {x: GenericMemoryResponse => x.channelID} 36 | ) extends Module { 37 | val io = IO(new RespDeinterleaverIF(numPipes, p)) 38 | 39 | val regDecodeErrors = RegInit(0.U(32.W)) 40 | 41 | // TODO the current implementation is likely to cause timing problems 42 | // due to high-fanout signals and combinational paths 43 | // - to avoid high-fanout signals: implement decoding as e.g shiftreg 44 | // - to avoid combinational paths, pipeline the deinterleaver 45 | for(i <- 0 until numPipes) { 46 | io.rspOut(i).bits := io.rspIn.bits 47 | io.rspOut(i).valid := false.B 48 | } 49 | 50 | io.rspIn.ready := false.B 51 | io.decodeErrors := regDecodeErrors 52 | 53 | val destPipe = routeFxn(io.rspIn.bits) 54 | val invalidChannel = (destPipe >= (numPipes).U) 55 | val canProceed = io.rspIn.valid && io.rspOut(destPipe).ready 56 | 57 | when (invalidChannel) { 58 | // do not let the entire pipe stall because head of line has invalid dest 59 | // increment error counter and move on 60 | regDecodeErrors := regDecodeErrors + 1.U 61 | io.rspIn.ready := true.B 62 | printf("RespDeinterleaver decode error! chanID = %d dest = %d \n", 63 | io.rspIn.bits.channelID, destPipe 64 | ) 65 | } 66 | .elsewhen (canProceed) { 67 | io.rspIn.ready := true.B 68 | io.rspOut(destPipe).valid := true.B 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/StreamSync.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import fpgatidbits.ocm._ 6 | 7 | // make two streams A and B go in lockstep, i.e. A is only popped when B is 8 | // popped, and vice versa. 9 | // this is achieved by first joining then forking the two streams. 10 | // depending on the I/O behavior on the input and output, it may be necessary 11 | // to add input/output queues. 12 | 13 | object StreamSync { 14 | def apply[TA <: Data, TB <: Data]( 15 | inA: DecoupledIO[TA], inB: DecoupledIO[TB], 16 | outA: DecoupledIO[TA], outB: DecoupledIO[TB], 17 | queueInput: Boolean = false, // whether the inputs should be queued 18 | queueOutput: Boolean = false // whether the outputs should be queued 19 | ) = { 20 | val ss = Module(new StreamSync( 21 | genA = inA.bits, genB = inB.bits, queueInput = queueInput, 22 | queueOutput = queueOutput 23 | )).io 24 | inA <> ss.inA 25 | inB <> ss.inB 26 | ss.outA <> outA 27 | ss.outB <> outB 28 | } 29 | } 30 | 31 | class StreamSync[TA <: Data, TB <: Data]( 32 | genA: TA, // clonetype for first stream 33 | genB: TB, // clonetype for second stream 34 | queueInput: Boolean = false, // whether the inputs should be queued 35 | queueOutput: Boolean = false // whether the outputs should be queued 36 | ) extends Module { 37 | val io = new Bundle { 38 | val inA = Flipped(Decoupled(genA)) 39 | val inB = Flipped(Decoupled(genB)) 40 | val outA = Decoupled(genA) 41 | val outB = Decoupled(genB) 42 | } 43 | // define an internal packet type for the synced stream 44 | class SyncPacket extends Bundle { 45 | val compA = genA.cloneType 46 | val compB = genB.cloneType 47 | } 48 | val syncedData = new SyncPacket() 49 | 50 | def joinFxn(a: TA, b: TB): SyncPacket = { 51 | val ret = new SyncPacket() 52 | ret.compA := a 53 | ret.compB := b 54 | return ret 55 | } 56 | 57 | val join = Module(new StreamJoin( 58 | genA = genA, genB = genB, genOut = syncedData, join = joinFxn 59 | )).io 60 | 61 | val fork = Module(new StreamFork( 62 | genIn = syncedData, genA = genA, genB = genB, 63 | forkA = {s: SyncPacket => s.compA}, forkB = {s: SyncPacket => s.compB} 64 | )).io 65 | 66 | if(queueInput) { 67 | FPGAQueue(io.inA, 2) <> join.inA 68 | FPGAQueue(io.inB, 2) <> join.inB 69 | } else { 70 | io.inA <> join.inA 71 | io.inB <> join.inB 72 | } 73 | 74 | join.out <> fork.in 75 | 76 | if(queueOutput) { 77 | FPGAQueue(fork.outA, 2) <> io.outA 78 | FPGAQueue(fork.outB, 2) <> io.outB 79 | } else { 80 | fork.outA <> io.outA 81 | fork.outB <> io.outB 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/ocm/DualPortMaskedBRAM.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.ocm 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import fpgatidbits.utils.SubWordAssignment 6 | 7 | // creates a BRAM of desired size, which supports partial writes at "unit" 8 | // granularity. which parts will be written is determined by the writeMask. 9 | // internally, this is accomplished by instantiating a number of standard 10 | // dual-port BRAMs of width = unit, and routing the write/read data accordingly 11 | 12 | class DualPortMaskedBRAM(addrBits: Int, dataBits: Int, unit: Int = 8) 13 | extends Module { 14 | val numBanks = dataBits/unit 15 | val io = IO(new DualPortMaskedBRAMIO(addrBits, dataBits, numBanks)) 16 | 17 | val banksExt = VecInit(Seq.fill(numBanks) { 18 | Module(new DualPortBRAM(addrBits, unit)).io 19 | }) 20 | 21 | val banks = for (i <- 0 until numBanks) yield { 22 | Wire(new DualPortBRAMIOWrapper(addrBits, unit)) 23 | } 24 | 25 | (banksExt zip banks).map({ 26 | case (ext, int) => 27 | ext.clk := clock 28 | ext.a.connect(int.ports(0)) 29 | ext.b.connect(int.ports(1)) 30 | int.ports.map(_.driveDefaults()) 31 | }) 32 | 33 | val wiresReadOut =VecInit(Seq.fill(2)(VecInit(Seq.fill(numBanks)(WireInit(0.U(unit.W)))))) 34 | 35 | 36 | 37 | for(i <- 0 until numBanks) { 38 | for(p <- 0 until 2) { 39 | // base request data goes to all banks 40 | banks(i).ports(p).req.addr := io.ports(p).req.addr 41 | // each bank gets one byte of data 42 | val bankWrData = io.ports(p).req.writeData((i+1)*unit-1, i*unit) 43 | banks(i).ports(p).req.writeData := bankWrData 44 | // each bank's write enable is computed separately 45 | val bankWrEn = io.ports(p).req.writeEn & io.ports(p).req.writeMask(i) 46 | banks(i).ports(p).req.writeEn := bankWrEn 47 | // use partial assignment to concatenate read data 48 | // erlingr: chisel3 doesnt support subword assignment 49 | wiresReadOut(p)(i) := banks(i).ports(p).rsp.readData 50 | } 51 | } 52 | 53 | // Concatenate output 54 | for (p <- 0 until 2) { 55 | io.ports(p).rsp.readData := wiresReadOut(p).asUInt 56 | } 57 | } 58 | 59 | class OCMMaskedRequest(writeWidth: Int, addrWidth: Int, maskWidth: Int) 60 | extends OCMRequest(writeWidth, addrWidth) { 61 | if(writeWidth % maskWidth != 0) 62 | throw new Exception("Mask-writable BRAM needs data width % mask width = 0") 63 | 64 | val writeMask = Vec(maskWidth, Bool()) 65 | 66 | } 67 | 68 | class OCMMaskedSlaveIF(dataWidth: Int, addrWidth: Int, maskWidth: Int) 69 | extends Bundle { 70 | val req = Input(new OCMMaskedRequest(dataWidth, addrWidth, maskWidth)) 71 | val rsp = Output(new OCMResponse(dataWidth)) 72 | 73 | } 74 | 75 | class DualPortMaskedBRAMIO(addrBits: Int, dataBits: Int, maskBits: Int) 76 | extends Bundle { 77 | val ports = Vec(2, new OCMMaskedSlaveIF(dataBits, addrBits, maskBits)) 78 | } 79 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/ocm/AsymDualPortRAM.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.ocm 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | //import chisel3.iotesters._ 6 | 7 | // simple model for a dual-port OCM with asymmetric r/w widths 8 | // not intended for synthesis, only for simulation 9 | class AsymDualPortRAM(p: OCMParameters) extends Module { 10 | val ocmParams = p 11 | val io = IO(new Bundle { 12 | val ports = Vec(2, new OCMSlaveIF(p.writeWidth, p.readWidth, p.addrWidth)) 13 | }) 14 | // note that we assume the user left-shifts the address as necessary 15 | // (this is what Xilinx BRAMs assume as well -- not sure about Altera) 16 | // e.g the address is always given in the terms of the smallest granularity 17 | 18 | // calculate the minimum of required port widths & instantiate memory 19 | // according to this 20 | val minWidth = math.min(p.writeWidth, p.readWidth) 21 | val memBits = p.writeDepth * p.writeWidth 22 | val mem = SyncReadMem(memBits/minWidth, UInt(minWidth.W)) 23 | 24 | for(i <- 0 to 1) { // for each memory port 25 | val base = io.ports(i).req.addr 26 | // logic depends on whether read or write width is smaller 27 | if(p.readWidth > p.writeWidth) { 28 | // big reads, small writes 29 | // address corresponds directly to write cell address 30 | when (io.ports(i).req.writeEn) { 31 | mem(base) := io.ports(i).req.writeData 32 | } 33 | // reads need to concatenate multiple cells 34 | val wordsToRead = p.readWidth / p.writeWidth 35 | val rdData = Cat((wordsToRead-1 to 0 by -1).map( {i: Int => mem(base+i.U)})) 36 | // use shift register to satisfy read latency requirement 37 | io.ports(i).rsp.readData := ShiftRegister(rdData, p.readLatency) 38 | } else { 39 | // small reads, big writes 40 | // address corresponds directly to read cell address 41 | val rdData = mem(base) 42 | io.ports(i).rsp.readData := ShiftRegister(rdData, p.readLatency) 43 | // need to write to multiple cells 44 | val wordsToWrite = p.writeWidth / p.readWidth 45 | when (io.ports(i).req.writeEn) { 46 | for(j <- 0 until wordsToWrite) { 47 | mem(base+j.U) := io.ports(i).req.writeData((j+1)*minWidth-1, j*minWidth) 48 | } 49 | } 50 | } 51 | } 52 | } 53 | 54 | // TODO this test will only work on 8w/32r OCM and tests very little 55 | 56 | // 57 | //class AsymDualPortRAMTester(c: AsymDualPortRAM) extends PeekPokeTester(c) { 58 | // val p = c.ocmParams 59 | // val p0 = c.io.ports(0) 60 | // 61 | // var wr_data = List(0xef, 0xbe, 0xad, 0xde) 62 | // var addr = 4 63 | // for (i <- wr_data) { 64 | // poke(p0.req.addr, addr) 65 | // poke(p0.req.writeData, i) 66 | // poke(p0.req.writeEn, 1) 67 | // step(1) 68 | // addr = addr + 1 69 | // } 70 | // poke(p0.req.writeEn, 0) 71 | // poke(p0.req.addr, 4) 72 | // step(p.readLatency) 73 | // expect(p0.rsp.readData, 0xdeadbeef) 74 | //} 75 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/math/PipelinedMul.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.math 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | class PipelinedMultStageData(w: Int, wMul: Int) extends Bundle { 7 | val signA = Bool() 8 | val a = UInt(w.W) 9 | val signB = Bool() 10 | val b = UInt(w.W) 11 | val mulRes = UInt((2*wMul).W) 12 | val addRes = UInt(w.W) 13 | } 14 | 15 | // pipelined 64-bit signed multiplier with backpressure support, hardwired 16 | // for 5 stages. not really optimized but it does the job. 17 | // TODOs: 18 | // - add a more flexible generator for configuring latency, op schedule etc. 19 | // - internally uses signed magnitude, overflow may differ from 2s complement 20 | // - control DSP inference for FPGAs that support it (how?) 21 | class SystolicSInt64Mul_5Stage extends BinaryMathOp(64) { 22 | val latency = 5 23 | val wMul: Int = 32 24 | val metad = new PipelinedMultStageData(64, wMul) 25 | 26 | // stage 0: convert to signed magnitude form 27 | val fxnS0 = {i: BinaryMathOperands => val m = new PipelinedMultStageData(64, wMul) 28 | m.signA := i.first(63) 29 | m.signB := i.second(63) 30 | m.a := Mux(i.first(63), ~i.first+ 1.U, i.first) 31 | m.b := Mux(i.second(63), ~i.second + 1.U, i.second) 32 | m.mulRes := 0.U 33 | m.addRes := 0.U 34 | m 35 | } 36 | val s0 = SystolicReg(io.in.bits, metad, fxnS0, io.in) 37 | 38 | // stages 1-4: pipelined multiply; 1 wMul-bit multiply and 1 64-bit add per 39 | // stage. 40 | // fMaker generates the stage transfer function, where offA and offB control 41 | // where the values to be multiplied will be taken from within the operands 42 | // and shiftAdd (should be equal to wMul*sum of offsets for prev stage) 43 | val fMaker = { (offA: Int, offB: Int, shiftAdd: Int) => 44 | {i: PipelinedMultStageData => val m = new PipelinedMultStageData(64, wMul) 45 | m := i 46 | // multiply offA-th wMul-wide word of A, off-Bth of B 47 | m.mulRes := i.a((wMul*(offA+1))-1, wMul*offA) * i.b((wMul*(offB+1))-1, wMul*offB) 48 | // add partial product and addRes from previous stage 49 | m.addRes := (i.mulRes << shiftAdd.U) + i.addRes 50 | m 51 | } 52 | } 53 | 54 | val s1 = SystolicReg(metad, metad, fMaker(0, 0, 0), s0) 55 | val s2 = SystolicReg(metad, metad, fMaker(0, 1, 0), s1) 56 | val s3 = SystolicReg(metad, metad, fMaker(1, 0, wMul), s2) 57 | // note that we don't use the highest offsets (1, 1) since this multiplier 58 | // only returns an 64-bit result (offset 1,1 generates only overflow) 59 | // the last stage is just used for the add (mul is unused) 60 | val s4 = SystolicReg(metad, metad, fMaker(0, 0, wMul), s3) 61 | 62 | // convert from signed magnitude back to 2s complement on the way out 63 | s4.ready := io.out.ready 64 | io.out.valid := s4.valid 65 | 66 | val magnRes = Cat(0.U(1.W), s4.bits.addRes(62, 0)) 67 | val isResultNegative = s4.bits.signA ^ s4.bits.signB 68 | io.out.bits := Mux(isResultNegative, ~magnRes + 1.U, magnRes) 69 | } -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/dma/StreamWriter.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.dma 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import fpgatidbits.streams._ 6 | 7 | // write contiguous streams of data to main memory 8 | // note that start address and total byte count to write must be aligned to 9 | // the memory bus size 10 | 11 | // TODO add support for burst writes 12 | class StreamWriterParams( 13 | val streamWidth: Int, 14 | val mem: MemReqParams, 15 | val chanID: Int, 16 | val maxBeats: Int = 1 17 | ) 18 | 19 | class StreamWriterIF(w: Int, p: MemReqParams) extends Bundle { 20 | val start = Input(Bool()) 21 | val active = Output(Bool()) 22 | val finished = Output(Bool()) 23 | val error = Output(Bool()) 24 | val baseAddr = Input(UInt(p.addrWidth.W)) 25 | val byteCount = Input(UInt(32.W)) 26 | // stream data input 27 | val in = Flipped(Decoupled(UInt(w.W))) 28 | // interface towards memory port 29 | val req = Decoupled(new GenericMemoryRequest(p)) 30 | val wdat = Decoupled(UInt(p.dataWidth.W)) 31 | val rsp = Flipped(Decoupled(new GenericMemoryResponse(p))) 32 | } 33 | 34 | class StreamWriter(val p: StreamWriterParams) extends Module { 35 | val io = IO(new StreamWriterIF(p.streamWidth, p.mem)) 36 | val StreamElem = UInt(p.streamWidth.W) 37 | 38 | // always ready to receive write responses 39 | io.rsp.ready := true.B 40 | // count write responses to determine finished 41 | val regNumPendingReqs = RegInit(0.U(32.W)) 42 | val regRequestedBytes = RegInit(0.U(32.W)) 43 | when(!io.start) { 44 | regNumPendingReqs := 0.U 45 | regRequestedBytes := 0.U 46 | } .otherwise { 47 | val reqFired = io.req.valid & io.req.ready 48 | val rspFired = io.rsp.valid & io.rsp.ready 49 | regRequestedBytes := regRequestedBytes + Mux(reqFired, io.req.bits.numBytes, 0.U) 50 | when(reqFired && !rspFired) { regNumPendingReqs := regNumPendingReqs + 1.U} 51 | .elsewhen(!reqFired && rspFired) { regNumPendingReqs := regNumPendingReqs - 1.U } 52 | } 53 | // finished when: 54 | // - all bytes have been requested 55 | // - there are no pending (un-responded) requests left 56 | val fin = (regRequestedBytes === io.byteCount) & (regNumPendingReqs === 0.U) 57 | io.finished := io.start & fin 58 | 59 | // write request generator 60 | val wg = Module(new WriteReqGen(p.mem, p.chanID, p.maxBeats)).io 61 | wg.ctrl.start := io.start 62 | wg.ctrl.baseAddr := io.baseAddr 63 | wg.ctrl.byteCount := io.byteCount // TODO must be multiple of write size! 64 | wg.ctrl.throttle := false.B 65 | io.active := (io.start & !fin) 66 | io.error := wg.stat.error 67 | 68 | // push out the generated write requests 69 | wg.reqs <> io.req 70 | 71 | // add a resizer between input data and write data 72 | if(p.streamWidth == p.mem.dataWidth) {io.in <> io.wdat} 73 | else if(p.streamWidth > p.mem.dataWidth) { 74 | StreamDownsizer(io.in, p.mem.dataWidth) <> io.wdat 75 | } else { 76 | StreamUpsizer(io.in, p.mem.dataWidth) <> io.wdat 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/regfile/RegFile.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.regfile 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | // command bundle for read/writes to AEG/CSR registers 7 | class RegCommand(idBits: Int, dataBits: Int) extends Bundle { 8 | val regID = UInt(idBits.W) 9 | val read = Bool() 10 | val write = Bool() 11 | val writeData = UInt(dataBits.W) 12 | 13 | def driveDefaults() = { 14 | regID := 0.U 15 | read := false.B 16 | write := false.B 17 | writeData := 0.U 18 | } 19 | } 20 | 21 | // register file interface 22 | class RegFileSlaveIF(idBits: Int, dataBits: Int) extends Bundle { 23 | // register read/write commands 24 | // the "valid" signal here should be connected to (.read OR .write) 25 | val cmd = Flipped(Valid(new RegCommand(idBits, dataBits))) 26 | // returned read data 27 | val readData = Valid(UInt(dataBits.W)) 28 | // number of registers 29 | val regCount = Output(UInt(idBits.W)) 30 | 31 | } 32 | 33 | 34 | class RegFile(numRegs: Int, idBits: Int, dataBits: Int) extends Module { 35 | val io = IO(new Bundle { 36 | // external command interface 37 | val extIF = new RegFileSlaveIF(idBits, dataBits) 38 | // exposed values of all registers, for internal use 39 | val regOut = Vec(numRegs, Output(UInt(dataBits.W))) 40 | // valid pipes for writing new values for all registers, for internal use 41 | // (extIF takes priority over this) 42 | val regIn = Vec(numRegs, Flipped(Valid(UInt(dataBits.W)))) 43 | }) 44 | // drive num registers to compile-time constant 45 | io.extIF.regCount := numRegs.U 46 | 47 | // instantiate the registers in the file 48 | val regFile = RegInit(VecInit(Seq.fill(numRegs){0.U(dataBits.W)})) 49 | for (i <- 0 until numRegs) { 50 | dontTouch(regFile(i)) 51 | } 52 | 53 | 54 | // latch the incoming commands 55 | val regCommand = RegNext(io.extIF.cmd.bits) 56 | val regDoCmd = RegNext(next=io.extIF.cmd.valid, init=false.B) 57 | 58 | 59 | val hasExtReadCommand = (regDoCmd && regCommand.read) 60 | val hasExtWriteCommand = (regDoCmd && regCommand.write) 61 | 62 | // register read logic 63 | io.extIF.readData.valid := hasExtReadCommand 64 | // make sure regID stays within range for memory read 65 | when (regCommand.regID < (numRegs).U) { 66 | io.extIF.readData.bits := regFile(regCommand.regID) 67 | } .otherwise { 68 | // return 0 otherwise 69 | io.extIF.readData.bits := 0.U 70 | } 71 | 72 | // register write logic 73 | // to avoid multiple ports, we prioritize the extIF writes over the internal 74 | // ones (e.g if there is an external write present, the internal write will 75 | // be ignored if it arrives simultaneously) 76 | when (hasExtWriteCommand) { 77 | regFile(regCommand.regID) := regCommand.writeData 78 | } .otherwise { 79 | for(i <- 0 until numRegs) { 80 | when (io.regIn(i).valid) { regFile(i) := io.regIn(i).bits } 81 | } 82 | } 83 | 84 | // expose all reg outputs for personality's access 85 | for (i <- 0 to numRegs-1) { 86 | io.regOut(i) := regFile(i) 87 | } 88 | 89 | // TODO add testbench for regfile logic 90 | } 91 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-tests/GrayScaleFilter.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | using namespace std; 4 | 5 | #include "ExampleGrayScale.hpp" 6 | #include "platform.h" 7 | 8 | bool Run_ExampleGrayScale(WrapperRegDriver * platform, unsigned char *rgb_image, int rgb_size, unsigned char *grayscale_image) { 9 | ExampleGrayScale t(platform); 10 | int rgb_size_aligned = (rgb_size >> 3) << 3; 11 | int grayscale_size = rgb_size/3; 12 | int grayscale_size_aligned = (grayscale_size >> 3) << 3; 13 | cout << "Signature: " << hex << t.get_signature() << dec << endl; 14 | cout << "Running Grayscale accelerator on image of size " <allocAccelBuffer(rgb_size_aligned); 17 | void * accelBufRes = platform->allocAccelBuffer(grayscale_size_aligned); 18 | platform->copyBufferHostToAccel(rgb_image, accelBuf, rgb_size_aligned); 19 | 20 | t.set_baseAddr((AccelDblReg) accelBuf); 21 | t.set_resBaseAddr((AccelDblReg) accelBufRes); 22 | t.set_byteCount(rgb_size_aligned); 23 | t.set_resByteCount(grayscale_size_aligned); 24 | 25 | t.set_start(1); 26 | 27 | cout << "Waiting for Accel" <(i, j)[0]; 65 | unsigned char g = image.at(i, j)[1]; 66 | unsigned char r = image.at(i, j)[2]; 67 | rgb_data[idx_rgb++] = r; 68 | rgb_data[idx_rgb++] = g; 69 | rgb_data[idx_rgb++] = b; 70 | } 71 | } 72 | 73 | WrapperRegDriver * platform = initPlatform(); 74 | 75 | Run_ExampleGrayScale(platform, rgb_data, rgb_size, grayscale_data); 76 | 77 | deinitPlatform(platform); 78 | 79 | // Convert the grayscale_data back to a cv::Mat for saving 80 | cv::Mat grayscale_image(height, width, CV_8U, grayscale_data); 81 | cv::imwrite("grayscale_output.jpg", grayscale_image); 82 | 83 | delete[] rgb_data; 84 | delete[] grayscale_data; 85 | 86 | return 0; 87 | } 88 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/interconnect/AXILiteSwitch.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.interconnect 2 | 3 | import chisel3._ 4 | import chisel3.util.RRArbiter 5 | import fpgatidbits.axi._ 6 | import fpgatidbits.streams._ 7 | 8 | // connect an AXI-lite master interface to multiple AXI-lite slaves, 9 | // routing the requests based on the address. note that the addresses are NOT 10 | // rebased when going into the slaves. 11 | class AXILiteSwitch( 12 | addrBits: Int, dataBits: Int, // AXI lite interface widths 13 | numSlaves: Int, // number of slaves to connect to 14 | routingFxn: UInt => UInt // map address to slave number (0...numSlaves-1) 15 | ) extends Module { 16 | val io = new Bundle { 17 | val in = new AXILiteSlaveIF(addrBits, dataBits) 18 | val out = Vec(numSlaves, new AXILiteMasterIF(addrBits, dataBits)) 19 | } 20 | 21 | // route incoming read commands to appropriate slave, based on the output 22 | // of the routingFxn (applied on the incoming address) 23 | io.in.readAddr <> DecoupledOutputDemux( 24 | sel = routingFxn(io.in.readAddr.bits.addr), 25 | chans = io.out.map(x => x.readAddr) 26 | ) 27 | 28 | // writes are a bit trickier, since we have to ensure that the write data 29 | // (which has no routing indicator) follows the same path as the write request 30 | // to achieve this, we force the incoming write requests and data to go in 31 | // lockstep, with queueing so that they can arrive at different times: 32 | 33 | val syncWrIn = Module(new StreamSync( 34 | genA = io.in.writeAddr.bits, genB = io.in.writeData.bits, 35 | queueInput = true, queueOutput = true 36 | )).io 37 | io.in.writeAddr <> syncWrIn.inA 38 | io.in.writeData <> syncWrIn.inB 39 | 40 | // to ensure lockstep on the output, we need a number of StreamSync comps, 41 | // this time with output queueing so that the slaves can pop the write addr 42 | // and data separately if they want to: 43 | val syncWrOut = VecInit(Seq.fill(numSlaves)(Module(new StreamSync( 44 | genA = io.in.writeAddr.bits, genB = io.in.writeData.bits, 45 | queueOutput = true 46 | )).io)) 47 | 48 | for(i <- 0 until numSlaves) { 49 | syncWrOut(i).outA <> io.out(i).writeAddr 50 | syncWrOut(i).outB <> io.out(i).writeData 51 | } 52 | 53 | // finally, we connect the synchronizers using a pair of DecoupledOutputDemux: 54 | // write address 55 | syncWrIn.outA <> DecoupledOutputDemux( 56 | sel = routingFxn(syncWrIn.outA.bits.addr), 57 | chans = syncWrOut.map(x => x.inA) 58 | ) 59 | // write data 60 | syncWrIn.outB <> DecoupledOutputDemux( 61 | sel = routingFxn(syncWrIn.outA.bits.addr), // possible w/addr-data sync 62 | chans = syncWrOut.map(x => x.inB) 63 | ) 64 | 65 | // use round-robin arbitration between incoming read data and write resps 66 | val arbReadData = Module(new RRArbiter(io.in.readData.bits, numSlaves)).io 67 | val arbWriteRsp = Module(new RRArbiter(io.in.writeResp.bits, numSlaves)).io 68 | 69 | for(i <- 0 until numSlaves) { 70 | io.out(i).readData <> arbReadData.in(i) 71 | io.out(i).writeResp <> arbWriteRsp.in(i) 72 | } 73 | arbReadData.out <> io.in.readData 74 | arbWriteRsp.out <> io.in.writeResp 75 | } 76 | -------------------------------------------------------------------------------- /src/test/scala/TestAffineLoopNestIndGen.scala: -------------------------------------------------------------------------------- 1 | //import chisel3._ 2 | //import chisel3.util._ 3 | //import fpgatidbits.streams._ 4 | 5 | // Tester-derived class to give stimulus and observe the outputs for the 6 | // Module to be tested 7 | 8 | /* 9 | class AffineLoopNestIndGenTester(c: AffineLoopNestIndGen) extends Tester(c) { 10 | val levels = c.n 11 | val r = scala.util.Random 12 | val reps = 1 + r.nextInt(5) 13 | 14 | // functions to create the golden values for the affine loop nest 15 | def balance(current: Seq[Int], desc: Seq[Int]): Seq[Int] = { 16 | if(current.size == 0) { 17 | return current 18 | } else if(current.size == 1) { 19 | if(current(0) >= desc(0)) { 20 | return Seq(0) 21 | } else { 22 | return current 23 | } 24 | } else { 25 | if(current(0) >= desc(0)) { 26 | return Seq(0) ++ balance(Seq(current(1)+1) ++ current.drop(2), desc.drop(1)) 27 | } else { 28 | return Seq(current(0)) ++ balance(current.drop(1), desc.drop(1)) 29 | } 30 | } 31 | } 32 | def make_golden_util(current: Seq[Int], desc: Seq[Int], n: Int): Seq[Seq[Int]] = { 33 | if(n == 0) { 34 | return Seq() 35 | } else { 36 | val nxt = Seq(current(0)+1) ++ current.drop(1) 37 | return Seq(current) ++ make_golden_util(balance(nxt, desc), desc, n-1) 38 | } 39 | } 40 | def make_golden(desc: Seq[Int]): Seq[Seq[Int]] = { 41 | val total_iters = desc.reduce(_*_) 42 | val start_iter = Seq.fill(desc.length){0} 43 | return make_golden_util(start_iter, desc, total_iters) 44 | } 45 | 46 | for(rep <- 0 until reps) { 47 | // create new random affine loop nest with given #levels 48 | // avoid 0-sized dims 49 | val descr = (0 until levels).map(i => r.nextInt(10) + 1) 50 | // create the expected traversal 51 | val golden = make_golden(descr) 52 | // set up the loop nest descriptor 53 | for(i <- 0 until levels) { 54 | poke(c.io.in.bits.inds(i), descr(i)) 55 | } 56 | poke(c.io.in.valid, 1) 57 | step(1) 58 | poke(c.io.in.valid, 0) 59 | // wait for traversal to start 60 | while(peek(c.io.out.valid) == 0) { step(1) } 61 | poke(c.io.out.ready, 1) 62 | for(current_iter <- golden) { 63 | for(i <- 0 until levels) { 64 | expect(c.io.out.bits.inds(i), current_iter(i)) 65 | } 66 | step(1) 67 | } 68 | expect(c.io.out.valid, 0) 69 | } 70 | } 71 | 72 | class TestAffineLoopNestIndGen extends JUnitSuite { 73 | @Test def AffineLoopNestIndGenTest { 74 | for(w <- 32 to 32) { 75 | for(n <- 2 to 4) { 76 | // Chisel arguments to pass to chiselMainTest 77 | def testArgs = TestHelpers.stdArgs 78 | // function that instantiates the Module to be tested 79 | def testModuleInstFxn = () => { Module(new AffineLoopNestIndGen( 80 | w = w, n = n 81 | ))} 82 | // function that instantiates the Tester to test the Module 83 | def testTesterInstFxn = (c: AffineLoopNestIndGen) => new AffineLoopNestIndGenTester(c) 84 | // actually run the test 85 | chiselMainTest( 86 | testArgs, 87 | testModuleInstFxn 88 | ) { 89 | testTesterInstFxn 90 | } 91 | } 92 | } 93 | } 94 | } 95 | */ -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/dma/ReqInterleaver.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.dma 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | class ReqInterleaver(numPipes: Int, p: MemReqParams) extends Module { 7 | val io = IO(new Bundle { 8 | // individual request pipes 9 | val reqIn = Vec(numPipes, Flipped(Decoupled(new GenericMemoryRequest(p)))) 10 | // interleaved request pipe 11 | val reqOut = Decoupled(new GenericMemoryRequest(p)) 12 | }) 13 | // TODO for now, we just use a round-robin arbiter 14 | // TODO report statistics from the interleaved mix? 15 | val arb = Module(new RRArbiter(gen=new GenericMemoryRequest(p), n=numPipes)) 16 | for (i <- 0 until numPipes) { 17 | arb.io.in(i) <> io.reqIn(i) 18 | } 19 | arb.io.out <> io.reqOut 20 | } 21 | 22 | class TestReqInterleaverWrapper() extends Module { 23 | val p = new MemReqParams(48, 64, 4, 1) 24 | val burstBeats = 8 25 | val io = IO(new Bundle { 26 | val reqOut = Decoupled(new GenericMemoryRequest(p)) 27 | val allFinished = Output(Bool()) 28 | val allActive = Output(Bool()) 29 | }) 30 | val N = 4 31 | val bytesPerPipe = 1024 32 | val reqPipes = VecInit.tabulate(N) {i => Module(new ReadReqGen(p, i, burstBeats)).io} 33 | val dut = Module(new ReqInterleaver(N, p)) 34 | for(i <- 0 until N) { 35 | reqPipes(i).reqs <> dut.io.reqIn(i) 36 | reqPipes(i).ctrl.throttle := false.B 37 | reqPipes(i).ctrl.start := true.B 38 | reqPipes(i).ctrl.baseAddr := (bytesPerPipe*i).U 39 | reqPipes(i).ctrl.byteCount := bytesPerPipe.U 40 | } 41 | val reqQ = Module(new Queue(new GenericMemoryRequest(p), 1024)) 42 | reqQ.io.enq <> dut.io.reqOut 43 | reqQ.io.deq <> io.reqOut 44 | 45 | io.allFinished := reqPipes.forall( x => x.stat.finished ) 46 | io.allActive := reqPipes.forall( x => x.stat.active ) 47 | } 48 | 49 | 50 | //class TestReqInterleaver(c: TestReqInterleaverWrapper) extends PeekPokeTester(c) { 51 | // poke(c.io.reqOut.ready, 0) 52 | // step(1) 53 | // expect(c.io.allActive, 1) 54 | // expect(c.io.allFinished, 0) 55 | // while(peek(c.io.allFinished) != 1) { 56 | // peek(c.reqQ.io.enq.valid) 57 | // peek(c.reqQ.io.enq.bits) 58 | // peek(c.reqQ.io.count) 59 | // step(1) 60 | // } 61 | // // verify number of requests in the interleaved queue 62 | // val bytesPerBurst = (c.burstBeats*c.p.dataWidth/8) 63 | // val expReqsPerPipe = c.bytesPerPipe / bytesPerBurst 64 | // val expTotalReqs = c.N * expReqsPerPipe 65 | // expect(c.reqQ.io.count, expTotalReqs) 66 | // // verify the request mix from different channels 67 | // var reqsFromChannel:Array[Int] = Array.fill[Int](c.N)(0) 68 | // val channelExpReq:Array[Int] = (0 to c.N-1).map({ i => c.bytesPerPipe*i }).toArray 69 | // while(peek(c.io.reqOut.valid) == 1) { 70 | // val chanID = peek(c.io.reqOut.bits.channelID).toInt 71 | // expect(c.io.reqOut.bits.addr, channelExpReq(chanID)) 72 | // reqsFromChannel(chanID) += 1 73 | // channelExpReq(chanID) += bytesPerBurst 74 | // poke(c.io.reqOut.ready, 1) 75 | // step(1) 76 | // } 77 | // poke(c.io.reqOut.ready, 0) 78 | // expect(c.reqQ.io.count, 0) 79 | // for(i <- 0 until c.N) { 80 | // println("Channel " + i.toString + " #reqs= " + reqsFromChannel(i).toString) 81 | // expect(reqsFromChannel(i) == expReqsPerPipe, "Channel has correct #reqs") 82 | // } 83 | //} 84 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/zedboardregdriver.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ZEDBOARDREGDRIVER_H 2 | #define ZEDBOARDREGDRIVER_H 3 | 4 | #include "axiregdriver.hpp" 5 | #include 6 | #include 7 | #include "xil_cache.h" 8 | 9 | class ZedBoardRegDriver : public AXIRegDriver { 10 | public: 11 | ZedBoardRegDriver(void *baseAddr) : AXIRegDriver(baseAddr) {} 12 | 13 | virtual std::string platformID() { 14 | return "ZedBoardDriver"; 15 | } 16 | 17 | // functions for host-accelerator buffer management 18 | virtual void copyBufferHostToAccel(const void * hostBuffer, void * accelBuffer, unsigned int numBytes) { 19 | memcpy(accelBuffer, hostBuffer, numBytes); 20 | Xil_DCacheFlushRange((unsigned int) accelBuffer, numBytes); 21 | } 22 | 23 | virtual void copyBufferAccelToHost(const void * accelBuffer, void * hostBuffer, unsigned int numBytes) { 24 | Xil_DCacheInvalidateRange((unsigned int) accelBuffer, numBytes); 25 | memcpy(hostBuffer,accelBuffer, numBytes); 26 | } 27 | 28 | virtual void * allocAccelBuffer(unsigned int numBytes) { return malloc_aligned(64, numBytes);} 29 | virtual void deallocAccelBuffer(void * buffer) { free_aligned(buffer);} 30 | 31 | protected: 32 | // custom aligned malloc-free from http://stackoverflow.com/questions/6563120/what-does-posix-memalign-memalign-do 33 | void *malloc_aligned(size_t alignment, size_t bytes) 34 | { 35 | // we need to allocate enough storage for the requested bytes, some 36 | // book-keeping (to store the location returned by malloc) and some extra 37 | // padding to allow us to find an aligned byte. im not entirely sure if 38 | // 2 * alignment is enough here, its just a guess. 39 | const size_t total_size = bytes + (2 * alignment) + sizeof(size_t); 40 | 41 | // use malloc to allocate the memory. 42 | char *data = (char *) malloc(sizeof(char) * total_size); 43 | 44 | if (data) 45 | { 46 | // store the original start of the malloc'd data. 47 | const void * const data_start = data; 48 | 49 | // dedicate enough space to the book-keeping. 50 | data += sizeof(size_t); 51 | 52 | // find a memory location with correct alignment. the alignment minus 53 | // the remainder of this mod operation is how many bytes forward we need 54 | // to move to find an aligned byte. 55 | const size_t offset = alignment - (((size_t)data) % alignment); 56 | 57 | // set data to the aligned memory. 58 | data += offset; 59 | 60 | // write the book-keeping. 61 | size_t *book_keeping = (size_t*)(data - sizeof(size_t)); 62 | *book_keeping = (size_t)data_start; 63 | } else throw "Failure in malloc_aligned"; // freak out 64 | 65 | return data; 66 | } 67 | 68 | void free_aligned(void *raw_data) 69 | { 70 | if (raw_data) 71 | { 72 | char *data = (char *) raw_data; 73 | 74 | // we have to assume this memory was allocated with malloc_aligned. 75 | // this means the sizeof(size_t) bytes before data are the book-keeping 76 | // which points to the location we need to pass to free. 77 | data -= sizeof(size_t); 78 | 79 | // set data to the location stored in book-keeping. 80 | data = (char*)(*((size_t*)data)); 81 | 82 | // free the memory. 83 | free(data); 84 | } 85 | } 86 | }; 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/interfaces/AsymPipelinedDualPortBRAM.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.ocm 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | // Dual-port pipelined BRAM with asymmetric r/w widths 7 | class AsymPipelinedDualPortBRAM( 8 | p: OCMParameters, regIn: Int, regOut: Int 9 | ) extends Module { 10 | val ocmParams = p 11 | val io = new Bundle { 12 | val ports = Vec(2, new OCMSlaveIF(p.writeWidth, p.readWidth, p.addrWidth)) 13 | } 14 | // note that we assume the user left-shifts the address as necessary 15 | // (this is what Xilinx BRAMs assume as well -- not sure about Altera) 16 | // e.g the address is always given in the terms of the smallest granularity 17 | 18 | // calculate the minimum of required port widths & instantiate memory 19 | // according to this 20 | val minWidth = math.min(p.writeWidth, p.readWidth) 21 | val maxWidth = math.max(p.writeWidth, p.readWidth) 22 | val asymRatio = maxWidth / minWidth 23 | val memBits = p.writeDepth * p.writeWidth 24 | val addr_bits = log2Ceil(memBits / minWidth) 25 | val addrOfUnit_bits = log2Ceil(asymRatio) 26 | val addrInUnit_bits = addr_bits - addrOfUnit_bits 27 | if(asymRatio == 1) { 28 | // just instantiate a regular PipelinedDualPortBRAM 29 | val mem = Module(new PipelinedDualPortBRAM( 30 | addrBits = addr_bits, dataBits = minWidth, regIn = regIn, 31 | regOut = regOut 32 | )).io 33 | 34 | mem <> io 35 | } else { 36 | Predef.assert(asymRatio > 1) 37 | Predef.assert(isPow2(asymRatio)) 38 | Predef.assert(p.writeDepth * p.writeWidth == p.readDepth * p.readWidth) 39 | // instantiate unit-sized mems, based on the minimum width 40 | val mem = VecInit(Seq.fill(asymRatio)( 41 | Module(new PipelinedDualPortBRAM( 42 | addrBits = addrInUnit_bits, dataBits = minWidth, regIn = regIn, 43 | regOut = regOut 44 | )).io 45 | )) 46 | 47 | // use both ports 48 | for(pn <- 0 until 2) { 49 | val addrOfUnit = io.ports(pn).req.addr(addrOfUnit_bits-1, 0) 50 | val addrInUnit = io.ports(pn).req.addr(addr_bits-1, addrOfUnit_bits) 51 | // all units get the same addrInUnit 52 | mem.map(_.ports(pn).req.addr := addrInUnit) 53 | // rest of logic depends on whether read or write width is smaller 54 | if(p.readWidth > p.writeWidth) { 55 | // small writes, large reads 56 | // all units get the same write data 57 | mem.map(_.ports(pn).req.writeData := io.ports(pn).req.writeData) 58 | // only right unit gets write enable 59 | for(j <- 0 until asymRatio) { 60 | mem(j).ports(pn).req.writeEn := io.ports(pn).req.writeEn & (j.U === addrOfUnit) 61 | } 62 | // read data is several unit reads concatenated 63 | io.ports(pn).rsp.readData := Cat(mem.map(_.ports(pn).rsp.readData).reverse) 64 | } else { 65 | // small reads, large writes 66 | // all ports get the same write enable 67 | mem.map(_.ports(pn).req.writeEn := io.ports(pn).req.writeEn) 68 | // split up the write data between units 69 | for(j <- 0 until asymRatio) { 70 | mem(j).ports(pn).req.writeData := io.ports(pn).req.writeData((j+1)*minWidth-1, j*minWidth) 71 | } 72 | // read data is returned from the correct unit 73 | io.ports(pn).rsp.readData := mem(addrOfUnit).ports(pn).rsp.readData 74 | } 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/platform-genericsdaccel.cpp: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2018, Xilinx, Inc. 3 | * All rights reserved. 4 | * Author: Yaman Umuroglu 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 9 | * 1. Redistributions of source code must retain the above copyright notice, 10 | * this list of conditions and the following disclaimer. 11 | * 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * 3. Neither the name of the copyright holder nor the names of its 17 | * contributors may be used to endorse or promote products derived from 18 | * this software without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 22 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 24 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 25 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 26 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 27 | * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 28 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 29 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 30 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | * 32 | *****************************************************************************/ 33 | /****************************************************************************** 34 | * 35 | * 36 | * @file platform-genericsdaccel.cpp 37 | * 38 | * Definition of the platform init-deinit functions 39 | * Call initPlatform() at the start of your program to 40 | * get a WrapperRegDriver handle 41 | * 42 | * 43 | *****************************************************************************/ 44 | #include 45 | #include 46 | 47 | #include 48 | 49 | #include "platform.h" 50 | #include "xclhalwrapper.hpp" 51 | 52 | extern "C" { 53 | #include 54 | #include 55 | #include 56 | #include 57 | } 58 | 59 | static XCLHalWrapperRegDriver* platform = 0; 60 | 61 | void platformSIGINTHandler(int signum) { 62 | std::cout << "Caught SIGINT, forcing exit" << std::endl; 63 | if(platform) { 64 | platform->detach(); 65 | } 66 | delete platform; 67 | exit(1); 68 | } 69 | 70 | WrapperRegDriver* initPlatform() { 71 | if (!platform) { 72 | platform = new XCLHalWrapperRegDriver(); 73 | } 74 | 75 | struct sigaction action; 76 | std::memset(&action, 0, sizeof(struct sigaction)); 77 | action.sa_handler = &platformSIGINTHandler; 78 | int res = sigaction(SIGINT, &action, NULL); 79 | 80 | return static_cast(platform); 81 | } 82 | 83 | void deinitPlatform(WrapperRegDriver* driver) { 84 | delete platform; 85 | platform = 0; 86 | } 87 | -------------------------------------------------------------------------------- /src/main/resources/script/hls_syn.tcl: -------------------------------------------------------------------------------- 1 | # /******************************************************************************* 2 | # Copyright (c) 2018, Xilinx, Inc. 3 | # All rights reserved. 4 | # Author: Yaman Umuroglu 5 | # 6 | # Redistribution and use in source and binary forms, with or without modification, 7 | # are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 12 | # 13 | # 2. Redistributions in binary form must reproduce the above copyright notice, 14 | # this list of conditions and the following disclaimer in the documentation 15 | # and/or other materials provided with the distribution. 16 | # 17 | # 18 | # 3. Neither the name of the copyright holder nor the names of its contributors 19 | # may be used to endorse or promote products derived from this software 20 | # without specific prior written permission. 21 | # 22 | # 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 24 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE IMPLIED 25 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 26 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 27 | # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 30 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 31 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 32 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | # 34 | # *******************************************************************************/ 35 | 36 | # quick-and-dirty tcl script for single-file HLS synthesis 37 | # start reading args from index 2 since vivado HLS also passes -f tclname here 38 | 39 | set config_proj_name [lindex $::argv 2] 40 | set config_hwsrc [lindex $::argv 3] 41 | set config_proj_part [lindex $::argv 4] 42 | set config_clkperiod [lindex $::argv 5] 43 | set config_toplevelfxn [lindex $::argv 6] 44 | set config_incldirs [lindex $::argv 7] 45 | 46 | puts "HLS project: $config_proj_name" 47 | puts "HW source file: $config_hwsrc" 48 | puts "Part: $config_proj_part" 49 | puts "Clock period: $config_clkperiod ns" 50 | puts "Top level function name: $config_toplevelfxn" 51 | puts "Include dirs: $config_incldirs" 52 | 53 | proc lmap {listName expr} { 54 | upvar $listName list 55 | set res [list] 56 | foreach _ $list { 57 | lappend res [eval $expr] 58 | } 59 | return $res 60 | } 61 | 62 | set inclDirList [regexp -inline -all -- {\S+} $config_incldirs] 63 | set includeDirs [lmap inclDirList { format -I%s $_ }] 64 | 65 | puts "inclDirList: $inclDirList" 66 | puts "includeDirs: $includeDirs" 67 | 68 | # set up project 69 | open_project $config_proj_name 70 | add_files $config_hwsrc -cflags "-std=c++0x $includeDirs" 71 | set_top $config_toplevelfxn 72 | open_solution sol1 73 | set_part $config_proj_part 74 | config_compile -name_max_length 300 75 | 76 | # use 64-bit AXI MM addresses 77 | config_interface -m_axi_addr64 78 | 79 | # synthesize 80 | create_clock -name clk -period $config_clkperiod 81 | csynth_design 82 | exit 0 83 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/StreamCAM.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | // StreamCAM: a CAM with stream-style (DecoupledIO) interfaces. 7 | // a CAM with slots, each slot wide, is instantiated. 8 | // the interface is used for insertions, and the interface for 9 | // removals. if the interface blocks (!ready), either the CAM is full 10 | // (indicated by the signal) or the insertion candidate is already 11 | // present (indicated by the signal) 12 | 13 | class StreamCAM(entries: Int, tag_bits: Int) extends Module { 14 | val io = IO(new Bundle { 15 | val hazard = Output(Bool()) 16 | val full = Output(Bool()) 17 | val in = Flipped(Decoupled(UInt(tag_bits.W))) 18 | val rm = Flipped(Decoupled(UInt(tag_bits.W))) 19 | }) 20 | 21 | val cam = Module(new CAM(entries, tag_bits)).io 22 | // removal logic 23 | cam.clear_hit := io.rm.valid 24 | cam.clear_tag := io.rm.bits 25 | io.rm.ready := cam.is_clear_hit 26 | 27 | // insertion logic 28 | cam.tag := io.in.bits 29 | val canInsert = cam.hasFree & !cam.hit 30 | io.in.ready := canInsert 31 | cam.write_tag := io.in.bits 32 | cam.write := canInsert & io.in.valid 33 | io.hazard := cam.hit & io.in.valid 34 | io.full := !cam.hasFree 35 | } 36 | 37 | 38 | // adapted from J. Bachrach's "Advanced Chisel" slides 39 | // interface & implementation for a combinational content-addressable memory 40 | 41 | class CAMIO(entries: Int, addr_bits: Int, tag_bits: Int) extends Bundle { 42 | val clear_hit = Input(Bool()) 43 | val is_clear_hit = Output(Bool()) 44 | val clear_tag = Input(UInt(tag_bits.W)) 45 | 46 | val tag = Input(UInt(tag_bits.W)) 47 | val hit = Output(Bool()) 48 | val hits = Output(UInt(entries.W)) 49 | val valid_bits = Output(UInt(entries.W)) 50 | val write = Input(Bool()) 51 | val write_tag = Input(UInt(tag_bits.W)) 52 | val hasFree = Output(Bool()) 53 | val freeInd = Output(UInt(log2Ceil(entries).W)) 54 | 55 | } 56 | 57 | // TODO make the CAM search/match function customizable? 58 | // (e.g compare only a subset of tag bits or such) 59 | class CAM(entries: Int, tag_bits: Int) extends Module { 60 | val addr_bits = log2Up(entries) 61 | val io = IO(new CAMIO(entries, addr_bits, tag_bits)) 62 | val cam_tags = SyncReadMem(entries, UInt(tag_bits.W)) 63 | // valid (fullness) of each slot in the CAM 64 | val vb_array = RegInit(0.U(entries.W)) 65 | // hit status for clearing 66 | //val clearHits = Vec((0 until entries).map(i => vb_array(i) && cam_tags(i) === io.clear_tag)) 67 | val clearHits = VecInit(Seq.tabulate(entries){i => vb_array(i) && cam_tags(i) === io.clear_tag}) 68 | io.is_clear_hit := clearHits.asUInt.orR 69 | 70 | // index of first free slot in the CAM (least significant first) 71 | val freeLocation = PriorityEncoder(~vb_array) 72 | io.freeInd := freeLocation 73 | // whether there are any free slots at all 74 | io.hasFree := ~vb_array.orR 75 | 76 | // produce masks to allow simultaneous write+clear 77 | val writeMask = Mux(io.write, UIntToOH(freeLocation), (0.U(entries.W))) 78 | val clearMask = Mux(io.clear_hit, (~clearHits.asUInt).asUInt, (~0.U(entries.W)).asUInt) 79 | 80 | vb_array := ((vb_array | writeMask) & clearMask).asUInt 81 | 82 | when (io.write) { cam_tags(freeLocation) := io.write_tag } 83 | 84 | val hits = VecInit(Seq.tabulate(entries) { (i => vb_array(i) && cam_tags(i) === io.tag) }) 85 | io.valid_bits := vb_array 86 | io.hits := hits.asUInt 87 | io.hit := io.hits.orR 88 | } 89 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/platform-xlnk.cpp: -------------------------------------------------------------------------------- 1 | // adapted from the xlnk driver in BNN-PYNQ: 2 | // https://github.com/Xilinx/BNN-PYNQ 3 | 4 | /****************************************************************************** 5 | * Copyright (c) 2016, Xilinx, Inc. 6 | * All rights reserved. 7 | * 8 | * Redistribution and use in source and binary forms, with or without 9 | * modification, are permitted provided that the following conditions are met: 10 | * 11 | * 1. Redistributions of source code must retain the above copyright notice, 12 | * this list of conditions and the following disclaimer. 13 | * 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 18 | * 3. Neither the name of the copyright holder nor the names of its 19 | * contributors may be used to endorse or promote products derived from 20 | * this software without specific prior written permission. 21 | * 22 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 23 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 26 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 27 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 28 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 29 | * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 30 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 31 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 32 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | * 34 | *****************************************************************************/ 35 | /****************************************************************************** 36 | * 37 | * 38 | * @file platform-xlnk.cpp 39 | * 40 | * Definition of the platform init-deinit functions 41 | * Call initPlatform() at the start of your program to 42 | * get a WrapperRegDriver handle 43 | * 44 | * 45 | *****************************************************************************/ 46 | #include 47 | #include 48 | 49 | #include 50 | 51 | #include "platform.h" 52 | #include "xlnkdriver.hpp" 53 | 54 | extern "C" { 55 | #include 56 | #include 57 | #include 58 | #include 59 | } 60 | 61 | static XlnkDriver* platform = 0; 62 | 63 | void platformSIGINTHandler(int signum) { 64 | std::cout << "Caught SIGINT, forcing exit" << std::endl; 65 | if(platform) { 66 | platform->detach(); 67 | } 68 | delete platform; 69 | exit(1); 70 | } 71 | 72 | WrapperRegDriver* initPlatform() { 73 | if (!platform) { 74 | platform = new XlnkDriver(0x43c00000, 64 * 1024); 75 | } 76 | 77 | struct sigaction action; 78 | std::memset(&action, 0, sizeof(struct sigaction)); 79 | action.sa_handler = &platformSIGINTHandler; 80 | int res = sigaction(SIGINT, &action, NULL); 81 | 82 | return static_cast(platform); 83 | } 84 | 85 | void deinitPlatform(WrapperRegDriver* driver) { 86 | delete platform; 87 | platform = 0; 88 | } 89 | 90 | void loadBitfile(const char* accelName) { 91 | // TODO add bitfile loader here, if desired 92 | } 93 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/platform-mpsoc-xlnk.cpp: -------------------------------------------------------------------------------- 1 | // adapted from the xlnk driver in BNN-PYNQ: 2 | // https://github.com/Xilinx/BNN-PYNQ 3 | 4 | /****************************************************************************** 5 | * Copyright (c) 2016, Xilinx, Inc. 6 | * All rights reserved. 7 | * 8 | * Redistribution and use in source and binary forms, with or without 9 | * modification, are permitted provided that the following conditions are met: 10 | * 11 | * 1. Redistributions of source code must retain the above copyright notice, 12 | * this list of conditions and the following disclaimer. 13 | * 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 18 | * 3. Neither the name of the copyright holder nor the names of its 19 | * contributors may be used to endorse or promote products derived from 20 | * this software without specific prior written permission. 21 | * 22 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 23 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 26 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 27 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 28 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 29 | * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 30 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 31 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 32 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | * 34 | *****************************************************************************/ 35 | /****************************************************************************** 36 | * 37 | * 38 | * @file platform-xlnk.cpp 39 | * 40 | * Definition of the platform init-deinit functions 41 | * Call initPlatform() at the start of your program to 42 | * get a WrapperRegDriver handle 43 | * 44 | * 45 | *****************************************************************************/ 46 | #include 47 | #include 48 | 49 | #include 50 | 51 | #include "platform.h" 52 | #include "xlnkdriver.hpp" 53 | 54 | extern "C" { 55 | #include 56 | #include 57 | #include 58 | #include 59 | } 60 | 61 | static XlnkDriver* platform = 0; 62 | 63 | void platformSIGINTHandler(int signum) { 64 | std::cout << "Caught SIGINT, forcing exit" << std::endl; 65 | if(platform) { 66 | platform->detach(); 67 | } 68 | delete platform; 69 | exit(1); 70 | } 71 | 72 | WrapperRegDriver* initPlatform() { 73 | if (!platform) { 74 | platform = new XlnkDriver(0xa0000000, 64 * 1024); 75 | } 76 | 77 | struct sigaction action; 78 | std::memset(&action, 0, sizeof(struct sigaction)); 79 | action.sa_handler = &platformSIGINTHandler; 80 | int res = sigaction(SIGINT, &action, NULL); 81 | 82 | return static_cast(platform); 83 | } 84 | 85 | void deinitPlatform(WrapperRegDriver* driver) { 86 | delete platform; 87 | platform = 0; 88 | } 89 | 90 | void loadBitfile(const char* accelName) { 91 | // TODO add bitfile loader here, if desired 92 | } 93 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/math/MathDef.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.math 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | 6 | class BinaryMathOperands(val w: Int) extends Bundle { 7 | val first = UInt(w.W) 8 | val second = UInt(w.W) 9 | 10 | } 11 | 12 | object BinaryMathOperands { 13 | def apply(first: UInt, second: UInt) = { 14 | if(first.getWidth != second.getWidth) { 15 | throw new Exception("Operand widths do not match") 16 | } 17 | val sop = new BinaryMathOperands(first.getWidth) 18 | sop.first := first 19 | sop.second := second 20 | sop 21 | } 22 | } 23 | 24 | class BinaryMathOpIO(w: Int) extends Bundle { 25 | val in = Flipped(Decoupled(new BinaryMathOperands(w))) 26 | val out = Decoupled(UInt(w.W)) 27 | } 28 | 29 | // abstract base class for binary operators 30 | // exposes a Valid-wrapped (UInt, UInt) => UInt interface, and the op latency 31 | abstract class BinaryMathOp(val w: Int) extends Module { 32 | val io = new BinaryMathOpIO(w) 33 | def latency: Int 34 | } 35 | 36 | // systolic reg to parametrize op stages flexibly 37 | // this essentially behaves like a single-element queue with no 38 | // fallthrough, so it can be used to add forced latency to an op 39 | // the ready signal is still combinatorially linked to allow fast 40 | // handshakes, like a Chisel queue with pipe=true flow=false 41 | // supports defining a transfer function from input to output, to support 42 | // building decoupled pipelined operators 43 | // parameters: 44 | 45 | class SystolicRegParams[TI <: Data, TO <: Data]( 46 | val tIn: TI, // wIn: width of input stream in bits 47 | val tOut: TO, // wOut: width of output stream in bits 48 | val fxn: TI => TO // fxn: function to apply on the way out 49 | ) 50 | 51 | class SystolicReg[TI <: Data, TO <: Data](val p: SystolicRegParams[TI, TO]) 52 | extends Module { 53 | val io = new Bundle { 54 | val in = Flipped(Decoupled(p.tIn.cloneType)) 55 | val out = Decoupled(p.tOut.cloneType) 56 | } 57 | val regValid = RegInit(false.B) 58 | val resetVal = 0.U(p.tOut.getWidth.W) 59 | val regData: TO = RegInit[TO](resetVal.asInstanceOf[TO]) 60 | val allowNewData = (!regValid || io.out.ready) 61 | 62 | io.out.bits := regData 63 | io.out.valid := regValid 64 | io.in.ready := allowNewData 65 | // somehow this needs to be outside the when (mux) below, 66 | // otherwise Chisel complains about "no default value on wire" 67 | val updData: TO = p.fxn(io.in.bits) 68 | 69 | when(allowNewData) { 70 | regData := updData 71 | regValid := io.in.valid 72 | } 73 | } 74 | 75 | // convenience constructor for SystolicReg 76 | object SystolicReg { 77 | def apply(w: Int) = { 78 | val uintP = new SystolicRegParams[UInt, UInt]( 79 | UInt(w.W), UInt(w.W), fxn = {x: UInt => x} 80 | ) 81 | Module(new SystolicReg[UInt, UInt](uintP)).io 82 | } 83 | def apply[TI <: Data, TO <: Data](tIn: TI, tOut: TO, fxn: TI => TO) = { 84 | val p = new SystolicRegParams[TI,TO](tIn, tOut, fxn) 85 | Module(new SystolicReg[TI,TO](p)).io 86 | } 87 | 88 | def apply[TI <: Data, TO <: Data](tIn: TI, tOut: TO, fxn: TI => TO, 89 | in: DecoupledIO[TI] 90 | ) = { 91 | val p = new SystolicRegParams[TI,TO](tIn, tOut, fxn) 92 | val mod = Module(new SystolicReg[TI,TO](p)).io 93 | in <> mod.in 94 | mod.out 95 | } 96 | } -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-tests/ExampleGather.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | using namespace std; 6 | 7 | #include "TestGather.hpp" 8 | #include "platform.h" 9 | 10 | typedef uint64_t AccelWord; 11 | typedef uint32_t RandAccInd; 12 | 13 | RandAccInd * readInfData(std::string fileName, unsigned int & numInds) { 14 | FILE *f = fopen(fileName.c_str(), "rb"); 15 | if(!f) throw (std::string("Could not open file: ") + fileName).c_str(); 16 | fseek(f, 0, SEEK_END); 17 | unsigned int fsize = ftell(f); 18 | fseek(f, 0, SEEK_SET); 19 | 20 | numInds = fsize / sizeof(RandAccInd); 21 | 22 | RandAccInd * buf = new RandAccInd[numInds]; 23 | unsigned int r = fread((void *) buf, 1, fsize, f); 24 | 25 | if(r != fsize) throw "Read error"; 26 | 27 | fclose(f); 28 | 29 | return buf; 30 | } 31 | 32 | bool Run_TestGather(WrapperRegDriver * platform) { 33 | TestGather t(platform); 34 | 35 | cout << "Signature: " << hex << t.get_signature() << dec << endl; 36 | unsigned int numVals, numInds; 37 | cout << "Number of values to generate: "; 38 | cin >> numVals; 39 | 40 | // allocate memory and generate indices with predictable structure 41 | AccelWord * hostBufVal = new AccelWord[numVals]; 42 | unsigned int valbufsize = numVals * sizeof(AccelWord); 43 | for(unsigned int i = 0; i < numVals; i++) { hostBufVal[i] = i; } 44 | 45 | void * accelBufVal = platform->allocAccelBuffer(valbufsize); 46 | platform->copyBufferHostToAccel(hostBufVal, accelBufVal, valbufsize); 47 | 48 | t.set_valsBase((AccelDblReg) accelBufVal); 49 | 50 | // read random access indices from a file and copy into accel memory 51 | cout << "Enter filename to get rand.acc. indices (or eye): "; 52 | string indsFileName; 53 | cin >> indsFileName; 54 | 55 | RandAccInd * hostBufInds; 56 | if(indsFileName == "eye") { 57 | numInds = numVals; 58 | hostBufInds = new RandAccInd[numInds]; 59 | for(unsigned int i = 0; i < numInds; i++) { hostBufInds[i] = i; } 60 | } else { 61 | hostBufInds = readInfData(indsFileName, numInds); 62 | } 63 | 64 | unsigned int indsbufsize = numInds * sizeof(RandAccInd); 65 | 66 | void * accelBufInds = platform->allocAccelBuffer(indsbufsize); 67 | platform->copyBufferHostToAccel(hostBufInds, accelBufInds, indsbufsize); 68 | 69 | t.set_indsBase((AccelDblReg) accelBufInds); 70 | t.set_count((AccelReg) numInds); 71 | 72 | cout << "Starting accelerator..." << endl; 73 | 74 | t.set_start(1); 75 | 76 | while(t.get_finished() != 1); 77 | 78 | cout << "Passed: " << t.get_resultsOK() << endl; 79 | cout << "Failed: " << t.get_resultsNotOK() << endl; 80 | 81 | // display performance counters 82 | cout << endl << "Performance counters: " << endl << "=====================" << endl; 83 | map> regMap = t.getStatusRegs(); 84 | string prefix = "perf_"; 85 | 86 | for(auto & keyVal : regMap) { 87 | if(keyVal.first.substr(0, prefix.size()) == prefix) 88 | cout << keyVal.first << " : " << t.readStatusReg(keyVal.first) << endl; 89 | } 90 | 91 | t.set_start(0); 92 | 93 | platform->deallocAccelBuffer(accelBufInds); 94 | platform->deallocAccelBuffer(accelBufVal); 95 | delete [] hostBufVal; 96 | delete [] hostBufInds; 97 | 98 | return true; 99 | } 100 | 101 | int main() 102 | { 103 | WrapperRegDriver * platform = initPlatform(); 104 | Run_TestGather(platform); 105 | deinitPlatform(platform); 106 | return 0; 107 | } 108 | -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/platform-mpsoc-cc-xlnk.cpp: -------------------------------------------------------------------------------- 1 | // adapted from the xlnk driver in BNN-PYNQ: 2 | // https://github.com/Xilinx/BNN-PYNQ 3 | 4 | /****************************************************************************** 5 | * Copyright (c) 2016, Xilinx, Inc. 6 | * All rights reserved. 7 | * 8 | * Redistribution and use in source and binary forms, with or without 9 | * modification, are permitted provided that the following conditions are met: 10 | * 11 | * 1. Redistributions of source code must retain the above copyright notice, 12 | * this list of conditions and the following disclaimer. 13 | * 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 18 | * 3. Neither the name of the copyright holder nor the names of its 19 | * contributors may be used to endorse or promote products derived from 20 | * this software without specific prior written permission. 21 | * 22 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 23 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 26 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 27 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 28 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 29 | * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 30 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 31 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 32 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | * 34 | *****************************************************************************/ 35 | /****************************************************************************** 36 | * 37 | * 38 | * @file platform-xlnk.cpp 39 | * 40 | * Definition of the platform init-deinit functions, assumes cache coherency 41 | * Call initPlatform() at the start of your program to 42 | * get a WrapperRegDriver handle 43 | * 44 | * 45 | *****************************************************************************/ 46 | #include 47 | #include 48 | 49 | #include 50 | 51 | #include "platform.h" 52 | #include "xlnkdriver.hpp" 53 | 54 | extern "C" { 55 | #include 56 | #include 57 | #include 58 | #include 59 | } 60 | 61 | static XlnkDriver* platform = 0; 62 | 63 | void platformSIGINTHandler(int signum) { 64 | std::cout << "Caught SIGINT, forcing exit" << std::endl; 65 | if(platform) { 66 | platform->detach(); 67 | } 68 | delete platform; 69 | exit(1); 70 | } 71 | 72 | WrapperRegDriver* initPlatform() { 73 | if (!platform) { 74 | platform = new XlnkDriver(0xa0000000, 64 * 1024, true); 75 | } 76 | 77 | struct sigaction action; 78 | std::memset(&action, 0, sizeof(struct sigaction)); 79 | action.sa_handler = &platformSIGINTHandler; 80 | int res = sigaction(SIGINT, &action, NULL); 81 | 82 | return static_cast(platform); 83 | } 84 | 85 | void deinitPlatform(WrapperRegDriver* driver) { 86 | delete platform; 87 | platform = 0; 88 | } 89 | 90 | void loadBitfile(const char* accelName) { 91 | // TODO add bitfile loader here, if desired 92 | } 93 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/streams/AXIStreamUpsizer.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.streams 2 | 3 | import chisel3._ 4 | import chisel3.util._ 5 | import fpgatidbits.axi._ 6 | 7 | class SerialInParallelOutIO(parWidth: Int, serWidth: Int) extends Bundle { 8 | val serIn = Input(UInt(serWidth.W)) 9 | val parOut = Output(UInt(parWidth.W)) 10 | val shiftEn = Input(Bool()) 11 | 12 | } 13 | 14 | class SerialInParallelOut(parWidth: Int, serWidth: Int) extends Module { 15 | val numShiftSteps = parWidth/serWidth 16 | 17 | val io = IO(new SerialInParallelOutIO(parWidth, serWidth)) 18 | val stages = RegInit(VecInit(Seq.fill(numShiftSteps)(0.U(serWidth.W)))) 19 | 20 | when (io.shiftEn) { 21 | // fill highest stage from serial input 22 | stages(numShiftSteps-1) := io.serIn 23 | // shift all stages to the right 24 | for(i <- 0 until numShiftSteps-1) { 25 | stages(i) := stages(i+1) 26 | } 27 | } 28 | // Cat does concat as 0 1 2 .. N 29 | // reverse the order to get N .. 2 1 0 30 | io.parOut := Cat(stages.reverse) 31 | } 32 | 33 | 34 | class AXIStreamUpsizer(inWidth: Int, outWidth: Int) extends Module { 35 | val io = IO(new Bundle { 36 | val in = Flipped(new AXIStreamIF(UInt(inWidth.W))) 37 | val out = new AXIStreamIF(UInt(outWidth.W)) 38 | }) 39 | if(inWidth >= outWidth) { 40 | println("AXIStreamUpsizer needs inWidth < outWidth") 41 | System.exit(-1) 42 | } 43 | val numShiftSteps = outWidth/inWidth 44 | val shiftReg = Module(new SerialInParallelOut(outWidth, inWidth)).io 45 | shiftReg.serIn := io.in.bits 46 | shiftReg.shiftEn := false.B 47 | 48 | io.in.ready := false.B 49 | io.out.valid := false.B 50 | io.out.bits := shiftReg.parOut 51 | 52 | val sWaitInput :: sWaitOutput :: Nil = Enum(2) 53 | val regState = RegInit(sWaitInput) 54 | 55 | val regAcquiredStages = RegInit(0.U(32.W)) 56 | val readyForOutput = (regAcquiredStages === (numShiftSteps-1).U) 57 | 58 | switch(regState) { 59 | is(sWaitInput) { 60 | io.in.ready := true.B 61 | when (io.in.valid) { 62 | shiftReg.shiftEn := true.B 63 | regAcquiredStages := regAcquiredStages + 1.U 64 | regState := Mux(readyForOutput, sWaitOutput, sWaitInput) 65 | } 66 | } 67 | is(sWaitOutput) { 68 | io.out.valid := true.B 69 | when (io.out.ready) { 70 | regAcquiredStages := 0.U 71 | regState := sWaitInput 72 | } 73 | } 74 | } 75 | } 76 | 77 | object StreamUpsizer { 78 | def apply(in: DecoupledIO[UInt], outW: Int): DecoupledIO[UInt] = { 79 | val ds = Module(new AXIStreamUpsizer(in.bits.getWidth, outW)).io 80 | ds.in <> in 81 | ds.out 82 | } 83 | } 84 | 85 | 86 | 87 | 88 | /* 89 | 90 | class AXIStreamUpsizerTester(c: AXIStreamUpsizer) extends Tester(c) { 91 | // simple test 8 -> 32 upsizing 92 | expect(c.io.in.ready, 1) 93 | expect(c.io.out.valid, 0) 94 | poke(c.io.out.ready, 0) 95 | poke(c.io.in.valid, 1) 96 | poke(c.io.in.bits, UInt("hef", 8).litValue()) 97 | step(1) 98 | poke(c.io.in.bits, UInt("hbe", 8).litValue()) 99 | step(1) 100 | poke(c.io.in.bits, UInt("had", 8).litValue()) 101 | step(1) 102 | poke(c.io.in.bits, UInt("hde", 8).litValue()) 103 | step(1) 104 | poke(c.io.in.valid, 0) 105 | expect(c.io.in.ready, 0) 106 | expect(c.io.out.valid, 1) 107 | expect(c.io.out.bits, UInt("hdeadbeef", 32).litValue()) 108 | step(1) 109 | poke(c.io.out.ready, 1) 110 | step(1) 111 | expect(c.io.in.ready, 1) 112 | expect(c.io.out.valid, 0) 113 | } 114 | */ -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-integration-tests/TestExampleBRAMMasked.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "platform.h" 3 | #include "ExampleBRAMMasked.hpp" 4 | #include 5 | 6 | using namespace std; 7 | 8 | int main(int argc, char **argv) { 9 | 10 | if (argc != 2) { 11 | cout << "Please pass the number of tests to run as the only command line argument" <> j) & 1) == 1) { 35 | _mask = _mask | (255 << (8*j)); 36 | } 37 | } 38 | 39 | expected_read_data = data & _mask; 40 | 41 | 42 | if (i % 2 == 0) { 43 | t.set_ports_0_req_addr(addr); 44 | t.set_ports_0_req_writeData(data); 45 | t.set_ports_0_req_writeMask_0(writeMask & 1); 46 | t.set_ports_0_req_writeMask_1((writeMask & 2) >> 1); 47 | t.set_ports_0_req_writeMask_2((writeMask & 4) >> 2); 48 | t.set_ports_0_req_writeMask_3((writeMask & 8) >> 3); 49 | t.set_ports_0_req_writeEn(1); 50 | t.set_ports_0_req_writeEn(0); 51 | 52 | t.set_ports_1_req_addr(addr); 53 | read_data = t.get_ports_1_rsp_readData(); 54 | } else { 55 | t.set_ports_1_req_addr(addr); 56 | t.set_ports_1_req_writeData(data); 57 | t.set_ports_1_req_writeMask_0(writeMask & 1); 58 | t.set_ports_1_req_writeMask_1((writeMask & 2) >> 1); 59 | t.set_ports_1_req_writeMask_2((writeMask & 4) >> 2); 60 | t.set_ports_1_req_writeMask_3((writeMask & 8) >> 3); 61 | t.set_ports_1_req_writeEn(1); 62 | t.set_ports_1_req_writeEn(0); 63 | 64 | t.set_ports_0_req_addr(addr); 65 | read_data = t.get_ports_0_rsp_readData(); 66 | } 67 | 68 | if (read_data == expected_read_data) { 69 | passed_tests++; 70 | 71 | // Then we have to write back zeros 72 | t.set_ports_1_req_addr(addr); 73 | t.set_ports_1_req_writeData(0); 74 | t.set_ports_1_req_writeMask_0(1); 75 | t.set_ports_1_req_writeMask_1(1); 76 | t.set_ports_1_req_writeMask_2(1); 77 | t.set_ports_1_req_writeMask_3(1); 78 | t.set_ports_1_req_writeEn(1); 79 | t.set_ports_1_req_writeEn(0); 80 | 81 | t.set_ports_0_req_addr(addr); 82 | t.set_ports_0_req_writeData(0); 83 | t.set_ports_0_req_writeMask_0(1); 84 | t.set_ports_0_req_writeMask_1(1); 85 | t.set_ports_0_req_writeMask_2(1); 86 | t.set_ports_0_req_writeMask_3(1); 87 | t.set_ports_0_req_writeEn(1); 88 | t.set_ports_0_req_writeEn(0); 89 | 90 | } else { 91 | cout <<"ExampleBRAM failed for addr=" < 8 | } 9 | 10 | #include 11 | #include "wrapperregdriver.h" 12 | #include 13 | using namespace std; 14 | 15 | void *memset(void *dst, int c, size_t n) 16 | { 17 | if (n) { 18 | char *d = (char *)dst; 19 | 20 | do { 21 | *d++ = c; 22 | } while (--n); 23 | } 24 | return dst; 25 | } 26 | 27 | // TODO add error checks on all cny_fwd* functions 28 | 29 | class WolverineRegDriver : public WrapperRegDriver 30 | { 31 | public: 32 | virtual std::string platformID() { 33 | return "WolverineDriver"; 34 | } 35 | 36 | virtual void attach(const char * name) { 37 | m_coproc = WDM_INVALID; 38 | 39 | // reserve and attach to the coprocessor 40 | m_coproc = wdm_reserve(WDM_CPID_ANY, NULL); 41 | 42 | if (m_coproc == WDM_INVALID) { 43 | throw "Unable to reserve coprocessor"; 44 | return; 45 | } 46 | 47 | if (wdm_attach(m_coproc, name)) { 48 | throw "Unable to load personality"; 49 | return; 50 | } 51 | 52 | // open connection to the firmware daemon 53 | if(cny_fwd_open() != 0) 54 | throw "cny_fwd_open failed"; 55 | // enable access to CSRs for CSR read/write 56 | if(cny_fwd_cmd((char *)"fpga sca aemc0 1\n", PEEKPOKE_DEFAULT, NULL, NULL) != 0) 57 | throw "cny_fwd_cmd failed!"; 58 | 59 | // do an instruction dispatch, should never return 60 | wdm_dispatch_t ds; 61 | memset((void *)&ds, 0, sizeof(ds)); 62 | if (wdm_dispatch(m_coproc, &ds)) { 63 | throw "Dispatch error"; 64 | return; 65 | } 66 | } 67 | 68 | virtual void detach() { 69 | // hack: do a write to register 0 to unset the busy flag 70 | writeReg(0, 2); 71 | // wait until returned from dispatch? 72 | int stat = 0; 73 | while (!(stat = wdm_dispatch_status(m_coproc))); 74 | 75 | if (stat < 0) { 76 | throw "Dispatch status error"; 77 | } 78 | // close firmware daemon connection and release coprocessor 79 | cny_fwd_close(); 80 | wdm_detach(m_coproc); 81 | wdm_release(m_coproc); 82 | } 83 | 84 | // functions to ensure coherency across host-accelerator 85 | virtual void copyBufferHostToAccel(const void * hostBuffer, void * accelBuffer, unsigned int numBytes) { 86 | if(!wdm_memcpy(m_coproc, accelBuffer, hostBuffer, numBytes)) 87 | throw "Error in copyBufferHostToAccel"; 88 | } 89 | 90 | virtual void copyBufferAccelToHost(const void * accelBuffer, void * hostBuffer, unsigned int numBytes) { 91 | if(!wdm_memcpy(m_coproc, hostBuffer, accelBuffer, numBytes)) 92 | throw "Error in copyBufferAccelToHost"; 93 | } 94 | 95 | virtual void * allocAccelBuffer(unsigned int numBytes) { 96 | void * accelBuf; 97 | if(wdm_posix_memalign(m_coproc, &accelBuf, 64, numBytes) != 0) 98 | throw "Error in allocAccelBuffer"; 99 | return accelBuf; 100 | } 101 | 102 | // register access methods for the platform wrapper 103 | virtual void writeReg(unsigned int regInd, AccelReg regValue) { 104 | // TODO support finer-grained writes by adjusting mask 105 | uint64_t v = regValue; 106 | cny_fwd_write((char *)"aemc0", 0x30000 + 0x8 * regInd, v, 0xffffffffffffffff); 107 | } 108 | 109 | virtual AccelReg readReg(unsigned int regInd) { 110 | uint64_t ret = 0; 111 | cny_fwd_read((char *)"aemc0", 0x30000 + 0x8*regInd, &ret); 112 | return (AccelReg) ret; 113 | } 114 | 115 | protected: 116 | wdm_coproc_t m_coproc; 117 | }; 118 | 119 | #endif // WOLVERINEREGDRIVER_H 120 | -------------------------------------------------------------------------------- /src/main/scala/fpgatidbits/examples/ExampleGrayScale.scala: -------------------------------------------------------------------------------- 1 | package fpgatidbits.examples 2 | 3 | import chisel3._ 4 | import chisel3.util.{Decoupled, DecoupledIO} 5 | import fpgatidbits.PlatformWrapper._ 6 | import fpgatidbits.dma._ 7 | 8 | class ExampleGrayScaleIO(p: PlatformWrapperParams) extends GenericAcceleratorIF(1, p) { 9 | val start = Input(Bool()) 10 | val finished = Output(Bool()) 11 | val baseAddr = Input(UInt(64.W)) 12 | val byteCount = Input(UInt(32.W)) 13 | val resBaseAddr = Input(UInt(64.W)) 14 | val resByteCount = Input(UInt(32.W)) 15 | val cycleCount = Output(UInt(32.W)) 16 | } 17 | // read and sum a contiguous stream of 32-bit uints from main memory 18 | class ExampleGrayScale(p: PlatformWrapperParams) extends GenericAccelerator(p) { 19 | val numMemPorts = 1 20 | val io = IO(new ExampleGrayScaleIO(p)) 21 | io.signature := makeDefaultSignature() 22 | 23 | val rdP = new StreamReaderParams( 24 | streamWidth = 24, fifoElems = 8, mem = p.toMemReqParams(), 25 | maxBeats = 1, chanID = 0, disableThrottle = true 26 | ) 27 | 28 | val wrP = new StreamWriterParams( 29 | streamWidth = 8, mem=p.toMemReqParams(), chanID = 0, maxBeats = 1 30 | ) 31 | 32 | val reader = Module(new StreamReader(rdP)).io 33 | val writer = Module(new StreamWriter(wrP)).io 34 | 35 | reader.start := io.start 36 | reader.baseAddr := io.baseAddr 37 | reader.byteCount := io.byteCount 38 | reader.doInit := false.B 39 | reader.initCount := 8.U 40 | 41 | writer.start := io.start 42 | writer.baseAddr := io.resBaseAddr 43 | writer.byteCount := io.resByteCount 44 | 45 | io.finished := writer.finished 46 | 47 | reader.req <> io.memPort(0).memRdReq 48 | io.memPort(0).memRdRsp <> reader.rsp 49 | writer.req <> io.memPort(0).memWrReq 50 | writer.wdat <> io.memPort(0).memWrDat 51 | writer.rsp <> io.memPort(0).memWrRsp 52 | 53 | val grayFilter = Module(new GrayScaleFilter) 54 | grayFilter.rgbIn.valid := reader.out.valid 55 | grayFilter.rgbIn.bits := reader.out.bits.asTypeOf(new Colour) 56 | reader.out.ready := grayFilter.rgbIn.ready 57 | 58 | grayFilter.grayOut <> writer.in 59 | 60 | val regCycleCount = RegInit(0.U(32.W)) 61 | io.cycleCount := regCycleCount 62 | when(!io.start) {regCycleCount := 0.U} 63 | .elsewhen(io.start & !io.finished) {regCycleCount := regCycleCount + 1.U} 64 | } 65 | 66 | class Colour extends Bundle { 67 | val r = UInt(8.W) 68 | val g = UInt(8.W) 69 | val b = UInt(8.W) 70 | } 71 | 72 | class GrayScaleFilter extends Module { 73 | val rgbIn = IO(Flipped(Decoupled(new Colour))) 74 | val grayOut = IO(Decoupled(UInt(8.W))) 75 | 76 | val s1_valid = RegInit(false.B) 77 | val s1_r1Shifted = RegInit(0.U(8.W)) 78 | val s1_r2Shifted = RegInit(0.U(8.W)) 79 | val s1_g1Shifted = RegInit(0.U(8.W)) 80 | val s1_g2Shifted = RegInit(0.U(8.W)) 81 | val s1_b1Shifted = RegInit(0.U(8.W)) 82 | val s1_b2Shifted = RegInit(0.U(8.W)) 83 | 84 | val s2_valid = RegInit(false.B) 85 | val s2_gray = RegInit(0.U(8.W)) 86 | 87 | rgbIn.ready := !s2_valid || grayOut.fire 88 | grayOut.valid := s2_valid 89 | grayOut.bits := s2_gray 90 | 91 | when(rgbIn.fire) { 92 | // Stage 1 93 | s1_valid := true.B 94 | val rgb = rgbIn.bits 95 | val (r,g,b) = (rgb.r, rgb.g, rgb.b) 96 | s1_r1Shifted := (r >> 2).asUInt 97 | s1_r2Shifted := (r >> 5).asUInt 98 | s1_g1Shifted := (g >> 1).asUInt 99 | s1_g2Shifted := (g >> 4).asUInt 100 | s1_b1Shifted := (b >> 4).asUInt 101 | s1_b2Shifted := (b >> 5).asUInt 102 | 103 | // Stage 2 104 | s2_valid := s1_valid 105 | s2_gray := s1_r1Shifted + s1_r2Shifted + s1_g1Shifted + s1_g2Shifted + s1_b1Shifted + s1_b2Shifted 106 | } 107 | } -------------------------------------------------------------------------------- /src/main/resources/cpp/platform-wrapper-regdriver/wolverineregdriverdebug.hpp: -------------------------------------------------------------------------------- 1 | #ifndef WOLVERINEREGDRIVERDEBUG_H 2 | #define WOLVERINEREGDRIVERDEBUG_H 3 | 4 | extern "C" 5 | { 6 | #include 7 | } 8 | 9 | #include 10 | #include "wrapperregdriver.h" 11 | 12 | void *memset(void *dst, int c, size_t n) 13 | { 14 | if (n) { 15 | char *d = (char *)dst; 16 | 17 | do { 18 | *d++ = c; 19 | } while (--n); 20 | } 21 | return dst; 22 | } 23 | 24 | 25 | // variant of WolverineRegDriver that uses AEGs instead of CSRs 26 | // - cannot read registers while coprocessor is busy 27 | // - useful for Convey Verilog simulation (since no CSR support there) 28 | // - must call start() after all registers are set up 29 | 30 | class WolverineRegDriverDebug : public WrapperRegDriver 31 | { 32 | public: 33 | virtual std::string platformID() { 34 | return "WolverineDebugDriver"; 35 | } 36 | 37 | virtual void attach(const char * name) { 38 | m_coproc = WDM_INVALID; 39 | 40 | // reserve and attach to the coprocessor 41 | m_coproc = wdm_reserve(WDM_CPID_ANY, NULL); 42 | 43 | if (m_coproc == WDM_INVALID) { 44 | throw "Unable to reserve coprocessor"; 45 | return; 46 | } 47 | 48 | if (wdm_attach(m_coproc, name)) { 49 | throw "Unable to load personality"; 50 | return; 51 | } 52 | } 53 | 54 | virtual void deattach() { 55 | // hack: do a write to register 0 to unset the busy flag 56 | writeReg(0, 2); 57 | // close firmware daemon connection and release coprocessor 58 | wdm_detach(m_coproc); 59 | wdm_release(m_coproc); 60 | } 61 | 62 | void start() { 63 | // do an instruction dispatch, should never return 64 | wdm_dispatch_t ds; 65 | memset((void *)&ds, 0, sizeof(ds)); 66 | if (wdm_dispatch(m_coproc, &ds)) { 67 | throw "Dispatch error"; 68 | return; 69 | } 70 | } 71 | 72 | // functions to ensure coherency across host-accelerator 73 | virtual void copyBufferHostToAccel(const void * hostBuffer, void * accelBuffer, unsigned int numBytes) { 74 | if(!wdm_memcpy(m_coproc, accelBuffer, hostBuffer, numBytes)) 75 | throw "Error in copyBufferHostToAccel"; 76 | } 77 | 78 | virtual void copyBufferAccelToHost(const void * accelBuffer, void * hostBuffer, unsigned int numBytes) { 79 | if(!wdm_memcpy(m_coproc, hostBuffer, accelBuffer, numBytes)) 80 | throw "Error in copyBufferAccelToHost"; 81 | } 82 | 83 | virtual void * allocAccelBuffer(unsigned int numBytes) { 84 | void * accelBuf; 85 | if(wdm_posix_memalign(m_coproc, &accelBuf, 64, numBytes) != 0) 86 | throw "Error in allocAccelBuffer"; 87 | return accelBuf; 88 | } 89 | 90 | // register access methods for the platform wrapper 91 | virtual void writeReg(unsigned int regInd, AccelReg regValue) { 92 | wdm_dispatch_t ds; 93 | memset((void *)&ds, 0, sizeof(ds)); 94 | uint64_t reg = regValue; 95 | ds.ae[0].aeg_ptr_s = ® 96 | ds.ae[0].aeg_cnt_s = 1; 97 | ds.ae[0].aeg_base_s = regInd; 98 | if(wdm_aeg_write_read(m_coproc, &ds) != 0) 99 | throw "wdm_aeg_write_read failed in writeReg"; 100 | } 101 | 102 | virtual AccelReg readReg(unsigned int regInd) { 103 | uint64_t ret; 104 | wdm_dispatch_t ds; 105 | memset((void *)&ds, 0, sizeof(ds)); 106 | ds.ae[0].aeg_ptr_r = &ret; 107 | ds.ae[0].aeg_cnt_r = 1; 108 | ds.ae[0].aeg_base_r = regInd; 109 | if(wdm_aeg_write_read(m_coproc, &ds) != 0) 110 | throw "wdm_aeg_write_read failed in readReg"; 111 | 112 | return (AccelReg) ret; 113 | } 114 | 115 | protected: 116 | wdm_coproc_t m_coproc; 117 | 118 | }; 119 | 120 | #endif // WOLVERINEREGDRIVERDEBUG_H 121 | -------------------------------------------------------------------------------- /src/main/resources/script/vivado-platformwrapper-zc706.tcl: -------------------------------------------------------------------------------- 1 | if {$argc != 5} { 2 | puts "Expected: " 3 | exit 4 | } 5 | 6 | # pull cmdline variables to use during setup 7 | set config_tidbits_root [lindex $argv 0] 8 | set config_blackboxip_repo "$config_tidbits_root/src/main/vivado-ip-cores" 9 | set config_tidbits_verilog "$config_tidbits_root/src/main/verilog" 10 | set config_accel_verilog [lindex $argv 1] 11 | set config_proj_name [lindex $argv 2] 12 | set config_proj_dir [lindex $argv 3] 13 | set config_freq [lindex $argv 4] 14 | puts $config_tidbits_verilog 15 | # fixed for platform 16 | set config_proj_part "xc7z045ffg900-2" 17 | set config_proj_board "xilinx.com:zc706:part0:1.2" 18 | 19 | # set up project 20 | create_project $config_proj_name $config_proj_dir -part $config_proj_part 21 | set_property board_part $config_proj_board [current_project] 22 | set_property ip_repo_paths $config_blackboxip_repo [current_project] 23 | update_ip_catalog 24 | 25 | # create block design 26 | create_bd_design "procsys" 27 | create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0 28 | apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells processing_system7_0] 29 | 30 | # instantiate PlatformWrapper blackbox IP 31 | create_bd_cell -type ip -vlnv ntnueecs:eecsaccel:ZedBoardWrapper:1.0 ZedBoardWrapper_0 32 | apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config {Master "/processing_system7_0/M_AXI_GP0" Clk "Auto" } [get_bd_intf_pins ZedBoardWrapper_0/csr] 33 | # enable AXI HP ports, set target frequency to 200 MHz 34 | set_property -dict [list CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ $config_freq CONFIG.PCW_USE_S_AXI_HP0 {1} CONFIG.PCW_USE_S_AXI_HP1 {1} CONFIG.PCW_USE_S_AXI_HP2 {1} CONFIG.PCW_USE_S_AXI_HP3 {1}] [get_bd_cells processing_system7_0] 35 | # set number of ports to four on the blackbox IP 36 | set_property -dict [list CONFIG.NUM_MEM_PORTS {4}] [get_bd_cells ZedBoardWrapper_0] 37 | # connect IP to Zynq PS 38 | apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config {Master "/ZedBoardWrapper_0/mem0" Clk "Auto" } [get_bd_intf_pins processing_system7_0/S_AXI_HP0] 39 | apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config {Master "/ZedBoardWrapper_0/mem1" Clk "Auto" } [get_bd_intf_pins processing_system7_0/S_AXI_HP1] 40 | apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config {Master "/ZedBoardWrapper_0/mem2" Clk "Auto" } [get_bd_intf_pins processing_system7_0/S_AXI_HP2] 41 | apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config {Master "/ZedBoardWrapper_0/mem3" Clk "Auto" } [get_bd_intf_pins processing_system7_0/S_AXI_HP3] 42 | # make the block design look prettier 43 | regenerate_bd_layout 44 | validate_bd_design 45 | save_bd_design 46 | # create HDL wrapper 47 | make_wrapper -files [get_files $config_proj_dir/$config_proj_name.srcs/sources_1/bd/procsys/procsys.bd] -top 48 | add_files -norecurse $config_proj_dir/$config_proj_name.srcs/sources_1/bd/procsys/hdl/procsys_wrapper.v 49 | update_compile_order -fileset sources_1 50 | update_compile_order -fileset sim_1 51 | # use manual compile order to ensure accel verilog is processed prior to block design 52 | update_compile_order -fileset sources_1 53 | set_property source_mgmt_mode DisplayOnly [current_project] 54 | # add the real Verilog implementation for the accelerator and move to top 55 | add_files -norecurse $config_accel_verilog 56 | reorder_files -front $config_accel_verilog 57 | # add misc verilog files used by fpga-tidbits 58 | add_files -norecurse $config_tidbits_verilog/Q_srl.v $config_tidbits_verilog/DualPortBRAM.v 59 | reorder_files -before $config_accel_verilog $config_tidbits_verilog/Q_srl.v $config_tidbits_verilog/DualPortBRAM.v 60 | -------------------------------------------------------------------------------- /src/main/resources/script/vivado-platformwrapper-zedboard.tcl: -------------------------------------------------------------------------------- 1 | if {$argc != 5} { 2 | puts "Expected: " 3 | exit 4 | } 5 | 6 | # pull cmdline variables to use during setup 7 | set config_tidbits_root [lindex $argv 0] 8 | set config_blackboxip_repo "$config_tidbits_root/src/main/vivado-ip-cores" 9 | set config_tidbits_verilog "$config_tidbits_root/src/main/verilog" 10 | set config_accel_verilog [lindex $argv 1] 11 | set config_proj_name [lindex $argv 2] 12 | set config_proj_dir [lindex $argv 3] 13 | set config_freq [lindex $argv 4] 14 | puts $config_tidbits_verilog 15 | # fixed for platform 16 | set config_proj_part "xc7z020clg484-1" 17 | set config_proj_board "em.avnet.com:zed:part0:1.3" 18 | 19 | # set up project 20 | create_project $config_proj_name $config_proj_dir -part $config_proj_part 21 | set_property board_part $config_proj_board [current_project] 22 | set_property ip_repo_paths $config_blackboxip_repo [current_project] 23 | update_ip_catalog 24 | 25 | # create block design 26 | create_bd_design "procsys" 27 | create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0 28 | apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells processing_system7_0] 29 | 30 | # instantiate PlatformWrapper blackbox IP 31 | create_bd_cell -type ip -vlnv ntnueecs:eecsaccel:ZedBoardWrapper:1.0 ZedBoardWrapper_0 32 | apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config {Master "/processing_system7_0/M_AXI_GP0" Clk "Auto" } [get_bd_intf_pins ZedBoardWrapper_0/csr] 33 | # enable AXI HP ports, set target frequency to 200 MHz 34 | set_property -dict [list CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ $config_freq CONFIG.PCW_USE_S_AXI_HP0 {1} CONFIG.PCW_USE_S_AXI_HP1 {1} CONFIG.PCW_USE_S_AXI_HP2 {1} CONFIG.PCW_USE_S_AXI_HP3 {1}] [get_bd_cells processing_system7_0] 35 | # set number of ports to four on the blackbox IP 36 | set_property -dict [list CONFIG.NUM_MEM_PORTS {4}] [get_bd_cells ZedBoardWrapper_0] 37 | # connect IP to Zynq PS 38 | apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config {Master "/ZedBoardWrapper_0/mem0" Clk "Auto" } [get_bd_intf_pins processing_system7_0/S_AXI_HP0] 39 | apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config {Master "/ZedBoardWrapper_0/mem1" Clk "Auto" } [get_bd_intf_pins processing_system7_0/S_AXI_HP1] 40 | apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config {Master "/ZedBoardWrapper_0/mem2" Clk "Auto" } [get_bd_intf_pins processing_system7_0/S_AXI_HP2] 41 | apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config {Master "/ZedBoardWrapper_0/mem3" Clk "Auto" } [get_bd_intf_pins processing_system7_0/S_AXI_HP3] 42 | # make the block design look prettier 43 | regenerate_bd_layout 44 | validate_bd_design 45 | save_bd_design 46 | # create HDL wrapper 47 | make_wrapper -files [get_files $config_proj_dir/$config_proj_name.srcs/sources_1/bd/procsys/procsys.bd] -top 48 | add_files -norecurse $config_proj_dir/$config_proj_name.srcs/sources_1/bd/procsys/hdl/procsys_wrapper.v 49 | update_compile_order -fileset sources_1 50 | update_compile_order -fileset sim_1 51 | # use manual compile order to ensure accel verilog is processed prior to block design 52 | update_compile_order -fileset sources_1 53 | set_property source_mgmt_mode DisplayOnly [current_project] 54 | # add the real Verilog implementation for the accelerator and move to top 55 | add_files -norecurse $config_accel_verilog 56 | reorder_files -front $config_accel_verilog 57 | # add misc verilog files used by fpga-tidbits 58 | add_files -norecurse $config_tidbits_verilog/Q_srl.v $config_tidbits_verilog/DualPortBRAM.v 59 | reorder_files -before $config_accel_verilog $config_tidbits_verilog/Q_srl.v $config_tidbits_verilog/DualPortBRAM.v 60 | --------------------------------------------------------------------------------