├── README.md ├── compiler ├── compiler.py ├── driver.py ├── fsim.py ├── npu_layers.py ├── pac_dump │ └── README.md ├── pcie_dump │ └── README.md └── perf_sim.sh ├── how_to_run_cpu_fpga_system_demo.pdf ├── how_to_run_npu_on_fpga.pdf ├── npu_readme.pdf ├── patch ├── kernel │ ├── Makefile │ ├── intel_fpga_pcie_chr.c │ ├── intel_fpga_pcie_dma.c │ ├── intel_fpga_pcie_ioctl.c │ └── intel_fpga_pcie_setup.c ├── npu_test │ ├── Makefile │ ├── dma_test.hpp │ ├── gemv256_batch1026 │ │ ├── golden_data │ │ ├── input_data │ │ ├── inputs.dat │ │ ├── inst_data │ │ ├── instructions.dat │ │ ├── mrf_data │ │ ├── mrfs.dat │ │ └── outputs.dat │ ├── gemv256_batch6 │ │ ├── inputs.dat │ │ ├── instructions.dat │ │ ├── mrfs.dat │ │ └── outputs.dat │ ├── gemv256_batch768 │ │ ├── inputs.dat │ │ ├── instructions.dat │ │ ├── mrfs.dat │ │ └── outputs.dat │ ├── gen_input.py │ ├── mlp_batch2052 │ │ ├── inputs.dat │ │ ├── instructions.dat │ │ ├── mrfs.dat │ │ └── outputs.dat │ ├── mlp_batch4104 │ │ ├── inputs.dat │ │ ├── instructions.dat │ │ ├── mrfs.dat │ │ └── outputs.dat │ └── real_npu_test.cpp ├── pcie_ed_MEM.v ├── setup.sh └── user │ ├── intel_fpga_pcie_api.hpp │ └── intel_fpga_pcie_api_linux.cpp ├── rtl ├── altera_syncram.sv ├── asymmetric_fifo.sv ├── axbs.sv ├── bram_accum.sv ├── daisy_chain_interconnect.sv ├── dma_buffer.v ├── dpe.sv ├── dpe_mrf.sv ├── evrf.sv ├── evrf_sched.sv ├── fifo.sv ├── inst_fifo.sv ├── inst_ram.sv ├── ld.sv ├── ld_sched.sv ├── mfu.sv ├── mfu_sched.sv ├── mrf_ram.sv ├── mvu.sv ├── mvu_sched.sv ├── mvu_tile.sv ├── mvu_vrf.sv ├── npu.sv ├── npu.vh ├── npu_tb.sv ├── nx_axbs.sv ├── nx_axbs_core.sv ├── nx_axbs_slice.sv ├── nx_dot6_int8.sv ├── nx_dot_product_int8.sv ├── pipeline_interconnect.sv ├── prime_dsp_tensor_int8.sv ├── ram.sv ├── run_sim.sh ├── self_tester_shim.sv ├── self_tester_tb.v ├── setup.sh ├── shim.sv ├── sigmoid.mif ├── sigmoid.sv ├── sigmoid.ver ├── star_interconnect.sv ├── tanh.mif ├── tanh.sv ├── tanh.ver ├── tester_rom.sv └── top_sched.sv ├── scripts ├── perf_baseline ├── perf_tests.py ├── reports │ └── README.md ├── rtl_baseline ├── rtl_tests.py └── workloads │ ├── 01_gemv_512x512.py │ ├── 02_gemv_1024x1024.py │ ├── 03_gemv_1152x1152.py │ ├── 04_gemv_1536x1536.py │ ├── 05_gemv_1792x1792.py │ ├── 06_rnn_512_8.py │ ├── 07_rnn_1024_8.py │ ├── 08_rnn_1152_8.py │ ├── 09_rnn_1536_8.py │ ├── 10_rnn_1792_8.py │ ├── 11_gru_512_8.py │ ├── 12_gru_1024_8.py │ ├── 13_gru_1152_8.py │ ├── 14_lstm_512_8.py │ ├── 15_lstm_1024_8.py │ ├── 16_mlp5_512.py │ ├── 17_mlp5_1024.py │ ├── 18_mlp3_1024_512_256_256.py │ └── 19_mlp3_1024_512_256_256_batched.py └── simulator ├── Makefile ├── inc ├── accumulator.h ├── channel.h ├── datapath.h ├── decoder.h ├── defines.h ├── dpe.h ├── evrf.h ├── input.h ├── inst.h ├── loader.h ├── mfu.h ├── module.h ├── mvu.h ├── mvu_vrf.h ├── npu.h ├── output.h ├── port.h ├── register_file.h ├── tile.h └── utils.h ├── main ├── npu_sim.cpp └── obj │ └── README.md ├── perf_sim_log ├── register_files └── README.md └── src ├── accumulator.cpp ├── channel.cpp ├── datapath.cpp ├── decoder.cpp ├── dpe.cpp ├── evrf.cpp ├── input.cpp ├── loader.cpp ├── mfu.cpp ├── mvu.cpp ├── mvu_vrf.cpp ├── npu.cpp ├── obj └── README.md ├── output.cpp ├── port.cpp ├── register_file.cpp ├── tile.cpp └── utils.cpp /README.md: -------------------------------------------------------------------------------- 1 | # DISCONTINUATION OF PROJECT # 2 | This project will no longer be maintained by Intel. 3 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project. 4 | Intel no longer accepts patches to this project. 5 | If you have an ongoing need to use this project, are interested in independently developing it, or would like to maintain patches for the open source software community, please create your own fork of this project. 6 | 7 | # The Neural Processing Unit (NPU) 8 | 9 | ## Introduction 10 | The Neural Processing Unit (NPU) is an FPGA soft processor (i.e., overlay) architecture for low latency, low batch AI inference. It adopts the "persistent AI" approach, in which all model weights are kept persistent in the on-chip SRAM memory of one or more network-connected FPGAs to eliminate the expensive off-chip memory accesses. The NPU is a domain-specific software-programmable processor. Therefore, once the NPU bitstream is compiled and deployed on an FPGA, users can rapidly program it to run different AI workloads using a high-level domain-specific language or a deep learning framework (e.g. TensorFlow Keras) purely in software. This approach enables AI application developers to use FPGAs for AI inference acceleration without the need for FPGA design expertise or suffering from the long runtime of FPGA CAD tools. 11 | 12 | ## License 13 | Copyright 2022 Intel Corporation 14 | 15 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 16 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 17 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 18 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 21 | 22 | ## Citation 23 | If you use the NPU code in this repo for your research, please cite the following paper: 24 | * A. Boutros, E. Nurvitadhi, R. Ma, S. Gribok, Z. Zhao, J. Hoe, V. Betz, and M. Langhammer. "Beyond Peak Performance: Comparing the Real Performance of AI-Optimized FPGAs and GPUs". In the IEEE International Conference on Field-Programmable Technology (FPT), 2020. 25 | 26 | You can use the following BibTex entry: 27 | ```plaintext 28 | @article{npu_s10_nx, 29 | title={{Beyond Peak Performance: Comparing the Real Performance of AI-Optimized FPGAs and GPUs}}, 30 | author={Boutros, Andrew and others}, 31 | booktitle={IEEE International Conference on Field-Programmable Technology (ICFPT)}, 32 | year={2020} 33 | } 34 | ``` 35 | -------------------------------------------------------------------------------- /compiler/driver.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | 7 | from compiler import * 8 | from npu_layers import * 9 | 10 | ###### START OF MODEL DEFINITION ###### 11 | 12 | # Define constants 13 | INPUT_VEC_SIZE = 256 14 | DENSE_L1_SIZE = 256 15 | DENSE_L2_SIZE = 256 16 | DENSE_L3_SIZE = 256 17 | 18 | # Define model architecture using Keras Sequential Model 19 | model = NPUSequential([ 20 | layers.Dense(DENSE_L1_SIZE, activation="relu", name="layer1"), 21 | #layers.Dense(DENSE_L2_SIZE, activation="relu", name="layer2"), 22 | #layers.Dense(DENSE_L3_SIZE, activation="relu", name="layer3"), 23 | ]) 24 | 25 | # Random test inputs for different types of layers 26 | test_input = tf.random.uniform(shape=[6, INPUT_VEC_SIZE], minval=-128, maxval=127) 27 | 28 | # Call model on example input 29 | y = model(test_input) 30 | 31 | # Print model summary 32 | model.summary() 33 | 34 | ####### END OF MODEL DEFINITION ####### 35 | 36 | # Initialize NPU 37 | npu = initialize_npu(sys.argv) 38 | # Compile model for NPU 39 | model.compile_for_npu(npu, test_input) 40 | # Run NPU flow 41 | npu.run_flow() 42 | -------------------------------------------------------------------------------- /compiler/pac_dump/README.md: -------------------------------------------------------------------------------- 1 | Directory for dumping MIF files for RTL simulation 2 | -------------------------------------------------------------------------------- /compiler/pcie_dump/README.md: -------------------------------------------------------------------------------- 1 | Directory for storing PCIe .dat files 2 | -------------------------------------------------------------------------------- /compiler/perf_sim.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd ../simulator 4 | make &> make_log 5 | ./npu_sim &> perf_sim_log 6 | make clean &> make_clean_log 7 | -------------------------------------------------------------------------------- /how_to_run_cpu_fpga_system_demo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/fpga-npu/6fead512b112e0a687b7aac6e551c0b8390c7c75/how_to_run_cpu_fpga_system_demo.pdf -------------------------------------------------------------------------------- /how_to_run_npu_on_fpga.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/fpga-npu/6fead512b112e0a687b7aac6e551c0b8390c7c75/how_to_run_npu_on_fpga.pdf -------------------------------------------------------------------------------- /npu_readme.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/fpga-npu/6fead512b112e0a687b7aac6e551c0b8390c7c75/npu_readme.pdf -------------------------------------------------------------------------------- /patch/kernel/Makefile: -------------------------------------------------------------------------------- 1 | MODULE_NAME := intel_fpga_pcie_drv 2 | obj-m += $(MODULE_NAME).o 3 | $(MODULE_NAME)-y := intel_fpga_pcie_chr.o intel_fpga_pcie_dma.o intel_fpga_pcie_setup.o intel_fpga_pcie_ioctl.o 4 | USE_AVX ?= 1 5 | 6 | PWD := $(shell pwd) 7 | KDIR ?= /lib/modules/$(shell uname -r)/build 8 | CPPFLAGS += -include $(KDIR)/include/generated/autoconf.h 9 | EXTRA_CFLAGS += -Wall 10 | 11 | ifeq ($(USE_AVX), 1) 12 | # Enable wide accesses up to 32B 13 | EXTRA_CFLAGS += -mavx -mpreferred-stack-boundary=4 14 | endif 15 | 16 | all: 17 | $(MAKE) -C $(KDIR) M=$(PWD) modules 18 | 19 | clean: 20 | $(MAKE) -C $(KDIR) M=$(PWD) clean 21 | -------------------------------------------------------------------------------- /patch/npu_test/Makefile: -------------------------------------------------------------------------------- 1 | # Extremely simple makefile. 2 | 3 | all: 4 | g++ -Wno-sign-compare -Wno-unused -std=c++0x -Wall -I ../api -I ../api/linux ../api/linux/intel_fpga_pcie_api_linux.cpp real_npu_test.cpp -o real_npu_test 5 | clean: 6 | rm -f ./real_npu_test 7 | -------------------------------------------------------------------------------- /patch/npu_test/dma_test.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DMA_TEST_HPP 2 | #define DMA_TEST_HPP 3 | 4 | const int version_major = 2; 5 | const int version_minor = 0; 6 | 7 | #define NPU_PRINT 8 | 9 | #define WELCOME_OPT_AUTO 0 10 | #define WELCOME_OPT_MANUAL 1 11 | #define WELCOME_OPT_MAXNR 1 12 | 13 | #define NPU_INPUT 0x4000 14 | #define NPU_INPUT_1 0x4100 15 | #define NPU_RAM1 0x4040 16 | #define NPU_RAM2 0x4140 17 | #define NPU_IN_FIFO 0x4080 18 | #define NPU_START 0x4240 19 | 20 | #define NPU_DONE 0x4040 21 | #define NPU_OUT_DEQ 0x40c0 22 | #define NPU_OUT_FIFO_0 0x4000 23 | #define NPU_OUT_FIFO_1 0x4100 24 | #define NPU_OUT_FIFO_2 0x4200 25 | #define NPU_OUT_FIFO_3 0x4300 26 | #define NPU_OUT_FIFO_4 0x4400 27 | 28 | #define POLL_RAM_STATUS 0x80100 29 | #define NPU_SOFT_RST 0x80200 30 | 31 | #define SEL_MENU_DELIMS "*********************************************************" 32 | #define FILL_ZERO 0 33 | #define FILL_RAND 1 34 | #define FILL_INCR 2 35 | 36 | #endif /* DMA_TEST_HPP */ 37 | -------------------------------------------------------------------------------- /patch/npu_test/gemv256_batch1026/instructions.dat: -------------------------------------------------------------------------------- 1 | 17 48 2 | 92 0 0 0 0 40 64 96 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | 92 0 0 0 0 40 64 96 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 | 92 0 0 0 0 40 64 96 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 | 92 0 0 0 0 40 64 96 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 | 92 0 0 0 0 40 64 96 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 | 92 0 0 0 0 40 64 96 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 | 92 0 0 0 0 40 64 96 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 252 0 6 0 0 0 240 0 3 12 0 8 160 0 129 1 10 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 13 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 | 127 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 17 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 | 171 0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 19 | -------------------------------------------------------------------------------- /patch/npu_test/gemv256_batch6/inputs.dat: -------------------------------------------------------------------------------- 1 | 42 40 42 40 42 2 | 1 3 -29 91 -107 10 -31 126 22 73 -125 81 82 8 -123 93 -66 36 18 -121 -125 -1 -126 -99 50 -3 125 56 65 -55 -3 39 97 -9 -74 -45 8 -35 109 -105 3 | 1 3 -29 91 -107 10 -31 126 22 73 -125 81 82 8 -123 93 -66 36 18 -121 -125 -1 -126 -99 50 -3 125 56 65 -55 -3 39 97 -9 -74 -45 8 -35 109 -105 4 | 60 107 83 37 -102 -10 4 -75 92 -78 -77 -41 5 -11 42 -54 18 -43 1 95 60 -11 -76 7 22 -68 -121 40 16 -14 -64 33 -97 -118 120 -74 -3 9 49 86 5 | 60 107 83 37 -102 -10 4 -75 92 -78 -77 -41 5 -11 42 -54 18 -43 1 95 60 -11 -76 7 22 -68 -121 40 16 -14 -64 33 -97 -118 120 -74 -3 9 49 86 6 | 50 -121 -116 24 -64 71 29 -55 -106 87 49 110 84 31 115 -92 78 3 -18 96 -113 89 -82 -1 20 115 68 -97 54 81 -60 -112 19 -5 -105 85 55 -40 15 -34 7 | 50 -121 -116 24 -64 71 29 -55 -106 87 49 110 84 31 115 -92 78 3 -18 96 -113 89 -82 -1 20 115 68 -97 54 81 -60 -112 19 -5 -105 85 55 -40 15 -34 8 | 76 -31 -65 -8 11 -11 -53 4 -127 43 -100 -67 -23 -102 110 78 2 19 57 -119 12 -116 123 -107 46 -14 120 -31 108 -50 113 -42 35 -20 66 -40 54 75 -41 -67 9 | 76 -31 -65 -8 11 -11 -53 4 -127 43 -100 -67 -23 -102 110 78 2 19 57 -119 12 -116 123 -107 46 -14 120 -31 108 -50 113 -42 35 -20 66 -40 54 75 -41 -67 10 | -114 2 109 -105 0 28 -123 44 -20 -15 68 -94 99 74 74 -68 126 47 -66 -55 25 64 -46 -71 41 -1 45 -32 -46 121 43 15 -82 -127 12 84 -81 23 75 70 11 | -114 2 109 -105 0 28 -123 44 -20 -15 68 -94 99 74 74 -68 126 47 -66 -55 25 64 -46 -71 41 -1 45 -32 -46 121 43 15 -82 -127 12 84 -81 23 75 70 12 | 29 11 -111 -112 63 -107 -30 -45 118 58 116 47 -88 -118 90 101 -26 91 116 -94 -5 40 -43 -69 -25 -114 -104 2 -107 -13 -29 11 83 84 46 37 86 -83 111 -17 13 | 29 11 -111 -112 63 -107 -30 -45 118 58 116 47 -88 -118 90 101 -26 91 116 -94 -5 40 -43 -69 -25 -114 -104 2 -107 -13 -29 11 83 84 46 37 86 -83 111 -17 14 | 32 5 56 13 86 111 -84 -92 34 -125 3 -104 -51 106 -2 -113 96 96 117 -19 -41 103 0 100 121 -20 126 -63 15 -115 -14 -122 -100 -104 -34 -33 104 -80 -88 51 15 | 32 5 56 13 86 111 -84 -92 34 -125 3 -104 -51 106 -2 -113 96 96 117 -19 -41 103 0 100 121 -20 126 -63 15 -115 -14 -122 -100 -104 -34 -33 104 -80 -88 51 16 | -88 -108 -3 -117 -18 123 18 69 99 -65 -123 76 -115 -91 -121 -28 -59 -12 48 57 -24 81 90 -19 -53 -44 -117 -79 -75 69 -60 -11 77 86 -107 -114 -16 -94 -32 19 17 | -88 -108 -3 -117 -18 123 18 69 99 -65 -123 76 -115 -91 -121 -28 -59 -12 48 57 -24 81 90 -19 -53 -44 -117 -79 -75 69 -60 -11 77 86 -107 -114 -16 -94 -32 19 18 | 37 -90 -1 126 -116 -108 38 125 87 -56 16 50 -77 -21 -103 -87 45 -16 -95 62 -121 45 -13 94 52 96 77 -5 -46 32 10 66 111 101 -62 111 28 -10 -50 -71 19 | 37 -90 -1 126 -116 -108 38 125 87 -56 16 50 -77 -21 -103 -87 45 -16 -95 62 -121 45 -13 94 52 96 77 -5 -46 32 10 66 111 101 -62 111 28 -10 -50 -71 20 | -21 22 -62 -53 33 71 -45 123 69 80 -17 114 83 -27 -66 -96 110 103 125 -51 -62 -116 -124 108 17 125 -12 -81 -78 -60 44 118 -10 -62 -72 58 79 76 -50 83 21 | -21 22 -62 -53 33 71 -45 123 69 80 -17 114 83 -27 -66 -96 110 103 125 -51 -62 -116 -124 108 17 125 -12 -81 -78 -60 44 118 -10 -62 -72 58 79 76 -50 83 22 | -39 95 83 57 113 -123 -59 -61 69 -26 28 57 65 52 69 51 13 -81 -13 -98 -108 66 -121 108 -13 -85 37 33 103 -31 -82 -67 -64 -12 -97 113 -73 126 -82 -111 23 | -39 95 83 57 113 -123 -59 -61 69 -26 28 57 65 52 69 51 13 -81 -13 -98 -108 66 -121 108 -13 -85 37 33 103 -31 -82 -67 -64 -12 -97 113 -73 126 -82 -111 24 | -65 32 -90 54 -17 76 101 25 -97 -51 -117 28 125 -41 87 -74 76 -62 -14 101 126 -123 22 -109 78 117 -89 -119 26 48 25 -26 -93 -87 -103 -108 44 57 75 -50 25 | -65 32 -90 54 -17 76 101 25 -97 -51 -117 28 125 -41 87 -74 76 -62 -14 101 126 -123 22 -109 78 117 -89 -119 26 48 25 -26 -93 -87 -103 -108 44 57 75 -50 26 | -126 1 -77 115 -75 -31 -32 51 -100 74 99 -121 -1 18 33 57 67 102 -81 -79 35 61 0 104 -70 120 91 114 14 40 119 -40 -1 -6 91 -41 100 73 -83 24 27 | -126 1 -77 115 -75 -31 -32 51 -100 74 99 -121 -1 18 33 57 67 102 -81 -79 35 61 0 104 -70 120 91 114 14 40 119 -40 -1 -6 91 -41 100 73 -83 24 28 | 28 4 53 27 124 -103 -8 87 -81 64 117 102 77 -36 -103 49 -13 51 76 -61 -122 3 55 101 -127 28 -17 73 74 22 37 -17 51 37 68 -65 59 -115 -67 -110 29 | 28 4 53 27 124 -103 -8 87 -81 64 117 102 77 -36 -103 49 -13 51 76 -61 -122 3 55 101 -127 28 -17 73 74 22 37 -17 51 37 68 -65 59 -115 -67 -110 30 | -78 -86 126 19 119 -89 39 48 120 109 -121 -68 55 34 72 -75 -80 -98 78 61 -120 22 57 17 -86 24 123 102 -12 21 46 40 31 25 79 -73 93 -45 107 -61 31 | -78 -86 126 19 119 -89 39 48 120 109 -121 -68 55 34 72 -75 -80 -98 78 61 -120 22 57 17 -86 24 123 102 -12 21 46 40 31 25 79 -73 93 -45 107 -61 32 | -55 -113 122 80 -71 72 -46 73 -99 66 -104 87 13 -83 -44 90 24 2 -4 -39 6 75 -5 -25 66 -53 44 -67 124 25 40 -6 1 -59 -14 -35 -89 105 41 -17 33 | -55 -113 122 80 -71 72 -46 73 -99 66 -104 87 13 -83 -44 90 24 2 -4 -39 6 75 -5 -25 66 -53 44 -67 124 25 40 -6 1 -59 -14 -35 -89 105 41 -17 34 | -3 59 -68 36 102 -64 -23 54 -81 -100 100 -78 -21 92 52 101 -30 -94 -58 53 75 119 -81 -102 84 -48 96 71 71 -1 108 -40 14 88 42 100 106 54 -10 86 35 | -3 59 -68 36 102 -64 -23 54 -81 -100 100 -78 -21 92 52 101 -30 -94 -58 53 75 119 -81 -102 84 -48 96 71 71 -1 108 -40 14 88 42 100 106 54 -10 86 36 | -8 18 -90 -128 -97 21 43 -119 -2 63 29 -120 38 -89 -120 -79 124 -110 -17 -4 -14 110 106 -103 -19 -124 -55 -42 1 44 95 126 101 -69 34 -40 -51 -113 85 92 37 | -8 18 -90 -128 -97 21 43 -119 -2 63 29 -120 38 -89 -120 -79 124 -110 -17 -4 -14 110 106 -103 -19 -124 -55 -42 1 44 95 126 101 -69 34 -40 -51 -113 85 92 38 | -113 113 -65 -56 122 -57 6 71 -97 15 -97 13 -32 -7 97 -126 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 39 | -113 113 -65 -56 122 -57 6 71 -97 15 -97 13 -32 -7 97 -126 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 40 | 23 -75 -4 -78 29 93 -75 -112 -5 99 125 -86 59 98 -60 72 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 41 | 23 -75 -4 -78 29 93 -75 -112 -5 99 125 -86 59 98 -60 72 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42 | -52 -57 85 -2 -67 -71 -29 37 84 -32 19 13 115 -92 -109 -60 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 43 | -52 -57 85 -2 -67 -71 -29 37 84 -32 19 13 115 -92 -109 -60 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 44 | -------------------------------------------------------------------------------- /patch/npu_test/gemv256_batch6/instructions.dat: -------------------------------------------------------------------------------- 1 | 17 48 2 | 92 0 0 0 0 40 64 96 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | 92 0 0 0 0 40 64 96 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 | 92 0 0 0 0 40 64 96 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 | 92 0 0 0 0 40 64 96 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 | 92 0 0 0 0 40 64 96 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 | 92 0 0 0 0 40 64 96 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 | 92 0 0 0 0 40 64 96 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 252 0 6 0 0 0 240 0 3 12 0 8 160 0 129 1 10 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 13 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 | 127 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 17 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 | 1 0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 19 | -------------------------------------------------------------------------------- /patch/npu_test/gemv256_batch6/outputs.dat: -------------------------------------------------------------------------------- 1 | 12 222 163 45 31 113 131 226 161 64 203 167 245 230 5 15 239 225 250 79 120 165 224 25 243 169 168 141 206 177 94 58 155 217 30 24 46 103 84 170 2 | 12 222 163 45 31 113 131 226 161 64 203 167 245 230 5 15 239 225 250 79 120 165 224 25 243 169 168 141 206 177 94 58 155 217 30 24 46 103 84 170 3 | 65 59 214 221 247 215 119 167 42 203 69 112 241 164 186 55 242 103 138 168 185 9 199 209 38 242 104 56 154 40 110 181 209 225 20 201 147 126 136 28 4 | 65 59 214 221 247 215 119 167 42 203 69 112 241 164 186 55 242 103 138 168 185 9 199 209 38 242 104 56 154 40 110 181 209 225 20 201 147 126 136 28 5 | 95 187 188 41 12 206 149 145 14 22 198 76 14 206 187 164 139 218 127 31 226 57 90 43 117 94 159 41 229 231 195 209 128 41 82 9 4 67 217 67 6 | 95 187 188 41 12 206 149 145 14 22 198 76 14 206 187 164 139 218 127 31 226 57 90 43 117 94 159 41 229 231 195 209 128 41 82 9 4 67 217 67 7 | 146 255 130 147 65 22 0 115 36 46 230 155 42 177 37 29 245 73 222 131 72 233 82 143 172 185 248 141 166 96 160 150 5 130 68 97 134 38 35 102 8 | 146 255 130 147 65 22 0 115 36 46 230 155 42 177 37 29 245 73 222 131 72 233 82 143 172 185 248 141 166 96 160 150 5 130 68 97 134 38 35 102 9 | 110 252 146 28 23 89 112 167 89 46 207 113 57 157 191 88 156 34 252 10 157 242 94 190 206 63 77 13 246 137 47 50 225 68 85 248 215 51 201 251 10 | 110 252 146 28 23 89 112 167 89 46 207 113 57 157 191 88 156 34 252 10 157 242 94 190 206 63 77 13 246 137 47 50 225 68 85 248 215 51 201 251 11 | 200 3 254 248 7 121 2 160 195 164 79 232 5 31 39 82 185 245 175 143 97 10 34 242 168 200 28 251 105 172 180 181 90 217 140 156 255 253 27 2 12 | 200 3 254 248 7 121 2 160 195 164 79 232 5 31 39 82 185 245 175 143 97 10 34 242 168 200 28 251 105 172 180 181 90 217 140 156 255 253 27 2 13 | 210 221 77 13 93 251 251 164 68 173 140 199 29 243 197 164 67 143 252 126 48 107 73 214 195 220 4 188 188 128 26 231 74 191 174 7 183 38 217 182 14 | 210 221 77 13 93 251 251 164 68 173 140 199 29 243 197 164 67 143 252 126 48 107 73 214 195 220 4 188 188 128 26 231 74 191 174 7 183 38 217 182 15 | 117 220 65 202 42 93 126 185 65 239 122 209 182 78 139 202 251 102 155 236 16 96 1 45 211 0 219 51 73 61 252 240 174 160 89 166 254 124 251 138 16 | 117 220 65 202 42 93 126 185 65 239 122 209 182 78 139 202 251 102 155 236 16 96 1 45 211 0 219 51 73 61 252 240 174 160 89 166 254 124 251 138 17 | 100 247 101 113 102 107 232 220 0 117 36 252 248 65 41 49 198 86 37 250 193 134 7 130 18 52 97 225 48 224 38 192 243 35 202 149 103 47 154 56 18 | 100 247 101 113 102 107 232 220 0 117 36 252 248 65 41 49 198 86 37 250 193 134 7 130 18 52 97 225 48 224 38 192 243 35 202 149 103 47 154 56 19 | 207 116 61 228 164 37 104 252 63 180 233 4 202 54 125 117 202 58 10 17 170 55 97 44 45 252 6 60 232 44 176 49 124 188 133 26 208 79 173 110 20 | 207 116 61 228 164 37 104 252 63 180 233 4 202 54 125 117 202 58 10 17 170 55 97 44 45 252 6 60 232 44 176 49 124 188 133 26 208 79 173 110 21 | 253 101 232 47 82 117 217 220 179 54 5 12 117 12 173 24 216 102 65 138 119 59 222 32 102 249 141 121 75 249 4 163 233 141 156 134 25 98 138 221 22 | 253 101 232 47 82 117 217 220 179 54 5 12 117 12 173 24 216 102 65 138 119 59 222 32 102 249 141 121 75 249 4 163 233 141 156 134 25 98 138 221 23 | 44 158 197 70 236 80 175 108 33 234 26 142 84 21 176 115 92 187 207 31 62 21 214 218 203 152 123 237 201 45 9 127 99 50 15 79 123 192 96 127 24 | 44 158 197 70 236 80 175 108 33 234 26 142 84 21 176 115 92 187 207 31 62 21 214 218 203 152 123 237 201 45 9 127 99 50 15 79 123 192 96 127 25 | 148 86 168 241 77 229 114 63 229 230 80 154 200 165 36 163 60 45 89 172 219 47 135 175 39 128 191 79 43 217 150 237 115 2 64 71 54 214 76 235 26 | 148 86 168 241 77 229 114 63 229 230 80 154 200 165 36 163 60 45 89 172 219 47 135 175 39 128 191 79 43 217 150 237 115 2 64 71 54 214 76 235 27 | 133 82 62 193 192 64 172 196 197 187 199 187 147 57 72 70 122 19 231 204 73 169 104 228 249 122 238 232 112 55 234 85 88 164 40 220 16 137 165 81 28 | 133 82 62 193 192 64 172 196 197 187 199 187 147 57 72 70 122 19 231 204 73 169 104 228 249 122 238 232 112 55 234 85 88 164 40 220 16 137 165 81 29 | 0 21 202 162 226 121 151 48 214 51 155 72 242 67 34 132 180 26 221 49 35 242 7 149 38 64 2 237 30 102 65 49 118 18 165 1 74 223 23 39 30 | 0 21 202 162 226 121 151 48 214 51 155 72 242 67 34 132 180 26 221 49 35 242 7 149 38 64 2 237 30 102 65 49 118 18 165 1 74 223 23 39 31 | 187 98 66 153 106 41 34 33 143 174 210 189 254 120 159 170 102 33 142 75 56 158 231 174 172 30 12 169 6 28 207 116 153 133 87 141 103 74 43 254 32 | 187 98 66 153 106 41 34 33 143 174 210 189 254 120 159 170 102 33 142 75 56 158 231 174 172 30 12 169 6 28 207 116 153 133 87 141 103 74 43 254 33 | 98 240 139 147 254 203 75 35 90 236 30 83 91 11 41 229 134 104 7 56 218 41 38 41 229 217 128 139 222 25 219 145 52 228 39 160 158 205 163 162 34 | 98 240 139 147 254 203 75 35 90 236 30 83 91 11 41 229 134 104 7 56 218 41 38 41 229 217 128 139 222 25 219 145 52 228 39 160 158 205 163 162 35 | 195 197 15 161 98 158 38 189 114 87 133 107 244 225 85 138 178 72 8 8 23 116 60 68 87 190 220 219 254 166 140 15 243 5 242 68 123 206 213 168 36 | 195 197 15 161 98 158 38 189 114 87 133 107 244 225 85 138 178 72 8 8 23 116 60 68 87 190 220 219 254 166 140 15 243 5 242 68 123 206 213 168 37 | 251 42 192 178 140 198 105 57 131 254 232 72 47 192 111 169 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 38 | 251 42 192 178 140 198 105 57 131 254 232 72 47 192 111 169 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 39 | 172 74 129 99 144 151 9 225 175 96 147 247 252 175 77 84 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 40 | 172 74 129 99 144 151 9 225 175 96 147 247 252 175 77 84 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 41 | 191 145 223 204 87 232 178 180 99 46 118 185 233 142 135 82 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42 | 191 145 223 204 87 232 178 180 99 46 118 185 233 142 135 82 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 43 | -------------------------------------------------------------------------------- /patch/npu_test/gemv256_batch768/instructions.dat: -------------------------------------------------------------------------------- 1 | 17 48 2 | 92 0 0 0 0 40 64 96 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | 92 0 0 0 0 40 64 96 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 | 92 0 0 0 0 40 64 96 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 | 92 0 0 0 0 40 64 96 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 | 92 0 0 0 0 40 64 96 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 | 92 0 0 0 0 40 64 96 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 | 92 0 0 0 0 40 64 96 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 252 0 6 0 0 0 240 0 3 12 0 8 160 0 129 1 10 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 13 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 | 127 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 17 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 | 128 0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 19 | -------------------------------------------------------------------------------- /patch/npu_test/gen_input.py: -------------------------------------------------------------------------------- 1 | length = 8192 2 | with open('test_input.dat', 'w') as f: 3 | f.write(str(length)+" 40\n") 4 | for i in range(0, length): 5 | for j in range(0, 40): 6 | f.write(str((i % 256) - 128) + " ") 7 | f.write("\n") 8 | -------------------------------------------------------------------------------- /patch/npu_test/mlp_batch2052/instructions.dat: -------------------------------------------------------------------------------- 1 | 34 48 2 | 28 1 0 0 0 112 160 192 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | 28 1 0 0 0 112 160 192 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 | 28 1 0 0 0 112 160 192 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 | 28 1 0 0 0 112 160 192 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 | 28 1 0 0 0 112 160 192 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 | 28 1 0 0 0 112 160 192 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 | 28 1 0 0 0 112 160 192 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 13 0 0 0 0 0 0 0 3 1 52 0 0 0 0 0 0 0 252 128 6 0 0 0 240 64 3 52 0 32 192 129 2 3 10 | 188 0 0 0 0 32 32 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 | 188 0 0 0 0 32 32 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | 188 0 0 0 0 32 32 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 13 | 188 0 0 0 0 32 32 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 | 188 0 0 0 0 32 32 0 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 | 188 0 0 0 0 32 32 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 | 124 0 0 0 0 32 32 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 17 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 220 1 6 0 0 0 208 1 3 24 208 16 128 128 0 0 18 | 124 0 0 0 0 160 48 65 66 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 19 | 124 0 0 0 0 160 48 65 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 20 | 124 0 0 0 0 160 48 65 2 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 21 | 124 0 0 0 0 160 48 65 2 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 | 124 0 0 0 0 160 48 65 2 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 23 | 124 0 0 0 0 160 48 65 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 24 | 124 0 0 0 0 160 48 65 2 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 25 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 26 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 188 2 6 0 0 0 176 2 3 12 48 9 128 194 4 9 27 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 28 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 29 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 30 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 32 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 | 127 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 34 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 35 | 86 1 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 36 | -------------------------------------------------------------------------------- /patch/npu_test/mlp_batch4104/instructions.dat: -------------------------------------------------------------------------------- 1 | 34 48 2 | 28 1 0 0 0 112 160 192 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | 28 1 0 0 0 112 160 192 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 | 28 1 0 0 0 112 160 192 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 | 28 1 0 0 0 112 160 192 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 | 28 1 0 0 0 112 160 192 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 | 28 1 0 0 0 112 160 192 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 | 28 1 0 0 0 112 160 192 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 13 0 0 0 0 0 0 0 3 1 52 0 0 0 0 0 0 0 252 128 6 0 0 0 240 64 3 52 0 32 192 129 2 3 10 | 188 0 0 0 0 32 32 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 | 188 0 0 0 0 32 32 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | 188 0 0 0 0 32 32 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 13 | 188 0 0 0 0 32 32 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 | 188 0 0 0 0 32 32 0 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 | 188 0 0 0 0 32 32 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 | 124 0 0 0 0 32 32 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 17 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 220 1 6 0 0 0 208 1 3 24 208 16 128 128 0 0 18 | 124 0 0 0 0 160 48 65 66 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 19 | 124 0 0 0 0 160 48 65 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 20 | 124 0 0 0 0 160 48 65 2 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 21 | 124 0 0 0 0 160 48 65 2 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 | 124 0 0 0 0 160 48 65 2 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 23 | 124 0 0 0 0 160 48 65 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 24 | 124 0 0 0 0 160 48 65 2 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 25 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 26 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 188 2 6 0 0 0 176 2 3 12 48 9 128 194 4 9 27 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 28 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 29 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 30 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 32 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 | 127 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 34 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 35 | 172 2 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 36 | -------------------------------------------------------------------------------- /patch/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd ../DUT_example_design/ip/pcie_ed/pcie_ed_MEM/altera_avalon_onchip_memory2_1920/synth 4 | 5 | FILE=pcie_ed_MEM_altera_avalon_onchip_memory2_1920_2xhjmhi.v 6 | if test -f "$FILE"; then 7 | sed -i "s/\[ 13\: 0\]/\[ 12\: 0\]/g" $FILE 8 | sed -i "s/16384/8192/g" $FILE 9 | sed -i "s/14/13/g" $FILE 10 | else 11 | echo "IP not configured correctly." 12 | exit 1 13 | fi 14 | 15 | # DMA double buffer controller 16 | cd ../../synth 17 | cp ../../../../../patch/pcie_ed_MEM.v ./ 18 | 19 | 20 | # modify npu.vh path name 21 | cd ../../../../../rtl/ 22 | cwd=$(pwd) 23 | sed -i "s~/nfs/sc/disks/swuser_work_aboutros/npu_demo/npu-s10-nx/rtl/~$cwd~" npu.vh 24 | 25 | # add npu src files to Quartus prj 26 | cd ../DUT_example_design/ 27 | for f in ../rtl/*.sv 28 | do 29 | if [[ $f != "../rtl/altera_syncram.sv" ]] && [[ $f != "../rtl/self_tester_shim.sv" ]] && [[ $f != "../rtl/tester_rom.sv" ]] 30 | then 31 | echo "set_global_assignment -name SYSTEMVERILOG_FILE $f" >> pcie_ed.qsf 32 | fi 33 | done 34 | 35 | echo "set_global_assignment -name VERILOG_FILE ../rtl/dma_buffer.v" >> pcie_ed.qsf 36 | echo "set_global_assignment -name VERILOG_INCLUDE_FILE ../rtl/npu.vh" >> pcie_ed.qsf 37 | echo "set_global_assignment -name MIF_FILE ../rtl/tanh.mif" >> pcie_ed.qsf 38 | echo "set_global_assignment -name MIF_FILE ../rtl/sigmoid.mif" >> pcie_ed.qsf 39 | echo "set_global_assignment -name OPTIMIZATION_MODE \"SUPERIOR PERFORMANCE WITH MAXIMUM PLACEMENT EFFORT\"" >> pcie_ed.qsf 40 | 41 | # host program 42 | cd ./software 43 | cp ../../patch/kernel/* ./kernel/linux/ 44 | cp ../../patch/user/*.hpp ./user/api/ 45 | cp ../../patch/user/*.cpp ./user/api/linux/ 46 | cp -r ../../patch/npu_test ./user/ -------------------------------------------------------------------------------- /rtl/asymmetric_fifo.sv: -------------------------------------------------------------------------------- 1 | `include "npu.vh" 2 | 3 | module asymmetric_fifo # ( 4 | parameter IDW = 3*`ACCW, 5 | parameter ODW = `ACCW, 6 | parameter DEPTH = `QDEPTH, 7 | parameter ID = 0, 8 | parameter AW = $clog2(DEPTH) 9 | ) ( 10 | input clk, 11 | input rst, 12 | input wr_en, 13 | input [IDW-1:0] wr_data, 14 | output wr_ok, 15 | input rd_en, 16 | output [ODW-1:0] rd_data, 17 | output rd_ok, 18 | output [AW-1:0] usedw 19 | ); 20 | 21 | wire [2:0] fifo_wr_ok; 22 | wire [2:0] fifo_rd_ok; 23 | wire [ODW-1:0] fifo_rd_data [0:2]; 24 | wire [AW-1:0] fifo_usedw [0:2]; 25 | 26 | reg [2:0] sel; 27 | 28 | fifo # ( 29 | .ID (ID), 30 | .DW (ODW), 31 | .DEPTH (DEPTH) 32 | ) fifo0 ( 33 | .clk (clk), 34 | .rst (rst), 35 | .wr_en (wr_en), 36 | .wr_data (wr_data[ODW-1:0]), 37 | .wr_ok (fifo_wr_ok[0]), 38 | 39 | .rd_ok (fifo_rd_ok[0]), 40 | .rd_data (fifo_rd_data[0]), 41 | .rd_en (rd_en && sel[0]), 42 | .usedw (fifo_usedw[0]) 43 | ); 44 | 45 | fifo # ( 46 | .ID (ID), 47 | .DW (ODW), 48 | .DEPTH (DEPTH) 49 | ) fifo1 ( 50 | .clk (clk), 51 | .rst (rst), 52 | .wr_en (wr_en), 53 | .wr_data (wr_data[2*ODW-1:ODW]), 54 | .wr_ok (fifo_wr_ok[1]), 55 | 56 | .rd_ok (fifo_rd_ok[1]), 57 | .rd_data (fifo_rd_data[1]), 58 | .rd_en (rd_en && sel[1]), 59 | .usedw (fifo_usedw[1]) 60 | ); 61 | 62 | fifo # ( 63 | .ID (ID), 64 | .DW (ODW), 65 | .DEPTH (DEPTH) 66 | ) fifo2 ( 67 | .clk (clk), 68 | .rst (rst), 69 | .wr_en (wr_en), 70 | .wr_data (wr_data[3*ODW-1:2*ODW]), 71 | .wr_ok (fifo_wr_ok[2]), 72 | .rd_ok (fifo_rd_ok[2]), 73 | .rd_data (fifo_rd_data[2]), 74 | .rd_en (rd_en && sel[2]), 75 | .usedw (fifo_usedw[2]) 76 | ); 77 | 78 | always @ (posedge clk) begin 79 | if (rst) begin 80 | sel <= 3'b001; 81 | end else begin 82 | if (rd_en) begin 83 | sel <= (sel == 3'b100)? 3'b001: (sel << 1); 84 | end 85 | end 86 | end 87 | 88 | reg rd_ok_out; 89 | reg [ODW-1:0] rd_data_out; 90 | reg [AW-1:0] usedw_out; 91 | always @ (*) begin 92 | if (sel == 3'b001) begin 93 | rd_ok_out <= fifo_rd_ok[0]; 94 | rd_data_out <= fifo_rd_data[0]; 95 | usedw_out <= fifo_usedw[0]; 96 | end else if (sel == 3'b010) begin 97 | rd_ok_out <= fifo_rd_ok[1]; 98 | rd_data_out <= fifo_rd_data[1]; 99 | usedw_out <= fifo_usedw[1]; 100 | end else begin 101 | rd_ok_out <= fifo_rd_ok[2]; 102 | rd_data_out <= fifo_rd_data[2]; 103 | usedw_out <= fifo_usedw[2]; 104 | end 105 | end 106 | 107 | assign rd_ok = rd_ok_out; 108 | assign rd_data = rd_data_out; 109 | assign usedw = usedw_out; 110 | assign wr_ok = fifo_wr_ok[0]; 111 | 112 | endmodule -------------------------------------------------------------------------------- /rtl/axbs.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Intel Corporation. 2 | // 3 | // This reference design file is subject licensed to you by the terms and 4 | // conditions of the applicable License Terms and Conditions for Hardware 5 | // Reference Designs and/or Design Examples (either as signed by you or 6 | // found at https://www.altera.com/common/legal/leg-license_agreement.html ). 7 | // 8 | // As stated in the license, you agree to only use this reference design 9 | // solely in conjunction with Intel FPGAs or Intel CPLDs. 10 | // 11 | // THE REFERENCE DESIGN IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED 12 | // WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, 13 | // NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. Intel does not 14 | // warrant or assume responsibility for the accuracy or completeness of any 15 | // information, links or other items within the Reference Design and any 16 | // accompanying materials. 17 | // 18 | // In the event that you do not agree with such terms and conditions, do not 19 | // use the reference design file. 20 | ///////////////////////////////////////////////////////////////////////////// 21 | 22 | (* altera_attribute = "-name FRACTAL_SYNTHESIS ON; -name SYNCHRONIZER_IDENTIFICATION OFF" *) 23 | module axbs #( 24 | parameter SIZE_A = 27, 25 | parameter SIZE_B = 27 26 | ) ( 27 | input clk, 28 | input signed [SIZE_A-1:0] din_a, 29 | input signed [SIZE_B-1:0] din_b, 30 | output reg signed [SIZE_A+SIZE_B-1:0] dout 31 | ); 32 | 33 | reg signed [SIZE_A+SIZE_B-1:0] dout_r; 34 | reg signed [SIZE_A+SIZE_B-1:0] dout_rr; 35 | reg signed [SIZE_A+SIZE_B-1:0] dout_rrr; 36 | 37 | always @(posedge clk) begin 38 | dout_r <= din_a * din_b; 39 | dout_rr <= dout_r; 40 | dout_rrr <= dout_rr; 41 | dout <= dout_rrr; 42 | end 43 | endmodule 44 | -------------------------------------------------------------------------------- /rtl/bram_accum.sv: -------------------------------------------------------------------------------- 1 | `include "npu.vh" 2 | 3 | module bram_accum # ( 4 | parameter ACCW = `ACCW, 5 | parameter NDPE = `NDPE, 6 | parameter DOTW = `DOTW, 7 | parameter PRIME_DOTW = `PRIME_DOTW, 8 | parameter DOT_PER_DSP = `DOT_PER_DSP, 9 | parameter NUM_DSP = `NUM_DSP, 10 | parameter NUM_CHUNKS = NDPE/DOTW, 11 | parameter NUM_ACCUM = `NUM_ACCUM, 12 | parameter ACCIDW = `ACCIDW 13 | )( 14 | input clk, 15 | input rst, 16 | input [3+ACCIDW-1:0] accum_ctrl [0:3*NDPE-1], //[3] valid, [2:1] op, [0] sel 17 | input [3*ACCW*NDPE-1:0] accum_in, 18 | output [NDPE-1:0] valid_out, 19 | output [3*ACCW*NDPE-1:0] accum_out 20 | ); 21 | 22 | localparam ACCUM_DEPTH = NUM_ACCUM*2; 23 | localparam ACCUM_ADDRW = $clog2(ACCUM_DEPTH); 24 | localparam BRAM_LATENCY = 2; 25 | localparam [1:0] ACC_OP_SET = 0, ACC_OP_UPD = 1, ACC_OP_WB = 2, ACC_OP_SET_AND_WB = 3; 26 | 27 | reg [ACCUM_ADDRW-1:0] accum_rd_addr [0:3*NDPE-1]; 28 | wire [ACCW-1:0] accum_rd_data [0:3*NDPE-1]; 29 | 30 | reg [3*ACCW*NDPE-1:0] r_accum_in [0:BRAM_LATENCY]; 31 | reg [ACCUM_ADDRW-1:0] r_accum_rd_addr [0:BRAM_LATENCY][0:3*NDPE-1]; 32 | reg [3+ACCIDW-1:0] r_accum_ctrl [0:BRAM_LATENCY][0:3*NDPE-1]; 33 | reg [ACCW-1:0] accum_wr_data [0:3*NDPE-1]; 34 | reg [NDPE-1:0] valid [0:BRAM_LATENCY]; 35 | 36 | wire [3*ACCW*NDPE-1:0] accum_res; 37 | 38 | integer a, p; 39 | always @ (posedge clk) begin 40 | if (rst) begin 41 | for(a = 0; a < 3*NDPE; a = a + 1) begin 42 | accum_rd_addr[a] <= 0; 43 | end 44 | for(p = 0; p < BRAM_LATENCY+1; p = p + 1) begin 45 | valid[p] <= 'd0; 46 | end 47 | end else begin 48 | for(a = 0; a < 3*NDPE; a = a + 1) begin 49 | // If valid input, increment read address 50 | if(accum_ctrl[a][3+ACCIDW-1]) begin 51 | if(accum_rd_addr[a] == accum_ctrl[a][ACCIDW-1:0]-1) begin 52 | accum_rd_addr[a] <= 0; 53 | end else begin 54 | accum_rd_addr[a] <= ACCUM_ADDRW'(accum_rd_addr[a] + 1'b1); 55 | end 56 | end 57 | 58 | // Pipeline ctrl, address and input to align with read value (then an extra pipeline for addition) 59 | r_accum_rd_addr[0][a] <= accum_rd_addr[a]; 60 | r_accum_in[0] <= accum_in; 61 | r_accum_ctrl[0][a] <= accum_ctrl[a]; 62 | for(p = 1; p < BRAM_LATENCY+1; p = p + 1) begin 63 | r_accum_rd_addr[p][a] <= r_accum_rd_addr[p-1][a]; 64 | r_accum_in[p] <= r_accum_in[p-1]; 65 | r_accum_ctrl[p][a] <= r_accum_ctrl[p-1][a]; 66 | end 67 | 68 | // Perform addition 69 | accum_wr_data[a] <= ((r_accum_ctrl[BRAM_LATENCY-1][a][ACCIDW+:2] == ACC_OP_SET) 70 | || (r_accum_ctrl[BRAM_LATENCY-1][a][ACCIDW+:2] == ACC_OP_SET_AND_WB))? 71 | r_accum_in[BRAM_LATENCY-1][a*ACCW+:ACCW]: 72 | r_accum_in[BRAM_LATENCY-1][a*ACCW+:ACCW] + accum_rd_data[a]; 73 | end 74 | 75 | // Valid pipeline 76 | for(a = 0; a < NDPE; a = a + 1) begin 77 | valid[0][a] <= ((accum_ctrl[a*3][ACCIDW+:2] == ACC_OP_WB) || (accum_ctrl[a*3][ACCIDW+:2] == ACC_OP_SET_AND_WB)) 78 | && (accum_ctrl[a*3][3+ACCIDW-1]); 79 | for(p = 1; p < BRAM_LATENCY+1; p = p + 1) begin 80 | valid[p][a] <= valid[p-1][a]; 81 | end 82 | end 83 | end 84 | end 85 | 86 | genvar accum_id; 87 | generate 88 | for(accum_id = 0; accum_id < 3*NDPE; accum_id = accum_id + 1) begin: gen_accum_ram 89 | ram #( 90 | .ID (accum_id), 91 | .DW (ACCW), 92 | .AW (ACCUM_ADDRW), 93 | .DEPTH (ACCUM_DEPTH), 94 | .MODULE_ID ("accum") 95 | ) accum_ram ( 96 | .wr_en (r_accum_ctrl[BRAM_LATENCY][accum_id][3+ACCIDW-1]), 97 | .wr_addr (r_accum_rd_addr[BRAM_LATENCY][accum_id]), 98 | .wr_data (accum_wr_data[accum_id]), 99 | .rd_addr (accum_rd_addr[accum_id]), 100 | .rd_data (accum_rd_data[accum_id]), 101 | .clk (clk), 102 | .rst (rst) 103 | ); 104 | 105 | assign accum_res[accum_id*ACCW+:ACCW] = accum_wr_data[accum_id]; 106 | end 107 | endgenerate 108 | 109 | reg [3*ACCW*NDPE-1:0] accum_out_arranged; 110 | always @(*) begin 111 | for (p = 0; p < NUM_CHUNKS*3; p = p + 3) begin 112 | accum_out_arranged[(p*ACCW*DOTW)+:(ACCW*DOTW)] = accum_res[(p/3*ACCW*DOTW)+:(ACCW*DOTW)]; 113 | accum_out_arranged[((p+1)*ACCW*DOTW)+:(ACCW*DOTW)] = accum_res[(ACCW*NDPE)+(p/3*ACCW*DOTW)+:(ACCW*DOTW)]; 114 | accum_out_arranged[((p+2)*ACCW*DOTW)+:(ACCW*DOTW)] = accum_res[(2*ACCW*NDPE)+(p/3*ACCW*DOTW)+:(ACCW*DOTW)]; 115 | end 116 | end 117 | 118 | assign valid_out = valid[BRAM_LATENCY]; 119 | assign accum_out = accum_out_arranged; 120 | 121 | `ifdef DISPLAY_MVU 122 | always @(posedge clk) begin 123 | if (accum_ctrl[0][3+ACCIDW-1]) begin 124 | $display("[%0t][ACCUM] addr: %d, size: %d", 125 | $time, 126 | accum_rd_addr[0], 127 | accum_ctrl[0][ACCIDW-1:0]); 128 | end 129 | end 130 | `endif 131 | 132 | endmodule -------------------------------------------------------------------------------- /rtl/daisy_chain_interconnect.sv: -------------------------------------------------------------------------------- 1 | 2 | // This module implements a daisy chain interconnect from a source to multiple sinks with parameterizable latency per hop. 3 | (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name DONT_MERGE_REGISTER ON" *) module daisy_chain_interconnect # ( 4 | parameter DATAW = 32, 5 | parameter END_POINTS = 4, 6 | parameter LATENCY_PER_HOP = 2 7 | ) ( 8 | input clk, 9 | input rst, 10 | input [DATAW-1:0] i_daisy_chain_in, 11 | output [DATAW-1:0] o_daisy_chain_out [0:END_POINTS-1] 12 | ); 13 | 14 | reg [DATAW-1:0] pipeline [0:LATENCY_PER_HOP*END_POINTS-1]; 15 | 16 | integer t; 17 | always @ (posedge clk) begin 18 | // Set the input to the first pipeline stage 19 | pipeline[0] <= i_daisy_chain_in; 20 | // Progress the pipeline 21 | for (t = 1; t < LATENCY_PER_HOP*END_POINTS; t = t + 1) begin 22 | pipeline[t] <= pipeline[t-1]; 23 | end 24 | end 25 | 26 | // Hook up outputs 27 | genvar i; 28 | generate 29 | for(i = 0; i < END_POINTS; i = i + 1) begin: gen_outputs 30 | assign o_daisy_chain_out[i] = pipeline[(LATENCY_PER_HOP-1)+i*LATENCY_PER_HOP]; 31 | end 32 | endgenerate 33 | 34 | endmodule -------------------------------------------------------------------------------- /rtl/dma_buffer.v: -------------------------------------------------------------------------------- 1 | `timescale 1ns / 1ps 2 | 3 | module dma_buffer # ( 4 | parameter WIDTH = 512, 5 | parameter DEPTH = 8192, 6 | parameter ADDRW = $clog2(DEPTH), 7 | parameter BYENW = WIDTH / 8 8 | )( 9 | input clk, 10 | input ren, 11 | input [ADDRW-1:0] raddr, 12 | output [WIDTH-1:0] rdata, 13 | input wen, 14 | input [ADDRW-1:0] waddr, 15 | input [BYENW-1:0] wben, 16 | input [WIDTH-1: 0] wdata 17 | ); 18 | 19 | reg [WIDTH-1:0] readdata; 20 | wire [WIDTH-1:0] readdata_ram; 21 | wire wren; 22 | 23 | always @(posedge clk) begin 24 | //if (ren) begin 25 | readdata <= readdata_ram; 26 | //end 27 | end 28 | 29 | assign rdata = readdata; 30 | 31 | altera_syncram altera_syncram_component ( 32 | .address_a (waddr), 33 | .address_b (raddr), 34 | .byteena_a (1'b1), 35 | .clock0 (clk), 36 | .data_a (wdata), 37 | .wren_a (wen), 38 | .q_b (readdata_ram), 39 | .aclr0 (1'b0), 40 | .aclr1 (1'b0), 41 | .address2_a (1'b1), 42 | .address2_b (1'b1), 43 | .addressstall_a (1'b0), 44 | .addressstall_b (1'b0), 45 | .byteena_b (1'b1), 46 | .clock1 (1'b1), 47 | .clocken0 (1'b1), 48 | .clocken1 (1'b1), 49 | .clocken2 (1'b1), 50 | .clocken3 (1'b1), 51 | .data_b ({512{1'b1}}), 52 | .eccencbypass (1'b0), 53 | .eccencparity (8'b0), 54 | .eccstatus (), 55 | .q_a (), 56 | .rden_a (1'b1), 57 | .rden_b (1'b1), 58 | .sclr (1'b0), 59 | .wren_b (1'b0) 60 | ); 61 | defparam 62 | altera_syncram_component.address_aclr_b = "NONE", 63 | altera_syncram_component.address_reg_b = "CLOCK0", 64 | altera_syncram_component.byte_size = 8, 65 | altera_syncram_component.clock_enable_input_a = "BYPASS", 66 | altera_syncram_component.clock_enable_input_b = "BYPASS", 67 | altera_syncram_component.clock_enable_output_b = "BYPASS", 68 | altera_syncram_component.intended_device_family = "Stratix 10", 69 | altera_syncram_component.lpm_type = "altera_syncram", 70 | altera_syncram_component.numwords_a = DEPTH, 71 | altera_syncram_component.numwords_b = DEPTH, 72 | altera_syncram_component.operation_mode = "DUAL_PORT", 73 | altera_syncram_component.outdata_aclr_b = "NONE", 74 | altera_syncram_component.outdata_sclr_b = "NONE", 75 | altera_syncram_component.outdata_reg_b = "UNREGISTERED", 76 | altera_syncram_component.power_up_uninitialized = "FALSE", 77 | altera_syncram_component.read_during_write_mode_mixed_ports = "DONT_CARE", 78 | altera_syncram_component.widthad_a = ADDRW, 79 | altera_syncram_component.widthad_b = ADDRW, 80 | altera_syncram_component.width_a = WIDTH, 81 | altera_syncram_component.width_b = WIDTH, 82 | altera_syncram_component.width_byteena_a = 1; 83 | 84 | endmodule -------------------------------------------------------------------------------- /rtl/dpe_mrf.sv: -------------------------------------------------------------------------------- 1 | `include "npu.vh" 2 | 3 | module dpe_mrf # ( 4 | parameter MODULE_ID = "", 5 | parameter ID = 0, 6 | parameter DW = 32, 7 | parameter DEPTH = 512, 8 | parameter AW = 9, 9 | parameter EW = `EW, 10 | parameter DOTW = `DOTW, 11 | parameter NUM_DSP = DOTW/10 12 | )( 13 | input wr_en, 14 | input [AW-1:0] wr_addr, 15 | input [AW-1:0] rd_addr, 16 | input [DW-1:0] wr_data, 17 | output [DW-1:0] rd_data, 18 | input clk, 19 | input rst 20 | ); 21 | 22 | 23 | reg [AW-1:0] rd_addr_balance [0:(2*(NUM_DSP-1))-1]; 24 | 25 | integer c; 26 | always @ (posedge clk) begin 27 | rd_addr_balance[0] <= rd_addr; 28 | for(c = 1; c < 2*(NUM_DSP-1); c = c + 1) begin 29 | rd_addr_balance[c] <= rd_addr_balance[c-1]; 30 | end 31 | end 32 | 33 | genvar ram_id; 34 | generate 35 | for (ram_id = 0; ram_id < NUM_DSP; ram_id = ram_id + 1) begin: gen_ram 36 | if (ram_id == 0) begin 37 | mrf_ram #( 38 | .ID(ID), 39 | .DW(EW*10), 40 | .AW(AW), 41 | .DEPTH(DEPTH), 42 | .MODULE_ID("mvu-mrf"), 43 | .RAM_ID(ram_id) 44 | ) ram_0 ( 45 | .wr_en (wr_en), 46 | .wr_addr (wr_addr), 47 | .wr_data (wr_data[(NUM_DSP-ram_id)*EW*10-1 : (NUM_DSP-(ram_id+1))*EW*10]), 48 | .rd_addr (rd_addr), 49 | .rd_data (rd_data[(NUM_DSP-ram_id)*EW*10-1 : (NUM_DSP-(ram_id+1))*EW*10]), 50 | .clk (clk), 51 | .rst (rst) 52 | ); 53 | end else begin 54 | mrf_ram #( 55 | .ID(ID), 56 | .DW(EW*10), 57 | .AW(AW), 58 | .DEPTH(DEPTH), 59 | .MODULE_ID("mvu-mrf"), 60 | .RAM_ID(ram_id) 61 | ) ram_i ( 62 | .wr_en (wr_en), 63 | .wr_addr (wr_addr), 64 | .wr_data (wr_data[(NUM_DSP-ram_id)*EW*10-1 : (NUM_DSP-(ram_id+1))*EW*10]), 65 | .rd_addr (rd_addr_balance[2*ram_id-1]), 66 | .rd_data (rd_data[(NUM_DSP-ram_id)*EW*10-1 : (NUM_DSP-(ram_id+1))*EW*10]), 67 | .clk (clk), 68 | .rst (rst) 69 | ); 70 | end 71 | end 72 | endgenerate 73 | 74 | endmodule 75 | 76 | -------------------------------------------------------------------------------- /rtl/inst_fifo.sv: -------------------------------------------------------------------------------- 1 | `include "npu.vh" 2 | 3 | module inst_fifo # ( 4 | parameter DW = 64, // FIFO data width 5 | parameter DEPTH = 512, // FIFO depth 6 | parameter ID = 0, // Unique FIFO ID (used for debugging) 7 | parameter TARGET_FPGA = `TARGET_FPGA, 8 | parameter AW = $clog2(DEPTH), 9 | parameter MLAB_FIFO_DEPTH = 7, 10 | parameter MLAB_FIFO_ADDRW = $clog2(MLAB_FIFO_DEPTH), 11 | parameter NTAG = `NTAG, 12 | parameter NTAGW = `NTAGW, 13 | parameter MODULE = "evrf" 14 | ) ( 15 | input clk, 16 | input rst, 17 | input wr_en, 18 | input [DW-1:0] wr_data, 19 | output wr_ok, 20 | input rd_en, 21 | output [DW-1:0] rd_data, 22 | output rd_ok, 23 | input [NTAGW-1:0] current_tag 24 | ); 25 | 26 | wire m20k_fifo_rd_ok, mlab_fifo_wr_ok, mlab_fifo_rd_ok; 27 | wire [DW-1:0] m20k_fifo_rd_data; 28 | reg rd_from_m20k, r_rd_from_m20k; 29 | 30 | reg [NTAGW-1:0] tag_lookahead [0:MLAB_FIFO_DEPTH-1]; 31 | reg [MLAB_FIFO_ADDRW-1:0] rd_ptr, wr_ptr; 32 | 33 | reg inst_rd_ok; 34 | 35 | normal_fifo # ( 36 | .DW (DW), 37 | .DEPTH (DEPTH), 38 | .ID (ID), 39 | .TARGET_FPGA(TARGET_FPGA), 40 | .AW (AW) 41 | ) m20k_fifo ( 42 | .clk (clk), 43 | .rst (rst), 44 | .wr_en (wr_en), 45 | .wr_data (wr_data), 46 | .wr_ok (wr_ok), 47 | .rd_en (rd_from_m20k), 48 | .rd_data (m20k_fifo_rd_data), 49 | .rd_ok (m20k_fifo_rd_ok) 50 | ); 51 | 52 | mlab_fifo # ( 53 | .DW (DW), 54 | .DEPTH (MLAB_FIFO_DEPTH), 55 | .ID (ID), 56 | .TARGET_FPGA(TARGET_FPGA), 57 | .AW (MLAB_FIFO_ADDRW) 58 | ) mlab_fifo ( 59 | .clk (clk), 60 | .rst (rst), 61 | .wr_en (r_rd_from_m20k), 62 | .wr_data (m20k_fifo_rd_data), 63 | .wr_ok (mlab_fifo_wr_ok), 64 | .rd_en (rd_en), 65 | .rd_data (rd_data), 66 | .rd_ok (mlab_fifo_rd_ok) 67 | ); 68 | 69 | always @ (posedge clk) begin 70 | if (rst) begin 71 | rd_from_m20k <= 1'b0; 72 | r_rd_from_m20k <= 1'b0; 73 | end else begin 74 | rd_from_m20k <= m20k_fifo_rd_ok && mlab_fifo_wr_ok; 75 | r_rd_from_m20k <= rd_from_m20k; 76 | end 77 | end 78 | 79 | integer i; 80 | always @ (posedge clk) begin 81 | if (rst) begin 82 | rd_ptr <= 'd0; 83 | wr_ptr <= 'd0; 84 | for (i = 0; i < MLAB_FIFO_DEPTH; i = i + 1) begin 85 | tag_lookahead[i] <= {(NTAGW){1'b1}}; 86 | end 87 | end else begin 88 | if (r_rd_from_m20k) begin 89 | if(MODULE == "evrf") begin 90 | tag_lookahead[wr_ptr] <= `evrf_uinst_tag(m20k_fifo_rd_data); 91 | end else if (MODULE == "mfu") begin 92 | tag_lookahead[wr_ptr] <= `mfu_uinst_tag(m20k_fifo_rd_data); 93 | end 94 | wr_ptr <= (wr_ptr == MLAB_FIFO_DEPTH-1)? 95 | 'd0: MLAB_FIFO_ADDRW'(wr_ptr + 1'b1); 96 | end 97 | 98 | if (rd_en) begin 99 | tag_lookahead[rd_ptr] <= {(NTAGW){1'b1}}; 100 | rd_ptr <= (rd_ptr == MLAB_FIFO_DEPTH-1)? 101 | 'd0: MLAB_FIFO_ADDRW'(rd_ptr + 1'b1); 102 | end 103 | end 104 | end 105 | 106 | wire state_t, state_tm1; 107 | assign state_tm1 = (current_tag >= tag_lookahead[rd_ptr]); 108 | assign state_t = (rd_ptr == MLAB_FIFO_DEPTH-1)? (current_tag >= tag_lookahead[0]) : (current_tag >= tag_lookahead[rd_ptr+1]); 109 | always @ (posedge clk) begin 110 | if (rst) begin 111 | inst_rd_ok <= 1'b0; 112 | end else begin 113 | inst_rd_ok <= state_t; 114 | end 115 | end 116 | 117 | assign rd_ok = inst_rd_ok; 118 | 119 | endmodule -------------------------------------------------------------------------------- /rtl/inst_ram.sv: -------------------------------------------------------------------------------- 1 | `include "npu.vh" 2 | 3 | module inst_ram # ( 4 | parameter MODULE_ID = "", 5 | parameter OUTREG = "CLOCK0", 6 | parameter ID = 0, 7 | parameter ID_UNITS = (ID%10) + 8'h30, 8 | parameter ID_TENS = (ID/10 == 0)? "": ((ID/10)%10) + 8'h30, 9 | parameter ID_HUNDREDS = (ID/100 == 0)? "": (ID/100) + 8'h30, 10 | parameter DW = 32, 11 | parameter DEPTH = 512, 12 | parameter AW = 9, 13 | parameter RTL_DIR = `RTL_DIR, 14 | parameter TARGET_FPGA = `TARGET_FPGA 15 | )( 16 | input wr_en, 17 | input [AW-1:0] wr_addr, 18 | input [AW-1:0] rd_addr, 19 | input [DW-1:0] wr_data, 20 | output [DW-1:0] rd_data, 21 | input clk, 22 | input rst 23 | ); 24 | 25 | wire [DW-1:0] sub_wire0; 26 | assign rd_data = sub_wire0[DW-1:0]; 27 | 28 | 29 | //localparam RAM_SRC = {RTL_DIR, "mif_files/top_sched.mif"}; 30 | 31 | altera_syncram altera_syncram_component ( 32 | .address_a (wr_addr), 33 | .address_b (rd_addr), 34 | .clock0 (clk), 35 | .data_a (wr_data), 36 | .wren_a (wr_en), 37 | .q_b (sub_wire0), 38 | .aclr0 (1'b0), 39 | .aclr1 (1'b0), 40 | .address2_a (1'b1), 41 | .address2_b (1'b1), 42 | .addressstall_a(1'b0), 43 | .addressstall_b(1'b0), 44 | .byteena_a (1'b1), 45 | .byteena_b (1'b1), 46 | .clock1 (1'b1), 47 | .clocken0 (1'b1), 48 | .clocken1 (1'b1), 49 | .clocken2 (1'b1), 50 | .clocken3 (1'b1), 51 | .data_b ({(DW){1'b1}}), 52 | .eccencbypass (1'b0), 53 | .eccencparity (8'b0), 54 | .eccstatus (), 55 | .q_a (), 56 | .rden_a (1'b1), 57 | .rden_b (1'b1), 58 | .sclr (1'b0), 59 | .wren_b (1'b0) 60 | ); 61 | 62 | defparam 63 | altera_syncram_component.address_aclr_b = "NONE", 64 | altera_syncram_component.address_reg_b = "CLOCK0", 65 | altera_syncram_component.clock_enable_input_a = "BYPASS", 66 | altera_syncram_component.clock_enable_input_b = "BYPASS", 67 | altera_syncram_component.clock_enable_output_b = "BYPASS", 68 | /*`ifdef DEPLOY 69 | altera_syncram_component.init_file = RAM_SRC, 70 | `endif*/ 71 | altera_syncram_component.enable_ecc = "FALSE", 72 | altera_syncram_component.intended_device_family = TARGET_FPGA, 73 | altera_syncram_component.lpm_type = "altera_syncram", 74 | altera_syncram_component.numwords_a = DEPTH, 75 | altera_syncram_component.numwords_b = DEPTH, 76 | altera_syncram_component.operation_mode = "DUAL_PORT", 77 | altera_syncram_component.outdata_aclr_b = "NONE", 78 | altera_syncram_component.outdata_sclr_b = "NONE", 79 | altera_syncram_component.outdata_reg_b = OUTREG, 80 | altera_syncram_component.power_up_uninitialized = "FALSE", 81 | altera_syncram_component.ram_block_type = "M20K", 82 | altera_syncram_component.read_during_write_mode_mixed_ports = "DONT_CARE", 83 | altera_syncram_component.widthad_a = AW, 84 | altera_syncram_component.widthad_b = AW, 85 | altera_syncram_component.width_a = DW, 86 | altera_syncram_component.width_b = DW, 87 | altera_syncram_component.width_byteena_a = 1; 88 | 89 | 90 | // Debug 91 | // always @ (posedge clk) begin 92 | // if(wr_en) 93 | // $display("[%0t][%s] wr ram%d[%d] = %x(%d,%d,%d,%d)", 94 | // $time, `__FILE__, ID, wr_addr, wr_data, wr_data[7:0], wr_data[15:8], 95 | // wr_data[23:16], wr_data[31:24]); 96 | // end 97 | 98 | endmodule 99 | 100 | -------------------------------------------------------------------------------- /rtl/mrf_ram.sv: -------------------------------------------------------------------------------- 1 | `include "npu.vh" 2 | 3 | module mrf_ram # ( 4 | parameter MODULE_ID = "", 5 | parameter OUTREG = "CLOCK0", 6 | parameter ID = 0, 7 | parameter ID_UNITS = (ID%10) + 8'h30, 8 | parameter ID_TENS = ((ID/10)%10) + 8'h30, 9 | parameter ID_HUNDREDS = (ID/100) + 8'h30, 10 | parameter RAM_ID = 0, 11 | parameter DW = 32, 12 | parameter DEPTH = 512, 13 | parameter AW = 9, 14 | parameter RTL_DIR = `RTL_DIR, 15 | parameter TARGET_FPGA = `TARGET_FPGA 16 | )( 17 | input wr_en, 18 | input [AW-1:0] wr_addr, 19 | input [AW-1:0] rd_addr, 20 | input [DW-1:0] wr_data, 21 | output [DW-1:0] rd_data, 22 | input clk, 23 | input rst 24 | ); 25 | 26 | wire [DW-1:0] sub_wire0; 27 | assign rd_data = sub_wire0[DW-1:0]; 28 | 29 | 30 | localparam RAM_SRC = {RTL_DIR, "mif_files/", MODULE_ID, ID_HUNDREDS, ID_TENS, ID_UNITS, "_", RAM_ID+8'h30, ".mif"}; 31 | 32 | altera_syncram altera_syncram_component ( 33 | .address_a (wr_addr), 34 | .address_b (rd_addr), 35 | .clock0 (clk), 36 | .data_a (wr_data), 37 | .wren_a (wr_en), 38 | .q_b (sub_wire0), 39 | .aclr0 (1'b0), 40 | .aclr1 (1'b0), 41 | .address2_a (1'b1), 42 | .address2_b (1'b1), 43 | .addressstall_a(1'b0), 44 | .addressstall_b(1'b0), 45 | .byteena_a (1'b1), 46 | .byteena_b (1'b1), 47 | .clock1 (1'b1), 48 | .clocken0 (1'b1), 49 | .clocken1 (1'b1), 50 | .clocken2 (1'b1), 51 | .clocken3 (1'b1), 52 | .data_b ({(DW){1'b1}}), 53 | .eccencbypass (1'b0), 54 | .eccencparity (8'b0), 55 | .eccstatus (), 56 | .q_a (), 57 | .rden_a (1'b1), 58 | .rden_b (1'b1), 59 | .sclr (1'b0), 60 | .wren_b (1'b0) 61 | ); 62 | 63 | defparam 64 | altera_syncram_component.address_aclr_b = "NONE", 65 | altera_syncram_component.address_reg_b = "CLOCK0", 66 | altera_syncram_component.clock_enable_input_a = "BYPASS", 67 | altera_syncram_component.clock_enable_input_b = "BYPASS", 68 | altera_syncram_component.clock_enable_output_b = "BYPASS", 69 | altera_syncram_component.init_file = RAM_SRC, 70 | altera_syncram_component.enable_ecc = "FALSE", 71 | altera_syncram_component.intended_device_family = TARGET_FPGA, 72 | altera_syncram_component.lpm_type = "altera_syncram", 73 | altera_syncram_component.numwords_a = DEPTH, 74 | altera_syncram_component.numwords_b = DEPTH, 75 | altera_syncram_component.operation_mode = "DUAL_PORT", 76 | altera_syncram_component.outdata_aclr_b = "NONE", 77 | altera_syncram_component.outdata_sclr_b = "NONE", 78 | altera_syncram_component.outdata_reg_b = OUTREG, 79 | altera_syncram_component.power_up_uninitialized = "FALSE", 80 | altera_syncram_component.ram_block_type = "M20K", 81 | altera_syncram_component.read_during_write_mode_mixed_ports = "DONT_CARE", 82 | altera_syncram_component.widthad_a = AW, 83 | altera_syncram_component.widthad_b = AW, 84 | altera_syncram_component.width_a = DW, 85 | altera_syncram_component.width_b = DW, 86 | altera_syncram_component.width_byteena_a = 1; 87 | 88 | 89 | // Debug 90 | // always @ (posedge clk) begin 91 | // if(wr_en) 92 | // $display("[%0t][%s] wr ram%d[%d] = %x(%d,%d,%d,%d)", 93 | // $time, `__FILE__, ID, wr_addr, wr_data, wr_data[7:0], wr_data[15:8], 94 | // wr_data[23:16], wr_data[31:24]); 95 | // end 96 | 97 | endmodule 98 | 99 | -------------------------------------------------------------------------------- /rtl/mvu_vrf.sv: -------------------------------------------------------------------------------- 1 | `include "npu.vh" 2 | 3 | module mvu_vrf # ( 4 | parameter MODULE_ID = "", 5 | parameter OUTREG = "CLOCK0", 6 | parameter ID = 0, 7 | parameter DW = 32, 8 | parameter DEPTH = 512, 9 | parameter AW = 9, 10 | parameter RTL_DIR = `RTL_DIR, 11 | parameter TARGET_FPGA = `TARGET_FPGA, 12 | parameter EW = `EW, 13 | parameter DEVICE = (TARGET_FPGA == "S10-Prime")? "Stratix 10": TARGET_FPGA, 14 | parameter PRIME_DOTW = `PRIME_DOTW, 15 | parameter DOTW = `DOTW, 16 | parameter NUM_DSP = `NUM_DSP, 17 | parameter NUM_RAM = (TARGET_FPGA == "S10-Prime")? NUM_DSP: 1, 18 | parameter RW = DW / NUM_RAM, 19 | parameter VRFIDW = `VRFIDW, 20 | parameter MVU_TILE = 0 21 | )( 22 | input wr_en, 23 | input [AW-1:0] wr_addr, 24 | input [AW-1:0] rd_addr, 25 | input [DW-1:0] wr_data, 26 | input [VRFIDW-1:0] rd_id, 27 | input rd_en, 28 | output [RW-1:0] rd_data, 29 | input clk, 30 | input rst 31 | ); 32 | 33 | wire [RW-1:0] rdata [0:NUM_RAM-1]; 34 | reg [VRFIDW-1:0] id [0:1]; 35 | reg rd [0:1]; 36 | 37 | always @ (posedge clk) begin 38 | if(rst)begin 39 | id[0] <= 'd0; 40 | id[1] <= 'd0; 41 | rd[0] <= 0; 42 | rd[1] <= 0; 43 | end else begin 44 | id[0] <= rd_id; 45 | id[1] <= id[0]; 46 | 47 | rd[0] <= rd_en; 48 | rd[1] <= rd[0]; 49 | end 50 | end 51 | 52 | genvar i; 53 | generate 54 | for(i = 0; i < NUM_RAM; i = i + 1) begin: gen_mvu_vrf_ram 55 | altera_syncram altera_syncram_component ( 56 | .address_a (wr_addr), 57 | .address_b (rd_addr), 58 | .clock0 (clk), 59 | .data_a (wr_data[i*RW +: RW]), 60 | .wren_a (wr_en), 61 | .q_b (rdata[i]), 62 | .aclr0 (1'b0), 63 | .aclr1 (1'b0), 64 | .address2_a (1'b1), 65 | .address2_b (1'b1), 66 | .addressstall_a(1'b0), 67 | .addressstall_b(1'b0), 68 | .byteena_a (1'b1), 69 | .byteena_b (1'b1), 70 | .clock1 (1'b1), 71 | .clocken0 (1'b1), 72 | .clocken1 (1'b1), 73 | .clocken2 (1'b1), 74 | .clocken3 (1'b1), 75 | .data_b ({(RW){1'b1}}), 76 | .eccencbypass (1'b0), 77 | .eccencparity (8'b0), 78 | .eccstatus (), 79 | .q_a (), 80 | .rden_a (1'b1), 81 | .rden_b (1'b1), 82 | .sclr (1'b0), 83 | .wren_b (1'b0) 84 | ); 85 | 86 | defparam 87 | altera_syncram_component.address_aclr_b = "NONE", 88 | altera_syncram_component.address_reg_b = "CLOCK0", 89 | altera_syncram_component.clock_enable_input_a = "BYPASS", 90 | altera_syncram_component.clock_enable_input_b = "BYPASS", 91 | altera_syncram_component.clock_enable_output_b = "BYPASS", 92 | altera_syncram_component.enable_ecc = "FALSE", 93 | altera_syncram_component.intended_device_family = DEVICE, 94 | altera_syncram_component.lpm_type = "altera_syncram", 95 | altera_syncram_component.numwords_a = DEPTH, 96 | altera_syncram_component.numwords_b = DEPTH, 97 | altera_syncram_component.operation_mode = "DUAL_PORT", 98 | altera_syncram_component.outdata_aclr_b = "NONE", 99 | altera_syncram_component.outdata_sclr_b = "NONE", 100 | altera_syncram_component.outdata_reg_b = OUTREG, 101 | altera_syncram_component.power_up_uninitialized = "FALSE", 102 | altera_syncram_component.ram_block_type = "M20K", 103 | altera_syncram_component.read_during_write_mode_mixed_ports = "DONT_CARE", 104 | altera_syncram_component.widthad_a = AW, 105 | altera_syncram_component.widthad_b = AW, 106 | altera_syncram_component.width_a = RW, 107 | altera_syncram_component.width_b = RW, 108 | altera_syncram_component.width_byteena_a = 1; 109 | end 110 | 111 | endgenerate 112 | 113 | assign rd_data = rdata[id[1]]; 114 | 115 | 116 | `ifdef DISPLAY_MVU 117 | always @(posedge clk) begin 118 | if(wr_en && MVU_TILE == 0) begin 119 | $display("[%0t][MVU-VRF] wr_addr: %d, wr_data: %d %d %d %d %d %d %d %d %d %d", 120 | $time, 121 | wr_addr, 122 | $signed(wr_data[7:0]), 123 | $signed(wr_data[15:8]), 124 | $signed(wr_data[23:16]), 125 | $signed(wr_data[31:24]), 126 | $signed(wr_data[39:32]), 127 | $signed(wr_data[47:40]), 128 | $signed(wr_data[55:48]), 129 | $signed(wr_data[63:56]), 130 | $signed(wr_data[71:64]), 131 | $signed(wr_data[79:72])); 132 | end 133 | 134 | if(rd_en && MVU_TILE == 0) begin 135 | $display("[%0t][MVU-VRF] rd_addr: %d %b", $time, rd_addr, rd_addr); 136 | end 137 | 138 | if(rd[1] && MVU_TILE == 0) begin 139 | $display("[%0t][MVU-VRF] vrf_id: %d, rd_data: %d %d %d %d %d %d %d %d %d %d", 140 | $time, 141 | id[1], 142 | $signed(rd_data[7:0]), 143 | $signed(rd_data[15:8]), 144 | $signed(rd_data[23:16]), 145 | $signed(rd_data[31:24]), 146 | $signed(rd_data[39:32]), 147 | $signed(rd_data[47:40]), 148 | $signed(rd_data[55:48]), 149 | $signed(rd_data[63:56]), 150 | $signed(rd_data[71:64]), 151 | $signed(rd_data[79:72])); 152 | end 153 | end 154 | `endif 155 | 156 | endmodule -------------------------------------------------------------------------------- /rtl/nx_axbs.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Intel Corporation. 2 | // 3 | // This reference design file is subject licensed to you by the terms and 4 | // conditions of the applicable License Terms and Conditions for Hardware 5 | // Reference Designs and/or Design Examples (either as signed by you or 6 | // found at https://www.altera.com/common/legal/leg-license_agreement.html ). 7 | // 8 | // As stated in the license, you agree to only use this reference design 9 | // solely in conjunction with Intel FPGAs or Intel CPLDs. 10 | // 11 | // THE REFERENCE DESIGN IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED 12 | // WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, 13 | // NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. Intel does not 14 | // warrant or assume responsibility for the accuracy or completeness of any 15 | // information, links or other items within the Reference Design and any 16 | // accompanying materials. 17 | // 18 | // In the event that you do not agree with such terms and conditions, do not 19 | // use the reference design file. 20 | ///////////////////////////////////////////////////////////////////////////// 21 | 22 | module nx_axbs #( 23 | parameter SIZE_A = 32, 24 | parameter SIZE_B = 32 25 | )( 26 | input clk, 27 | input signed [SIZE_A-1:0] din_a, 28 | input signed [SIZE_B-1:0] din_b, 29 | output signed [SIZE_A+SIZE_B-1:0] dout 30 | ); 31 | 32 | generate 33 | 34 | if ((SIZE_A > 512) && (SIZE_B > 512)) begin 35 | initial begin 36 | $fatal("Error: %0dx%0d multiplier is not supported", SIZE_A, SIZE_B); 37 | end 38 | end 39 | 40 | localparam NUM_A = (SIZE_A - 2) / 7 + 1; 41 | localparam NUM_B = (SIZE_B - 2) / 7 + 1; 42 | 43 | if ((NUM_A == 1) || (NUM_B == 1)) begin 44 | 45 | axbs #(.SIZE_A(SIZE_A), .SIZE_B(SIZE_B)) mult ( 46 | .clk(clk), 47 | .din_a(din_a), 48 | .din_b(din_b), 49 | .dout(dout) 50 | ); 51 | 52 | end 53 | else 54 | begin 55 | localparam SIZE_A_PRIME = NUM_A * 7 + 1; 56 | localparam SIZE_B_PRIME = NUM_B * 7 + 1; 57 | 58 | wire signed [SIZE_A_PRIME-1:0] din_a_prime; 59 | wire signed [SIZE_B_PRIME-1:0] din_b_prime; 60 | 61 | assign din_a_prime = {din_a, {(SIZE_A_PRIME - SIZE_A){1'b0}}}; 62 | assign din_b_prime = {din_b, {(SIZE_B_PRIME - SIZE_B){1'b0}}}; 63 | 64 | wire signed [SIZE_A_PRIME+SIZE_B_PRIME-1:0] dout_prime; 65 | 66 | nx_axbs_core #(.SIZE_A(SIZE_A_PRIME), .SIZE_B(SIZE_B_PRIME)) mult ( 67 | .clk(clk), 68 | .din_a(din_a_prime), 69 | .din_b(din_b_prime), 70 | .dout(dout_prime) 71 | ); 72 | 73 | assign dout = dout_prime[SIZE_A_PRIME+SIZE_B_PRIME-1:SIZE_A_PRIME+SIZE_B_PRIME-SIZE_A-SIZE_B]; 74 | end 75 | endgenerate 76 | 77 | endmodule 78 | -------------------------------------------------------------------------------- /rtl/nx_axbs_core.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Intel Corporation. 2 | // 3 | // This reference design file is subject licensed to you by the terms and 4 | // conditions of the applicable License Terms and Conditions for Hardware 5 | // Reference Designs and/or Design Examples (either as signed by you or 6 | // found at https://www.altera.com/common/legal/leg-license_agreement.html ). 7 | // 8 | // As stated in the license, you agree to only use this reference design 9 | // solely in conjunction with Intel FPGAs or Intel CPLDs. 10 | // 11 | // THE REFERENCE DESIGN IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED 12 | // WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, 13 | // NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. Intel does not 14 | // warrant or assume responsibility for the accuracy or completeness of any 15 | // information, links or other items within the Reference Design and any 16 | // accompanying materials. 17 | // 18 | // In the event that you do not agree with such terms and conditions, do not 19 | // use the reference design file. 20 | ///////////////////////////////////////////////////////////////////////////// 21 | 22 | module nx_axbs_core #( 23 | parameter SIZE_A = 15, 24 | parameter SIZE_B = 15, 25 | parameter SIZE_O = SIZE_A + SIZE_B 26 | )( 27 | input clk, 28 | input signed [SIZE_A-1:0] din_a, 29 | input signed [SIZE_B-1:0] din_b, 30 | output reg signed [SIZE_A+SIZE_B-1:0] dout 31 | ); 32 | localparam NUM_A = (SIZE_A - 2) / 7 + 1; 33 | localparam NUM_B = (SIZE_B - 2) / 7 + 1; 34 | 35 | genvar i, j; 36 | generate 37 | 38 | localparam LATENCY = (NUM_A <= NUM_B) ? 4+$clog2((NUM_A-1)/6+1) : 4+$clog2((NUM_B-1)/6+1); 39 | 40 | wire signed [7:0] din_a_w[0:NUM_A-1]; 41 | wire signed [7:0] din_b_w[0:NUM_B-1]; 42 | 43 | 44 | for (i = 0; i < NUM_A; i=i+1) begin : assign_a 45 | if (i < NUM_A - 1) 46 | assign din_a_w[i] = {1'b0, din_a[7*i+6:7*i]}; 47 | else 48 | assign din_a_w[i] = din_a[7*i+7:7*i]; 49 | end 50 | 51 | for (i = 0; i < NUM_B; i=i+1) begin : assign_b 52 | if (i < NUM_B - 1) 53 | assign din_b_w[i] = {1'b0, din_b[7*i+6:7*i]}; 54 | else 55 | assign din_b_w[i] = din_b[7*i+7:7*i]; 56 | end 57 | 58 | 59 | wire signed [20:0] dot_product_out[0:NUM_A+NUM_B-1]; 60 | 61 | wire signed [7*(NUM_A+NUM_B)-1:0] dout1; 62 | wire signed [7*(NUM_A+NUM_B):0] dout2; 63 | wire signed [7*(NUM_A+NUM_B+1)-1:0] dout3; 64 | wire signed [7*(NUM_A+NUM_B+1):0] dout4; 65 | 66 | assign dout2[6:0] = 0; 67 | assign dout3[13:0] = 0; 68 | assign dout4[14:0] = 0; 69 | 70 | if (7*(NUM_A+NUM_B)-1 >= 7*(NUM_A+NUM_B-2)+7) 71 | assign dout1[7*(NUM_A+NUM_B)-1:7*(NUM_A+NUM_B-2)+7] = 0; 72 | 73 | assign dout2[7*(NUM_A+NUM_B)] = 0; 74 | 75 | for (i = 0; i < NUM_A+NUM_B-1; i=i+1) begin: loopa 76 | 77 | nx_axbs_slice #(.NUM_A(NUM_A), .NUM_B(NUM_B), .INDEX(i), .SIZE_OUT(21), .LATENCY(LATENCY)) dot_product( 78 | .clk(clk), 79 | .din_a(din_a_w), 80 | .din_b(din_b_w), 81 | .dout(dot_product_out[i]) 82 | ); 83 | 84 | assign {dout3[7*i+20:7*i+14], dout2[7*i+13:7*i+7], dout1[7*i+6:7*i]} = dot_product_out[i]; 85 | assign {dout4[7*i+21:7*i+15]} = {dot_product_out[i][20], 6'b0}; 86 | end 87 | 88 | reg signed [SIZE_A+SIZE_B-1:0] dout12; 89 | reg signed [SIZE_A+SIZE_B-1:0] dout31; 90 | 91 | always @(posedge clk) begin 92 | 93 | dout12 <= SIZE_O'(dout1 + dout2); 94 | dout31 <= SIZE_O'(dout3 - dout4); 95 | dout <= dout12 + dout31; 96 | end 97 | 98 | endgenerate 99 | 100 | endmodule 101 | -------------------------------------------------------------------------------- /rtl/nx_axbs_slice.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Intel Corporation. 2 | // 3 | // This reference design file is subject licensed to you by the terms and 4 | // conditions of the applicable License Terms and Conditions for Hardware 5 | // Reference Designs and/or Design Examples (either as signed by you or 6 | // found at https://www.altera.com/common/legal/leg-license_agreement.html ). 7 | // 8 | // As stated in the license, you agree to only use this reference design 9 | // solely in conjunction with Intel FPGAs or Intel CPLDs. 10 | // 11 | // THE REFERENCE DESIGN IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED 12 | // WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, 13 | // NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. Intel does not 14 | // warrant or assume responsibility for the accuracy or completeness of any 15 | // information, links or other items within the Reference Design and any 16 | // accompanying materials. 17 | // 18 | // In the event that you do not agree with such terms and conditions, do not 19 | // use the reference design file. 20 | ///////////////////////////////////////////////////////////////////////////// 21 | 22 | module nx_axbs_slice #( 23 | parameter NUM_A = 2, 24 | parameter NUM_B = 2, 25 | parameter INDEX = 0, 26 | parameter SIZE_OUT = 16, 27 | parameter LATENCY = 4 28 | )( 29 | input clk, 30 | input signed [7:0] din_a[0:NUM_A-1], 31 | input signed [7:0] din_b[0:NUM_B-1], 32 | 33 | output signed [SIZE_OUT-1:0] dout 34 | ); 35 | 36 | localparam MIN_A = (INDEX < NUM_B) ? 0 : (INDEX - NUM_B + 1); 37 | localparam MAX_A = (INDEX < NUM_A) ? INDEX : NUM_A - 1; 38 | localparam MAX_B = (INDEX < NUM_B) ? INDEX : NUM_B - 1; 39 | localparam LOCAL_NUM = MAX_A - MIN_A + 1; 40 | 41 | wire signed [7:0] din_a_local_w[0:LOCAL_NUM-1]; 42 | wire signed [7:0] din_b_local_w[0:LOCAL_NUM-1]; 43 | 44 | genvar j; 45 | generate 46 | for (j = 0; j < LOCAL_NUM; j=j+1) begin: loopb 47 | assign din_a_local_w[j] = {1'b0, din_a[j+MIN_A]}; 48 | assign din_b_local_w[j] = {1'b0, din_b[MAX_B-j]}; 49 | end 50 | 51 | wire signed [15+$clog2(LOCAL_NUM):0] dout_w; 52 | 53 | nx_dot_product_int8 #(.NUM(LOCAL_NUM), .LATENCY(LATENCY)) dot_product( 54 | .clk(clk), 55 | .din_a(din_a_local_w), 56 | .din_b(din_b_local_w), 57 | .dout(dout_w[15+$clog2(LOCAL_NUM):0]) 58 | ); 59 | 60 | if (SIZE_OUT > 16+$clog2(LOCAL_NUM)) 61 | assign dout = { {(SIZE_OUT - $clog2(LOCAL_NUM) - 16) {dout_w[15+$clog2(LOCAL_NUM)]}}, dout_w}; 62 | else 63 | assign dout = dout_w; 64 | endgenerate 65 | endmodule 66 | 67 | -------------------------------------------------------------------------------- /rtl/nx_dot6_int8.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Intel Corporation. 2 | // 3 | // This reference design file is subject licensed to you by the terms and 4 | // conditions of the applicable License Terms and Conditions for Hardware 5 | // Reference Designs and/or Design Examples (either as signed by you or 6 | // found at https://www.altera.com/common/legal/leg-license_agreement.html ). 7 | // 8 | // As stated in the license, you agree to only use this reference design 9 | // solely in conjunction with Intel FPGAs or Intel CPLDs. 10 | // 11 | // THE REFERENCE DESIGN IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED 12 | // WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, 13 | // NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. Intel does not 14 | // warrant or assume responsibility for the accuracy or completeness of any 15 | // information, links or other items within the Reference Design and any 16 | // accompanying materials. 17 | // 18 | // In the event that you do not agree with such terms and conditions, do not 19 | // use the reference design file. 20 | ///////////////////////////////////////////////////////////////////////////// 21 | 22 | module nx_dot6_int8 ( 23 | input clk, 24 | input signed [7:0] din_a1, 25 | input signed [7:0] din_b1, 26 | input signed [7:0] din_a2, 27 | input signed [7:0] din_b2, 28 | input signed [7:0] din_a3, 29 | input signed [7:0] din_b3, 30 | input signed [7:0] din_a4, 31 | input signed [7:0] din_b4, 32 | input signed [7:0] din_a5, 33 | input signed [7:0] din_b5, 34 | input signed [7:0] din_a6, 35 | input signed [7:0] din_b6, 36 | output reg signed [18:0] dout 37 | ); 38 | 39 | wire signed [18:0] dout_w; 40 | wire [5:0] tmp; 41 | fourteennm_dsp_prime #( 42 | .dsp_mode("vector_fxp"), 43 | .dsp_sel_int4("select_int8"), 44 | .dsp_fp32_sub_en("float_sub_disabled"), 45 | .dsp_cascade("cascade_disabled") 46 | ) 47 | dsp_prime_wys0 ( 48 | .ena(1'b1), 49 | .clk(clk), 50 | .data_in({din_b6,din_a6,din_b5,din_a5,din_b4,din_a4,din_b3,din_a3,din_b2,din_a2,din_b1,din_a1}), 51 | .clr({1'b0,1'b0}), 52 | .result_l({tmp,dout_w}), 53 | 54 | .load_buf_sel(1'b0), 55 | .mode_switch(1'b0), 56 | .load_bb_one(1'b0), 57 | .load_bb_two(1'b0), 58 | .feed_sel(2'b0), 59 | .zero_en(1'b0), 60 | .shared_exponent(8'h0), 61 | .cascade_weight_in(88'h0), 62 | .cascade_data_in(96'h0), 63 | .acc_en(1'b0), 64 | 65 | .cascade_weight_out(), 66 | .cascade_data_out() 67 | ); 68 | always @(posedge clk) begin 69 | dout <= dout_w; 70 | end 71 | 72 | endmodule 73 | 74 | -------------------------------------------------------------------------------- /rtl/nx_dot_product_int8.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Intel Corporation. 2 | // 3 | // This reference design file is subject licensed to you by the terms and 4 | // conditions of the applicable License Terms and Conditions for Hardware 5 | // Reference Designs and/or Design Examples (either as signed by you or 6 | // found at https://www.altera.com/common/legal/leg-license_agreement.html ). 7 | // 8 | // As stated in the license, you agree to only use this reference design 9 | // solely in conjunction with Intel FPGAs or Intel CPLDs. 10 | // 11 | // THE REFERENCE DESIGN IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED 12 | // WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, 13 | // NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. Intel does not 14 | // warrant or assume responsibility for the accuracy or completeness of any 15 | // information, links or other items within the Reference Design and any 16 | // accompanying materials. 17 | // 18 | // In the event that you do not agree with such terms and conditions, do not 19 | // use the reference design file. 20 | ///////////////////////////////////////////////////////////////////////////// 21 | 22 | module nx_dot_product_int8 #( 23 | parameter NUM = 16, 24 | parameter LATENCY = 4+$clog2((NUM-1)/6+1) 25 | )( 26 | input clk, 27 | input signed [7:0] din_a[0:NUM-1], 28 | input signed [7:0] din_b[0:NUM-1], 29 | output signed [15+$clog2(NUM):0] dout 30 | ); 31 | 32 | localparam MIN_LATENCY = 4+$clog2((NUM-1)/6+1); 33 | localparam DSP_NUM = (NUM - 1) / 6 + 1; 34 | 35 | localparam NUM_6 = DSP_NUM * 6; 36 | 37 | wire signed [7:0] din_a_w[0:NUM_6-1]; 38 | wire signed [7:0] din_b_w[0:NUM_6-1]; 39 | 40 | wire signed [18:0] dsp_out[0:DSP_NUM-1]; 41 | genvar i; 42 | generate 43 | 44 | for (i = 0; i < NUM_6; i=i+1) begin: loop1 45 | if (i < NUM) begin 46 | assign din_a_w[i] = din_a[i]; 47 | assign din_b_w[i] = din_b[i]; 48 | end 49 | else 50 | begin 51 | assign din_a_w[i] = 0; 52 | assign din_b_w[i] = 0; 53 | end 54 | 55 | end 56 | 57 | for (i = 0; i < DSP_NUM; i=i+1) begin: loop2 58 | if ((i < DSP_NUM-1) || (NUM_6 - NUM != 5)) begin 59 | nx_dot6_int8 dot ( 60 | .clk(clk), 61 | .din_a1(din_a_w[6*i]), 62 | .din_b1(din_b_w[6*i]), 63 | .din_a2(din_a_w[6*i+1]), 64 | .din_b2(din_b_w[6*i+1]), 65 | .din_a3(din_a_w[6*i+2]), 66 | .din_b3(din_b_w[6*i+2]), 67 | .din_a4(din_a_w[6*i+3]), 68 | .din_b4(din_b_w[6*i+3]), 69 | .din_a5(din_a_w[6*i+4]), 70 | .din_b5(din_b_w[6*i+4]), 71 | .din_a6(din_a_w[6*i+5]), 72 | .din_b6(din_b_w[6*i+5]), 73 | .dout(dsp_out[i]) 74 | ); 75 | end else begin 76 | axbs #(.SIZE_A(8), .SIZE_B(8)) mult ( 77 | .clk(clk), 78 | .din_a(din_a_w[6*i]), 79 | .din_b(din_b_w[6*i]), 80 | .dout(dsp_out[i][15:0]) 81 | ); 82 | assign dsp_out[i][18:16] = {3{dsp_out[i][15]}}; 83 | end 84 | end 85 | 86 | wire signed [15+$clog2(NUM):0] dout_ww; 87 | 88 | if (DSP_NUM > 1) begin 89 | wire signed [18+$clog2(DSP_NUM):0] dout_w; 90 | adder_tree #(.SIZE(19), .NUM(DSP_NUM)) adder_tree_inst ( 91 | .clk(clk), 92 | .din(dsp_out), 93 | .dout(dout_w) 94 | ); 95 | assign dout_ww = dout_w[15+$clog2(NUM):0]; 96 | end else begin 97 | assign dout_ww = dsp_out[0][15+$clog2(NUM):0]; 98 | end 99 | 100 | integer j; 101 | if (LATENCY < MIN_LATENCY) begin 102 | initial begin 103 | $fatal("Specified latency %0d is too small", LATENCY); 104 | end 105 | end if (LATENCY == MIN_LATENCY) begin 106 | assign dout = dout_ww; 107 | end else begin 108 | reg signed [15+$clog2(NUM):0] dout_r[0:LATENCY-MIN_LATENCY-1]; 109 | always @(posedge clk) begin 110 | dout_r[0] <= dout_ww; 111 | for (j = 1; j < LATENCY-MIN_LATENCY; j=j+1) begin 112 | dout_r[j] <= dout_r[j-1]; 113 | end 114 | end 115 | assign dout = dout_r[LATENCY-MIN_LATENCY-1]; 116 | end 117 | 118 | 119 | endgenerate 120 | 121 | endmodule 122 | 123 | -------------------------------------------------------------------------------- /rtl/pipeline_interconnect.sv: -------------------------------------------------------------------------------- 1 | 2 | // This module implements a simple pipelined interconnect from a source to a single sink. 3 | (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name DONT_MERGE_REGISTER ON" *) module pipeline_interconnect # ( 4 | parameter DATAW = 32, 5 | parameter LATENCY = 2 6 | ) ( 7 | input clk, 8 | input rst, 9 | input [DATAW-1:0] i_pipe_in, 10 | output [DATAW-1:0] o_pipe_out 11 | ); 12 | 13 | reg [DATAW-1:0] pipeline [0:LATENCY-1]; 14 | 15 | integer t; 16 | always @ (posedge clk) begin 17 | // Set the input to the first pipeline stage 18 | pipeline[0] <= i_pipe_in; 19 | 20 | // Progress the pipeline 21 | for (t = 1; t < LATENCY; t = t + 1) begin 22 | pipeline[t] <= pipeline[t-1]; 23 | end 24 | end 25 | 26 | // Hook up outputs 27 | assign o_pipe_out = pipeline[LATENCY-1]; 28 | 29 | endmodule -------------------------------------------------------------------------------- /rtl/ram.sv: -------------------------------------------------------------------------------- 1 | `include "npu.vh" 2 | 3 | module ram # ( 4 | parameter MODULE_ID = "", 5 | parameter OUTREG = "CLOCK0", 6 | parameter ID = 0, 7 | parameter ID_UNITS = (ID%10) + 8'h30, 8 | parameter ID_TENS = (ID/10 == 0)? "": ((ID/10)%10) + 8'h30, 9 | parameter ID_HUNDREDS = (ID/100 == 0)? "": (ID/100) + 8'h30, 10 | parameter DW = 32, 11 | parameter DEPTH = 512, 12 | parameter AW = 9, 13 | parameter RTL_DIR = `RTL_DIR, 14 | parameter TARGET_FPGA = `TARGET_FPGA 15 | )( 16 | input wr_en, 17 | input [AW-1:0] wr_addr, 18 | input [AW-1:0] rd_addr, 19 | input [DW-1:0] wr_data, 20 | output [DW-1:0] rd_data, 21 | input clk, 22 | input rst 23 | ); 24 | 25 | wire [DW-1:0] sub_wire0; 26 | assign rd_data = sub_wire0[DW-1:0]; 27 | 28 | 29 | localparam RAM_SRC = {RTL_DIR, "mif_files/", MODULE_ID, ID_HUNDREDS, ID_TENS, ID_UNITS, ".mif"}; 30 | 31 | altera_syncram altera_syncram_component ( 32 | .address_a (wr_addr), 33 | .address_b (rd_addr), 34 | .clock0 (clk), 35 | .data_a (wr_data), 36 | .wren_a (wr_en), 37 | .q_b (sub_wire0), 38 | .aclr0 (1'b0), 39 | .aclr1 (1'b0), 40 | .address2_a (1'b1), 41 | .address2_b (1'b1), 42 | .addressstall_a(1'b0), 43 | .addressstall_b(1'b0), 44 | .byteena_a (1'b1), 45 | .byteena_b (1'b1), 46 | .clock1 (1'b1), 47 | .clocken0 (1'b1), 48 | .clocken1 (1'b1), 49 | .clocken2 (1'b1), 50 | .clocken3 (1'b1), 51 | .data_b ({(DW){1'b1}}), 52 | .eccencbypass (1'b0), 53 | .eccencparity (8'b0), 54 | .eccstatus (), 55 | .q_a (), 56 | .rden_a (1'b1), 57 | .rden_b (1'b1), 58 | .sclr (1'b0), 59 | .wren_b (1'b0) 60 | ); 61 | 62 | defparam 63 | altera_syncram_component.address_aclr_b = "NONE", 64 | altera_syncram_component.address_reg_b = "CLOCK0", 65 | altera_syncram_component.clock_enable_input_a = "BYPASS", 66 | altera_syncram_component.clock_enable_input_b = "BYPASS", 67 | altera_syncram_component.clock_enable_output_b = "BYPASS", 68 | //altera_syncram_component.init_file = RAM_SRC, 69 | altera_syncram_component.enable_ecc = "FALSE", 70 | altera_syncram_component.intended_device_family = TARGET_FPGA, 71 | altera_syncram_component.lpm_type = "altera_syncram", 72 | altera_syncram_component.numwords_a = DEPTH, 73 | altera_syncram_component.numwords_b = DEPTH, 74 | altera_syncram_component.operation_mode = "DUAL_PORT", 75 | altera_syncram_component.outdata_aclr_b = "NONE", 76 | altera_syncram_component.outdata_sclr_b = "NONE", 77 | altera_syncram_component.outdata_reg_b = OUTREG, 78 | altera_syncram_component.power_up_uninitialized = "FALSE", 79 | altera_syncram_component.ram_block_type = "M20K", 80 | altera_syncram_component.read_during_write_mode_mixed_ports = "DONT_CARE", 81 | altera_syncram_component.widthad_a = AW, 82 | altera_syncram_component.widthad_b = AW, 83 | altera_syncram_component.width_a = DW, 84 | altera_syncram_component.width_b = DW, 85 | altera_syncram_component.width_byteena_a = 1; 86 | 87 | 88 | // Debug 89 | // always @ (posedge clk) begin 90 | // if(wr_en) 91 | // $display("[%0t][%s] wr ram%d[%d] = %x(%d,%d,%d,%d)", 92 | // $time, `__FILE__, ID, wr_addr, wr_data, wr_data[7:0], wr_data[15:8], 93 | // wr_data[23:16], wr_data[31:24]); 94 | // end 95 | 96 | endmodule 97 | 98 | -------------------------------------------------------------------------------- /rtl/run_sim.sh: -------------------------------------------------------------------------------- 1 | QSYS_SIMDIR="." 2 | QUARTUS_INSTALL_DIR=$QUARTUS_ROOTDIR 3 | USER_DEFINED_ELAB_OPTIONS="+vcs+lic+wait -debug_access+pp npu_tb.sv" 4 | SKIP_SIM=1 5 | TOP_LEVEL_NAME=npu_tb 6 | 7 | sh setup.sh QUARTUS_INSTALL_DIR=$QUARTUS_INSTALL_DIR USER_DEFINED_ELAB_OPTIONS="\"$USER_DEFINED_ELAB_OPTIONS\"" SKIP_SIM=$SKIP_SIM TOP_LEVEL_NAME=$TOP_LEVEL_NAME > rtl_sim_log 8 | 9 | #./simv +vcs+lic+wait 10 | -------------------------------------------------------------------------------- /rtl/self_tester_tb.v: -------------------------------------------------------------------------------- 1 | `timescale 1 ps / 1 ps 2 | 3 | module self_tester_tb; 4 | reg clk, reset; 5 | wire [2:0] status; 6 | wire [31:0] count; 7 | wire [31:0] perf_counter; 8 | wire done; 9 | 10 | self_tester_shim uut ( 11 | .clk(clk), 12 | .reset(reset), 13 | .o_test_status(status), 14 | .o_result_count(count), 15 | .o_perf_counter(perf_counter), 16 | .o_test_done(done) 17 | ); 18 | 19 | initial begin 20 | clk = 0; 21 | reset = 0; 22 | end 23 | 24 | always 25 | #5 clk = !clk; 26 | 27 | initial begin 28 | reset = 1; #20 29 | reset = 0; 30 | end 31 | 32 | endmodule 33 | -------------------------------------------------------------------------------- /rtl/setup.sh: -------------------------------------------------------------------------------- 1 | # initialize variables 2 | TOP_LEVEL_NAME="npu_tb" 3 | 4 | QUARTUS_INSTALL_DIR=$QUARTUS_ROOTDIR 5 | SKIP_SIM=1 6 | #USER_DEFINED_ELAB_OPTIONS="+vcs+lic+wait -debug_access+pp" 7 | USER_DEFINED_ELAB_OPTIONS="+vcs+lic+wait" 8 | USER_DEFINED_ELAB_OPTIONS_APPEND="" 9 | USER_DEFINED_SIM_OPTIONS="" 10 | 11 | # ---------------------------------------- 12 | # overwrite variables - DO NOT MODIFY! 13 | # This block evaluates each command line argument, typically used for 14 | # overwriting variables. An example usage: 15 | # sh _setup.sh SKIP_SIM=1 16 | for expression in "$@"; do 17 | eval $expression 18 | if [ $? -ne 0 ]; then 19 | echo "Error: This command line argument, \"$expression\", is/has an invalid expression." >&2 20 | exit $? 21 | fi 22 | done 23 | 24 | #------------------------------------------- 25 | # check tclsh version no earlier than 8.5 26 | version=$(echo "puts [package vcompare [info tclversion] 8.5]; exit" | tclsh) 27 | if [ $version -eq -1 ]; then 28 | echo "Error: Minimum required tcl package version is 8.5." >&2 29 | exit 1 30 | fi 31 | 32 | ELAB_OPTIONS="" 33 | 34 | design_files="*.sv *.v" 35 | 36 | vcs -lca -timescale=1ps/1ps -sverilog +verilog2001ext+.v $USER_DEFINED_ELAB_OPTIONS \ 37 | -v $QUARTUS_INSTALL_DIR/eda/sim_lib/altera_primitives.v \ 38 | -v $QUARTUS_INSTALL_DIR/eda/sim_lib/220model.v \ 39 | -v $QUARTUS_INSTALL_DIR/eda/sim_lib/sgate.v \ 40 | -v $QUARTUS_INSTALL_DIR/eda/sim_lib/altera_mf.v \ 41 | $QUARTUS_INSTALL_DIR/eda/sim_lib/fourteennm_atoms.sv \ 42 | $QUARTUS_INSTALL_DIR/eda/sim_lib/synopsys/fourteennm_atoms_ncrypt.sv \ 43 | $design_files \ 44 | $USER_DEFINED_ELAB_OPTIONS_APPEND \ 45 | -top $TOP_LEVEL_NAME -R #-gui & 46 | 47 | # ---------------------------------------- 48 | # simulate 49 | if [ $SKIP_SIM -eq 0 ]; then 50 | ./simv $SIM_OPTIONS $USER_DEFINED_SIM_OPTIONS 51 | fi 52 | 53 | #$QUARTUS_INSTALL_DIR/eda/sim_lib/altera_lnsim.sv \ 54 | #$QUARTUS_INSTALL_DIR/eda/sim_lib/ct1_hssi_atoms.sv \ 55 | #$QUARTUS_INSTALL_DIR/eda/sim_lib/synopsys/ct1_hssi_atoms_ncrypt.sv \ 56 | #$QUARTUS_INSTALL_DIR/eda/sim_lib/synopsys/cr3v0_serdes_models_ncrypt.sv \ 57 | #$QUARTUS_INSTALL_DIR/eda/sim_lib/ct1_hip_atoms.sv \ 58 | #$QUARTUS_INSTALL_DIR/eda/sim_lib/synopsys/ct1_hip_atoms_ncrypt.sv \ 59 | -------------------------------------------------------------------------------- /rtl/sigmoid.sv: -------------------------------------------------------------------------------- 1 | `include "npu.vh" 2 | 3 | module sigmoid # ( 4 | parameter DW = 32, 5 | parameter IF = 19, 6 | parameter OF = 19, 7 | parameter SAMPLES = 512, 8 | parameter AW = $clog2(SAMPLES), 9 | parameter RTL_DIR = `RTL_DIR, 10 | parameter TARGET_FPGA = `TARGET_FPGA 11 | ) ( 12 | input clk, 13 | input rst, 14 | input [DW-1:0] x, 15 | output [DW-1:0] result 16 | ); 17 | 18 | reg [AW-1:0] index; 19 | reg [DW-1:0] abs_x; 20 | reg is_neg, is_neg_reg, is_big; 21 | reg [DW-1:0] res; 22 | wire [DW-1:0] lookup; 23 | 24 | always @ (posedge clk) begin 25 | if(rst) begin 26 | abs_x <= 0; 27 | is_neg <= 0; 28 | is_big <= 0; 29 | res <= 0; 30 | index <= 0; 31 | //lookup <= 0; 32 | end else begin 33 | //Cycle 1: Get abs x 34 | if(x[DW-1]) begin 35 | abs_x <= -x; 36 | is_neg <= 1'b1; 37 | end else begin 38 | abs_x <= x; 39 | is_neg <= 1'b0; 40 | end 41 | 42 | //Cycle 2: Get index & do comparisons 43 | if(abs_x > 4194304) begin 44 | is_big <= 1'b1; 45 | end else begin 46 | is_big <= 1'b0; 47 | end 48 | index <= AW'(abs_x[DW-6:DW-16] + abs_x[DW-17]); 49 | //lookup <= sigmoid_LUT[index]; 50 | is_neg_reg <= is_neg; 51 | 52 | //Cycle 3: Choose output 53 | case({is_neg_reg, is_big}) 54 | 2'b01: res <= {{(DW-OF-1){1'b0}}, 1'b1, {(OF){1'b0}}}; 55 | 2'b11: res <= {(DW){1'b0}}; 56 | 2'b00: res <= lookup; 57 | 2'b10: res <= {{(DW-OF-1){1'b0}}, 1'b1, {(OF){1'b0}}} - lookup; 58 | default: res <= 0; 59 | endcase 60 | end 61 | end 62 | 63 | /*reg [AW-1:0] index_2; 64 | reg [DW-1:0] abs_x_2; 65 | reg is_neg_2, is_neg_reg_2, is_big_2; 66 | reg [DW-1:0] res_2; 67 | wire [DW-1:0] lookup_2; 68 | 69 | always @ (posedge clk) begin 70 | if(rst) begin 71 | abs_x_2 <= 0; 72 | is_neg_2 <= 0; 73 | is_big_2 <= 0; 74 | res_2 <= 0; 75 | index_2 <= 0; 76 | //lookup <= 0; 77 | end else begin 78 | //Cycle 1: Get abs x 79 | if(x_2[DW-1]) begin 80 | abs_x_2 <= -x_2; 81 | is_neg_2 <= 1'b1; 82 | end else begin 83 | abs_x_2 <= x_2; 84 | is_neg_2 <= 1'b0; 85 | end 86 | 87 | //Cycle 2: Get index & do comparisons 88 | if(abs_x_2 > 4194304) begin 89 | is_big_2 <= 1'b1; 90 | end else begin 91 | is_big_2 <= 1'b0; 92 | end 93 | index_2 <= abs_x_2[DW-6:DW-16] + abs_x_2[DW-17]; 94 | //lookup <= sigmoid_LUT[index]; 95 | is_neg_reg_2 <= is_neg_2; 96 | 97 | //Cycle 3: Choose output 98 | case({is_neg_reg_2, is_big_2}) 99 | 2'b01: res_2 <= {{(DW-OF-1){1'b0}}, 1'b1, {(OF){1'b0}}}; 100 | 2'b11: res_2 <= {(DW){1'b0}}; 101 | 2'b00: res_2 <= lookup_2; 102 | 2'b10: res_2 <= {{(DW-OF-1){1'b0}}, 1'b1, {(OF){1'b0}}} - lookup_2; 103 | default: res_2 <= 0; 104 | endcase 105 | end 106 | end*/ 107 | 108 | assign result = res; 109 | /*assign result_2 = res_2; 110 | 111 | altera_syncram altera_syncram_component ( 112 | .address_a (index), 113 | .address_b (index_2), 114 | .clock0 (clk), 115 | .data_a ({(DW){1'b1}}), 116 | .data_b ({(DW){1'b1}}), 117 | .wren_a (1'b0), 118 | .wren_b (1'b0), 119 | .q_a (lookup), 120 | .q_b (lookup_2), 121 | .aclr0 (), 122 | .aclr1 (), 123 | .address2_a (1'b1), 124 | .address2_b (1'b1), 125 | .addressstall_a (1'b0), 126 | .addressstall_b (1'b0), 127 | .byteena_a (1'b1), 128 | .byteena_b (1'b1), 129 | .clock1 (1'b1), 130 | .clocken0 (1'b1), 131 | .clocken1 (1'b1), 132 | .clocken2 (1'b1), 133 | .clocken3 (1'b1), 134 | .eccencbypass (1'b0), 135 | .eccencparity (8'b0), 136 | .eccstatus (), 137 | .rden_a (1'b1), 138 | .rden_b (1'b1), 139 | .sclr (1'b0) 140 | ); 141 | defparam 142 | altera_syncram_component.address_reg_b = "CLOCK0", 143 | altera_syncram_component.clock_enable_input_a = "BYPASS", 144 | altera_syncram_component.clock_enable_input_b = "BYPASS", 145 | altera_syncram_component.clock_enable_output_a = "BYPASS", 146 | altera_syncram_component.clock_enable_output_b = "BYPASS", 147 | altera_syncram_component.indata_reg_b = "CLOCK0", 148 | altera_syncram_component.init_file = "sigmoid.mif", 149 | altera_syncram_component.intended_device_family = "Stratix 10", 150 | altera_syncram_component.lpm_type = "altera_syncram", 151 | altera_syncram_component.numwords_a = SAMPLES, 152 | altera_syncram_component.numwords_b = SAMPLES, 153 | altera_syncram_component.operation_mode = "BIDIR_DUAL_PORT", 154 | altera_syncram_component.outdata_aclr_a = "NONE", 155 | altera_syncram_component.outdata_aclr_b = "NONE", 156 | altera_syncram_component.outdata_sclr_a = "NONE", 157 | altera_syncram_component.outdata_sclr_b = "NONE", 158 | altera_syncram_component.outdata_reg_a = "CLOCK0", 159 | altera_syncram_component.outdata_reg_b = "CLOCK0", 160 | altera_syncram_component.enable_force_to_zero = "TRUE", 161 | altera_syncram_component.power_up_uninitialized = "FALSE", 162 | altera_syncram_component.ram_block_type = "M20K", 163 | altera_syncram_component.widthad_a = AW, 164 | altera_syncram_component.widthad_b = AW, 165 | altera_syncram_component.width_a = DW, 166 | altera_syncram_component.width_b = DW, 167 | altera_syncram_component.width_byteena_a = 1, 168 | altera_syncram_component.width_byteena_b = 1;*/ 169 | 170 | altera_syncram altera_syncram_component ( 171 | .address_a (index), 172 | .clock0 (clk), 173 | .q_a (lookup), 174 | .aclr0 (1'b0), 175 | .aclr1 (1'b0), 176 | .address2_a (1'b1), 177 | .address2_b (1'b1), 178 | .address_b (1'b1), 179 | .addressstall_a (1'b0), 180 | .addressstall_b (1'b0), 181 | .byteena_a (1'b1), 182 | .byteena_b (1'b1), 183 | .clock1 (1'b1), 184 | .clocken0 (1'b1), 185 | .clocken1 (1'b1), 186 | .clocken2 (1'b1), 187 | .clocken3 (1'b1), 188 | .data_a ({(DW){1'b1}}), 189 | .data_b (1'b1), 190 | .eccencbypass (1'b0), 191 | .eccencparity (8'b0), 192 | .eccstatus ( ), 193 | .q_b ( ), 194 | .rden_a (1'b1), 195 | .rden_b (1'b1), 196 | .sclr (1'b0), 197 | .wren_a (1'b0), 198 | .wren_b (1'b0) 199 | ); 200 | defparam 201 | altera_syncram_component.address_aclr_a = "NONE", 202 | altera_syncram_component.clock_enable_input_a = "BYPASS", 203 | altera_syncram_component.clock_enable_output_a = "BYPASS", 204 | altera_syncram_component.init_file = {RTL_DIR, "sigmoid.mif"}, 205 | altera_syncram_component.intended_device_family = TARGET_FPGA, 206 | altera_syncram_component.lpm_hint = "ENABLE_RUNTIME_MOD=NO", 207 | altera_syncram_component.lpm_type = "altera_syncram", 208 | altera_syncram_component.numwords_a = SAMPLES, 209 | altera_syncram_component.operation_mode = "ROM", 210 | altera_syncram_component.outdata_aclr_a = "NONE", 211 | altera_syncram_component.outdata_sclr_a = "NONE", 212 | altera_syncram_component.outdata_reg_a = "CLOCK0", 213 | altera_syncram_component.ram_block_type = "M20K", 214 | altera_syncram_component.enable_force_to_zero = "FALSE", 215 | altera_syncram_component.widthad_a = AW, 216 | altera_syncram_component.width_a = DW, 217 | altera_syncram_component.width_byteena_a = 1; 218 | 219 | endmodule 220 | -------------------------------------------------------------------------------- /rtl/star_interconnect.sv: -------------------------------------------------------------------------------- 1 | 2 | // This module implements a star-shapped interconnect from a source to multiple sinks with distinct pipelining registers. 3 | (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name DONT_MERGE_REGISTER ON" *) module star_interconnect # ( 4 | parameter END_POINTS = 4, 5 | parameter DATAW = 32, 6 | parameter LATENCY = 2 7 | ) ( 8 | input clk, 9 | input rst, 10 | input [DATAW-1:0] i_star_in, 11 | output [DATAW-1:0] o_star_out [0:END_POINTS-1] 12 | ); 13 | 14 | reg [DATAW-1:0] pipeline [0:LATENCY-1][0:END_POINTS-1]; 15 | 16 | integer t, d; 17 | always @ (posedge clk) begin 18 | if (rst) begin 19 | // Reset the first stage of the pipeline 20 | for (d = 0; d < END_POINTS; d = d + 1) begin 21 | pipeline[0][d] <= 'd0; 22 | end 23 | end else begin 24 | // Set the input to the first pipeline stage 25 | for (d = 0; d < END_POINTS; d = d + 1) begin 26 | pipeline[0][d] <= i_star_in; 27 | end 28 | 29 | // Progress the pipeline 30 | for (d = 0; d < END_POINTS; d = d + 1) begin 31 | for (t = 1; t < LATENCY; t = t + 1) begin 32 | pipeline[t][d] <= pipeline[t-1][d]; 33 | end 34 | end 35 | end 36 | end 37 | 38 | // Hook up outputs 39 | assign o_star_out = pipeline[LATENCY-1]; 40 | 41 | endmodule -------------------------------------------------------------------------------- /rtl/tanh.sv: -------------------------------------------------------------------------------- 1 | `include "npu.vh" 2 | 3 | module tanh # ( 4 | parameter DW = 32, 5 | parameter IF = 19, 6 | parameter OF = 19, 7 | parameter SAMPLES = 512, 8 | parameter AW = $clog2(SAMPLES), 9 | parameter RTL_DIR = `RTL_DIR, 10 | parameter TARGET_FPGA = `TARGET_FPGA 11 | ) ( 12 | input clk, 13 | input rst, 14 | input [DW-1:0] x, 15 | output [DW-1:0] result 16 | ); 17 | 18 | reg [AW-1:0] index; 19 | reg [DW-1:0] abs_x, abs_x_reg; 20 | reg is_neg, is_neg_reg, is_lin, is_big; 21 | reg [DW-1:0] res; 22 | wire [DW-1:0] lookup; 23 | 24 | always @ (posedge clk) begin 25 | if(rst) begin 26 | abs_x <= 0; 27 | is_neg <= 0; 28 | is_lin <= 0; 29 | is_big <= 0; 30 | res <= 0; 31 | index <= 0; 32 | //lookup <= 0; 33 | end else begin 34 | //Cycle 1: Get abs x 35 | if(x[DW-1]) begin 36 | abs_x <= -x; 37 | is_neg <= 1'b1; 38 | end else begin 39 | abs_x <= x; 40 | is_neg <= 1'b0; 41 | end 42 | 43 | //Cycle 2: Get index & do comparisons 44 | if(abs_x > 2097152) begin 45 | is_big <= 1'b1; 46 | end else begin 47 | is_big <= 1'b0; 48 | end 49 | if((abs_x < 524288) && (abs_x[IF-1:0] <= {4'b0001, {(IF-4){1'b0}}})) begin 50 | is_lin <= 1'b1; 51 | end else begin 52 | is_lin <= 1'b0; 53 | end 54 | index <= AW'(abs_x[DW-7:DW-17] + abs_x[DW-18]); 55 | //lookup <= tanh_LUT[index]; 56 | abs_x_reg <= abs_x; 57 | is_neg_reg <= is_neg; 58 | 59 | //Cycle 3: Choose output 60 | case({is_neg_reg, is_big, is_lin}) 61 | 3'b010: res <= {{(DW-OF-1){1'b0}}, 1'b1, {(OF){1'b0}}}; 62 | 3'b110: res <= {{(DW-OF){1'b1}}, {(OF){1'b0}}}; 63 | 3'b001: res <= abs_x_reg; 64 | 3'b101: res <= -abs_x_reg; 65 | 3'b000: res <= lookup; 66 | 3'b100: res <= -lookup; 67 | default: res <= 0; 68 | endcase 69 | end 70 | end 71 | 72 | /*reg [AW-1:0] index_2; 73 | reg [DW-1:0] abs_x_2, abs_x_reg_2; 74 | reg is_neg_2, is_neg_reg_2, is_lin_2, is_big_2; 75 | reg [DW-1:0] res_2; 76 | wire [DW-1:0] lookup_2; 77 | 78 | always @ (posedge clk) begin 79 | if(rst) begin 80 | abs_x_2 <= 0; 81 | is_neg_2 <= 0; 82 | is_lin_2 <= 0; 83 | is_big_2 <= 0; 84 | res_2 <= 0; 85 | index_2 <= 0; 86 | //lookup <= 0; 87 | end else begin 88 | //Cycle 1: Get abs x 89 | if(x_2[DW-1]) begin 90 | abs_x_2 <= -x_2; 91 | is_neg_2 <= 1'b1; 92 | end else begin 93 | abs_x_2 <= x_2; 94 | is_neg_2 <= 1'b0; 95 | end 96 | 97 | //Cycle 2: Get index & do comparisons 98 | if(abs_x_2 > 2097152) begin 99 | is_big_2 <= 1'b1; 100 | end else begin 101 | is_big_2 <= 1'b0; 102 | end 103 | if((abs_x_2 < 524288) && (abs_x_2[IF-1:0] <= {4'b0001, {(IF-4){1'b0}}})) begin 104 | is_lin_2 <= 1'b1; 105 | end else begin 106 | is_lin_2 <= 1'b0; 107 | end 108 | index_2 <= abs_x_2[DW-7:DW-17] + abs_x_2[DW-18]; 109 | //lookup <= tanh_LUT[index]; 110 | abs_x_reg_2 <= abs_x_2; 111 | is_neg_reg_2 <= is_neg_2; 112 | 113 | //Cycle 3: Choose output 114 | case({is_neg_reg_2, is_big_2, is_lin_2}) 115 | 3'b010: res_2 <= {{(DW-OF-1){1'b0}}, 1'b1, {(OF){1'b0}}}; 116 | 3'b110: res_2 <= {{(DW-OF){1'b1}}, {(OF){1'b0}}}; 117 | 3'b001: res_2 <= abs_x_reg_2; 118 | 3'b101: res_2 <= -abs_x_reg_2; 119 | 3'b000: res_2 <= lookup_2; 120 | 3'b100: res_2 <= -lookup_2; 121 | default: res_2 <= 0; 122 | endcase 123 | end 124 | end*/ 125 | 126 | assign result = res; 127 | /*assign result_2 = res_2; 128 | 129 | altera_syncram altera_syncram_component ( 130 | .address_a (index), 131 | .address_b (index_2), 132 | .clock0 (clk), 133 | .data_a ({(DW){1'b1}}), 134 | .data_b ({(DW){1'b1}}), 135 | .wren_a (1'b0), 136 | .wren_b (1'b0), 137 | .q_a (lookup), 138 | .q_b (lookup_2), 139 | .aclr0 (), 140 | .aclr1 (), 141 | .address2_a (1'b1), 142 | .address2_b (1'b1), 143 | .addressstall_a (1'b0), 144 | .addressstall_b (1'b0), 145 | .byteena_a (1'b1), 146 | .byteena_b (1'b1), 147 | .clock1 (1'b1), 148 | .clocken0 (1'b1), 149 | .clocken1 (1'b1), 150 | .clocken2 (1'b1), 151 | .clocken3 (1'b1), 152 | .eccencbypass (1'b0), 153 | .eccencparity (8'b0), 154 | .eccstatus (), 155 | .rden_a (1'b1), 156 | .rden_b (1'b1), 157 | .sclr (1'b0) 158 | ); 159 | defparam 160 | altera_syncram_component.address_reg_b = "CLOCK0", 161 | altera_syncram_component.clock_enable_input_a = "BYPASS", 162 | altera_syncram_component.clock_enable_input_b = "BYPASS", 163 | altera_syncram_component.clock_enable_output_a = "BYPASS", 164 | altera_syncram_component.clock_enable_output_b = "BYPASS", 165 | altera_syncram_component.indata_reg_b = "CLOCK0", 166 | altera_syncram_component.init_file = "tanh.mif", 167 | altera_syncram_component.intended_device_family = "Stratix 10", 168 | altera_syncram_component.lpm_type = "altera_syncram", 169 | altera_syncram_component.numwords_a = SAMPLES, 170 | altera_syncram_component.numwords_b = SAMPLES, 171 | altera_syncram_component.operation_mode = "BIDIR_DUAL_PORT", 172 | altera_syncram_component.outdata_aclr_a = "NONE", 173 | altera_syncram_component.outdata_aclr_b = "NONE", 174 | altera_syncram_component.outdata_sclr_a = "NONE", 175 | altera_syncram_component.outdata_sclr_b = "NONE", 176 | altera_syncram_component.outdata_reg_a = "CLOCK0", 177 | altera_syncram_component.outdata_reg_b = "CLOCK0", 178 | altera_syncram_component.enable_force_to_zero = "TRUE", 179 | altera_syncram_component.power_up_uninitialized = "FALSE", 180 | altera_syncram_component.ram_block_type = "M20K", 181 | altera_syncram_component.widthad_a = AW, 182 | altera_syncram_component.widthad_b = AW, 183 | altera_syncram_component.width_a = DW, 184 | altera_syncram_component.width_b = DW, 185 | altera_syncram_component.width_byteena_a = 1, 186 | altera_syncram_component.width_byteena_b = 1;*/ 187 | 188 | altera_syncram altera_syncram_component ( 189 | .address_a (index), 190 | .clock0 (clk), 191 | .q_a (lookup), 192 | .aclr0 (1'b0), 193 | .aclr1 (1'b0), 194 | .address2_a (1'b1), 195 | .address2_b (1'b1), 196 | .address_b (1'b1), 197 | .addressstall_a (1'b0), 198 | .addressstall_b (1'b0), 199 | .byteena_a (1'b1), 200 | .byteena_b (1'b1), 201 | .clock1 (1'b1), 202 | .clocken0 (1'b1), 203 | .clocken1 (1'b1), 204 | .clocken2 (1'b1), 205 | .clocken3 (1'b1), 206 | .data_a ({(DW){1'b1}}), 207 | .data_b (1'b1), 208 | .eccencbypass (1'b0), 209 | .eccencparity (8'b0), 210 | .eccstatus ( ), 211 | .q_b ( ), 212 | .rden_a (1'b1), 213 | .rden_b (1'b1), 214 | .sclr (1'b0), 215 | .wren_a (1'b0), 216 | .wren_b (1'b0) 217 | ); 218 | defparam 219 | altera_syncram_component.address_aclr_a = "NONE", 220 | altera_syncram_component.clock_enable_input_a = "BYPASS", 221 | altera_syncram_component.clock_enable_output_a = "BYPASS", 222 | altera_syncram_component.init_file = {RTL_DIR, "tanh.mif"}, 223 | altera_syncram_component.intended_device_family = TARGET_FPGA, 224 | altera_syncram_component.lpm_hint = "ENABLE_RUNTIME_MOD=NO", 225 | altera_syncram_component.lpm_type = "altera_syncram", 226 | altera_syncram_component.numwords_a = SAMPLES, 227 | altera_syncram_component.operation_mode = "ROM", 228 | altera_syncram_component.outdata_aclr_a = "NONE", 229 | altera_syncram_component.outdata_sclr_a = "NONE", 230 | altera_syncram_component.outdata_reg_a = "CLOCK0", 231 | altera_syncram_component.ram_block_type = "M20K", 232 | altera_syncram_component.enable_force_to_zero = "FALSE", 233 | altera_syncram_component.widthad_a = AW, 234 | altera_syncram_component.width_a = DW, 235 | altera_syncram_component.width_byteena_a = 1; 236 | 237 | endmodule 238 | -------------------------------------------------------------------------------- /rtl/tester_rom.sv: -------------------------------------------------------------------------------- 1 | module test_rom # ( 2 | parameter DEPTH = 2, 3 | parameter DATAW = 16, 4 | parameter ADDRW = $clog2(DEPTH), 5 | parameter MIF_FILE = "/nfs/site/home/aboutros/self_tester/test_vectors.mif" 6 | )( 7 | input [ADDRW-1:0] address, 8 | input clock, 9 | output [DATAW-1:0] q 10 | ); 11 | 12 | wire [DATAW-1:0] sub_wire0; 13 | assign q = sub_wire0[DATAW-1:0]; 14 | 15 | altera_syncram altera_syncram_component ( 16 | .address_a (address), 17 | .clock0 (clock), 18 | .q_a (sub_wire0), 19 | .aclr0 (1'b0), 20 | .aclr1 (1'b0), 21 | .address2_a (1'b1), 22 | .address2_b (1'b1), 23 | .address_b (1'b1), 24 | .addressstall_a (1'b0), 25 | .addressstall_b (1'b0), 26 | .byteena_a (1'b1), 27 | .byteena_b (1'b1), 28 | .clock1 (1'b1), 29 | .clocken0 (1'b1), 30 | .clocken1 (1'b1), 31 | .clocken2 (1'b1), 32 | .clocken3 (1'b1), 33 | .data_a ({(DATAW){1'b1}}), 34 | .data_b (1'b1), 35 | .eccencbypass (1'b0), 36 | .eccencparity (8'b0), 37 | .eccstatus ( ), 38 | .q_b ( ), 39 | .rden_a (1'b1), 40 | .rden_b (1'b1), 41 | .sclr (1'b0), 42 | .wren_a (1'b0), 43 | .wren_b (1'b0)); 44 | defparam 45 | altera_syncram_component.address_aclr_a = "NONE", 46 | altera_syncram_component.clock_enable_input_a = "BYPASS", 47 | altera_syncram_component.clock_enable_output_a = "BYPASS", 48 | altera_syncram_component.init_file = MIF_FILE, 49 | altera_syncram_component.intended_device_family = "Stratix 10", 50 | altera_syncram_component.lpm_type = "altera_syncram", 51 | altera_syncram_component.numwords_a = DEPTH, 52 | altera_syncram_component.operation_mode = "ROM", 53 | altera_syncram_component.outdata_aclr_a = "NONE", 54 | altera_syncram_component.outdata_sclr_a = "NONE", 55 | altera_syncram_component.outdata_reg_a = "CLOCK0", 56 | altera_syncram_component.ram_block_type = "M20K", 57 | altera_syncram_component.enable_force_to_zero = "TRUE", 58 | altera_syncram_component.widthad_a = ADDRW, 59 | altera_syncram_component.width_a = DATAW, 60 | altera_syncram_component.width_byteena_a = 1; 61 | 62 | endmodule -------------------------------------------------------------------------------- /scripts/perf_baseline: -------------------------------------------------------------------------------- 1 | 01_gemv_512x512 5.65 2 | 02_gemv_1024x1024 13.06 3 | 03_gemv_1152x1152 13.38 4 | 04_gemv_1536x1536 18.34 5 | 05_gemv_1792x1792 20.04 6 | 06_rnn_512_8 10.86 7 | 07_rnn_1024_8 25.11 8 | 08_rnn_1152_8 25.59 9 | 09_rnn_1536_8 33.38 10 | 10_rnn_1792_8 34.25 11 | 11_gru_512_8 10.89 12 | 12_gru_1024_8 24.32 13 | 13_gru_1152_8 25.29 14 | 14_lstm_512_8 15.16 15 | 15_lstm_1024_8 31.85 16 | 16_mlp5_512 6.86 17 | 17_mlp5_1024 16.29 18 | 18_mlp3_1024_512_256_256 6.02 19 | 19_mlp3_1024_512_256_256_batched 7.26 20 | -------------------------------------------------------------------------------- /scripts/perf_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os import listdir, chdir 3 | from os.path import isfile, join 4 | import sys 5 | import subprocess 6 | 7 | # Define colors for printing 8 | class colors: 9 | PASS = '\x1b[42m' 10 | FAIL = '\x1b[41m' 11 | BOLD = '\033[1m' 12 | RESET = '\033[0;0m' 13 | 14 | keyword = '' 15 | if ('--run_test' in sys.argv): 16 | keyword = sys.argv[sys.argv.index('--run_test')+1] 17 | 18 | # Get list of existing workloads 19 | path = './workloads/' 20 | workloads = [f for f in listdir(path) if isfile(join(path, f))] 21 | workloads = [f for f in workloads if keyword in f] 22 | workloads.sort() 23 | for i in range(len(workloads)): 24 | workloads[i] = workloads[i].split('.')[0] 25 | 26 | # Parse baseline results 27 | baseline_results = {} 28 | baseline = open('../scripts/perf_baseline', 'r') 29 | for line in baseline: 30 | split_line = line.split(' ') 31 | baseline_results[split_line[0]] = float(split_line[1]) 32 | 33 | print(colors.BOLD + '{:<35}{:<4} {:<5} {:<6}'.format('WORKLOAD', 'TEST', 'TOPS', 'QoR') + colors.RESET) 34 | 35 | chdir('../compiler') 36 | for workload in workloads: 37 | subprocess.call(['cp', '../scripts/workloads/'+workload+'.py', './'], shell=False) 38 | sys.stdout.write('{:<35}'.format(workload)) 39 | sys.stdout.flush() 40 | outfile = open('../scripts/reports/'+workload+'_perf.rpt', 'w') 41 | subprocess.call(['python', workload+'.py', '-perfsim'], stdout=outfile, shell=False) 42 | rptfile = open('../scripts/reports/'+workload+'_perf.rpt', 'r') 43 | parse_perf_res = False 44 | for line in rptfile: 45 | if (parse_perf_res and ('Running simulation ... ' in line)): 46 | args = line.split() 47 | if('PASSED' in args[3]): 48 | print(colors.PASS + 'PASS' + colors.RESET, end='') 49 | result = args[10] 50 | if workload in baseline_results: 51 | comparison_to_baseline = ((float(args[10])/baseline_results[workload])-1) * 100 52 | if comparison_to_baseline >= 0: 53 | print (' {:>5} +{:<5.2f}'.format(result, comparison_to_baseline) + '%') 54 | else: 55 | print (' {:>5} -{:<5.2f}'.format(result, comparison_to_baseline) + '%') 56 | else: 57 | print (' {:>5} N/A'.format(result)) 58 | else: 59 | print(colors.FAIL + 'FAIL' + colors.RESET) 60 | elif 'C++ Performance Simulation' in line: 61 | parse_perf_res = True 62 | if(not parse_perf_res): 63 | print(colors.FAIL + 'FAIL' + colors.RESET) 64 | subprocess.call(['rm', workload+'.py'], shell=False) 65 | 66 | -------------------------------------------------------------------------------- /scripts/reports/README.md: -------------------------------------------------------------------------------- 1 | Directory for script reports 2 | -------------------------------------------------------------------------------- /scripts/rtl_baseline: -------------------------------------------------------------------------------- 1 | 01_gemv_512x512 3.55 2 | 02_gemv_1024x1024 9.7 3 | 03_gemv_1152x1152 10.39 4 | 04_gemv_1536x1536 15.06 5 | 05_gemv_1792x1792 16.9 6 | 06_rnn_512_8 7.47 7 | 07_rnn_1024_8 19.84 8 | 08_rnn_1152_8 21.1 9 | 09_rnn_1536_8 29.91 10 | 10_rnn_1792_8 33.12 11 | 11_gru_512_8 8.41 12 | 12_gru_1024_8 21.02 13 | 13_gru_1152_8 22.97 14 | 14_lstm_512_8 13.29 15 | 15_lstm_1024_8 30.0 16 | 16_mlp5_512 4.32 17 | 17_mlp5_1024 12.03 18 | 18_mlp3_1024_512_256_256 3.92 19 | 19_mlp3_1024_512_256_256_batched 5.32 20 | -------------------------------------------------------------------------------- /scripts/rtl_tests.py: -------------------------------------------------------------------------------- 1 | from os import listdir, chdir 2 | from os.path import isfile, join 3 | import sys 4 | import subprocess 5 | 6 | # Define colors for printing 7 | class colors: 8 | PASS = '\x1b[42m' 9 | FAIL = '\x1b[41m' 10 | BOLD = '\033[1m' 11 | RESET = '\033[0;0m' 12 | 13 | keyword = '' 14 | if ('--run_test' in sys.argv): 15 | keyword = sys.argv[sys.argv.index('--run_test')+1] 16 | 17 | # Get list of existing workloads 18 | path = './workloads/' 19 | workloads = [f for f in listdir(path) if isfile(join(path, f))] 20 | workloads = [f for f in workloads if keyword in f] 21 | workloads.sort() 22 | for i in range(len(workloads)): 23 | workloads[i] = workloads[i].split('.')[0] 24 | 25 | # Parse baseline results 26 | baseline_results = {} 27 | baseline = open('../scripts/rtl_baseline', 'r') 28 | for line in baseline: 29 | split_line = line.split(' ') 30 | baseline_results[split_line[0]] = float(split_line[1]) 31 | 32 | chdir('../compiler') 33 | print(colors.BOLD + '{:<35}{:<4} {:<5} {:<6}'.format('WORKLOAD', 'TEST', 'TOPS', 'QoR') + colors.RESET) 34 | for workload in workloads: 35 | subprocess.call(['cp', '../scripts/workloads/'+workload+'.py', './'], shell=False) 36 | sys.stdout.write('{:<35}'.format(workload)) 37 | sys.stdout.flush() 38 | outfile = open('../scripts/reports/'+workload+'_rtl.rpt', 'w') 39 | subprocess.call(['python', workload+'.py', '-rtlsim'], stdout=outfile, shell=False) 40 | rptfile = open('../scripts/reports/'+workload+'_rtl.rpt', 'r') 41 | parse_rtl_res = False 42 | for line in rptfile: 43 | if (parse_rtl_res and ('Running simulation ... ' in line)): 44 | args = line.split() 45 | if('PASSED' in args[3]): 46 | print(colors.PASS + 'PASS' + colors.RESET, end='') 47 | result = args[10] 48 | if workload in baseline_results: 49 | comparison_to_baseline = ((float(args[10])/baseline_results[workload])-1) * 100 50 | if comparison_to_baseline >= 0: 51 | print (' {:>5} +{:<5.2f}'.format(result, comparison_to_baseline) + '%') 52 | else: 53 | print (' {:>5} {:<6.2f}'.format(result, comparison_to_baseline) + '%') 54 | else: 55 | print (' {:>5} N/A'.format(result)) 56 | else: 57 | print(colors.FAIL + 'FAIL' + colors.RESET) 58 | elif 'Launching RTL Simulation' in line: 59 | parse_rtl_res = True 60 | if(not parse_rtl_res): 61 | print(colors.FAIL + 'FAIL' + colors.RESET) 62 | subprocess.call(['rm', workload+'.py'], shell=False) 63 | 64 | -------------------------------------------------------------------------------- /scripts/workloads/01_gemv_512x512.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 512 16 | L1_SIZE = 512 17 | 18 | # Define model architecture using Keras Sequential Model 19 | model = NPUSequential([ 20 | layers.Dense(L1_SIZE, name="layer1"), 21 | ]) 22 | 23 | # Random test inputs for different types of layers 24 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], dtype=tf.int32, minval=-128, maxval=127) 25 | 26 | # Call model on example input 27 | y = model(test_input) 28 | 29 | # Print model summary 30 | model.summary() 31 | 32 | ####### END OF MODEL DEFINITION ####### 33 | 34 | # Initialize NPU 35 | npu = initialize_npu(sys.argv) 36 | # Compile model for NPU 37 | model.compile_for_npu(npu, test_input) 38 | # Run NPU flow 39 | npu.run_flow() 40 | -------------------------------------------------------------------------------- /scripts/workloads/02_gemv_1024x1024.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 1024 16 | L1_SIZE = 1024 17 | 18 | # Define model architecture using Keras Sequential Model 19 | model = NPUSequential([ 20 | layers.Dense(L1_SIZE, name="layer1"), 21 | ]) 22 | 23 | # Random test inputs for different types of layers 24 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], dtype=tf.int32, minval=-128, maxval=127) 25 | 26 | # Call model on example input 27 | y = model(test_input) 28 | 29 | # Print model summary 30 | model.summary() 31 | 32 | ####### END OF MODEL DEFINITION ####### 33 | 34 | # Initialize NPU 35 | npu = initialize_npu(sys.argv) 36 | # Compile model for NPU 37 | model.compile_for_npu(npu, test_input) 38 | # Run NPU flow 39 | npu.run_flow() 40 | -------------------------------------------------------------------------------- /scripts/workloads/03_gemv_1152x1152.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 1152 16 | L1_SIZE = 1152 17 | 18 | # Define model architecture using Keras Sequential Model 19 | model = NPUSequential([ 20 | layers.Dense(L1_SIZE, name="layer1"), 21 | ]) 22 | 23 | # Random test inputs for different types of layers 24 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], dtype=tf.int32, minval=-128, maxval=127) 25 | 26 | # Call model on example input 27 | y = model(test_input) 28 | 29 | # Print model summary 30 | model.summary() 31 | 32 | ####### END OF MODEL DEFINITION ####### 33 | 34 | # Initialize NPU 35 | npu = initialize_npu(sys.argv) 36 | # Compile model for NPU 37 | model.compile_for_npu(npu, test_input) 38 | # Run NPU flow 39 | npu.run_flow() 40 | -------------------------------------------------------------------------------- /scripts/workloads/04_gemv_1536x1536.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 1536 16 | L1_SIZE = 1536 17 | 18 | # Define model architecture using Keras Sequential Model 19 | model = NPUSequential([ 20 | layers.Dense(L1_SIZE, name="layer1"), 21 | ]) 22 | 23 | # Random test inputs for different types of layers 24 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], dtype=tf.int32, minval=-128, maxval=127) 25 | 26 | # Call model on example input 27 | y = model(test_input) 28 | 29 | # Print model summary 30 | model.summary() 31 | 32 | ####### END OF MODEL DEFINITION ####### 33 | 34 | # Initialize NPU 35 | npu = initialize_npu(sys.argv) 36 | # Compile model for NPU 37 | model.compile_for_npu(npu, test_input) 38 | # Run NPU flow 39 | npu.run_flow() 40 | -------------------------------------------------------------------------------- /scripts/workloads/05_gemv_1792x1792.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 1792 16 | L1_SIZE = 1792 17 | 18 | # Define model architecture using Keras Sequential Model 19 | model = NPUSequential([ 20 | layers.Dense(L1_SIZE, name="layer1"), 21 | ]) 22 | 23 | # Random test inputs for different types of layers 24 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], dtype=tf.int32, minval=-128, maxval=127) 25 | 26 | # Call model on example input 27 | y = model(test_input) 28 | 29 | # Print model summary 30 | model.summary() 31 | 32 | ####### END OF MODEL DEFINITION ####### 33 | 34 | # Initialize NPU 35 | npu = initialize_npu(sys.argv) 36 | # Compile model for NPU 37 | model.compile_for_npu(npu, test_input) 38 | # Run NPU flow 39 | npu.run_flow() 40 | -------------------------------------------------------------------------------- /scripts/workloads/06_rnn_512_8.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 512 16 | HIDDEN_UNITS = 512 17 | TIME_STEPS = 8 18 | 19 | # Define model architecture using Keras Sequential Model 20 | model = NPUSequential([ 21 | layers.SimpleRNN(HIDDEN_UNITS, name="layer1"), 22 | ]) 23 | 24 | # Random test inputs for different types of layers 25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127) 26 | 27 | # Call model on example input 28 | y = model(test_input) 29 | 30 | # Print model summary 31 | model.summary() 32 | 33 | ####### END OF MODEL DEFINITION ####### 34 | 35 | # Initialize NPU 36 | npu = initialize_npu(sys.argv) 37 | # Compile model for NPU 38 | model.compile_for_npu(npu, test_input) 39 | # Run NPU flow 40 | npu.run_flow() 41 | -------------------------------------------------------------------------------- /scripts/workloads/07_rnn_1024_8.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 1024 16 | HIDDEN_UNITS = 1024 17 | TIME_STEPS = 8 18 | 19 | # Define model architecture using Keras Sequential Model 20 | model = NPUSequential([ 21 | layers.SimpleRNN(HIDDEN_UNITS, name="layer1"), 22 | ]) 23 | 24 | # Random test inputs for different types of layers 25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127) 26 | 27 | # Call model on example input 28 | y = model(test_input) 29 | 30 | # Print model summary 31 | model.summary() 32 | 33 | ####### END OF MODEL DEFINITION ####### 34 | 35 | # Initialize NPU 36 | npu = initialize_npu(sys.argv) 37 | # Compile model for NPU 38 | model.compile_for_npu(npu, test_input) 39 | # Run NPU flow 40 | npu.run_flow() 41 | -------------------------------------------------------------------------------- /scripts/workloads/08_rnn_1152_8.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 1152 16 | HIDDEN_UNITS = 1152 17 | TIME_STEPS = 8 18 | 19 | # Define model architecture using Keras Sequential Model 20 | model = NPUSequential([ 21 | layers.SimpleRNN(HIDDEN_UNITS, name="layer1"), 22 | ]) 23 | 24 | # Random test inputs for different types of layers 25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127) 26 | 27 | # Call model on example input 28 | y = model(test_input) 29 | 30 | # Print model summary 31 | model.summary() 32 | 33 | ####### END OF MODEL DEFINITION ####### 34 | 35 | # Initialize NPU 36 | npu = initialize_npu(sys.argv) 37 | # Compile model for NPU 38 | model.compile_for_npu(npu, test_input) 39 | # Run NPU flow 40 | npu.run_flow() 41 | -------------------------------------------------------------------------------- /scripts/workloads/09_rnn_1536_8.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 1536 16 | HIDDEN_UNITS = 1536 17 | TIME_STEPS = 8 18 | 19 | # Define model architecture using Keras Sequential Model 20 | model = NPUSequential([ 21 | layers.SimpleRNN(HIDDEN_UNITS, name="layer1"), 22 | ]) 23 | 24 | # Random test inputs for different types of layers 25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127) 26 | 27 | # Call model on example input 28 | y = model(test_input) 29 | 30 | # Print model summary 31 | model.summary() 32 | 33 | ####### END OF MODEL DEFINITION ####### 34 | 35 | # Initialize NPU 36 | npu = initialize_npu(sys.argv) 37 | # Compile model for NPU 38 | model.compile_for_npu(npu, test_input) 39 | # Run NPU flow 40 | npu.run_flow() 41 | -------------------------------------------------------------------------------- /scripts/workloads/10_rnn_1792_8.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 1792 16 | HIDDEN_UNITS = 1792 17 | TIME_STEPS = 8 18 | 19 | # Define model architecture using Keras Sequential Model 20 | model = NPUSequential([ 21 | layers.SimpleRNN(HIDDEN_UNITS, name="layer1"), 22 | ]) 23 | 24 | # Random test inputs for different types of layers 25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127) 26 | 27 | # Call model on example input 28 | y = model(test_input) 29 | 30 | # Print model summary 31 | model.summary() 32 | 33 | ####### END OF MODEL DEFINITION ####### 34 | 35 | # Initialize NPU 36 | npu = initialize_npu(sys.argv) 37 | # Compile model for NPU 38 | model.compile_for_npu(npu, test_input) 39 | # Run NPU flow 40 | npu.run_flow() 41 | -------------------------------------------------------------------------------- /scripts/workloads/11_gru_512_8.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 512 16 | HIDDEN_UNITS = 512 17 | TIME_STEPS = 8 18 | 19 | # Define model architecture using Keras Sequential Model 20 | model = NPUSequential([ 21 | layers.GRU(HIDDEN_UNITS, name="layer1"), 22 | ]) 23 | 24 | # Random test inputs for different types of layers 25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127) 26 | 27 | # Call model on example input 28 | y = model(test_input) 29 | 30 | # Print model summary 31 | model.summary() 32 | 33 | ####### END OF MODEL DEFINITION ####### 34 | 35 | # Initialize NPU 36 | npu = initialize_npu(sys.argv) 37 | # Compile model for NPU 38 | model.compile_for_npu(npu, test_input) 39 | # Run NPU flow 40 | npu.run_flow() 41 | -------------------------------------------------------------------------------- /scripts/workloads/12_gru_1024_8.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 1024 16 | HIDDEN_UNITS = 1024 17 | TIME_STEPS = 8 18 | 19 | # Define model architecture using Keras Sequential Model 20 | model = NPUSequential([ 21 | layers.GRU(HIDDEN_UNITS, name="layer1"), 22 | ]) 23 | 24 | # Random test inputs for different types of layers 25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127) 26 | 27 | # Call model on example input 28 | y = model(test_input) 29 | 30 | # Print model summary 31 | model.summary() 32 | 33 | ####### END OF MODEL DEFINITION ####### 34 | 35 | # Initialize NPU 36 | npu = initialize_npu(sys.argv) 37 | # Compile model for NPU 38 | model.compile_for_npu(npu, test_input) 39 | # Run NPU flow 40 | npu.run_flow() 41 | -------------------------------------------------------------------------------- /scripts/workloads/13_gru_1152_8.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 1152 16 | HIDDEN_UNITS = 1152 17 | TIME_STEPS = 8 18 | 19 | # Define model architecture using Keras Sequential Model 20 | model = NPUSequential([ 21 | layers.GRU(HIDDEN_UNITS, name="layer1"), 22 | ]) 23 | 24 | # Random test inputs for different types of layers 25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127) 26 | 27 | # Call model on example input 28 | y = model(test_input) 29 | 30 | # Print model summary 31 | model.summary() 32 | 33 | ####### END OF MODEL DEFINITION ####### 34 | 35 | # Initialize NPU 36 | npu = initialize_npu(sys.argv) 37 | # Compile model for NPU 38 | model.compile_for_npu(npu, test_input) 39 | # Run NPU flow 40 | npu.run_flow() 41 | -------------------------------------------------------------------------------- /scripts/workloads/14_lstm_512_8.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 512 16 | HIDDEN_UNITS = 512 17 | TIME_STEPS = 8 18 | 19 | # Define model architecture using Keras Sequential Model 20 | model = NPUSequential([ 21 | layers.LSTM(HIDDEN_UNITS, name="layer1"), 22 | ]) 23 | 24 | # Random test inputs for different types of layers 25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127) 26 | 27 | # Call model on example input 28 | y = model(test_input) 29 | 30 | # Print model summary 31 | model.summary() 32 | 33 | ####### END OF MODEL DEFINITION ####### 34 | 35 | # Initialize NPU 36 | npu = initialize_npu(sys.argv) 37 | # Compile model for NPU 38 | model.compile_for_npu(npu, test_input) 39 | # Run NPU flow 40 | npu.run_flow() 41 | -------------------------------------------------------------------------------- /scripts/workloads/15_lstm_1024_8.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 1024 16 | HIDDEN_UNITS = 1024 17 | TIME_STEPS = 8 18 | 19 | # Define model architecture using Keras Sequential Model 20 | model = NPUSequential([ 21 | layers.LSTM(HIDDEN_UNITS, name="layer1"), 22 | ]) 23 | 24 | # Random test inputs for different types of layers 25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127) 26 | 27 | # Call model on example input 28 | y = model(test_input) 29 | 30 | # Print model summary 31 | model.summary() 32 | 33 | ####### END OF MODEL DEFINITION ####### 34 | 35 | # Initialize NPU 36 | npu = initialize_npu(sys.argv) 37 | # Compile model for NPU 38 | model.compile_for_npu(npu, test_input) 39 | # Run NPU flow 40 | npu.run_flow() 41 | -------------------------------------------------------------------------------- /scripts/workloads/16_mlp5_512.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 512 16 | DENSE_SIZE = 512 17 | 18 | # Define model architecture using Keras Sequential Model 19 | model = NPUSequential([ 20 | layers.Dense(DENSE_SIZE, name="layer1"), 21 | layers.Dense(DENSE_SIZE, name="layer2"), 22 | layers.Dense(DENSE_SIZE, name="layer3"), 23 | ]) 24 | 25 | # Random test inputs for different types of layers 26 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], minval=-128, maxval=127) 27 | 28 | # Call model on example input 29 | y = model(test_input) 30 | 31 | # Print model summary 32 | model.summary() 33 | 34 | ####### END OF MODEL DEFINITION ####### 35 | 36 | # Initialize NPU 37 | npu = initialize_npu(sys.argv) 38 | # Compile model for NPU 39 | model.compile_for_npu(npu, test_input) 40 | # Run NPU flow 41 | npu.run_flow() 42 | -------------------------------------------------------------------------------- /scripts/workloads/17_mlp5_1024.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | #import sys 7 | #sys.path.append('../compiler/') 8 | 9 | from compiler import * 10 | from npu_layers import * 11 | 12 | ###### START OF MODEL DEFINITION ###### 13 | 14 | # Define constants 15 | INPUT_SIZE = 1024 16 | DENSE_SIZE = 1024 17 | 18 | # Define model architecture using Keras Sequential Model 19 | model = NPUSequential([ 20 | layers.Dense(DENSE_SIZE, name="layer1"), 21 | layers.Dense(DENSE_SIZE, name="layer2"), 22 | layers.Dense(DENSE_SIZE, name="layer3"), 23 | ]) 24 | 25 | # Random test inputs for different types of layers 26 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], minval=-128, maxval=127) 27 | 28 | # Call model on example input 29 | y = model(test_input) 30 | 31 | # Print model summary 32 | model.summary() 33 | 34 | ####### END OF MODEL DEFINITION ####### 35 | 36 | # Initialize NPU 37 | npu = initialize_npu(sys.argv) 38 | # Compile model for NPU 39 | model.compile_for_npu(npu, test_input) 40 | # Run NPU flow 41 | npu.run_flow() 42 | -------------------------------------------------------------------------------- /scripts/workloads/18_mlp3_1024_512_256_256.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | 7 | from compiler import * 8 | from npu_layers import * 9 | 10 | ###### START OF MODEL DEFINITION ###### 11 | 12 | # Define constants 13 | INPUT_VEC_SIZE = 1024 14 | DENSE_L1_SIZE = 512 15 | DENSE_L2_SIZE = 256 16 | DENSE_L3_SIZE = 256 17 | 18 | # Define model architecture using Keras Sequential Model 19 | model = NPUSequential([ 20 | layers.Dense(DENSE_L1_SIZE, activation="relu", name="layer1"), 21 | layers.Dense(DENSE_L2_SIZE, activation="relu", name="layer2"), 22 | layers.Dense(DENSE_L3_SIZE, activation="relu", name="layer3"), 23 | ]) 24 | 25 | # Random test inputs for different types of layers 26 | test_input = tf.random.uniform(shape=[6, INPUT_VEC_SIZE], minval=-128, maxval=127) 27 | 28 | # Call model on example input 29 | y = model(test_input) 30 | 31 | # Print model summary 32 | model.summary() 33 | 34 | ####### END OF MODEL DEFINITION ####### 35 | 36 | # Initialize NPU 37 | npu = initialize_npu(sys.argv) 38 | # Compile model for NPU 39 | model.compile_for_npu(npu, test_input) 40 | # Run NPU flow 41 | npu.run_flow() 42 | -------------------------------------------------------------------------------- /scripts/workloads/19_mlp3_1024_512_256_256_batched.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2" 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | 7 | from compiler import * 8 | from npu_layers import * 9 | 10 | ###### START OF MODEL DEFINITION ###### 11 | 12 | # Define constants 13 | INPUT_VEC_SIZE = 1024 14 | DENSE_L1_SIZE = 512 15 | DENSE_L2_SIZE = 256 16 | DENSE_L3_SIZE = 256 17 | 18 | # Define model architecture using Keras Sequential Model 19 | model = NPUSequential([ 20 | layers.Dense(DENSE_L1_SIZE, activation="relu", name="layer1"), 21 | layers.Dense(DENSE_L2_SIZE, activation="relu", name="layer2"), 22 | layers.Dense(DENSE_L3_SIZE, activation="relu", name="layer3"), 23 | ]) 24 | 25 | # Random test inputs for different types of layers 26 | test_input = tf.random.uniform(shape=[18, INPUT_VEC_SIZE], minval=-128, maxval=127) 27 | 28 | # Call model on example input 29 | y = model(test_input) 30 | 31 | # Print model summary 32 | model.summary() 33 | 34 | ####### END OF MODEL DEFINITION ####### 35 | 36 | # Initialize NPU 37 | npu = initialize_npu(sys.argv) 38 | # Compile model for NPU 39 | model.compile_for_npu(npu, test_input) 40 | # Run NPU flow 41 | npu.run_flow() 42 | -------------------------------------------------------------------------------- /simulator/Makefile: -------------------------------------------------------------------------------- 1 | CC := g++ 2 | HEADER := inc/ 3 | CFLAGS := -c -g -std=c++11 -Wall -Wextra 4 | INCLUDES := -I ./inc/ 5 | OBJ_DIR := ./src/obj/ 6 | SIM_DIR := ./main/obj/ 7 | EXE := npu_sim 8 | 9 | OBJ := \ 10 | $(OBJ_DIR)port.o \ 11 | $(OBJ_DIR)input.o \ 12 | $(OBJ_DIR)output.o \ 13 | $(OBJ_DIR)channel.o \ 14 | $(OBJ_DIR)dpe.o \ 15 | $(OBJ_DIR)mvu_vrf.o \ 16 | $(OBJ_DIR)accumulator.o \ 17 | $(OBJ_DIR)register_file.o \ 18 | $(OBJ_DIR)tile.o \ 19 | $(OBJ_DIR)mvu.o \ 20 | $(OBJ_DIR)evrf.o \ 21 | $(OBJ_DIR)mfu.o \ 22 | $(OBJ_DIR)loader.o \ 23 | $(OBJ_DIR)datapath.o \ 24 | $(OBJ_DIR)decoder.o \ 25 | $(OBJ_DIR)npu.o \ 26 | $(OBJ_DIR)utils.o 27 | 28 | all: $(EXE) 29 | 30 | %: $(SIM_DIR)%.o $(OBJ) 31 | $(CC) -g $(OBJ) $< -o $@ 32 | 33 | $(SIM_DIR)%.o: main/%.cpp 34 | $(CC) $(INCLUDES) $(CFLAGS) $< -o $@ 35 | 36 | $(OBJ_DIR)%.o: src/%.cpp 37 | $(CC) $(INCLUDES) $(CFLAGS) $< -o $@ 38 | 39 | clean: 40 | rm $(OBJ) $(EXE) 41 | 42 | .PRECIOUS: $(OBJ) 43 | 44 | -------------------------------------------------------------------------------- /simulator/inc/accumulator.h: -------------------------------------------------------------------------------- 1 | #ifndef PRIME_ACCUMULATOR_H 2 | #define PRIME_ACCUMULATOR_H 3 | 4 | #include 5 | #include 6 | #include "input.h" 7 | #include "output.h" 8 | #include "inst.h" 9 | #include "defines.h" 10 | #include "utils.h" 11 | 12 | /* 13 | * This class implements the MVU accumulation of the 3 results computed by the dot product engines 14 | * based on the Stratix 10 NX tensor block. 15 | * Input Ports: 16 | * - 3 data inputs (from inter-tile reduction) 17 | * - uOP (from decoder) 18 | * - reconfigurable accumulator size (from decoder) 19 | * Output Ports: 20 | * - 3 accumulation results (to MVU output) 21 | */ 22 | class Accumulator : public Module { 23 | public: 24 | // Constructor 25 | Accumulator (std::string t_name, unsigned int t_accum_id); 26 | // Clock function 27 | void clock(); 28 | // Getter functions 29 | std::string getName(); 30 | unsigned int getId(); 31 | Input *getPortInput(unsigned int i); 32 | Input *getPortuOP(); 33 | Input *getPortSize(); 34 | Output *getPortRes(unsigned int i); 35 | // Helper functions 36 | void reset(); 37 | // Destructor 38 | ~Accumulator(); 39 | 40 | private: 41 | // Module name 42 | std::string name; 43 | // Input and Output ports 44 | Input* input0; 45 | Input* input1; 46 | Input* input2; 47 | Input* uOP; 48 | Input* size; 49 | Output* result0; 50 | Output* result1; 51 | Output* result2; 52 | // Local variables 53 | unsigned int accum_id; 54 | std::vector accum0_values; 55 | std::vector accum1_values; 56 | std::vector accum2_values; 57 | unsigned int channel_full_count; 58 | unsigned int num_accum_values = 2 * 3 * (LANES/10); 59 | unsigned int accum_idx; 60 | }; 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /simulator/inc/channel.h: -------------------------------------------------------------------------------- 1 | #ifndef CHANNEL_H_ 2 | #define CHANNEL_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "defines.h" 10 | #include "inst.h" 11 | 12 | /* 13 | * This class implements a communication channel. Each channel has a capacity and latency 14 | * parameters. By setting these two parameters, a channel can be used to model: 15 | * - wire: capacity 1 and latency 0 16 | * - register: capacity 1 and latency 1 17 | * - pipeline: capacity N and latency N 18 | * - FIFO: capacity N and latency 1 19 | */ 20 | template 21 | class Channel { 22 | public: 23 | // Constructor 24 | Channel (std::string t_name, unsigned int t_size, unsigned int t_latency); 25 | // Clock function 26 | void clock(); 27 | // Helper functions 28 | void write(T t_value); 29 | T read(); 30 | T peek(); 31 | T at(unsigned int idx); 32 | bool isEmpty(); 33 | bool isFull(); 34 | // Getter functions 35 | std::string getName(); 36 | unsigned int getSize(); 37 | 38 | private: 39 | // Module name 40 | std::string name; 41 | // Local variables 42 | std::queue> buffer; 43 | unsigned int size; 44 | unsigned int latency; 45 | }; 46 | #endif 47 | -------------------------------------------------------------------------------- /simulator/inc/datapath.h: -------------------------------------------------------------------------------- 1 | #ifndef DATAPATH_H 2 | #define DATAPATH_H 3 | 4 | #include 5 | #include 6 | 7 | #include "input.h" 8 | #include "output.h" 9 | #include "mvu.h" 10 | #include "mfu.h" 11 | #include "evrf.h" 12 | #include "loader.h" 13 | #include "inst.h" 14 | #include "defines.h" 15 | 16 | /* 17 | * This class implements the NPU datapath. It consists of 5 main pipeline stages (MVU, eVRF, MFU0, 18 | * MFU1, and Loader). 19 | * Input Ports: 20 | * - MVU uOP (from NPU decoders) 21 | * - eVRF uOP (from NPU decoders) 22 | * - MFU0 uOP (from NPU decoders) 23 | * - MFU1 uOP (from NPU decoders) 24 | * - Loader uOP (from NPU decoders) 25 | * Output Ports: 26 | * - Final NPU output (to tester) 27 | */ 28 | class Datapath : public Module { 29 | public: 30 | // Constructor 31 | Datapath (std::string t_name); 32 | // Clock function 33 | void clock(unsigned int &cycle_count); 34 | // Getter functions 35 | Input* getPortMVUuOP(); 36 | Input* getPortEVRFuOP(); 37 | Input* getPortMFU0uOP(); 38 | Input* getPortMFU1uOP(); 39 | Input* getPortLDuOP(); 40 | Output>* getPortOutput(); 41 | // Destructor 42 | ~Datapath(); 43 | 44 | private: 45 | // Module name 46 | std::string name; 47 | // Input and Output ports 48 | Input* mvu_uOP_port; 49 | Input* evrf_uOP_port; 50 | Input* mfu0_uOP_port; 51 | Input* mfu1_uOP_port; 52 | Input* ld_uOP_port; 53 | Output>* datapath_output; 54 | // Internal modules 55 | MVU* mvu; 56 | EVRF* evrf; 57 | MFU* mfu0; 58 | MFU* mfu1; 59 | LD* ld; 60 | // Internal channels 61 | Channel>* mvu_to_evrf_channel; 62 | Channel>* evrf_to_mfu0_channel; 63 | Channel>* mfu0_to_mfu1_channel; 64 | Channel>* mfu1_to_ld_channel; 65 | // Loader to MVU channels 66 | std::vector>*> ld_to_mvu_wdata_channels; 67 | std::vector*> ld_to_mvu_waddr_channels; 68 | Channel* ld_to_mvu_update_channel; 69 | // Loader to eVRF Channels 70 | Channel>* ld_to_evrf_wdata_channel; 71 | Channel* ld_to_evrf_waddr_channel; 72 | Channel* ld_to_evrf_update_channel; 73 | // Loader to MRF0 Channels 74 | Channel>* ld_to_mfu0_vrf0_wdata_channel; 75 | Channel>* ld_to_mfu0_vrf1_wdata_channel; 76 | Channel* ld_to_mfu0_vrf0_waddr_channel; 77 | Channel* ld_to_mfu0_vrf1_waddr_channel; 78 | Channel* ld_to_mfu0_update_channel; 79 | // Loader to MRF1 Channels 80 | Channel>* ld_to_mfu1_vrf0_wdata_channel; 81 | Channel>* ld_to_mfu1_vrf1_wdata_channel; 82 | Channel* ld_to_mfu1_vrf0_waddr_channel; 83 | Channel* ld_to_mfu1_vrf1_waddr_channel; 84 | Channel* ld_to_mfu1_update_channel; 85 | }; 86 | 87 | #endif 88 | -------------------------------------------------------------------------------- /simulator/inc/decoder.h: -------------------------------------------------------------------------------- 1 | #ifndef DECODER_H 2 | #define DECODER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "input.h" 8 | #include "output.h" 9 | #include "channel.h" 10 | #include "inst.h" 11 | #include "defines.h" 12 | #include "utils.h" 13 | 14 | /* 15 | * This class implements the NPU instruction decoders that translate an NPU VLIW instruction 16 | * (5 chained mOPs) into a sequence of uOPs for each of the 5 NPU pipeline stages. 17 | * Input Ports: 18 | * - VLIW NPU instructions (from NPU top-level module) 19 | * Output Ports: 20 | * - MVU uOP (to NPU datapath) 21 | * - eVRF uOP (to NPU datapath) 22 | * - MFU0 uOP (to NPU datapath) 23 | * - MFU1 uOP (to NPU datapath) 24 | * - Loader uOP (to NPU datapath) 25 | */ 26 | class Decoder : public Module { 27 | public: 28 | // Constructor 29 | Decoder (std::string t_name); 30 | // Clock function 31 | void clock(unsigned int &cycle_count); 32 | // Getter functions 33 | std::string getName(); 34 | Input* getPortInputVLIW(); 35 | Output* getPortMVUuOP(); 36 | Output* getPortEVRFuOP(); 37 | Output* getPortMFU0uOP(); 38 | Output* getPortMFU1uOP(); 39 | Output* getPortLDuOP(); 40 | // Destructor 41 | ~Decoder(); 42 | 43 | private: 44 | // Module name 45 | std::string name; 46 | // Input and Output ports 47 | Input* vliw; 48 | Output* mvu_uOP_port; 49 | Output* evrf_uOP_port; 50 | Output* mfu0_uOP_port; 51 | Output* mfu1_uOP_port; 52 | Output* ld_uOP_port; 53 | // Internal channels 54 | Channel* mvu_mOP_channel; 55 | Channel* evrf_mOP_channel; 56 | Channel* mfu0_mOP_channel; 57 | Channel* mfu1_mOP_channel; 58 | Channel* ld_mOP_channel; 59 | // Local variables for decoding logic 60 | unsigned int mvu_counter; 61 | unsigned int mvu_pipeline_counter; 62 | unsigned int mvu_chunk_counter; 63 | unsigned int reg_sel_flag; 64 | int remaining_rows; 65 | unsigned int acc_size; 66 | unsigned int evrf_counter; 67 | unsigned int evrf_batch_counter; 68 | unsigned int mfu0_counter; 69 | unsigned int mfu0_batch_counter; 70 | unsigned int mfu1_counter; 71 | unsigned int mfu1_batch_counter; 72 | unsigned int ld_counter; 73 | unsigned int ld_batch_counter; 74 | bool decoding_mvu; 75 | bool decoding_evrf; 76 | bool decoding_mfu0; 77 | bool decoding_mfu1; 78 | bool decoding_ld; 79 | npu_instruction inst; 80 | mvu_uOP u1; evrf_uOP u2; mfu_uOP u3; mfu_uOP u4; ld_uOP u5; 81 | mvu_mOP m1; evrf_mOP m2; mfu_mOP m3; mfu_mOP m4; ld_mOP m5; 82 | unsigned int row_count; unsigned int col_count; 83 | unsigned int tile_id; unsigned int pue_id; 84 | unsigned int x_size; unsigned int y_size; unsigned int chunks_per_tile; 85 | }; 86 | 87 | #endif 88 | -------------------------------------------------------------------------------- /simulator/inc/defines.h: -------------------------------------------------------------------------------- 1 | #ifndef DEFINES_H_ 2 | #define DEFINES_H_ 3 | 4 | #include 5 | #include 6 | 7 | // Debug Messages 8 | #define VERBOSE_OP 1 9 | #define VERBOSE_MVU 1 10 | #define VERBOSE_LD_OUT 0 11 | 12 | // Architecture Parameters 13 | #define TILES 7 14 | #define DPES 40 15 | #define LANES 40 16 | #define MVU_VRF_DEPTH 512 17 | #define MVU_MRF_DEPTH 1024 18 | #define EVRF_DEPTH 512 19 | #define MFU_VRF0_DEPTH 512 20 | #define MFU_VRF1_DEPTH 512 21 | #define FIFO_DEPTH 512 22 | 23 | // Latency Parameters 24 | #define DPE_MULT_LATENCY 2 25 | #define DPE_ADDER_LATENCY 1 26 | #define RF_WRITE_LATENCY 1 27 | #define RF_READ_LATENCY 1 28 | #define MRF_TO_DPE_LATENCY 8 29 | #define VRF_TO_DPE_LATENCY 8 30 | #define MVU_ACCUM_LATENCY 4 31 | #define MVU_REDUCTION_LATENCY (unsigned int)(ceil(log2(TILES))+5) 32 | #define MFU_ACT_LATENCY 3 33 | #define MFU_ADD_LATENCY 3 34 | #define MFU_MUL_LATENCY 3 35 | #define MFU_LATENCY MFU_ACT_LATENCY+MFU_ADD_LATENCY+MFU_MUL_LATENCY 36 | #define LD_WB_LATENCY 5 37 | 38 | // Precision 39 | #define TYPE int 40 | #define INPUT_PRECISION 8 41 | #define MASK_TRUNCATE 0x000000FF 42 | #define MASK_SIGN_EXTEND 0xFFFFFF00 43 | #define MASK_SIGN_CHECK 0x00000080 44 | 45 | #define LOG(module_name, msg) do { \ 46 | std::cout << "[" << module_name << " @ " << cycle_count << "]: " << msg << std::endl; \ 47 | } while (0) 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /simulator/inc/dpe.h: -------------------------------------------------------------------------------- 1 | #ifndef DPE_H_ 2 | #define DPE_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "module.h" 11 | #include "input.h" 12 | #include "output.h" 13 | #include "defines.h" 14 | #include "utils.h" 15 | 16 | /* 17 | * This class implements the MVU dot product engine (DPE) based on the Stratix 10 NX tensor blocks. 18 | * Each DPE implements a batch-3 dot product operation (i.e. 1 shared vector multiplied by 3 other 19 | * input vectors). 20 | * Input Ports: 21 | * - shared input vector (vBroadcast) 22 | * - sequentially loaded input vectors (vSeq) 23 | * - control signals (reg_sel, vrf_en) 24 | * Output Ports: 25 | * - 3 dot product results (dpe_res0, dpe_res1, dpe_res2) 26 | */ 27 | class DPE : public Module { 28 | public: 29 | // Constructor 30 | DPE (std::string t_name, unsigned int t_dpe_id, unsigned int t_tile_id); 31 | // Clock function 32 | void clock(); 33 | // Getter functions 34 | std::string getName(); 35 | Input> *getPortVSeq(); 36 | Input> *getPortVBroadcast(); 37 | Input *getPortRegSel(); 38 | Input *getPortVrfEn(); 39 | Output *getPortDPERes(unsigned int i); 40 | // Destructor 41 | ~DPE(); 42 | 43 | private: 44 | // Module name 45 | std::string name; 46 | // Input and Output ports 47 | Input>* vSeq; 48 | Input>* vBroadcast; 49 | Input* reg_sel; 50 | Input* vrf_en; 51 | Output* dpe_res0; 52 | Output* dpe_res1; 53 | Output* dpe_res2; 54 | // Internal channels' 55 | Channel* dpe_result0_channel; 56 | Channel* dpe_result1_channel; 57 | Channel* dpe_result2_channel; 58 | Channel>* pingpong0; 59 | Channel>* pingpong1; 60 | Channel>* broadcast_delay; 61 | Channel* input_sel_delay; 62 | Channel* reg_sel_delay; 63 | Channel* vrf_en_delay; 64 | // Local variables 65 | unsigned int dpe_id; 66 | unsigned int tile_id; 67 | // Local latency variables 68 | unsigned int num_prime_dsps = (unsigned int) ceil(1.0 * LANES / 10.0); 69 | unsigned int dpe_result_latency = (unsigned int) 2 + (ceil(log2(num_prime_dsps)) * 70 | DPE_ADDER_LATENCY); 71 | unsigned int pingpong_length = 3 * (1 + num_prime_dsps); 72 | int accum_val = 0; 73 | }; 74 | 75 | #endif 76 | 77 | -------------------------------------------------------------------------------- /simulator/inc/evrf.h: -------------------------------------------------------------------------------- 1 | #ifndef EVRF_H 2 | #define EVRF_H 3 | 4 | #include 5 | #include 6 | #include "input.h" 7 | #include "output.h" 8 | #include "register_file.h" 9 | #include "inst.h" 10 | #include "defines.h" 11 | 12 | /* 13 | * This class implements the external VRF (eVRF) module which is used to skip the MVU if an 14 | * instruction chain does not have an MVU operation. 15 | * Input Ports: 16 | * - eVRF input (from previous block in pipeline -- MVU) 17 | * - eVRF uOP (from decoder) 18 | * - eVRF write data (from Loader) 19 | * - eVRF write address (from Loader) 20 | * - update tag (from Loader) 21 | * Output Ports: 22 | * - eVRF output (to next block in pipeline -- MFU0) 23 | */ 24 | class EVRF : public Module { 25 | public: 26 | // Constructor 27 | EVRF (std::string t_name); 28 | // Clock function 29 | void clock(unsigned int &cycle_count); 30 | // Getter functions 31 | std::string getName(); 32 | Input>* getPortInput(); 33 | Input* getPortuOP(); 34 | Input>* getPortEvrfWdata(); 35 | Input* getPortEvrfWaddr(); 36 | Input* getPortUpdateTag(); 37 | Output>* getPortRes(); 38 | // Destructor 39 | ~EVRF(); 40 | 41 | private: 42 | // Module name 43 | std::string name; 44 | // Input and Output ports 45 | Input>* evrf_input; 46 | Input* uOP; 47 | Input>* evrf_wdata; 48 | Input* evrf_waddr; 49 | Input* update_tag; 50 | Output>* evrf_result; 51 | // Internal modules 52 | RegisterFile>* evrf; 53 | // Internal channels 54 | Channel>* mvu_channel; 55 | Channel* evrf_raddr; 56 | Channel>* evrf_rdata; 57 | // Local variables 58 | unsigned int current_tag; 59 | }; 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /simulator/inc/input.h: -------------------------------------------------------------------------------- 1 | #ifndef INPUT_H_ 2 | #define INPUT_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include "port.h" 8 | #include "module.h" 9 | #include "defines.h" 10 | #include "inst.h" 11 | #include "channel.h" 12 | 13 | /* 14 | * This class implements an input port for a module. Each input port is connected to a channel. 15 | */ 16 | template 17 | class Input : public Port 18 | { 19 | public: 20 | // Constructor 21 | Input(std::string t_name, Module *t_module); 22 | // Helper functions 23 | void connectTo(Channel *t_channel); 24 | T readFromChannel(); 25 | T peekChannel(); 26 | bool isChannelEmpty(); 27 | // Destructor 28 | ~Input(); 29 | 30 | private: 31 | Channel* channel; 32 | }; 33 | 34 | #endif -------------------------------------------------------------------------------- /simulator/inc/loader.h: -------------------------------------------------------------------------------- 1 | #ifndef LOADER_H 2 | #define LOADER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "input.h" 8 | #include "output.h" 9 | #include "inst.h" 10 | #include "utils.h" 11 | #include "defines.h" 12 | 13 | /* 14 | * This class implements the loader module which writes the datapath results back to one of the 15 | * NPU architectural states (VRFs). 16 | * Input Ports: 17 | * - Loader input (from previous block in pipeline -- MFU1) 18 | * - Loader uOP (from decoder) 19 | * Output Ports: 20 | * - MVU VRFs write data (to MVU) 21 | * - MVU VRFs write address (to MVU) 22 | * - MVU tag update (to MVU) 23 | * - eVRF write data (to eVRF) 24 | * - eVRF write address (to eVRF) 25 | * - eVRF tag update (to eVRF) 26 | * - MFU0 VRF0 write data (to MFU0) 27 | * - MFU0 VRF0 write address (to MFU0) 28 | * - MFU0 VRF1 write data (to MFU0) 29 | * - MFU0 VRF1 write address (to MFU0) 30 | * - MFU0 tag update (to MFU0) 31 | * - MFU1 VRF0 write data (to MFU1) 32 | * - MFU1 VRF0 write address (to MFU1) 33 | * - MFU1 VRF1 write data (to MFU1) 34 | * - MFU1 VRF1 write address (to MFU1) 35 | * - MFU1 tag update (to MFU1) 36 | * - Loader output port (to tester) 37 | */ 38 | class LD : public Module { 39 | public: 40 | // Constructor 41 | LD (std::string t_name); 42 | // Clock function 43 | void clock(unsigned int &cycle_count); 44 | // Getter functions 45 | std::string getName(); 46 | Input* getPortuOP(); 47 | Input>* getPortInput(); 48 | Output>* getPortMVUWdata(unsigned int idx); 49 | Output* getPortMVUWaddr(unsigned int idx); 50 | Output>* getPortEvrfWdata(); 51 | Output* getPortEvrfWaddr(); 52 | Output>* getPortMFU0Vrf0Wdata(); 53 | Output* getPortMFU0Vrf0Waddr(); 54 | Output>* getPortMFU0Vrf1Wdata(); 55 | Output* getPortMFU0Vrf1Waddr(); 56 | Output>* getPortMFU1Vrf0Wdata(); 57 | Output* getPortMFU1Vrf0Waddr(); 58 | Output>* getPortMFU1Vrf1Wdata(); 59 | Output* getPortMFU1Vrf1Waddr(); 60 | Output* getPortUpdateMVU(); 61 | Output* getPortUpdateEvrf(); 62 | Output* getPortUpdateMFU0(); 63 | Output* getPortUpdateMFU1(); 64 | Output>* getPortOutput(); 65 | // Destructor 66 | ~LD(); 67 | 68 | private: 69 | // Module name 70 | std::string name; 71 | // Input and Output ports 72 | Input* uOP; 73 | Input>* ld_input; 74 | std::vector>*> mvu_vrfs_wdata; 75 | std::vector*> mvu_vrfs_waddr; 76 | Output>* evrf_wdata; 77 | Output* evrf_waddr; 78 | Output>* mfu0_vrf0_wdata; 79 | Output* mfu0_vrf0_waddr; 80 | Output>* mfu0_vrf1_wdata; 81 | Output* mfu0_vrf1_waddr; 82 | Output>* mfu1_vrf0_wdata; 83 | Output* mfu1_vrf0_waddr; 84 | Output>* mfu1_vrf1_wdata; 85 | Output* mfu1_vrf1_waddr; 86 | Output *update_tag_mvu; 87 | Output *update_tag_evrf; 88 | Output *update_tag_mfu0; 89 | Output *update_tag_mfu1; 90 | Output>* ld_output; 91 | // Local variables 92 | std::queue> input_fifo; 93 | }; 94 | 95 | #endif 96 | -------------------------------------------------------------------------------- /simulator/inc/mfu.h: -------------------------------------------------------------------------------- 1 | #ifndef MFU_H 2 | #define MFU_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "input.h" 10 | #include "output.h" 11 | #include "register_file.h" 12 | #include "inst.h" 13 | #include "defines.h" 14 | #include "utils.h" 15 | 16 | /* 17 | * This class implements the Multi-Function Unit (MFU) which performs vector element-wise 18 | * operations: activations {tanh, sigmoid, relu}, addition {add, sub_ab, sub_ba, max}, and 19 | * multiplication {mult}. 20 | * Input Ports: 21 | * - MFU input (from previous block in pipeline -- eVRF for MFU0 or MFU0 for MFU1) 22 | * - MFU uOP (from decoder) 23 | * - VRF0 write data (from Loader) 24 | * - VRF0 write address (from Loader) 25 | * - VRF1 write data (from Loader) 26 | * - VRF1 write address (from Loader) 27 | * - Tag update (from Loader) 28 | * Output Ports: 29 | * - MFU output (to next block in pipeline -- MFU1 for MFU0 or Loader for MFU1) 30 | */ 31 | class MFU : public Module { 32 | public: 33 | // Constructor 34 | MFU (std::string t_name); 35 | // Clock function 36 | void clock(unsigned int &cycle_count); 37 | // Getter functions 38 | std::string getName(); 39 | Input>* getPortInput(); 40 | Input* getPortuOP(); 41 | Output>* getPortRes(); 42 | Input>* getPortVrf0Wdata(); 43 | Input>* getPortVrf1Wdata(); 44 | Input* getPortVrf0Waddr(); 45 | Input* getPortVrf1Waddr(); 46 | Input* getPortUpdateTag(); 47 | // Destructor 48 | ~MFU(); 49 | 50 | private: 51 | // Module name 52 | std::string name; 53 | // Input and Output port 54 | Input>* mfu_input; 55 | Input* uOP; 56 | Input>* vrf0_wdata; 57 | Input* vrf0_waddr; 58 | Input>* vrf1_wdata; 59 | Input* vrf1_waddr; 60 | Input* update_tag; 61 | Output>* mfu_result; 62 | // Internal modules 63 | RegisterFile> *vrf0; 64 | RegisterFile> *vrf1; 65 | // Internal channels 66 | Channel>* mfu_channel; 67 | Channel>* vrf0_rdata_channel; 68 | Channel* vrf0_raddr_channel; 69 | Channel>* vrf1_rdata_channel; 70 | Channel* vrf1_raddr_channel; 71 | Channel* uOP_channel; 72 | Channel>* act_out_channel; 73 | Channel>* add_out_channel; 74 | Channel* uOP_pipeline; 75 | // Local variables 76 | unsigned int current_tag; 77 | }; 78 | 79 | #endif 80 | -------------------------------------------------------------------------------- /simulator/inc/module.h: -------------------------------------------------------------------------------- 1 | #ifndef MODULE_H_ 2 | #define MODULE_H_ 3 | 4 | #include 5 | 6 | /* 7 | * This header file defines the module abstract class. Any other module in the simulated 8 | * architecture inherits this class and has to implement the clock() function 9 | */ 10 | class Module { 11 | public: 12 | // Constructor 13 | Module(std::string t_name) { name = t_name; } 14 | virtual ~Module() {} 15 | //Getter functions 16 | std::string getName() { return name; } 17 | // Defines what happens in this module every clock cycle (analogous to always block) 18 | virtual void clock() { } 19 | 20 | private: 21 | // Module name 22 | std::string name; 23 | }; 24 | 25 | #endif -------------------------------------------------------------------------------- /simulator/inc/mvu.h: -------------------------------------------------------------------------------- 1 | #ifndef MVU_H 2 | #define MVU_H 3 | 4 | #include 5 | #include 6 | #include "input.h" 7 | #include "output.h" 8 | #include "tile.h" 9 | #include "inst.h" 10 | #include "utils.h" 11 | #include "defines.h" 12 | 13 | /* 14 | * This class implements the matrix-vector multiplication unit (MVU). 15 | * Input Ports: 16 | * - VRFs write data (from Loader) 17 | * - VRFs write address (from Loader) 18 | * - MVU uOP (from decoder) 19 | * - update tag (from Loader) 20 | * Output Ports: 21 | * - MVU output (to next block in pipeline -- eVRF) 22 | */ 23 | class MVU : public Module { 24 | public: 25 | // Constructor 26 | MVU (std::string t_name); 27 | // Clock function 28 | void clock(unsigned int &cycle_count); 29 | // Getter functions 30 | std::string getName(); 31 | Input>* getPortVrfWdata(unsigned int idx); 32 | Input* getPortVrfWaddr(unsigned int idx); 33 | Input* getPortuOP(); 34 | Input* getPortUpdateTag(); 35 | Output>* getPortRes(); 36 | // Destructor 37 | ~MVU(); 38 | 39 | private: 40 | // Module name 41 | std::string name; 42 | // Input and Output ports 43 | std::vector>*> vrfs_wdata; 44 | std::vector*> vrfs_waddr; 45 | Input* uOP; 46 | Input* update_tag; 47 | Output>* mvu_results; 48 | // Internal modules 49 | std::vector mvu_tiles; 50 | // Internal channels 51 | std::vector*> uOP_channels; 52 | std::vector*>> tile_results0; 53 | std::vector*>> tile_results1; 54 | std::vector*>> tile_results2; 55 | Channel>* reduction_channel0; 56 | Channel>* reduction_channel1; 57 | Channel>* reduction_channel2; 58 | // Local variables 59 | unsigned int current_tag; 60 | }; 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /simulator/inc/mvu_vrf.h: -------------------------------------------------------------------------------- 1 | #ifndef MVU_VRF_H 2 | #define MVU_VRF_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "input.h" 11 | #include "output.h" 12 | #include "register_file.h" 13 | #include "utils.h" 14 | #include "defines.h" 15 | 16 | /* 17 | * This class implements the MVU vector register file (VRF). This module has the same interface as 18 | * a conventional register file, but supplies the batch-3 inputs in sequence to be compatible with 19 | * the Stratix 10 NX DPE. For a conventional DPE, a conventional register file would have been used. 20 | * Input Ports: 21 | * - VRF write data (from Loader) 22 | * - VRF write address (from Loader) 23 | * - VRF read address (from MVU uOP) 24 | * - VRF select control signal (from MVU uOP) 25 | * Output Ports: 26 | * - VRF read data (to DPEs) 27 | */ 28 | class MVUVRF : public Module { 29 | public: 30 | // Constructor 31 | MVUVRF (std::string t_name, unsigned int t_tile_id); 32 | // Clock function 33 | void clock(); 34 | // Getters and setters 35 | Input> *getPortVrfWdata(); 36 | Input *getPortVrfWaddr(); 37 | Output> *getPortVrfRdata(); 38 | Input *getPortVrfRaddr(); 39 | Input *getPortVrfSel(); 40 | // Destructor 41 | ~MVUVRF(); 42 | 43 | private: 44 | // Module name 45 | std::string name; 46 | // Input and Output ports 47 | Input>* vrf_wdata; 48 | Input* vrf_waddr; 49 | Output>* vrf_rdata; 50 | Input* vrf_raddr; 51 | Input* vrf_sel; 52 | // Internal modules 53 | std::vector>*> vrfs; 54 | // Internal channels 55 | std::vector*> vrf_raddr_channel; 56 | std::vector>*> vrf_rdata_channel; 57 | std::vector*> vrf_waddr_channel; 58 | std::vector>*> vrf_wdata_channel; 59 | // Local variables 60 | unsigned int tile_id; 61 | unsigned int num_vrfs = LANES / 10; 62 | }; 63 | 64 | #endif -------------------------------------------------------------------------------- /simulator/inc/npu.h: -------------------------------------------------------------------------------- 1 | #ifndef NPU_H 2 | #define NPU_H 3 | 4 | #include 5 | #include "input.h" 6 | #include "output.h" 7 | #include "datapath.h" 8 | #include "decoder.h" 9 | #include "inst.h" 10 | #include "defines.h" 11 | 12 | /* 13 | * This class implements the NPU top-level module consisting of datapath and instruction decoders. 14 | * Input Ports: 15 | * - VLIW NPU instructions (from tester) 16 | * Output Ports: 17 | * - NPU final outputs (to tester) 18 | */ 19 | class NPU : public Module { 20 | public: 21 | // Constructor 22 | NPU (std::string t_name); 23 | // Clock function 24 | void clock(unsigned int &cycle_count); 25 | // Getter functions 26 | std::string getName(); 27 | Input* getPortInst(); 28 | Output>* getPortOutput(); 29 | // Destructor 30 | ~NPU(); 31 | 32 | private: 33 | // Module name 34 | std::string name; 35 | // Input and Output ports 36 | Input* npu_inst; 37 | Output>* npu_output; 38 | // Internal modules 39 | Datapath* npu_datapath; 40 | Decoder* npu_decoders; 41 | // Internal channels 42 | Channel* mvu_uOP_channel; 43 | Channel* evrf_uOP_channel; 44 | Channel* mfu0_uOP_channel; 45 | Channel* mfu1_uOP_channel; 46 | Channel* ld_uOP_channel; 47 | }; 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /simulator/inc/output.h: -------------------------------------------------------------------------------- 1 | #ifndef OUTPUT_H_ 2 | #define OUTPUT_H_ 3 | 4 | #include 5 | #include 6 | #include "defines.h" 7 | #include "inst.h" 8 | #include "port.h" 9 | #include "module.h" 10 | #include "channel.h" 11 | 12 | /* 13 | * This class implements an output port for a module. Each output port is connected to one or more 14 | * outgoing channel(s). 15 | */ 16 | template 17 | class Output : public Port 18 | { 19 | public: 20 | // Constructor 21 | Output(std::string t_name, Module *t_module); 22 | // Helper functions 23 | void connectTo(Channel *t_channel); 24 | void writeToChannel(T t_data); 25 | bool isChannelFull(); 26 | // Destructor 27 | ~Output(); 28 | 29 | private: 30 | std::vector*> channels; 31 | }; 32 | 33 | #endif -------------------------------------------------------------------------------- /simulator/inc/port.h: -------------------------------------------------------------------------------- 1 | #ifndef PORT_H_ 2 | #define PORT_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "inst.h" 9 | #include "utils.h" 10 | #include "module.h" 11 | #include "channel.h" 12 | 13 | /* 14 | * This class implements a module port. This class is not used in the implementation of the 15 | * simulator. Both Input and Output port classes inherit from it. 16 | */ 17 | template 18 | class Port { 19 | public: 20 | // Constructor 21 | Port (std::string t_name, Module *t_module); 22 | // Getther functions 23 | std::string getName(); 24 | Module* getModule(); 25 | virtual ~Port() {}; 26 | 27 | protected: 28 | // Port name 29 | std::string name; 30 | // Module the port belongs to 31 | Module* module; 32 | }; 33 | #endif 34 | 35 | -------------------------------------------------------------------------------- /simulator/inc/register_file.h: -------------------------------------------------------------------------------- 1 | #ifndef REGISTER_FILE_H_ 2 | #define REGISTER_FILE_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "module.h" 11 | #include "input.h" 12 | #include "output.h" 13 | #include "channel.h" 14 | #include "utils.h" 15 | #include "defines.h" 16 | 17 | /* 18 | * This class implements a simple dual-port register file (1 read and 1 write ports) that is used 19 | * in different modules of the NPU. 20 | * Input Ports: 21 | * - VRF write data 22 | * - VRF write address 23 | * - VRF read address 24 | * Output Ports: 25 | * - VRF read data 26 | */ 27 | template 28 | class RegisterFile : public Module { 29 | public: 30 | // Constructor 31 | RegisterFile(std::string t_name, unsigned int t_depth, std::string *t_file_name = nullptr); 32 | // Clock function 33 | void clock(); 34 | // Getter functions 35 | Input* getPortRaddr(); 36 | Output* getPortRdata(); 37 | Input* getPortWaddr(); 38 | Input* getPortWdata(); 39 | // Helper functions 40 | void write(); 41 | void read(); 42 | void print(); 43 | // Destructor 44 | ~RegisterFile(); 45 | 46 | private: 47 | // Input and Output ports 48 | Input* raddr; 49 | Output* rdata; 50 | Input* waddr; 51 | Input* wdata; 52 | // Local variables 53 | std::vector register_file; 54 | unsigned int depth; 55 | std::queue> read_pipeline; 56 | unsigned int reads_in_flight; 57 | std::queue> write_pipeline; 58 | unsigned int writes_in_flight; 59 | }; 60 | 61 | #endif 62 | 63 | -------------------------------------------------------------------------------- /simulator/inc/tile.h: -------------------------------------------------------------------------------- 1 | #ifndef TILE_H 2 | #define TILE_H 3 | 4 | #include 5 | #include 6 | #include "input.h" 7 | #include "output.h" 8 | #include "dpe.h" 9 | #include "mvu_vrf.h" 10 | #include "register_file.h" 11 | #include "accumulator.h" 12 | #include "inst.h" 13 | #include "defines.h" 14 | 15 | /* 16 | * This class implements the matrix-vector multiplication unit (MVU) tile. 17 | * Input Ports: 18 | * - VRFs write data (from Loader) 19 | * - VRFs write address (from Loader) 20 | * - MVU uOP (from decoder) 21 | * Output Ports: 22 | * - MVU tile output 0 (to MVU reduction) 23 | * - MVU tile output 1 (to MVU reduction) 24 | * - MVU tile output 2 (to MVU reduction) 25 | */ 26 | class Tile : public Module { 27 | public: 28 | // Constructor 29 | Tile (std::string t_name, unsigned int t_tile_id); 30 | // Clock function 31 | void clock(); 32 | // Getter functions 33 | Input> *getPortVrfWdata(); 34 | Input *getPortVrfWaddr(); 35 | Input *getPortuOP(); 36 | Output *getPortResults(unsigned int accum, unsigned int idx); 37 | // Destructor 38 | ~Tile(); 39 | 40 | private: 41 | // Module name 42 | std::string name; 43 | // Input and Output ports 44 | Input>* vrf_wdata; 45 | Input* vrf_waddr; 46 | Input* uOP; 47 | std::vector*> accum0_results; 48 | std::vector*> accum1_results; 49 | std::vector*> accum2_results; 50 | // Internal modules 51 | MVUVRF* vrf; 52 | std::vector>*> mrfs; 53 | std::vector dpes; 54 | std::vector accums; 55 | // Internal channels 56 | Channel* vrf_raddr; 57 | Channel* vrf_sel; 58 | std::vector*> mrf_raddr; 59 | std::vector>*> mrf_rdata; 60 | std::vector*> mrf_waddr; 61 | std::vector>*> mrf_wdata; 62 | std::vector>*> vrf_to_dpe_channels; 63 | std::vector>*> mrf_to_dpe_channels; 64 | std::vector*> dpe_reg_sel_channels; 65 | std::vector*> dpe_vrf_en_channels; 66 | std::vector*> accum_uOP; 67 | std::vector*> accum_size; 68 | std::vector*> accum0_channels; 69 | std::vector*> accum1_channels; 70 | std::vector*> accum2_channels; 71 | // Local variables 72 | unsigned int tile_id; 73 | unsigned int accum_latency; 74 | unsigned int reg_sel_latency; 75 | }; 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /simulator/inc/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "defines.h" 13 | 14 | /* 15 | * This header file declares several utility functions used throughout the simulator 16 | */ 17 | 18 | // Operator overload for printing a vector 19 | template 20 | std::ostream& operator<< (std::ostream& out, const std::vector& v); 21 | 22 | // Used for populating vector register file contents from a file 23 | void readVectorFile(std::string &file_name, std::vector> &vec_data); 24 | 25 | // Used for populating vector FIFO contents from a file 26 | void readVectorFile(std::string &file_name, std::queue> &que_data); 27 | 28 | // Operator overload for adding two vectors 29 | std::vector operator+ (const std::vector &v1, const std::vector &v2); 30 | 31 | // Used for reading simulating golden outputs 32 | template 33 | void readGoldenOutput(std::string &file_name, std::vector &vec_data, int v_size); 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /simulator/main/obj/README.md: -------------------------------------------------------------------------------- 1 | Directory for object files produced from simulation -------------------------------------------------------------------------------- /simulator/register_files/README.md: -------------------------------------------------------------------------------- 1 | Directory for register files content created by compiler -------------------------------------------------------------------------------- /simulator/src/accumulator.cpp: -------------------------------------------------------------------------------- 1 | #include "accumulator.h" 2 | 3 | // Reset helper function to set accumulated values to zeros 4 | void Accumulator::reset(){ 5 | accum0_values.erase(accum0_values.begin(), accum0_values.end()); 6 | accum1_values.erase(accum1_values.begin(), accum1_values.end()); 7 | accum2_values.erase(accum2_values.begin(), accum2_values.end()); 8 | for(unsigned int i = 0; i < num_accum_values; i++){ 9 | accum0_values.push_back(0); 10 | accum1_values.push_back(0); 11 | accum2_values.push_back(0); 12 | } 13 | } 14 | 15 | // Accumulator Constructor 16 | Accumulator::Accumulator(std::string t_name, unsigned int t_accum_id) : Module (t_name) { 17 | // Initialize local variables 18 | accum_id = t_accum_id; 19 | this->reset(); 20 | channel_full_count = 0; 21 | accum_idx = 0; 22 | // Create Input and Output ports 23 | input0 = new Input(t_name + "_input0", this); 24 | input1 = new Input(t_name + "_input1", this); 25 | input2 = new Input(t_name + "_input2", this); 26 | uOP = new Input(t_name + "_uOP", this); 27 | size = new Input(t_name + "_size", this); 28 | result0 = new Output(t_name + "_output0", this); 29 | result1 = new Output(t_name + "_output1", this); 30 | result2 = new Output(t_name + "_output2", this); 31 | } 32 | 33 | // Clock cycle update function 34 | void Accumulator::clock(){ 35 | // If no input data/size or uOP ready, abort 36 | if (input0->isChannelEmpty() || uOP->isChannelEmpty() || size->isChannelEmpty()) return; 37 | 38 | // Peek uOP and size to decide how to proceed 39 | unsigned int temp_uOP = uOP->peekChannel(); 40 | unsigned int temp_size = size->peekChannel(); 41 | 42 | //Accumlate input values 43 | TYPE input0_data = input0->readFromChannel(); 44 | TYPE input1_data = input1->readFromChannel(); 45 | TYPE input2_data = input2->readFromChannel(); 46 | temp_uOP = uOP->readFromChannel(); 47 | temp_size = size->readFromChannel(); 48 | accum0_values[accum_idx] = accum0_values[accum_idx] + input0_data; 49 | accum1_values[accum_idx] = accum1_values[accum_idx] + input1_data; 50 | accum2_values[accum_idx] = accum2_values[accum_idx] + input2_data; 51 | 52 | // Write out the final result & reset the accumulator 53 | if (temp_uOP) { 54 | result0->writeToChannel(accum0_values[accum_idx]); 55 | result1->writeToChannel(accum1_values[accum_idx]); 56 | result2->writeToChannel(accum2_values[accum_idx]); 57 | accum0_values[accum_idx] = 0; 58 | accum1_values[accum_idx] = 0; 59 | accum2_values[accum_idx] = 0; 60 | channel_full_count = 0; 61 | } 62 | 63 | // Update accumulator index 64 | if(accum_idx == temp_size-1) 65 | accum_idx = 0; 66 | else 67 | accum_idx++; 68 | } 69 | 70 | // Getter function for name 71 | std::string Accumulator::getName() { 72 | return name; 73 | } 74 | 75 | // Getter function for ID 76 | unsigned int Accumulator::getId() { 77 | return accum_id; 78 | } 79 | 80 | // Getter function for input ports 81 | Input* Accumulator::getPortInput(unsigned int i) { 82 | if(i == 0) 83 | return input0; 84 | else if (i == 1) 85 | return input1; 86 | else 87 | return input2; 88 | } 89 | 90 | // Getter function for uOP input port 91 | Input* Accumulator::getPortuOP() { 92 | return uOP; 93 | } 94 | 95 | // Getter function for port size 96 | Input* Accumulator::getPortSize() { 97 | return size; 98 | } 99 | 100 | // Getter function for output ports 101 | Output* Accumulator::getPortRes(unsigned int i) { 102 | if(i == 0) 103 | return result0; 104 | else if (i == 1) 105 | return result1; 106 | else 107 | return result2; 108 | } 109 | 110 | // Destructor 111 | Accumulator::~Accumulator(){ 112 | delete input0; 113 | delete input1; 114 | delete input2; 115 | delete uOP; 116 | delete size; 117 | delete result0; 118 | delete result1; 119 | delete result2; 120 | } -------------------------------------------------------------------------------- /simulator/src/channel.cpp: -------------------------------------------------------------------------------- 1 | #include "channel.h" 2 | 3 | // Channel constructor 4 | template 5 | Channel::Channel(std::string t_name, unsigned int t_size, unsigned int t_latency){ 6 | name = t_name; 7 | size = t_size; 8 | latency = t_latency; 9 | } 10 | 11 | // Helper function to write to a channel 12 | template 13 | void Channel::write(T t_value){ 14 | if (this->isFull()) 15 | std::cerr << "Channel "<< name <<" buffer size "<< 16 | buffer.size() << " out of " << size << std::endl; 17 | assert((!this->isFull()) && "Writing to a full channel"); 18 | buffer.push(std::make_tuple(t_value, latency)); 19 | } 20 | 21 | // Helper function to read from a channel 22 | template 23 | T Channel::read(){ 24 | assert((!buffer.empty() || (std::get<1>(buffer.front()) == 0)) && "Reading from empty channel"); 25 | T temp = std::get<0>(buffer.front()); 26 | buffer.pop(); 27 | return temp; 28 | }; 29 | 30 | // Helper function to peek a channel (look at the next element in the channel) 31 | template 32 | T Channel::peek(){ 33 | assert((!buffer.empty() || (std::get<1>(buffer.front()) == 0)) && "Peeking an empty channel"); 34 | T temp = std::get<0>(buffer.front()); 35 | return temp; 36 | }; 37 | 38 | // Helper function to get the element at a specific location in the channel 39 | template 40 | T Channel::at(unsigned int idx){ 41 | assert((buffer.size() > idx) && "Channel size is less that accessed index"); 42 | unsigned int i = 0; 43 | T temp; 44 | std::tuple temp_tuple; 45 | for(unsigned int itr = 0; itr < buffer.size(); itr++){ 46 | temp_tuple = buffer.front(); 47 | buffer.pop(); 48 | if(i == idx) 49 | temp = std::get<0>(temp_tuple); 50 | buffer.push(temp_tuple); 51 | i++; 52 | } 53 | return temp; 54 | }; 55 | 56 | // Helper function to check if channel is empty 57 | template 58 | bool Channel::isEmpty(){ 59 | return buffer.empty() || (std::get<1>(buffer.front()) != 0); 60 | } 61 | 62 | // Helper function to check if channel is full 63 | template 64 | bool Channel::isFull(){ 65 | return !(buffer.size() <= size); 66 | } 67 | 68 | // Clock cycle update function 69 | template 70 | void Channel::clock(){ 71 | if(!buffer.empty()){ 72 | for(unsigned int i = 0; i < buffer.size(); i++){ 73 | std::tuple temp = buffer.front(); 74 | buffer.pop(); 75 | if(std::get<1>(temp) > 0){ 76 | std::get<1>(temp)--; 77 | } 78 | buffer.push(temp); 79 | } 80 | } 81 | } 82 | 83 | // Getter function for name 84 | template 85 | std::string Channel::getName () { 86 | return name; 87 | } 88 | 89 | // Getter function for size 90 | template 91 | unsigned int Channel::getSize() { 92 | return buffer.size(); 93 | } 94 | 95 | template class Channel; 96 | template class Channel>; 97 | template class Channel; 98 | template class Channel; 99 | template class Channel; 100 | template class Channel; 101 | template class Channel; 102 | template class Channel; 103 | template class Channel; 104 | template class Channel; 105 | template class Channel; 106 | template class Channel; 107 | template class Channel; -------------------------------------------------------------------------------- /simulator/src/dpe.cpp: -------------------------------------------------------------------------------- 1 | #include "dpe.h" 2 | 3 | // DPE Constructor 4 | DPE::DPE (std::string t_name, unsigned int t_dpe_id, unsigned int t_tile_id) : Module(t_name) { 5 | // Create Input and Output ports 6 | vSeq = new Input>(t_name + "_vSeq", this); 7 | vBroadcast = new Input>(t_name + "_vBroadcast", this); 8 | reg_sel = new Input(t_name + "_reg_sel", this); 9 | vrf_en = new Input(t_name + "_vrf_en", this); 10 | dpe_res0 = new Output(t_name + "_dpe_res0", this); 11 | dpe_res1 = new Output(t_name + "_dpe_res1", this); 12 | dpe_res2 = new Output(t_name + "_dpe_res2", this); 13 | // Create internal channels 14 | dpe_result0_channel = new Channel(t_name + "_dpe_result0_channel", dpe_result_latency, 15 | dpe_result_latency); 16 | dpe_result1_channel = new Channel(t_name + "_dpe_result1_channel", dpe_result_latency, 17 | dpe_result_latency); 18 | dpe_result2_channel = new Channel(t_name + "_dpe_result2_channel", dpe_result_latency, 19 | dpe_result_latency); 20 | pingpong0 = new Channel>(t_name + "_pingpong0_channel", pingpong_length, 21 | pingpong_length); 22 | pingpong1 = new Channel>(t_name + "_pingpong1_channel", pingpong_length, 23 | pingpong_length); 24 | broadcast_delay = new Channel>(t_name + "_broadcast_delay_channel", 25 | pingpong_length, pingpong_length); 26 | input_sel_delay = new Channel(t_name + "_input_sel_delay_channel", 27 | pingpong_length, pingpong_length); 28 | reg_sel_delay = new Channel(t_name + "_input_reg_delay_channel", 3, 3); 29 | vrf_en_delay = new Channel(t_name + "_vrf_en_delay_channel", 3, 3); 30 | // Initialize local variables 31 | dpe_id = t_dpe_id; 32 | tile_id = t_tile_id; 33 | } 34 | 35 | // Dot product helper function 36 | TYPE dot_product(std::vector &v1, std::vector &v2){ 37 | TYPE result = 0; 38 | for(unsigned int i = 0; i < LANES; i++){ 39 | result += (v1[i] * v2[i]); 40 | } 41 | return result; 42 | } 43 | 44 | // Clock cycle update function 45 | void DPE::clock() { 46 | std::vector temp_vSeq, temp_vBroadcast; 47 | TYPE dpe_result0, dpe_result1, dpe_result2; 48 | // Write output results when ready 49 | if(!dpe_result0_channel->isEmpty() && !dpe_res0->isChannelFull()){ 50 | dpe_res0->writeToChannel(dpe_result0_channel->read()); 51 | dpe_res1->writeToChannel(dpe_result1_channel->read()); 52 | dpe_res2->writeToChannel(dpe_result2_channel->read()); 53 | } 54 | // Prepare operands 55 | if(!broadcast_delay->isEmpty() && !dpe_result0_channel->isFull()){ 56 | std::vector v0, v1, v2, temp, vb; 57 | unsigned int input_sel = input_sel_delay->read(); 58 | if(input_sel == 0){ 59 | for(unsigned int i = 0; i < (LANES/10 * 3); i++){ 60 | temp = pingpong0->at(i); 61 | if(i % 3 == 0){ 62 | v0.insert(v0.end(), temp.begin(), temp.end()); 63 | } else if (i % 3 == 1) { 64 | v1.insert(v1.end(), temp.begin(), temp.end()); 65 | } else { 66 | v2.insert(v2.end(), temp.begin(), temp.end()); 67 | } 68 | } 69 | } else { 70 | for(unsigned int i = 0; i < (LANES/10 * 3); i++){ 71 | temp = pingpong1->at(i); 72 | if(i % 3 == 0){ 73 | v0.insert(v0.end(), temp.begin(), temp.end()); 74 | } else if (i % 3 == 1) { 75 | v1.insert(v1.end(), temp.begin(), temp.end()); 76 | } else { 77 | v2.insert(v2.end(), temp.begin(), temp.end()); 78 | } 79 | } 80 | } 81 | vb = broadcast_delay->read(); 82 | // Perform computation 83 | dpe_result0 = dot_product(vb, v0); 84 | dpe_result1 = dot_product(vb, v1); 85 | dpe_result2 = dot_product(vb, v2); 86 | // Write dot product results to delay channels 87 | dpe_result0_channel->write(dpe_result0); 88 | dpe_result1_channel->write(dpe_result1); 89 | dpe_result2_channel->write(dpe_result2); 90 | } 91 | 92 | // Accept new inputs 93 | if(!vSeq->isChannelEmpty() && !vBroadcast->isChannelEmpty() 94 | && !reg_sel->isChannelEmpty() && !broadcast_delay->isFull()) { 95 | 96 | temp_vSeq = vSeq->readFromChannel(); 97 | temp_vBroadcast = vBroadcast->readFromChannel(); 98 | unsigned int temp_reg_sel = reg_sel->readFromChannel(); 99 | unsigned int temp_vrf_en = vrf_en->readFromChannel(); 100 | unsigned int delayed_reg_sel; 101 | unsigned int delayed_vrf_en; 102 | if(!reg_sel_delay->isEmpty()){ 103 | delayed_reg_sel = reg_sel_delay->read(); 104 | delayed_vrf_en = vrf_en_delay->read(); 105 | } else { 106 | delayed_reg_sel = temp_reg_sel; 107 | delayed_vrf_en = temp_vrf_en; 108 | } 109 | 110 | if(!pingpong0->isEmpty() && delayed_reg_sel == 0 && (delayed_vrf_en == 1)) 111 | pingpong0->read(); 112 | if(!pingpong1->isEmpty() && delayed_reg_sel == 1 && (delayed_vrf_en == 1)) 113 | pingpong1->read(); 114 | 115 | if((temp_reg_sel == 0) && (temp_vrf_en == 1)){ 116 | pingpong0->write(temp_vSeq); 117 | } else if ((temp_reg_sel == 1) && (temp_vrf_en == 1)){ 118 | pingpong1->write(temp_vSeq); 119 | } 120 | 121 | if(((temp_reg_sel == 0) && (temp_vrf_en == 1)) || (delayed_reg_sel == 0 && 122 | delayed_vrf_en == 1)) 123 | pingpong0->clock(); 124 | 125 | if(((temp_reg_sel == 1) && (temp_vrf_en == 1)) || (delayed_reg_sel == 1 && 126 | delayed_vrf_en == 1)) 127 | pingpong1->clock(); 128 | 129 | broadcast_delay->write(temp_vBroadcast); 130 | input_sel_delay->write(temp_reg_sel); 131 | reg_sel_delay->write(temp_reg_sel); 132 | vrf_en_delay->write(temp_vrf_en); 133 | } else if(!reg_sel_delay->isEmpty()){ 134 | unsigned int delayed_reg_sel = reg_sel_delay->read(); 135 | unsigned int delayed_vrf_en = vrf_en_delay->read(); 136 | 137 | if(!pingpong0->isEmpty() && delayed_reg_sel == 0 && delayed_vrf_en == 1) 138 | pingpong0->read(); 139 | if(!pingpong1->isEmpty() && delayed_reg_sel == 1 && delayed_vrf_en == 1) 140 | pingpong1->read(); 141 | 142 | if(delayed_reg_sel == 0 && delayed_vrf_en == 1) 143 | pingpong0->clock(); 144 | 145 | if(delayed_reg_sel == 1 && delayed_vrf_en == 1) 146 | pingpong1->clock(); 147 | } 148 | 149 | // Clock internal channels 150 | broadcast_delay->clock(); 151 | input_sel_delay->clock(); 152 | reg_sel_delay->clock(); 153 | vrf_en_delay->clock(); 154 | dpe_result0_channel->clock(); 155 | dpe_result1_channel->clock(); 156 | dpe_result2_channel->clock(); 157 | } 158 | 159 | // Getter function for name 160 | std::string DPE::getName() { 161 | return name; 162 | } 163 | 164 | // Getter function for sequentially loaded input port 165 | Input>* DPE::getPortVSeq() { 166 | return vSeq; 167 | } 168 | 169 | // Getter function for broadcast input port 170 | Input>* DPE::getPortVBroadcast() { 171 | return vBroadcast; 172 | } 173 | 174 | // Getter function for register select input port 175 | Input* DPE::getPortRegSel() { 176 | return reg_sel; 177 | } 178 | 179 | // Getter function for VRF enable input port 180 | Input* DPE::getPortVrfEn() { 181 | return vrf_en; 182 | } 183 | 184 | // Getter function for DPE output ports 185 | Output* DPE::getPortDPERes(unsigned int i) { 186 | if(i == 0) 187 | return dpe_res0; 188 | else if (i == 1) 189 | return dpe_res1; 190 | else 191 | return dpe_res2; 192 | } 193 | 194 | DPE::~DPE() { 195 | delete vSeq; 196 | delete vBroadcast; 197 | delete reg_sel; 198 | delete vrf_en; 199 | delete dpe_res0; 200 | delete dpe_res1; 201 | delete dpe_res2; 202 | delete dpe_result0_channel; 203 | delete dpe_result1_channel; 204 | delete dpe_result2_channel; 205 | delete pingpong0; 206 | delete pingpong1; 207 | delete broadcast_delay; 208 | delete input_sel_delay; 209 | delete reg_sel_delay; 210 | delete vrf_en_delay; 211 | } -------------------------------------------------------------------------------- /simulator/src/evrf.cpp: -------------------------------------------------------------------------------- 1 | #include "evrf.h" 2 | 3 | // eVRF Constructor 4 | EVRF::EVRF(std::string t_name) : Module (t_name) { 5 | // Create Input and Output ports 6 | evrf_input = new Input>(t_name + "_input", this); 7 | uOP = new Input(t_name + "_uOP", this); 8 | update_tag = new Input(t_name + "_update_tag", this); 9 | evrf_result = new Output>(t_name + "_result", this); 10 | 11 | // Create internal modules 12 | evrf = new RegisterFile>(t_name, EVRF_DEPTH); 13 | evrf_wdata = evrf->getPortWdata(); 14 | evrf_waddr = evrf->getPortWaddr(); 15 | 16 | // Create internal channels 17 | mvu_channel = new Channel>(t_name + "_mvu_channel", 18 | RF_READ_LATENCY + 1, RF_READ_LATENCY + 1); 19 | evrf_raddr = new Channel(t_name + "_evrf_raddr", 1, 0); 20 | evrf->getPortRaddr()->connectTo(evrf_raddr); 21 | evrf_rdata = new Channel>(t_name + "_evrf_rdata", 1, 0); 22 | evrf->getPortRdata()->connectTo(evrf_rdata); 23 | 24 | // Initialize local variables 25 | current_tag = 0; 26 | } 27 | 28 | // Clock cycle update function 29 | void EVRF::clock(unsigned int &cycle_count){ 30 | // If uOP is ready to dispatch 31 | if(!uOP->isChannelEmpty()){ 32 | // Peek ready uOP to decide how to proceed 33 | evrf_uOP temp = uOP->peekChannel(); 34 | // If ready operation is NOP, read and ignore 35 | if (temp.op == 0) { 36 | temp = uOP->readFromChannel(); 37 | LOG(this->getName(), "NOP"); 38 | 39 | // If ready operation is flush 40 | } else if (temp.op == 2 && !evrf_input->isChannelEmpty() && temp.tag <= current_tag) { 41 | evrf_input->readFromChannel(); 42 | 43 | // If ready operation is bypass 44 | } else if(!temp.src && !mvu_channel->isFull() && !evrf_input->isChannelEmpty() && 45 | temp.tag <= current_tag){ 46 | mvu_channel->write(evrf_input->readFromChannel()); 47 | temp = uOP->readFromChannel(); 48 | if(temp.first_flag) { 49 | LOG(this->getName(), "Issued first uOP " + std::to_string(temp.first_flag)); 50 | } 51 | 52 | // If ready operation is read eVRF 53 | } else if (temp.src && !evrf_rdata->isFull() && temp.tag <= current_tag) { 54 | evrf_raddr->write(temp.vrf_addr); 55 | temp = uOP->readFromChannel(); 56 | if(temp.first_flag) { 57 | LOG(this->getName(), "Issued first uOP " + std::to_string(temp.first_flag)); 58 | } 59 | } 60 | } 61 | 62 | // Write eVRF output when ready 63 | if(!mvu_channel->isEmpty()){ 64 | evrf_result->writeToChannel(mvu_channel->read()); 65 | LOG(this->getName(), "Produced Output"); 66 | } else if (!evrf_rdata->isEmpty()){ 67 | evrf_result->writeToChannel(evrf_rdata->read()); 68 | LOG(this->getName(), "Produced Output"); 69 | } 70 | 71 | // Update local instruction tag (if required) 72 | if(!update_tag->isChannelEmpty()){ 73 | update_tag->readFromChannel(); 74 | current_tag++; 75 | } 76 | 77 | // Clock internal modules 78 | evrf->clock(); 79 | // Clock internal channels 80 | evrf_raddr->clock(); 81 | evrf_rdata->clock(); 82 | mvu_channel->clock(); 83 | } 84 | 85 | // Getter function for name 86 | std::string EVRF::getName() { 87 | return name; 88 | } 89 | 90 | // Getter function for eVRF input port 91 | Input>* EVRF::getPortInput() { 92 | return evrf_input; 93 | } 94 | 95 | // Getter function for uOP input port 96 | Input* EVRF::getPortuOP() { 97 | return uOP; 98 | } 99 | 100 | // Getter function for eVRF write data input port 101 | Input>* EVRF::getPortEvrfWdata() { 102 | return evrf_wdata; 103 | } 104 | 105 | // Getter function for eVRF write address input port 106 | Input* EVRF::getPortEvrfWaddr() { 107 | return evrf_waddr; 108 | } 109 | 110 | // Getter function for update tag inputy port 111 | Input* EVRF::getPortUpdateTag() { 112 | return update_tag; 113 | } 114 | 115 | // Getter function for eVRF output port 116 | Output>* EVRF::getPortRes() { 117 | return evrf_result; 118 | } 119 | 120 | EVRF::~EVRF() { 121 | delete evrf_input; 122 | delete uOP; 123 | delete update_tag; 124 | delete evrf_result; 125 | delete evrf; 126 | delete mvu_channel; 127 | delete evrf_raddr; 128 | delete evrf_rdata; 129 | } -------------------------------------------------------------------------------- /simulator/src/input.cpp: -------------------------------------------------------------------------------- 1 | #include "input.h" 2 | 3 | // Input Port Constructor 4 | template 5 | Input::Input(std::string t_name, Module *t_module) : Port(t_name, t_module) { } 6 | 7 | // Helper function for connecting an Input port to incoming channel 8 | template 9 | void Input::connectTo(Channel *t_channel) { 10 | channel = t_channel; 11 | } 12 | 13 | // Helper function for reading from the incoming channel connected to this port 14 | template 15 | T Input::readFromChannel() { 16 | return channel->read(); 17 | } 18 | 19 | // Helper function for peeking the contents of a channel connected to this port 20 | template 21 | T Input::peekChannel() { 22 | return channel->peek(); 23 | } 24 | 25 | // Helper function for checking if the channel connected to this port is empty 26 | template 27 | bool Input::isChannelEmpty(){ 28 | assert((channel) && "no channel for input"); 29 | return channel->isEmpty(); 30 | } 31 | 32 | template 33 | Input::~Input(){ channel = NULL; } 34 | 35 | template class Input; 36 | template class Input>; 37 | template class Input; 38 | template class Input; 39 | template class Input; 40 | template class Input; 41 | template class Input; 42 | template class Input; 43 | template class Input; -------------------------------------------------------------------------------- /simulator/src/mvu.cpp: -------------------------------------------------------------------------------- 1 | #include "mvu.h" 2 | 3 | // MVU Constructor 4 | MVU::MVU(std::string t_name) : Module (t_name) { 5 | // Create Input and Output ports 6 | update_tag = new Input(t_name + "_update_tag", this); 7 | uOP = new Input(t_name + "_uOP", this); 8 | mvu_results = new Output>(t_name + "_results", this); 9 | 10 | // Create internal modules 11 | for(unsigned int i = 0; i < TILES; i++){ 12 | Tile* mvu_tile = new Tile(t_name+"_tile"+std::to_string(i), i); 13 | mvu_tiles.push_back(mvu_tile); 14 | vrfs_wdata.push_back(mvu_tile->getPortVrfWdata()); 15 | vrfs_waddr.push_back(mvu_tile->getPortVrfWaddr()); 16 | 17 | Channel* uOP_channel = new Channel(t_name+"_uOP" + std::to_string(i), 18 | 1, 0); 19 | uOP_channels.push_back(uOP_channel); 20 | mvu_tile->getPortuOP()->connectTo(uOP_channels[i]); 21 | 22 | std::vector*> temp_tile_results0, temp_tile_results1, temp_tile_results2; 23 | for(unsigned int j = 0; j < DPES; j++){ 24 | Channel *temp0 = new Channel(t_name + "_tile" + std::to_string(i) + 25 | "_result0", 1, 0); 26 | Channel *temp1 = new Channel(t_name + "_tile" + std::to_string(i) + 27 | "_result1", 1, 0); 28 | Channel *temp2 = new Channel(t_name + "_tile" + std::to_string(i) + 29 | "_result2", 1, 0); 30 | mvu_tile->getPortResults(0, j)->connectTo(temp0); 31 | mvu_tile->getPortResults(1, j)->connectTo(temp1); 32 | mvu_tile->getPortResults(2, j)->connectTo(temp2); 33 | temp_tile_results0.push_back(temp0); 34 | temp_tile_results1.push_back(temp1); 35 | temp_tile_results2.push_back(temp2); 36 | } 37 | tile_results0.push_back(temp_tile_results0); 38 | tile_results1.push_back(temp_tile_results1); 39 | tile_results2.push_back(temp_tile_results2); 40 | } 41 | 42 | // Create internal channels 43 | reduction_channel0 = new Channel>(t_name + "_reduction0", 44 | MVU_REDUCTION_LATENCY, MVU_REDUCTION_LATENCY-1); 45 | reduction_channel1 = new Channel>(t_name + "_reduction1", 46 | MVU_REDUCTION_LATENCY, MVU_REDUCTION_LATENCY-1); 47 | reduction_channel2 = new Channel>(t_name + "_reduction2", 48 | MVU_REDUCTION_LATENCY, MVU_REDUCTION_LATENCY-1); 49 | 50 | // Initialize local variables 51 | current_tag = 0; 52 | } 53 | 54 | // Clock cycle update function 55 | void MVU::clock(unsigned int &cycle_count){ 56 | // If uOP is ready to dispatch 57 | if(!uOP->isChannelEmpty()) { 58 | // Peek ready uOP to decide how to proceed 59 | mvu_uOP temp = uOP->peekChannel(); 60 | 61 | // If ready operation is NOP, read and ignore 62 | if (temp.op == 0) { 63 | temp = uOP->readFromChannel(); 64 | LOG(this->getName(), "NOP"); 65 | 66 | // If ready operation is not NOP, read and dispatch 67 | } else if (!uOP->isChannelEmpty() && temp.tag <= current_tag && 68 | !uOP_channels[0]->isFull()) { 69 | temp = uOP->readFromChannel(); 70 | if(temp.first_flag){ 71 | LOG(this->getName(), "Issued first uOP " + std::to_string(temp.first_flag/3)); 72 | } 73 | for (unsigned int i = 0; i < TILES; i++) { 74 | uOP_channels[i]->write(temp); 75 | } 76 | } 77 | } 78 | 79 | // Perform reduction of corresponding DPEs from different tiles 80 | if((!tile_results0[0][0]->isEmpty()) && (!reduction_channel0->isFull())){ 81 | std::vector partial_results0(DPES); 82 | std::vector partial_results1(DPES); 83 | std::vector partial_results2(DPES); 84 | for(unsigned int i = 0; i < TILES; i++){ 85 | for(unsigned int j = 0; j < DPES; j++){ 86 | partial_results0[j] += tile_results0[i][j]->read(); 87 | partial_results1[j] += tile_results1[i][j]->read(); 88 | partial_results2[j] += tile_results2[i][j]->read(); 89 | } 90 | } 91 | reduction_channel0->write(partial_results0); 92 | reduction_channel1->write(partial_results1); 93 | reduction_channel2->write(partial_results2); 94 | } 95 | 96 | // Write MVU output when ready 97 | if((!reduction_channel0->isEmpty()) && (!mvu_results->isChannelFull())){ 98 | // Read reduction result 99 | std::vector mvu_res_vec0 = reduction_channel0->read(); 100 | std::vector mvu_res_vec1 = reduction_channel1->read(); 101 | std::vector mvu_res_vec2 = reduction_channel2->read(); 102 | // Reshape it from vectors of length DPES to vectors of length LANES (Asymmetric FIFO) 103 | for(unsigned int i = 0; i < (DPES/LANES); i++){ 104 | std::vector mvu_res_part0, mvu_res_part1, mvu_res_part2; 105 | for(unsigned int j = 0; j < LANES; j++){ 106 | mvu_res_part0.push_back(mvu_res_vec0[0]); 107 | mvu_res_part1.push_back(mvu_res_vec1[0]); 108 | mvu_res_part2.push_back(mvu_res_vec2[0]); 109 | mvu_res_vec0.erase(mvu_res_vec0.begin()); 110 | mvu_res_vec1.erase(mvu_res_vec1.begin()); 111 | mvu_res_vec2.erase(mvu_res_vec2.begin()); 112 | } 113 | mvu_results->writeToChannel(mvu_res_part0); 114 | mvu_results->writeToChannel(mvu_res_part1); 115 | mvu_results->writeToChannel(mvu_res_part2); 116 | LOG(this->getName(), "Produced Output"); 117 | #if(VERBOSE_MVU) 118 | std::cout << "MVU OUTPUT0: " << mvu_res_part0 << std::endl; 119 | std::cout << "MVU OUTPUT1: " << mvu_res_part1 << std::endl; 120 | std::cout << "MVU OUTPUT2: " << mvu_res_part2 << std::endl; 121 | #endif 122 | } 123 | } 124 | 125 | // Update local instruction tag (if required) 126 | if(!update_tag->isChannelEmpty()){ 127 | update_tag->readFromChannel(); 128 | current_tag++; 129 | } 130 | 131 | // Clock internal modules 132 | for(unsigned int i = 0; i < TILES; i++){ 133 | mvu_tiles[i]->clock(); 134 | } 135 | // Clock internal channels 136 | for(unsigned int i = 0; i < TILES; i++){ 137 | uOP_channels[i]->clock(); 138 | } 139 | reduction_channel0->clock(); 140 | reduction_channel1->clock(); 141 | reduction_channel2->clock(); 142 | } 143 | 144 | // Getter function for name 145 | std::string MVU::getName() { 146 | return name; 147 | } 148 | 149 | // Getter function for VRF write data input port 150 | Input>* MVU::getPortVrfWdata(unsigned int idx) { 151 | return vrfs_wdata[idx]; 152 | } 153 | 154 | // Getter function for VRF write address input port 155 | Input* MVU::getPortVrfWaddr(unsigned int idx) { 156 | return vrfs_waddr[idx]; 157 | } 158 | 159 | // Getter function for uOP input port 160 | Input* MVU::getPortuOP() { 161 | return uOP; 162 | } 163 | 164 | // Getter function for update tag input port 165 | Input* MVU::getPortUpdateTag() { 166 | return update_tag; 167 | } 168 | 169 | // Getter function for MVU output port 170 | Output>* MVU::getPortRes() { 171 | return mvu_results; 172 | } 173 | 174 | MVU::~MVU() { 175 | delete uOP; 176 | delete update_tag; 177 | delete mvu_results; 178 | for (unsigned int i = 0; i < TILES; i++) { 179 | delete mvu_tiles[i]; 180 | delete uOP_channels[i]; 181 | for (unsigned int j = 0; j < DPES; j++) { 182 | delete tile_results0[i][j]; 183 | delete tile_results1[i][j]; 184 | delete tile_results2[i][j]; 185 | } 186 | } 187 | delete reduction_channel0; 188 | delete reduction_channel1; 189 | delete reduction_channel2; 190 | } -------------------------------------------------------------------------------- /simulator/src/mvu_vrf.cpp: -------------------------------------------------------------------------------- 1 | #include "mvu_vrf.h" 2 | 3 | // MVU VRF Constructor 4 | MVUVRF::MVUVRF (std::string t_name, unsigned int t_tile_id) : Module(t_name) { 5 | // Create Input and Output ports 6 | vrf_wdata = new Input>(t_name + "_vrf_wdata", this); 7 | vrf_waddr = new Input(t_name + "_vrf_waddr", this); 8 | vrf_rdata = new Output>(t_name + "_vrf_rdata", this); 9 | vrf_raddr = new Input(t_name + "_vrf_raddr", this); 10 | vrf_sel = new Input(t_name + "_vrf_sel", this); 11 | // Create internal modules and channels 12 | for(unsigned int i = 0; i < num_vrfs; i++){ 13 | RegisterFile>* ivrf = new RegisterFile>(t_name + 14 | "_vrf_" + std::to_string(i), MVU_VRF_DEPTH); 15 | Channel* ivrf_raddr = new Channel(t_name + "_vrf_raddr_" + 16 | std::to_string(i), 1, 0); 17 | Channel>* ivrf_rdata = new Channel>(t_name + 18 | "_vrf_rdata_" + std::to_string(i), 1, 0); 19 | Channel* ivrf_waddr = new Channel(t_name + "_vrf_waddr_" + 20 | std::to_string(i), 1, 0); 21 | Channel>* ivrf_wdata = new Channel>(t_name + 22 | "_vrf_wdata_" + std::to_string(i), 1, 0); 23 | ivrf->getPortRaddr()->connectTo(ivrf_raddr); 24 | ivrf->getPortRdata()->connectTo(ivrf_rdata); 25 | ivrf->getPortWaddr()->connectTo(ivrf_waddr); 26 | ivrf->getPortWdata()->connectTo(ivrf_wdata); 27 | vrfs.push_back(ivrf); 28 | vrf_raddr_channel.push_back(ivrf_raddr); 29 | vrf_rdata_channel.push_back(ivrf_rdata); 30 | vrf_waddr_channel.push_back(ivrf_waddr); 31 | vrf_wdata_channel.push_back(ivrf_wdata); 32 | } 33 | // Initialize local variables 34 | tile_id = t_tile_id; 35 | } 36 | 37 | // Clock cycle update function 38 | void MVUVRF::clock() { 39 | // Parallel write to all VRFs 40 | if(!vrf_wdata->isChannelEmpty() && !vrf_waddr->isChannelEmpty()){ 41 | std::vector wdata = vrf_wdata->readFromChannel(); 42 | unsigned int waddr = vrf_waddr->readFromChannel(); 43 | for(unsigned int i = 0; i < num_vrfs; i++){ 44 | vrf_waddr_channel[i]->write(waddr); 45 | std::vector vrf_wdata; 46 | vrf_wdata.insert(vrf_wdata.end(), wdata.begin()+(i*10), wdata.begin()+(i*10)+10); 47 | assert(vrf_wdata.size() == 10); 48 | vrf_wdata_channel[i]->write(vrf_wdata); 49 | } 50 | } 51 | 52 | // Read from a single VRF 53 | if(!vrf_sel->isChannelEmpty() && !vrf_raddr->isChannelEmpty()){ 54 | unsigned int temp_vrf_sel = vrf_sel->readFromChannel(); 55 | unsigned int temp_vrf_raddr = vrf_raddr->readFromChannel(); 56 | vrf_raddr_channel[temp_vrf_sel]->write(temp_vrf_raddr); 57 | } 58 | 59 | // Write output to ports & clock all internal VRFs and channels 60 | for(unsigned int i = 0; i < num_vrfs; i++){ 61 | if(!vrf_rdata_channel[i]->isEmpty() && !vrf_rdata->isChannelFull()) 62 | vrf_rdata->writeToChannel(vrf_rdata_channel[i]->read()); 63 | vrfs[i]->clock(); 64 | vrf_raddr_channel[i]->clock(); 65 | vrf_rdata_channel[i]->clock(); 66 | vrf_waddr_channel[i]->clock(); 67 | vrf_wdata_channel[i]->clock(); 68 | } 69 | } 70 | 71 | // Getter function for VRF write data input port 72 | Input>* MVUVRF::getPortVrfWdata() { 73 | return vrf_wdata; 74 | } 75 | 76 | // Getter function for VRF write address input port 77 | Input* MVUVRF::getPortVrfWaddr() { 78 | return vrf_waddr; 79 | } 80 | 81 | // Getter function for VRF read data output port 82 | Output>* MVUVRF::getPortVrfRdata() { 83 | return vrf_rdata; 84 | } 85 | 86 | // Getter function for VRF read address input port 87 | Input* MVUVRF::getPortVrfRaddr() { 88 | return vrf_raddr; 89 | } 90 | 91 | // Getter funtion for VRF select input port 92 | Input* MVUVRF::getPortVrfSel() { 93 | return vrf_sel; 94 | } 95 | 96 | MVUVRF::~MVUVRF() { 97 | delete vrf_wdata; 98 | delete vrf_waddr; 99 | delete vrf_rdata; 100 | delete vrf_raddr; 101 | delete vrf_sel; 102 | for (unsigned int i = 0; i < vrfs.size(); i++) { 103 | delete vrfs[i]; 104 | delete vrf_raddr_channel[i]; 105 | delete vrf_rdata_channel[i]; 106 | delete vrf_waddr_channel[i]; 107 | delete vrf_wdata_channel[i]; 108 | } 109 | } -------------------------------------------------------------------------------- /simulator/src/npu.cpp: -------------------------------------------------------------------------------- 1 | #include "npu.h" 2 | 3 | // NPU constructor 4 | NPU::NPU(std::string t_name) : Module(t_name) { 5 | // Create channels for the micro-OPs to connect between datapath and instruction decoders 6 | mvu_uOP_channel = new Channel(t_name+"_mvu_uOP", FIFO_DEPTH, 1); 7 | evrf_uOP_channel = new Channel(t_name+"_evrf_uOP", FIFO_DEPTH, 1); 8 | mfu0_uOP_channel = new Channel(t_name+"_mfu0_uOP", FIFO_DEPTH, 1); 9 | mfu1_uOP_channel = new Channel(t_name+"_mfu1_uOP", FIFO_DEPTH, 1); 10 | ld_uOP_channel = new Channel(t_name+"_ld_uOP", FIFO_DEPTH, 1); 11 | 12 | // Create NPU datapath and connect input uOP ports to channels 13 | npu_datapath = new Datapath(t_name); 14 | npu_datapath->getPortMVUuOP()->connectTo(mvu_uOP_channel); 15 | npu_datapath->getPortEVRFuOP()->connectTo(evrf_uOP_channel); 16 | npu_datapath->getPortMFU0uOP()->connectTo(mfu0_uOP_channel); 17 | npu_datapath->getPortMFU1uOP()->connectTo(mfu1_uOP_channel); 18 | npu_datapath->getPortLDuOP()->connectTo(ld_uOP_channel); 19 | 20 | // Create NPU instruction decoder and connect output uOP ports to channels 21 | npu_decoders = new Decoder(t_name+"_decoder"); 22 | npu_decoders->getPortMVUuOP()->connectTo(mvu_uOP_channel); 23 | npu_decoders->getPortEVRFuOP()->connectTo(evrf_uOP_channel); 24 | npu_decoders->getPortMFU0uOP()->connectTo(mfu0_uOP_channel); 25 | npu_decoders->getPortMFU1uOP()->connectTo(mfu1_uOP_channel); 26 | npu_decoders->getPortLDuOP()->connectTo(ld_uOP_channel); 27 | 28 | // Connect NPU input and output ports 29 | npu_inst = npu_decoders->getPortInputVLIW(); 30 | npu_output = npu_datapath->getPortOutput(); 31 | } 32 | 33 | // Clock cycle update function 34 | void NPU::clock(unsigned int &cycle_count) { 35 | npu_datapath->clock(cycle_count); 36 | npu_decoders->clock(cycle_count); 37 | 38 | mvu_uOP_channel->clock(); 39 | evrf_uOP_channel->clock(); 40 | mfu0_uOP_channel->clock(); 41 | mfu1_uOP_channel->clock(); 42 | ld_uOP_channel->clock(); 43 | } 44 | 45 | // Getter function for name 46 | std::string NPU::getName() { 47 | return name; 48 | } 49 | 50 | // Getter function for instruction port 51 | Input* NPU::getPortInst() { 52 | return npu_inst; 53 | } 54 | 55 | // Getter function for output port 56 | Output>* NPU::getPortOutput() { 57 | return npu_output; 58 | } 59 | 60 | NPU::~NPU(){ 61 | delete npu_datapath; 62 | delete npu_decoders; 63 | delete mvu_uOP_channel; 64 | delete evrf_uOP_channel; 65 | delete mfu0_uOP_channel; 66 | delete mfu1_uOP_channel; 67 | delete ld_uOP_channel; 68 | } -------------------------------------------------------------------------------- /simulator/src/obj/README.md: -------------------------------------------------------------------------------- 1 | Directory for object files produced from simulation -------------------------------------------------------------------------------- /simulator/src/output.cpp: -------------------------------------------------------------------------------- 1 | #include "output.h" 2 | 3 | // Output Port Constructor 4 | template 5 | Output::Output(std::string t_name, Module *t_module): Port(t_name, t_module) { } 6 | 7 | // Helper function for connecting an output port to an outgoing channel 8 | template 9 | void Output::connectTo(Channel *t_channel) { 10 | channels.push_back(t_channel); 11 | } 12 | 13 | // Helper function for writing to all the channels connected to this output port 14 | template 15 | void Output::writeToChannel(T t_data) { 16 | for(unsigned int i = 0; i < channels.size(); i++){ 17 | channels[i]->write(t_data); 18 | } 19 | } 20 | 21 | // Helper function for checking if the channel connected to this port is full 22 | template 23 | bool Output::isChannelFull() { 24 | bool full = false; 25 | for(unsigned int i = 0; i < channels.size(); i++){ 26 | full = full || channels[i]->isFull(); 27 | } 28 | return full; 29 | } 30 | 31 | template 32 | Output::~Output(){ 33 | for (unsigned int i = 0; i < channels.size(); i++){ 34 | channels[i] = NULL; 35 | } 36 | } 37 | 38 | template class Output; 39 | template class Output>; 40 | template class Output; 41 | template class Output; 42 | template class Output; 43 | template class Output; 44 | template class Output; 45 | template class Output; 46 | -------------------------------------------------------------------------------- /simulator/src/port.cpp: -------------------------------------------------------------------------------- 1 | #include "port.h" 2 | 3 | // Port Constructor 4 | template 5 | Port::Port (std::string t_name, Module *t_module) { 6 | name = t_name; 7 | module = t_module; 8 | } 9 | 10 | // Getter function for name 11 | template 12 | std::string Port::getName() { 13 | return name; 14 | } 15 | 16 | // Getter function for port module 17 | template 18 | Module* Port::getModule() { 19 | return module; 20 | } 21 | 22 | template class Port; 23 | template class Port>; 24 | template class Port; 25 | template class Port; 26 | template class Port; 27 | template class Port; 28 | template class Port; 29 | template class Port; 30 | template class Port; -------------------------------------------------------------------------------- /simulator/src/register_file.cpp: -------------------------------------------------------------------------------- 1 | #include "register_file.h" 2 | 3 | // Helper function for initializing vector register files 4 | void init_rf(std::vector> &rf, unsigned int depth){ 5 | for (unsigned int i = rf.size(); i < depth; i++) { 6 | std::vector zeros; 7 | for (int j = 0; j < LANES; j++) { 8 | zeros.push_back(0); 9 | } 10 | rf.push_back(zeros); 11 | } 12 | } 13 | 14 | // Register File Constructor 15 | template 16 | RegisterFile::RegisterFile (std::string t_name, unsigned int t_depth, 17 | std::string *t_file_name): Module(t_name) { 18 | // Create Input and Output ports 19 | raddr = new Input (t_name + "_raddr", this); 20 | rdata = new Output (t_name + "_rdata", this); 21 | waddr = new Input (t_name + "_waddr", this); 22 | wdata = new Input (t_name + "_wdata", this); 23 | // Initialize local variables 24 | depth = t_depth; 25 | reads_in_flight = 0; 26 | writes_in_flight = 0; 27 | // Initialize register file contents 28 | if (t_file_name) 29 | readVectorFile(*t_file_name, register_file); 30 | init_rf(register_file, t_depth); 31 | } 32 | 33 | // Helper function for read operation 34 | template 35 | void RegisterFile::read(){ 36 | // Advance the pipeline if there is any data in it 37 | if(read_pipeline.size() > 0){ 38 | bool retire = false; 39 | for (unsigned int i = 0; i < reads_in_flight; i++){ 40 | std::tuple temp = read_pipeline.front(); 41 | if((std::get<1>(temp) == 0) && (!rdata->isChannelFull())){ 42 | read_pipeline.pop(); 43 | assert(std::get<0>(temp) < depth && "Read address out of bound"); 44 | rdata->writeToChannel(register_file[std::get<0>(temp)]); 45 | retire = true; 46 | } else if (reads_in_flight <= RF_READ_LATENCY) { 47 | read_pipeline.pop(); 48 | if(std::get<1>(temp) > 0){ 49 | std::get<1>(temp)--; 50 | } 51 | read_pipeline.push(temp); 52 | } 53 | } 54 | reads_in_flight = (retire)? reads_in_flight-1: reads_in_flight; 55 | } 56 | 57 | // Read in new address if the pipeline is not stalled (i.e. reads in flight 58 | // less than the pipeline depth/latency) 59 | if(!raddr->isChannelEmpty() && reads_in_flight <= RF_READ_LATENCY){ 60 | unsigned int temp_raddr = raddr->readFromChannel(); 61 | read_pipeline.push(std::make_tuple(temp_raddr, RF_READ_LATENCY-1)); 62 | reads_in_flight++; 63 | } 64 | } 65 | 66 | // Helper function for write operation 67 | template 68 | void RegisterFile::write(){ 69 | // Advance the pipeline if there is any data in it 70 | if(write_pipeline.size() > 0){ 71 | bool retire = false; 72 | for (unsigned int i = 0; i < writes_in_flight; i++){ 73 | std::tuple temp = write_pipeline.front(); 74 | if((std::get<2>(temp) == 0)){ 75 | write_pipeline.pop(); 76 | assert(std::get<0>(temp) < depth && "Write address out of bound"); 77 | register_file[std::get<0>(temp)] = std::get<1>(temp); 78 | retire = true; 79 | } else if (writes_in_flight <= RF_WRITE_LATENCY) { 80 | write_pipeline.pop(); 81 | if(std::get<2>(temp) > 0) { 82 | std::get<2>(temp)--; 83 | } 84 | write_pipeline.push(temp); 85 | } 86 | } 87 | writes_in_flight = (retire)? writes_in_flight-1: writes_in_flight; 88 | } 89 | 90 | // Read in new address and data if the pipeline is not stalled (i.e. reads 91 | // in flight less than the pipeline depth/latency) 92 | if((!waddr->isChannelEmpty()) && (!wdata->isChannelEmpty()) && 93 | (writes_in_flight <= RF_WRITE_LATENCY)){ 94 | unsigned int temp_waddr = waddr->readFromChannel(); 95 | T temp_wdata = wdata->readFromChannel(); 96 | write_pipeline.push(std::make_tuple(temp_waddr, temp_wdata, RF_WRITE_LATENCY-1)); 97 | writes_in_flight++; 98 | } 99 | } 100 | 101 | // Clock cycle update function 102 | template 103 | void RegisterFile::clock() { 104 | this->write(); 105 | this->read(); 106 | } 107 | 108 | // Helper function for printing out the contents of a register file (used for debugging) 109 | template 110 | void RegisterFile::print() { 111 | std::cout<<"Register file elements: "; 112 | for (unsigned int i = 0; i < depth; i++) 113 | std::cout<< register_file.at(i) << ", "; 114 | std::cout << std::endl; 115 | } 116 | 117 | // Getter function for read address input port 118 | template 119 | Input* RegisterFile::getPortRaddr() { 120 | return raddr; 121 | } 122 | 123 | // Getter function for read data output port 124 | template 125 | Output* RegisterFile::getPortRdata() { 126 | return rdata; 127 | } 128 | 129 | // Getter function for write address input port 130 | template 131 | Input* RegisterFile::getPortWaddr() { 132 | return waddr; 133 | } 134 | 135 | // Getter function for write data input port 136 | template 137 | Input* RegisterFile::getPortWdata() { 138 | return wdata; 139 | } 140 | 141 | template 142 | RegisterFile::~RegisterFile() { 143 | delete raddr; 144 | delete rdata; 145 | delete waddr; 146 | delete wdata; 147 | } 148 | 149 | template class RegisterFile>; -------------------------------------------------------------------------------- /simulator/src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | // Operator overload for printing a vector 4 | template 5 | std::ostream& operator<< (std::ostream& out, const std::vector& v) { 6 | out << "{"; 7 | size_t last = v.size() - 1; 8 | for(size_t i = 0; i < v.size(); ++i) { 9 | out << v[i]; 10 | if (i != last) 11 | out << ", "; 12 | } 13 | out << "}"; 14 | return out; 15 | } 16 | 17 | // Used for populating vector register file contents from a file 18 | void readVectorFile(std::string &file_name, std::vector> &vec_data) { 19 | std::ifstream in(file_name); 20 | 21 | if (!in) assert(0 && "file not open"); 22 | std::string line; 23 | while(std::getline(in, line)) { 24 | std::stringstream line_stream(line); 25 | std::vector data; 26 | TYPE temp; 27 | while (line_stream >> temp) { 28 | data.push_back(temp); 29 | } 30 | vec_data.push_back(data); 31 | } 32 | } 33 | 34 | // Used for populating vector FIFO contents from a file 35 | void readVectorFile(std::string &file_name, std::queue> &que_data) { 36 | std::ifstream in(file_name); 37 | if (!in) assert(0 && "file not open"); 38 | std::string line; 39 | while(std::getline(in, line)) { 40 | std::stringstream line_stream(line); 41 | std::vector data; 42 | TYPE temp; 43 | while (line_stream >> temp) { 44 | data.push_back(temp); 45 | } 46 | que_data.push(data); 47 | } 48 | } 49 | 50 | // Operator overload for adding two vectors 51 | std::vector operator+ (const std::vector &v1, const std::vector &v2){ 52 | assert(v1.size() == v2.size() && "The two vectors have different lengths"); 53 | std::vector result; 54 | for(unsigned int i = 0; i < v1.size(); i++){ 55 | result.push_back(v1[i] + v2[i]); 56 | } 57 | return result; 58 | } 59 | 60 | // Used for reading simulating golden outputs 61 | template 62 | void readGoldenOutput(std::string &file_name, std::vector &vec_data, int v_size) { 63 | std::ifstream in(file_name); 64 | 65 | if (!in) assert(0 && "file not open"); 66 | std::string line; 67 | while(std::getline(in, line)) { 68 | std::stringstream line_stream(line); 69 | std::vector data; 70 | TYPE temp; 71 | int count = 0; 72 | while (line_stream >> temp) { 73 | data.push_back(temp); 74 | count++; 75 | if(count == v_size){ 76 | count = 0; 77 | vec_data.push_back(data); 78 | data.erase(data.begin(), data.end()); 79 | } 80 | } 81 | } 82 | } 83 | 84 | template std::ostream& operator<< (std::ostream& out, const std::vector& v); 85 | template void readGoldenOutput(std::string &file_name, std::vector> &vec_data, 86 | int v_size); --------------------------------------------------------------------------------