├── README.md
├── compiler
    ├── compiler.py
    ├── driver.py
    ├── fsim.py
    ├── npu_layers.py
    ├── pac_dump
    │   └── README.md
    ├── pcie_dump
    │   └── README.md
    └── perf_sim.sh
├── how_to_run_cpu_fpga_system_demo.pdf
├── how_to_run_npu_on_fpga.pdf
├── npu_readme.pdf
├── patch
    ├── kernel
    │   ├── Makefile
    │   ├── intel_fpga_pcie_chr.c
    │   ├── intel_fpga_pcie_dma.c
    │   ├── intel_fpga_pcie_ioctl.c
    │   └── intel_fpga_pcie_setup.c
    ├── npu_test
    │   ├── Makefile
    │   ├── dma_test.hpp
    │   ├── gemv256_batch1026
    │   │   ├── golden_data
    │   │   ├── input_data
    │   │   ├── inputs.dat
    │   │   ├── inst_data
    │   │   ├── instructions.dat
    │   │   ├── mrf_data
    │   │   ├── mrfs.dat
    │   │   └── outputs.dat
    │   ├── gemv256_batch6
    │   │   ├── inputs.dat
    │   │   ├── instructions.dat
    │   │   ├── mrfs.dat
    │   │   └── outputs.dat
    │   ├── gemv256_batch768
    │   │   ├── inputs.dat
    │   │   ├── instructions.dat
    │   │   ├── mrfs.dat
    │   │   └── outputs.dat
    │   ├── gen_input.py
    │   ├── mlp_batch2052
    │   │   ├── inputs.dat
    │   │   ├── instructions.dat
    │   │   ├── mrfs.dat
    │   │   └── outputs.dat
    │   ├── mlp_batch4104
    │   │   ├── inputs.dat
    │   │   ├── instructions.dat
    │   │   ├── mrfs.dat
    │   │   └── outputs.dat
    │   └── real_npu_test.cpp
    ├── pcie_ed_MEM.v
    ├── setup.sh
    └── user
    │   ├── intel_fpga_pcie_api.hpp
    │   └── intel_fpga_pcie_api_linux.cpp
├── rtl
    ├── altera_syncram.sv
    ├── asymmetric_fifo.sv
    ├── axbs.sv
    ├── bram_accum.sv
    ├── daisy_chain_interconnect.sv
    ├── dma_buffer.v
    ├── dpe.sv
    ├── dpe_mrf.sv
    ├── evrf.sv
    ├── evrf_sched.sv
    ├── fifo.sv
    ├── inst_fifo.sv
    ├── inst_ram.sv
    ├── ld.sv
    ├── ld_sched.sv
    ├── mfu.sv
    ├── mfu_sched.sv
    ├── mrf_ram.sv
    ├── mvu.sv
    ├── mvu_sched.sv
    ├── mvu_tile.sv
    ├── mvu_vrf.sv
    ├── npu.sv
    ├── npu.vh
    ├── npu_tb.sv
    ├── nx_axbs.sv
    ├── nx_axbs_core.sv
    ├── nx_axbs_slice.sv
    ├── nx_dot6_int8.sv
    ├── nx_dot_product_int8.sv
    ├── pipeline_interconnect.sv
    ├── prime_dsp_tensor_int8.sv
    ├── ram.sv
    ├── run_sim.sh
    ├── self_tester_shim.sv
    ├── self_tester_tb.v
    ├── setup.sh
    ├── shim.sv
    ├── sigmoid.mif
    ├── sigmoid.sv
    ├── sigmoid.ver
    ├── star_interconnect.sv
    ├── tanh.mif
    ├── tanh.sv
    ├── tanh.ver
    ├── tester_rom.sv
    └── top_sched.sv
├── scripts
    ├── perf_baseline
    ├── perf_tests.py
    ├── reports
    │   └── README.md
    ├── rtl_baseline
    ├── rtl_tests.py
    └── workloads
    │   ├── 01_gemv_512x512.py
    │   ├── 02_gemv_1024x1024.py
    │   ├── 03_gemv_1152x1152.py
    │   ├── 04_gemv_1536x1536.py
    │   ├── 05_gemv_1792x1792.py
    │   ├── 06_rnn_512_8.py
    │   ├── 07_rnn_1024_8.py
    │   ├── 08_rnn_1152_8.py
    │   ├── 09_rnn_1536_8.py
    │   ├── 10_rnn_1792_8.py
    │   ├── 11_gru_512_8.py
    │   ├── 12_gru_1024_8.py
    │   ├── 13_gru_1152_8.py
    │   ├── 14_lstm_512_8.py
    │   ├── 15_lstm_1024_8.py
    │   ├── 16_mlp5_512.py
    │   ├── 17_mlp5_1024.py
    │   ├── 18_mlp3_1024_512_256_256.py
    │   └── 19_mlp3_1024_512_256_256_batched.py
└── simulator
    ├── Makefile
    ├── inc
        ├── accumulator.h
        ├── channel.h
        ├── datapath.h
        ├── decoder.h
        ├── defines.h
        ├── dpe.h
        ├── evrf.h
        ├── input.h
        ├── inst.h
        ├── loader.h
        ├── mfu.h
        ├── module.h
        ├── mvu.h
        ├── mvu_vrf.h
        ├── npu.h
        ├── output.h
        ├── port.h
        ├── register_file.h
        ├── tile.h
        └── utils.h
    ├── main
        ├── npu_sim.cpp
        └── obj
        │   └── README.md
    ├── perf_sim_log
    ├── register_files
        └── README.md
    └── src
        ├── accumulator.cpp
        ├── channel.cpp
        ├── datapath.cpp
        ├── decoder.cpp
        ├── dpe.cpp
        ├── evrf.cpp
        ├── input.cpp
        ├── loader.cpp
        ├── mfu.cpp
        ├── mvu.cpp
        ├── mvu_vrf.cpp
        ├── npu.cpp
        ├── obj
            └── README.md
        ├── output.cpp
        ├── port.cpp
        ├── register_file.cpp
        ├── tile.cpp
        └── utils.cpp


/README.md:
--------------------------------------------------------------------------------
 1 | # DISCONTINUATION OF PROJECT #  
 2 | This project will no longer be maintained by Intel.  
 3 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project.  
 4 | Intel no longer accepts patches to this project.  
 5 |  If you have an ongoing need to use this project, are interested in independently developing it, or would like to maintain patches for the open source software community, please create your own fork of this project.  
 6 |   
 7 | # The Neural Processing Unit (NPU)
 8 | 
 9 | ## Introduction
10 | The Neural Processing Unit (NPU) is an FPGA soft processor (i.e., overlay) architecture for low latency, low batch AI inference. It adopts the "persistent AI" approach, in which all model weights are kept persistent in the on-chip SRAM memory of one or more network-connected FPGAs to eliminate the expensive off-chip memory accesses. The NPU is a domain-specific software-programmable processor. Therefore, once the NPU bitstream is compiled and deployed on an FPGA, users can rapidly program it to run different AI workloads using a high-level domain-specific language or a deep learning framework (e.g. TensorFlow Keras) purely in software. This approach enables AI application developers to use FPGAs for AI inference acceleration without the need for FPGA design expertise or suffering from the long runtime of FPGA CAD tools.
11 | 
12 | ## License
13 | Copyright 2022 Intel Corporation
14 | 
15 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
16 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
17 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
18 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
21 | 
22 | ## Citation
23 | If you use the NPU code in this repo for your research, please cite the following paper:
24 | * A. Boutros, E. Nurvitadhi, R. Ma, S. Gribok, Z. Zhao, J. Hoe, V. Betz, and M. Langhammer. "Beyond Peak Performance: Comparing the Real Performance of AI-Optimized FPGAs and GPUs". In the IEEE International Conference on Field-Programmable Technology (FPT), 2020.
25 | 
26 | You can use the following BibTex entry:
27 | ```plaintext
28 | @article{npu_s10_nx,
29 |   title={{Beyond Peak Performance: Comparing the Real Performance of AI-Optimized FPGAs and GPUs}},
30 |   author={Boutros, Andrew and others},
31 |   booktitle={IEEE International Conference on Field-Programmable Technology (ICFPT)},
32 |   year={2020}
33 | }
34 | ```
35 | 


--------------------------------------------------------------------------------
/compiler/driver.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | 
 7 | from compiler import *
 8 | from npu_layers import *
 9 | 
10 | ###### START OF MODEL DEFINITION ######
11 | 
12 | # Define constants
13 | INPUT_VEC_SIZE = 256
14 | DENSE_L1_SIZE = 256
15 | DENSE_L2_SIZE = 256
16 | DENSE_L3_SIZE = 256
17 | 
18 | # Define model architecture using Keras Sequential Model
19 | model = NPUSequential([
20 | 	layers.Dense(DENSE_L1_SIZE, activation="relu", name="layer1"),
21 | 	#layers.Dense(DENSE_L2_SIZE, activation="relu", name="layer2"),
22 | 	#layers.Dense(DENSE_L3_SIZE, activation="relu", name="layer3"),
23 | ])
24 | 
25 | # Random test inputs for different types of layers
26 | test_input = tf.random.uniform(shape=[6, INPUT_VEC_SIZE], minval=-128, maxval=127)
27 | 
28 | # Call model on example input
29 | y = model(test_input)
30 | 
31 | # Print model summary
32 | model.summary()
33 | 
34 | ####### END OF MODEL DEFINITION #######
35 | 
36 | # Initialize NPU
37 | npu = initialize_npu(sys.argv)
38 | # Compile model for NPU
39 | model.compile_for_npu(npu, test_input)
40 | # Run NPU flow
41 | npu.run_flow()
42 | 


--------------------------------------------------------------------------------
/compiler/pac_dump/README.md:
--------------------------------------------------------------------------------
1 | Directory for dumping MIF files for RTL simulation
2 | 


--------------------------------------------------------------------------------
/compiler/pcie_dump/README.md:
--------------------------------------------------------------------------------
1 | Directory for storing PCIe .dat files
2 | 


--------------------------------------------------------------------------------
/compiler/perf_sim.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cd ../simulator
4 | make &> make_log
5 | ./npu_sim &> perf_sim_log
6 | make clean &> make_clean_log
7 | 


--------------------------------------------------------------------------------
/how_to_run_cpu_fpga_system_demo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/fpga-npu/6fead512b112e0a687b7aac6e551c0b8390c7c75/how_to_run_cpu_fpga_system_demo.pdf


--------------------------------------------------------------------------------
/how_to_run_npu_on_fpga.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/fpga-npu/6fead512b112e0a687b7aac6e551c0b8390c7c75/how_to_run_npu_on_fpga.pdf


--------------------------------------------------------------------------------
/npu_readme.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/fpga-npu/6fead512b112e0a687b7aac6e551c0b8390c7c75/npu_readme.pdf


--------------------------------------------------------------------------------
/patch/kernel/Makefile:
--------------------------------------------------------------------------------
 1 | MODULE_NAME := intel_fpga_pcie_drv
 2 | obj-m += $(MODULE_NAME).o
 3 | $(MODULE_NAME)-y := intel_fpga_pcie_chr.o intel_fpga_pcie_dma.o intel_fpga_pcie_setup.o intel_fpga_pcie_ioctl.o
 4 | USE_AVX ?= 1 
 5 | 
 6 | PWD       := $(shell pwd)
 7 | KDIR ?= /lib/modules/$(shell uname -r)/build
 8 | CPPFLAGS += -include $(KDIR)/include/generated/autoconf.h
 9 | EXTRA_CFLAGS += -Wall
10 | 
11 | ifeq ($(USE_AVX), 1)
12 | 	# Enable wide accesses up to 32B
13 | 	EXTRA_CFLAGS += -mavx -mpreferred-stack-boundary=4
14 | endif
15 | 
16 | all:
17 | 	$(MAKE) -C $(KDIR) M=$(PWD) modules
18 | 
19 | clean:
20 | 	$(MAKE) -C $(KDIR) M=$(PWD) clean
21 | 


--------------------------------------------------------------------------------
/patch/npu_test/Makefile:
--------------------------------------------------------------------------------
1 | # Extremely simple makefile.
2 | 
3 | all:
4 | 	g++ -Wno-sign-compare -Wno-unused -std=c++0x -Wall -I ../api -I ../api/linux ../api/linux/intel_fpga_pcie_api_linux.cpp real_npu_test.cpp -o real_npu_test
5 | clean:
6 | 	rm -f ./real_npu_test
7 | 


--------------------------------------------------------------------------------
/patch/npu_test/dma_test.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DMA_TEST_HPP
 2 | #define DMA_TEST_HPP
 3 | 
 4 | const int version_major = 2;
 5 | const int version_minor = 0;
 6 | 
 7 | #define NPU_PRINT
 8 | 
 9 | #define WELCOME_OPT_AUTO        0
10 | #define WELCOME_OPT_MANUAL      1
11 | #define WELCOME_OPT_MAXNR       1
12 | 
13 | #define NPU_INPUT   0x4000
14 | #define NPU_INPUT_1 0x4100
15 | #define NPU_RAM1    0x4040
16 | #define NPU_RAM2    0x4140
17 | #define NPU_IN_FIFO 0x4080
18 | #define NPU_START   0x4240 
19 | 
20 | #define NPU_DONE    0x4040
21 | #define NPU_OUT_DEQ 0x40c0  
22 | #define NPU_OUT_FIFO_0 0x4000
23 | #define NPU_OUT_FIFO_1 0x4100
24 | #define NPU_OUT_FIFO_2 0x4200
25 | #define NPU_OUT_FIFO_3 0x4300
26 | #define NPU_OUT_FIFO_4 0x4400
27 | 
28 | #define POLL_RAM_STATUS 0x80100
29 | #define NPU_SOFT_RST    0x80200
30 | 
31 | #define SEL_MENU_DELIMS "*********************************************************"
32 | #define FILL_ZERO 0
33 | #define FILL_RAND 1
34 | #define FILL_INCR 2
35 | 
36 | #endif /* DMA_TEST_HPP */
37 | 


--------------------------------------------------------------------------------
/patch/npu_test/gemv256_batch1026/instructions.dat:
--------------------------------------------------------------------------------
 1 | 17 48
 2 | 92 0 0 0 0 40 64 96 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 3 | 92 0 0 0 0 40 64 96 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 4 | 92 0 0 0 0 40 64 96 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 5 | 92 0 0 0 0 40 64 96 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 6 | 92 0 0 0 0 40 64 96 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 7 | 92 0 0 0 0 40 64 96 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 8 | 92 0 0 0 0 40 64 96 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 9 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 252 0 6 0 0 0 240 0 3 12 0 8 160 0 129 1 
10 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
11 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
12 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
13 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
14 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
15 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
16 | 127 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
17 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
18 | 171 0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 
19 | 


--------------------------------------------------------------------------------
/patch/npu_test/gemv256_batch6/inputs.dat:
--------------------------------------------------------------------------------
 1 | 42 40 42 40 42
 2 | 1 3 -29 91 -107 10 -31 126 22 73 -125 81 82 8 -123 93 -66 36 18 -121 -125 -1 -126 -99 50 -3 125 56 65 -55 -3 39 97 -9 -74 -45 8 -35 109 -105 
 3 | 1 3 -29 91 -107 10 -31 126 22 73 -125 81 82 8 -123 93 -66 36 18 -121 -125 -1 -126 -99 50 -3 125 56 65 -55 -3 39 97 -9 -74 -45 8 -35 109 -105 
 4 | 60 107 83 37 -102 -10 4 -75 92 -78 -77 -41 5 -11 42 -54 18 -43 1 95 60 -11 -76 7 22 -68 -121 40 16 -14 -64 33 -97 -118 120 -74 -3 9 49 86 
 5 | 60 107 83 37 -102 -10 4 -75 92 -78 -77 -41 5 -11 42 -54 18 -43 1 95 60 -11 -76 7 22 -68 -121 40 16 -14 -64 33 -97 -118 120 -74 -3 9 49 86 
 6 | 50 -121 -116 24 -64 71 29 -55 -106 87 49 110 84 31 115 -92 78 3 -18 96 -113 89 -82 -1 20 115 68 -97 54 81 -60 -112 19 -5 -105 85 55 -40 15 -34 
 7 | 50 -121 -116 24 -64 71 29 -55 -106 87 49 110 84 31 115 -92 78 3 -18 96 -113 89 -82 -1 20 115 68 -97 54 81 -60 -112 19 -5 -105 85 55 -40 15 -34 
 8 | 76 -31 -65 -8 11 -11 -53 4 -127 43 -100 -67 -23 -102 110 78 2 19 57 -119 12 -116 123 -107 46 -14 120 -31 108 -50 113 -42 35 -20 66 -40 54 75 -41 -67 
 9 | 76 -31 -65 -8 11 -11 -53 4 -127 43 -100 -67 -23 -102 110 78 2 19 57 -119 12 -116 123 -107 46 -14 120 -31 108 -50 113 -42 35 -20 66 -40 54 75 -41 -67 
10 | -114 2 109 -105 0 28 -123 44 -20 -15 68 -94 99 74 74 -68 126 47 -66 -55 25 64 -46 -71 41 -1 45 -32 -46 121 43 15 -82 -127 12 84 -81 23 75 70 
11 | -114 2 109 -105 0 28 -123 44 -20 -15 68 -94 99 74 74 -68 126 47 -66 -55 25 64 -46 -71 41 -1 45 -32 -46 121 43 15 -82 -127 12 84 -81 23 75 70 
12 | 29 11 -111 -112 63 -107 -30 -45 118 58 116 47 -88 -118 90 101 -26 91 116 -94 -5 40 -43 -69 -25 -114 -104 2 -107 -13 -29 11 83 84 46 37 86 -83 111 -17 
13 | 29 11 -111 -112 63 -107 -30 -45 118 58 116 47 -88 -118 90 101 -26 91 116 -94 -5 40 -43 -69 -25 -114 -104 2 -107 -13 -29 11 83 84 46 37 86 -83 111 -17 
14 | 32 5 56 13 86 111 -84 -92 34 -125 3 -104 -51 106 -2 -113 96 96 117 -19 -41 103 0 100 121 -20 126 -63 15 -115 -14 -122 -100 -104 -34 -33 104 -80 -88 51 
15 | 32 5 56 13 86 111 -84 -92 34 -125 3 -104 -51 106 -2 -113 96 96 117 -19 -41 103 0 100 121 -20 126 -63 15 -115 -14 -122 -100 -104 -34 -33 104 -80 -88 51 
16 | -88 -108 -3 -117 -18 123 18 69 99 -65 -123 76 -115 -91 -121 -28 -59 -12 48 57 -24 81 90 -19 -53 -44 -117 -79 -75 69 -60 -11 77 86 -107 -114 -16 -94 -32 19 
17 | -88 -108 -3 -117 -18 123 18 69 99 -65 -123 76 -115 -91 -121 -28 -59 -12 48 57 -24 81 90 -19 -53 -44 -117 -79 -75 69 -60 -11 77 86 -107 -114 -16 -94 -32 19 
18 | 37 -90 -1 126 -116 -108 38 125 87 -56 16 50 -77 -21 -103 -87 45 -16 -95 62 -121 45 -13 94 52 96 77 -5 -46 32 10 66 111 101 -62 111 28 -10 -50 -71 
19 | 37 -90 -1 126 -116 -108 38 125 87 -56 16 50 -77 -21 -103 -87 45 -16 -95 62 -121 45 -13 94 52 96 77 -5 -46 32 10 66 111 101 -62 111 28 -10 -50 -71 
20 | -21 22 -62 -53 33 71 -45 123 69 80 -17 114 83 -27 -66 -96 110 103 125 -51 -62 -116 -124 108 17 125 -12 -81 -78 -60 44 118 -10 -62 -72 58 79 76 -50 83 
21 | -21 22 -62 -53 33 71 -45 123 69 80 -17 114 83 -27 -66 -96 110 103 125 -51 -62 -116 -124 108 17 125 -12 -81 -78 -60 44 118 -10 -62 -72 58 79 76 -50 83 
22 | -39 95 83 57 113 -123 -59 -61 69 -26 28 57 65 52 69 51 13 -81 -13 -98 -108 66 -121 108 -13 -85 37 33 103 -31 -82 -67 -64 -12 -97 113 -73 126 -82 -111 
23 | -39 95 83 57 113 -123 -59 -61 69 -26 28 57 65 52 69 51 13 -81 -13 -98 -108 66 -121 108 -13 -85 37 33 103 -31 -82 -67 -64 -12 -97 113 -73 126 -82 -111 
24 | -65 32 -90 54 -17 76 101 25 -97 -51 -117 28 125 -41 87 -74 76 -62 -14 101 126 -123 22 -109 78 117 -89 -119 26 48 25 -26 -93 -87 -103 -108 44 57 75 -50 
25 | -65 32 -90 54 -17 76 101 25 -97 -51 -117 28 125 -41 87 -74 76 -62 -14 101 126 -123 22 -109 78 117 -89 -119 26 48 25 -26 -93 -87 -103 -108 44 57 75 -50 
26 | -126 1 -77 115 -75 -31 -32 51 -100 74 99 -121 -1 18 33 57 67 102 -81 -79 35 61 0 104 -70 120 91 114 14 40 119 -40 -1 -6 91 -41 100 73 -83 24 
27 | -126 1 -77 115 -75 -31 -32 51 -100 74 99 -121 -1 18 33 57 67 102 -81 -79 35 61 0 104 -70 120 91 114 14 40 119 -40 -1 -6 91 -41 100 73 -83 24 
28 | 28 4 53 27 124 -103 -8 87 -81 64 117 102 77 -36 -103 49 -13 51 76 -61 -122 3 55 101 -127 28 -17 73 74 22 37 -17 51 37 68 -65 59 -115 -67 -110 
29 | 28 4 53 27 124 -103 -8 87 -81 64 117 102 77 -36 -103 49 -13 51 76 -61 -122 3 55 101 -127 28 -17 73 74 22 37 -17 51 37 68 -65 59 -115 -67 -110 
30 | -78 -86 126 19 119 -89 39 48 120 109 -121 -68 55 34 72 -75 -80 -98 78 61 -120 22 57 17 -86 24 123 102 -12 21 46 40 31 25 79 -73 93 -45 107 -61 
31 | -78 -86 126 19 119 -89 39 48 120 109 -121 -68 55 34 72 -75 -80 -98 78 61 -120 22 57 17 -86 24 123 102 -12 21 46 40 31 25 79 -73 93 -45 107 -61 
32 | -55 -113 122 80 -71 72 -46 73 -99 66 -104 87 13 -83 -44 90 24 2 -4 -39 6 75 -5 -25 66 -53 44 -67 124 25 40 -6 1 -59 -14 -35 -89 105 41 -17 
33 | -55 -113 122 80 -71 72 -46 73 -99 66 -104 87 13 -83 -44 90 24 2 -4 -39 6 75 -5 -25 66 -53 44 -67 124 25 40 -6 1 -59 -14 -35 -89 105 41 -17 
34 | -3 59 -68 36 102 -64 -23 54 -81 -100 100 -78 -21 92 52 101 -30 -94 -58 53 75 119 -81 -102 84 -48 96 71 71 -1 108 -40 14 88 42 100 106 54 -10 86 
35 | -3 59 -68 36 102 -64 -23 54 -81 -100 100 -78 -21 92 52 101 -30 -94 -58 53 75 119 -81 -102 84 -48 96 71 71 -1 108 -40 14 88 42 100 106 54 -10 86 
36 | -8 18 -90 -128 -97 21 43 -119 -2 63 29 -120 38 -89 -120 -79 124 -110 -17 -4 -14 110 106 -103 -19 -124 -55 -42 1 44 95 126 101 -69 34 -40 -51 -113 85 92 
37 | -8 18 -90 -128 -97 21 43 -119 -2 63 29 -120 38 -89 -120 -79 124 -110 -17 -4 -14 110 106 -103 -19 -124 -55 -42 1 44 95 126 101 -69 34 -40 -51 -113 85 92 
38 | -113 113 -65 -56 122 -57 6 71 -97 15 -97 13 -32 -7 97 -126 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
39 | -113 113 -65 -56 122 -57 6 71 -97 15 -97 13 -32 -7 97 -126 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
40 | 23 -75 -4 -78 29 93 -75 -112 -5 99 125 -86 59 98 -60 72 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
41 | 23 -75 -4 -78 29 93 -75 -112 -5 99 125 -86 59 98 -60 72 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
42 | -52 -57 85 -2 -67 -71 -29 37 84 -32 19 13 115 -92 -109 -60 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
43 | -52 -57 85 -2 -67 -71 -29 37 84 -32 19 13 115 -92 -109 -60 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
44 | 


--------------------------------------------------------------------------------
/patch/npu_test/gemv256_batch6/instructions.dat:
--------------------------------------------------------------------------------
 1 | 17 48
 2 | 92 0 0 0 0 40 64 96 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 3 | 92 0 0 0 0 40 64 96 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 4 | 92 0 0 0 0 40 64 96 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 5 | 92 0 0 0 0 40 64 96 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 6 | 92 0 0 0 0 40 64 96 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 7 | 92 0 0 0 0 40 64 96 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 8 | 92 0 0 0 0 40 64 96 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 9 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 252 0 6 0 0 0 240 0 3 12 0 8 160 0 129 1 
10 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
11 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
12 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
13 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
14 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
15 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
16 | 127 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
17 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
18 | 1 0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 
19 | 


--------------------------------------------------------------------------------
/patch/npu_test/gemv256_batch6/outputs.dat:
--------------------------------------------------------------------------------
 1 | 12 222 163 45 31 113 131 226 161 64 203 167 245 230 5 15 239 225 250 79 120 165 224 25 243 169 168 141 206 177 94 58 155 217 30 24 46 103 84 170 
 2 | 12 222 163 45 31 113 131 226 161 64 203 167 245 230 5 15 239 225 250 79 120 165 224 25 243 169 168 141 206 177 94 58 155 217 30 24 46 103 84 170 
 3 | 65 59 214 221 247 215 119 167 42 203 69 112 241 164 186 55 242 103 138 168 185 9 199 209 38 242 104 56 154 40 110 181 209 225 20 201 147 126 136 28 
 4 | 65 59 214 221 247 215 119 167 42 203 69 112 241 164 186 55 242 103 138 168 185 9 199 209 38 242 104 56 154 40 110 181 209 225 20 201 147 126 136 28 
 5 | 95 187 188 41 12 206 149 145 14 22 198 76 14 206 187 164 139 218 127 31 226 57 90 43 117 94 159 41 229 231 195 209 128 41 82 9 4 67 217 67 
 6 | 95 187 188 41 12 206 149 145 14 22 198 76 14 206 187 164 139 218 127 31 226 57 90 43 117 94 159 41 229 231 195 209 128 41 82 9 4 67 217 67 
 7 | 146 255 130 147 65 22 0 115 36 46 230 155 42 177 37 29 245 73 222 131 72 233 82 143 172 185 248 141 166 96 160 150 5 130 68 97 134 38 35 102 
 8 | 146 255 130 147 65 22 0 115 36 46 230 155 42 177 37 29 245 73 222 131 72 233 82 143 172 185 248 141 166 96 160 150 5 130 68 97 134 38 35 102 
 9 | 110 252 146 28 23 89 112 167 89 46 207 113 57 157 191 88 156 34 252 10 157 242 94 190 206 63 77 13 246 137 47 50 225 68 85 248 215 51 201 251 
10 | 110 252 146 28 23 89 112 167 89 46 207 113 57 157 191 88 156 34 252 10 157 242 94 190 206 63 77 13 246 137 47 50 225 68 85 248 215 51 201 251 
11 | 200 3 254 248 7 121 2 160 195 164 79 232 5 31 39 82 185 245 175 143 97 10 34 242 168 200 28 251 105 172 180 181 90 217 140 156 255 253 27 2 
12 | 200 3 254 248 7 121 2 160 195 164 79 232 5 31 39 82 185 245 175 143 97 10 34 242 168 200 28 251 105 172 180 181 90 217 140 156 255 253 27 2 
13 | 210 221 77 13 93 251 251 164 68 173 140 199 29 243 197 164 67 143 252 126 48 107 73 214 195 220 4 188 188 128 26 231 74 191 174 7 183 38 217 182 
14 | 210 221 77 13 93 251 251 164 68 173 140 199 29 243 197 164 67 143 252 126 48 107 73 214 195 220 4 188 188 128 26 231 74 191 174 7 183 38 217 182 
15 | 117 220 65 202 42 93 126 185 65 239 122 209 182 78 139 202 251 102 155 236 16 96 1 45 211 0 219 51 73 61 252 240 174 160 89 166 254 124 251 138 
16 | 117 220 65 202 42 93 126 185 65 239 122 209 182 78 139 202 251 102 155 236 16 96 1 45 211 0 219 51 73 61 252 240 174 160 89 166 254 124 251 138 
17 | 100 247 101 113 102 107 232 220 0 117 36 252 248 65 41 49 198 86 37 250 193 134 7 130 18 52 97 225 48 224 38 192 243 35 202 149 103 47 154 56 
18 | 100 247 101 113 102 107 232 220 0 117 36 252 248 65 41 49 198 86 37 250 193 134 7 130 18 52 97 225 48 224 38 192 243 35 202 149 103 47 154 56 
19 | 207 116 61 228 164 37 104 252 63 180 233 4 202 54 125 117 202 58 10 17 170 55 97 44 45 252 6 60 232 44 176 49 124 188 133 26 208 79 173 110 
20 | 207 116 61 228 164 37 104 252 63 180 233 4 202 54 125 117 202 58 10 17 170 55 97 44 45 252 6 60 232 44 176 49 124 188 133 26 208 79 173 110 
21 | 253 101 232 47 82 117 217 220 179 54 5 12 117 12 173 24 216 102 65 138 119 59 222 32 102 249 141 121 75 249 4 163 233 141 156 134 25 98 138 221 
22 | 253 101 232 47 82 117 217 220 179 54 5 12 117 12 173 24 216 102 65 138 119 59 222 32 102 249 141 121 75 249 4 163 233 141 156 134 25 98 138 221 
23 | 44 158 197 70 236 80 175 108 33 234 26 142 84 21 176 115 92 187 207 31 62 21 214 218 203 152 123 237 201 45 9 127 99 50 15 79 123 192 96 127 
24 | 44 158 197 70 236 80 175 108 33 234 26 142 84 21 176 115 92 187 207 31 62 21 214 218 203 152 123 237 201 45 9 127 99 50 15 79 123 192 96 127 
25 | 148 86 168 241 77 229 114 63 229 230 80 154 200 165 36 163 60 45 89 172 219 47 135 175 39 128 191 79 43 217 150 237 115 2 64 71 54 214 76 235 
26 | 148 86 168 241 77 229 114 63 229 230 80 154 200 165 36 163 60 45 89 172 219 47 135 175 39 128 191 79 43 217 150 237 115 2 64 71 54 214 76 235 
27 | 133 82 62 193 192 64 172 196 197 187 199 187 147 57 72 70 122 19 231 204 73 169 104 228 249 122 238 232 112 55 234 85 88 164 40 220 16 137 165 81 
28 | 133 82 62 193 192 64 172 196 197 187 199 187 147 57 72 70 122 19 231 204 73 169 104 228 249 122 238 232 112 55 234 85 88 164 40 220 16 137 165 81 
29 | 0 21 202 162 226 121 151 48 214 51 155 72 242 67 34 132 180 26 221 49 35 242 7 149 38 64 2 237 30 102 65 49 118 18 165 1 74 223 23 39 
30 | 0 21 202 162 226 121 151 48 214 51 155 72 242 67 34 132 180 26 221 49 35 242 7 149 38 64 2 237 30 102 65 49 118 18 165 1 74 223 23 39 
31 | 187 98 66 153 106 41 34 33 143 174 210 189 254 120 159 170 102 33 142 75 56 158 231 174 172 30 12 169 6 28 207 116 153 133 87 141 103 74 43 254 
32 | 187 98 66 153 106 41 34 33 143 174 210 189 254 120 159 170 102 33 142 75 56 158 231 174 172 30 12 169 6 28 207 116 153 133 87 141 103 74 43 254 
33 | 98 240 139 147 254 203 75 35 90 236 30 83 91 11 41 229 134 104 7 56 218 41 38 41 229 217 128 139 222 25 219 145 52 228 39 160 158 205 163 162 
34 | 98 240 139 147 254 203 75 35 90 236 30 83 91 11 41 229 134 104 7 56 218 41 38 41 229 217 128 139 222 25 219 145 52 228 39 160 158 205 163 162 
35 | 195 197 15 161 98 158 38 189 114 87 133 107 244 225 85 138 178 72 8 8 23 116 60 68 87 190 220 219 254 166 140 15 243 5 242 68 123 206 213 168 
36 | 195 197 15 161 98 158 38 189 114 87 133 107 244 225 85 138 178 72 8 8 23 116 60 68 87 190 220 219 254 166 140 15 243 5 242 68 123 206 213 168 
37 | 251 42 192 178 140 198 105 57 131 254 232 72 47 192 111 169 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
38 | 251 42 192 178 140 198 105 57 131 254 232 72 47 192 111 169 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
39 | 172 74 129 99 144 151 9 225 175 96 147 247 252 175 77 84 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
40 | 172 74 129 99 144 151 9 225 175 96 147 247 252 175 77 84 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
41 | 191 145 223 204 87 232 178 180 99 46 118 185 233 142 135 82 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
42 | 191 145 223 204 87 232 178 180 99 46 118 185 233 142 135 82 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
43 | 


--------------------------------------------------------------------------------
/patch/npu_test/gemv256_batch768/instructions.dat:
--------------------------------------------------------------------------------
 1 | 17 48
 2 | 92 0 0 0 0 40 64 96 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 3 | 92 0 0 0 0 40 64 96 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 4 | 92 0 0 0 0 40 64 96 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 5 | 92 0 0 0 0 40 64 96 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 6 | 92 0 0 0 0 40 64 96 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 7 | 92 0 0 0 0 40 64 96 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 8 | 92 0 0 0 0 40 64 96 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 9 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 252 0 6 0 0 0 240 0 3 12 0 8 160 0 129 1 
10 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
11 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
12 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
13 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
14 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
15 | 125 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
16 | 127 0 0 0 0 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
17 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
18 | 128 0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 
19 | 


--------------------------------------------------------------------------------
/patch/npu_test/gen_input.py:
--------------------------------------------------------------------------------
1 | length = 8192
2 | with open('test_input.dat', 'w') as f:
3 | 	f.write(str(length)+" 40\n")
4 | 	for i in range(0, length):
5 | 		for j in range(0, 40):
6 | 			f.write(str((i % 256) - 128) + " ")
7 | 		f.write("\n")
8 | 


--------------------------------------------------------------------------------
/patch/npu_test/mlp_batch2052/instructions.dat:
--------------------------------------------------------------------------------
 1 | 34 48
 2 | 28 1 0 0 0 112 160 192 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 3 | 28 1 0 0 0 112 160 192 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 4 | 28 1 0 0 0 112 160 192 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 5 | 28 1 0 0 0 112 160 192 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 6 | 28 1 0 0 0 112 160 192 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 7 | 28 1 0 0 0 112 160 192 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 8 | 28 1 0 0 0 112 160 192 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 9 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 13 0 0 0 0 0 0 0 3 1 52 0 0 0 0 0 0 0 252 128 6 0 0 0 240 64 3 52 0 32 192 129 2 3 
10 | 188 0 0 0 0 32 32 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
11 | 188 0 0 0 0 32 32 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
12 | 188 0 0 0 0 32 32 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
13 | 188 0 0 0 0 32 32 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
14 | 188 0 0 0 0 32 32 0 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
15 | 188 0 0 0 0 32 32 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
16 | 124 0 0 0 0 32 32 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
17 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 220 1 6 0 0 0 208 1 3 24 208 16 128 128 0 0 
18 | 124 0 0 0 0 160 48 65 66 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
19 | 124 0 0 0 0 160 48 65 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
20 | 124 0 0 0 0 160 48 65 2 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
21 | 124 0 0 0 0 160 48 65 2 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
22 | 124 0 0 0 0 160 48 65 2 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
23 | 124 0 0 0 0 160 48 65 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
24 | 124 0 0 0 0 160 48 65 2 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
25 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
26 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 188 2 6 0 0 0 176 2 3 12 48 9 128 194 4 9 
27 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
28 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
29 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
30 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
31 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
32 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
33 | 127 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
34 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
35 | 86 1 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 
36 | 


--------------------------------------------------------------------------------
/patch/npu_test/mlp_batch4104/instructions.dat:
--------------------------------------------------------------------------------
 1 | 34 48
 2 | 28 1 0 0 0 112 160 192 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 3 | 28 1 0 0 0 112 160 192 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 4 | 28 1 0 0 0 112 160 192 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 5 | 28 1 0 0 0 112 160 192 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 6 | 28 1 0 0 0 112 160 192 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 7 | 28 1 0 0 0 112 160 192 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 8 | 28 1 0 0 0 112 160 192 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
 9 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 13 0 0 0 0 0 0 0 3 1 52 0 0 0 0 0 0 0 252 128 6 0 0 0 240 64 3 52 0 32 192 129 2 3 
10 | 188 0 0 0 0 32 32 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
11 | 188 0 0 0 0 32 32 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
12 | 188 0 0 0 0 32 32 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
13 | 188 0 0 0 0 32 32 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
14 | 188 0 0 0 0 32 32 0 0 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
15 | 188 0 0 0 0 32 32 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
16 | 124 0 0 0 0 32 32 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
17 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 220 1 6 0 0 0 208 1 3 24 208 16 128 128 0 0 
18 | 124 0 0 0 0 160 48 65 66 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
19 | 124 0 0 0 0 160 48 65 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
20 | 124 0 0 0 0 160 48 65 2 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
21 | 124 0 0 0 0 160 48 65 2 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
22 | 124 0 0 0 0 160 48 65 2 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
23 | 124 0 0 0 0 160 48 65 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
24 | 124 0 0 0 0 160 48 65 2 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
25 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
26 | 0 0 0 0 0 0 0 0 0 0 0 192 64 0 12 0 0 0 0 0 0 0 3 1 48 0 0 0 0 0 0 0 188 2 6 0 0 0 176 2 3 12 48 9 128 194 4 9 
27 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
28 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
29 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
30 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
31 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
32 | 125 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
33 | 127 0 0 0 0 184 96 161 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
34 | 124 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
35 | 172 2 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 
36 | 


--------------------------------------------------------------------------------
/patch/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd ../DUT_example_design/ip/pcie_ed/pcie_ed_MEM/altera_avalon_onchip_memory2_1920/synth
 4 | 
 5 | FILE=pcie_ed_MEM_altera_avalon_onchip_memory2_1920_2xhjmhi.v
 6 | if test -f "$FILE"; then
 7 | 	sed -i "s/\[ 13\: 0\]/\[ 12\: 0\]/g" $FILE
 8 | 	sed -i "s/16384/8192/g" $FILE
 9 | 	sed -i "s/14/13/g" $FILE
10 | else
11 | 	echo "IP not configured correctly."
12 | 	exit 1
13 | fi
14 | 
15 | # DMA double buffer controller
16 | cd ../../synth
17 | cp ../../../../../patch/pcie_ed_MEM.v ./
18 | 
19 | 
20 | # modify npu.vh path name
21 | cd ../../../../../rtl/
22 | cwd=$(pwd)
23 | sed -i "s~/nfs/sc/disks/swuser_work_aboutros/npu_demo/npu-s10-nx/rtl/~$cwd~" npu.vh
24 | 
25 | # add npu src files to Quartus prj
26 | cd ../DUT_example_design/
27 | for f in ../rtl/*.sv
28 | do
29 | 	if [[ $f != "../rtl/altera_syncram.sv" ]] && [[ $f != "../rtl/self_tester_shim.sv" ]] && [[ $f != "../rtl/tester_rom.sv" ]]
30 | 	then
31 | 		echo "set_global_assignment -name SYSTEMVERILOG_FILE $f" >> pcie_ed.qsf
32 | 	fi
33 | done
34 | 
35 | echo "set_global_assignment -name VERILOG_FILE ../rtl/dma_buffer.v" >> pcie_ed.qsf
36 | echo "set_global_assignment -name VERILOG_INCLUDE_FILE ../rtl/npu.vh" >> pcie_ed.qsf
37 | echo "set_global_assignment -name MIF_FILE ../rtl/tanh.mif" >> pcie_ed.qsf
38 | echo "set_global_assignment -name MIF_FILE ../rtl/sigmoid.mif" >> pcie_ed.qsf
39 | echo "set_global_assignment -name OPTIMIZATION_MODE \"SUPERIOR PERFORMANCE WITH MAXIMUM PLACEMENT EFFORT\"" >> pcie_ed.qsf
40 | 
41 | # host program
42 | cd ./software
43 | cp ../../patch/kernel/* ./kernel/linux/
44 | cp ../../patch/user/*.hpp ./user/api/
45 | cp ../../patch/user/*.cpp ./user/api/linux/
46 | cp -r ../../patch/npu_test ./user/


--------------------------------------------------------------------------------
/rtl/asymmetric_fifo.sv:
--------------------------------------------------------------------------------
  1 | `include "npu.vh"
  2 | 
  3 | module asymmetric_fifo # (
  4 | 	parameter IDW   = 3*`ACCW,
  5 | 	parameter ODW   = `ACCW,
  6 | 	parameter DEPTH = `QDEPTH,
  7 | 	parameter ID    = 0,
  8 | 	parameter AW    = $clog2(DEPTH)
  9 | ) (
 10 | 	input            clk,
 11 | 	input            rst,
 12 | 	input            wr_en,
 13 | 	input  [IDW-1:0] wr_data,
 14 | 	output           wr_ok,
 15 | 	input            rd_en,
 16 | 	output [ODW-1:0] rd_data, 
 17 | 	output           rd_ok,
 18 | 	output [AW-1:0]  usedw
 19 | );
 20 | 
 21 | 	wire [2:0] fifo_wr_ok;
 22 | 	wire [2:0] fifo_rd_ok;
 23 | 	wire [ODW-1:0] fifo_rd_data [0:2];
 24 | 	wire [AW-1:0] fifo_usedw [0:2];
 25 | 
 26 | 	reg [2:0] sel;
 27 | 
 28 | 	fifo # (
 29 | 		.ID       (ID), 
 30 | 		.DW       (ODW), 
 31 | 		.DEPTH    (DEPTH)
 32 | 	) fifo0 (
 33 | 		.clk      (clk), 
 34 | 		.rst      (rst),
 35 | 		.wr_en    (wr_en),
 36 | 		.wr_data  (wr_data[ODW-1:0]),
 37 | 		.wr_ok    (fifo_wr_ok[0]),
 38 | 
 39 | 		.rd_ok    (fifo_rd_ok[0]),
 40 | 		.rd_data  (fifo_rd_data[0]),
 41 | 		.rd_en    (rd_en && sel[0]),
 42 | 		.usedw    (fifo_usedw[0])
 43 | 	);
 44 | 
 45 | 	fifo # (
 46 | 		.ID       (ID), 
 47 | 		.DW       (ODW), 
 48 | 		.DEPTH    (DEPTH)
 49 | 	) fifo1 (
 50 | 		.clk      (clk), 
 51 | 		.rst      (rst),
 52 | 		.wr_en    (wr_en),
 53 | 		.wr_data  (wr_data[2*ODW-1:ODW]),
 54 | 		.wr_ok    (fifo_wr_ok[1]),
 55 | 
 56 | 		.rd_ok    (fifo_rd_ok[1]),
 57 | 		.rd_data  (fifo_rd_data[1]),
 58 | 		.rd_en    (rd_en && sel[1]),
 59 | 		.usedw    (fifo_usedw[1])
 60 | 	);
 61 | 
 62 | 	fifo # (
 63 | 		.ID       (ID), 
 64 | 		.DW       (ODW), 
 65 | 		.DEPTH    (DEPTH)
 66 | 	) fifo2 (
 67 | 		.clk      (clk), 
 68 | 		.rst      (rst),
 69 | 		.wr_en    (wr_en),
 70 | 		.wr_data  (wr_data[3*ODW-1:2*ODW]),
 71 | 		.wr_ok    (fifo_wr_ok[2]),
 72 | 		.rd_ok    (fifo_rd_ok[2]),
 73 | 		.rd_data  (fifo_rd_data[2]),
 74 | 		.rd_en    (rd_en && sel[2]),
 75 | 		.usedw    (fifo_usedw[2])
 76 | 	);
 77 | 	
 78 | 	always @ (posedge clk) begin
 79 | 		if (rst) begin
 80 | 			sel <= 3'b001;
 81 | 		end else begin
 82 | 			if (rd_en) begin
 83 | 				sel <= (sel == 3'b100)? 3'b001: (sel << 1);
 84 | 			end
 85 | 		end
 86 | 	end
 87 | 
 88 | 	reg rd_ok_out;
 89 | 	reg [ODW-1:0] rd_data_out;
 90 | 	reg [AW-1:0] usedw_out;
 91 | 	always @ (*) begin
 92 | 		if (sel == 3'b001) begin
 93 | 			rd_ok_out <= fifo_rd_ok[0];
 94 | 			rd_data_out <= fifo_rd_data[0];
 95 | 			usedw_out <= fifo_usedw[0];
 96 | 		end else if (sel == 3'b010) begin
 97 | 			rd_ok_out <= fifo_rd_ok[1];
 98 | 			rd_data_out <= fifo_rd_data[1];
 99 | 			usedw_out <= fifo_usedw[1];
100 | 		end else begin
101 | 			rd_ok_out <= fifo_rd_ok[2];
102 | 			rd_data_out <= fifo_rd_data[2];
103 | 			usedw_out <= fifo_usedw[2];
104 | 		end
105 | 	end
106 | 
107 | 	assign rd_ok = rd_ok_out;
108 | 	assign rd_data = rd_data_out;
109 | 	assign usedw = usedw_out;
110 | 	assign wr_ok = fifo_wr_ok[0];
111 | 
112 | endmodule


--------------------------------------------------------------------------------
/rtl/axbs.sv:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Intel Corporation.
 2 | //
 3 | // This reference design file is subject licensed to you by the terms and
 4 | // conditions of the applicable License Terms and Conditions for Hardware
 5 | // Reference Designs and/or Design Examples (either as signed by you or
 6 | // found at https://www.altera.com/common/legal/leg-license_agreement.html ).
 7 | //
 8 | // As stated in the license, you agree to only use this reference design
 9 | // solely in conjunction with Intel FPGAs or Intel CPLDs.
10 | //
11 | // THE REFERENCE DESIGN IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED
12 | // WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
13 | // NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. Intel does not
14 | // warrant or assume responsibility for the accuracy or completeness of any
15 | // information, links or other items within the Reference Design and any
16 | // accompanying materials.
17 | //
18 | // In the event that you do not agree with such terms and conditions, do not
19 | // use the reference design file.
20 | /////////////////////////////////////////////////////////////////////////////
21 | 
22 | (* altera_attribute = "-name FRACTAL_SYNTHESIS ON; -name SYNCHRONIZER_IDENTIFICATION OFF" *)
23 | module axbs #(
24 | 	parameter SIZE_A = 27,
25 | 	parameter SIZE_B = 27
26 | ) (
27 | 	input clk,
28 | 	input signed [SIZE_A-1:0] din_a,
29 | 	input signed [SIZE_B-1:0] din_b,
30 | 	output reg signed [SIZE_A+SIZE_B-1:0] dout
31 | );
32 | 
33 | reg signed [SIZE_A+SIZE_B-1:0] dout_r;
34 | reg signed [SIZE_A+SIZE_B-1:0] dout_rr;
35 | reg signed [SIZE_A+SIZE_B-1:0] dout_rrr;
36 | 
37 | always @(posedge clk) begin
38 | 	dout_r <= din_a * din_b;
39 |     dout_rr <= dout_r;
40 |     dout_rrr <= dout_rr;
41 |     dout <= dout_rrr;
42 | end
43 | endmodule
44 | 


--------------------------------------------------------------------------------
/rtl/bram_accum.sv:
--------------------------------------------------------------------------------
  1 | `include "npu.vh"
  2 | 
  3 | module bram_accum # (
  4 | 	parameter ACCW = `ACCW,
  5 | 	parameter NDPE = `NDPE,
  6 | 	parameter DOTW = `DOTW,
  7 | 	parameter PRIME_DOTW = `PRIME_DOTW,
  8 | 	parameter DOT_PER_DSP = `DOT_PER_DSP,
  9 | 	parameter NUM_DSP = `NUM_DSP,
 10 |     parameter NUM_CHUNKS = NDPE/DOTW,
 11 | 	parameter NUM_ACCUM = `NUM_ACCUM,
 12 | 	parameter ACCIDW = `ACCIDW
 13 | )(
 14 | 	input  clk,
 15 |     input  rst,
 16 |     input  [3+ACCIDW-1:0] accum_ctrl [0:3*NDPE-1], //[3] valid, [2:1] op, [0] sel
 17 |     input  [3*ACCW*NDPE-1:0] accum_in,
 18 | 	output [NDPE-1:0] valid_out,
 19 | 	output [3*ACCW*NDPE-1:0] accum_out
 20 | );
 21 | 
 22 | localparam ACCUM_DEPTH = NUM_ACCUM*2;
 23 | localparam ACCUM_ADDRW = $clog2(ACCUM_DEPTH);
 24 | localparam BRAM_LATENCY = 2;
 25 | localparam [1:0] ACC_OP_SET = 0, ACC_OP_UPD = 1, ACC_OP_WB  = 2, ACC_OP_SET_AND_WB = 3;
 26 | 
 27 | reg [ACCUM_ADDRW-1:0] accum_rd_addr [0:3*NDPE-1];
 28 | wire [ACCW-1:0] accum_rd_data [0:3*NDPE-1];
 29 | 
 30 | reg [3*ACCW*NDPE-1:0] r_accum_in [0:BRAM_LATENCY];
 31 | reg [ACCUM_ADDRW-1:0] r_accum_rd_addr [0:BRAM_LATENCY][0:3*NDPE-1];
 32 | reg [3+ACCIDW-1:0] r_accum_ctrl [0:BRAM_LATENCY][0:3*NDPE-1];
 33 | reg [ACCW-1:0] accum_wr_data [0:3*NDPE-1];
 34 | reg [NDPE-1:0] valid [0:BRAM_LATENCY];
 35 | 
 36 | wire [3*ACCW*NDPE-1:0] accum_res;
 37 | 
 38 | integer a, p;
 39 | always @ (posedge clk) begin
 40 | 	if (rst) begin
 41 | 		for(a = 0; a < 3*NDPE; a = a + 1) begin
 42 | 			accum_rd_addr[a] <= 0;
 43 | 		end
 44 | 		for(p = 0; p < BRAM_LATENCY+1; p = p + 1) begin
 45 | 			valid[p] <= 'd0;
 46 | 		end
 47 | 	end else begin
 48 | 		for(a = 0; a < 3*NDPE; a = a + 1) begin
 49 | 			// If valid input, increment read address
 50 | 			if(accum_ctrl[a][3+ACCIDW-1]) begin
 51 | 				if(accum_rd_addr[a] == accum_ctrl[a][ACCIDW-1:0]-1) begin
 52 | 					accum_rd_addr[a] <= 0;
 53 | 				end else begin
 54 | 					accum_rd_addr[a] <= ACCUM_ADDRW'(accum_rd_addr[a] + 1'b1);
 55 | 				end
 56 | 			end
 57 | 
 58 | 			// Pipeline ctrl, address and input to align with read value (then an extra pipeline for addition)
 59 | 			r_accum_rd_addr[0][a] <= accum_rd_addr[a];
 60 | 			r_accum_in[0] <= accum_in;
 61 | 			r_accum_ctrl[0][a] <= accum_ctrl[a];
 62 | 			for(p = 1; p < BRAM_LATENCY+1; p = p + 1) begin
 63 | 				r_accum_rd_addr[p][a] <= r_accum_rd_addr[p-1][a];
 64 | 				r_accum_in[p] <= r_accum_in[p-1];
 65 | 				r_accum_ctrl[p][a] <= r_accum_ctrl[p-1][a];
 66 | 			end
 67 | 
 68 | 			// Perform addition
 69 | 			accum_wr_data[a] <= ((r_accum_ctrl[BRAM_LATENCY-1][a][ACCIDW+:2] == ACC_OP_SET) 
 70 | 				|| (r_accum_ctrl[BRAM_LATENCY-1][a][ACCIDW+:2] == ACC_OP_SET_AND_WB))?
 71 | 				r_accum_in[BRAM_LATENCY-1][a*ACCW+:ACCW]:
 72 | 				r_accum_in[BRAM_LATENCY-1][a*ACCW+:ACCW] + accum_rd_data[a];
 73 | 		end
 74 | 
 75 | 		// Valid pipeline
 76 | 		for(a = 0; a < NDPE; a = a + 1) begin
 77 | 			valid[0][a] <= ((accum_ctrl[a*3][ACCIDW+:2] == ACC_OP_WB) || (accum_ctrl[a*3][ACCIDW+:2] == ACC_OP_SET_AND_WB)) 
 78 | 				&& (accum_ctrl[a*3][3+ACCIDW-1]);
 79 | 			for(p = 1; p < BRAM_LATENCY+1; p = p + 1) begin
 80 | 				valid[p][a] <= valid[p-1][a];
 81 | 			end
 82 | 		end
 83 | 	end
 84 | end
 85 | 
 86 | genvar accum_id;
 87 | generate
 88 | 	for(accum_id = 0; accum_id < 3*NDPE; accum_id = accum_id + 1) begin: gen_accum_ram
 89 | 		ram #(
 90 | 			.ID 		(accum_id), 
 91 | 			.DW 		(ACCW), 
 92 | 			.AW 		(ACCUM_ADDRW), 
 93 | 			.DEPTH 		(ACCUM_DEPTH),
 94 | 			.MODULE_ID 	("accum")
 95 | 		) accum_ram (
 96 | 			.wr_en   	(r_accum_ctrl[BRAM_LATENCY][accum_id][3+ACCIDW-1]),
 97 | 			.wr_addr 	(r_accum_rd_addr[BRAM_LATENCY][accum_id]),
 98 | 			.wr_data 	(accum_wr_data[accum_id]),
 99 | 			.rd_addr 	(accum_rd_addr[accum_id]),
100 | 			.rd_data 	(accum_rd_data[accum_id]),
101 | 			.clk 		(clk), 
102 | 			.rst 		(rst)
103 | 		);
104 | 
105 | 		assign accum_res[accum_id*ACCW+:ACCW] = accum_wr_data[accum_id];
106 | 	end
107 | endgenerate
108 | 
109 | reg [3*ACCW*NDPE-1:0] accum_out_arranged;
110 | always @(*) begin
111 |     for (p = 0; p < NUM_CHUNKS*3; p = p + 3) begin
112 |         accum_out_arranged[(p*ACCW*DOTW)+:(ACCW*DOTW)] = accum_res[(p/3*ACCW*DOTW)+:(ACCW*DOTW)];
113 |         accum_out_arranged[((p+1)*ACCW*DOTW)+:(ACCW*DOTW)] = accum_res[(ACCW*NDPE)+(p/3*ACCW*DOTW)+:(ACCW*DOTW)];
114 |         accum_out_arranged[((p+2)*ACCW*DOTW)+:(ACCW*DOTW)] = accum_res[(2*ACCW*NDPE)+(p/3*ACCW*DOTW)+:(ACCW*DOTW)];
115 |     end
116 | end
117 | 
118 | assign valid_out = valid[BRAM_LATENCY];
119 | assign accum_out = accum_out_arranged;
120 | 
121 | `ifdef DISPLAY_MVU
122 | always @(posedge clk) begin
123 |   if (accum_ctrl[0][3+ACCIDW-1]) begin
124 |     $display("[%0t][ACCUM] addr: %d, size: %d", 
125 |     	$time,
126 |     	accum_rd_addr[0],
127 |     	accum_ctrl[0][ACCIDW-1:0]);
128 |   end
129 | end
130 | `endif
131 | 
132 | endmodule


--------------------------------------------------------------------------------
/rtl/daisy_chain_interconnect.sv:
--------------------------------------------------------------------------------
 1 | 
 2 | // This module implements a daisy chain interconnect from a source to multiple sinks with parameterizable latency per hop.
 3 | (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name DONT_MERGE_REGISTER ON" *) module daisy_chain_interconnect # (
 4 | 	parameter DATAW = 32,
 5 | 	parameter END_POINTS = 4,
 6 | 	parameter LATENCY_PER_HOP = 2
 7 | ) (
 8 | 	input 				clk,
 9 | 	input 				rst,
10 | 	input  [DATAW-1:0] 	i_daisy_chain_in,
11 | 	output [DATAW-1:0] 	o_daisy_chain_out [0:END_POINTS-1]
12 | );
13 | 
14 | reg [DATAW-1:0] pipeline [0:LATENCY_PER_HOP*END_POINTS-1];
15 | 
16 | integer t;
17 | always @ (posedge clk) begin
18 | 	// Set the input to the first pipeline stage
19 | 	pipeline[0] <= i_daisy_chain_in;
20 | 	// Progress the pipeline
21 | 	for (t = 1; t < LATENCY_PER_HOP*END_POINTS; t = t + 1) begin
22 | 		pipeline[t] <= pipeline[t-1];
23 | 	end
24 | end
25 | 
26 | // Hook up outputs
27 | genvar i;
28 | generate
29 | 	for(i = 0; i < END_POINTS; i = i + 1) begin: gen_outputs
30 | 		assign o_daisy_chain_out[i] = pipeline[(LATENCY_PER_HOP-1)+i*LATENCY_PER_HOP];
31 | 	end
32 | endgenerate
33 | 
34 | endmodule


--------------------------------------------------------------------------------
/rtl/dma_buffer.v:
--------------------------------------------------------------------------------
 1 | `timescale 1ns / 1ps
 2 | 
 3 | module dma_buffer # (
 4 | 	parameter WIDTH = 512,
 5 | 	parameter DEPTH = 8192,
 6 | 	parameter ADDRW = $clog2(DEPTH),
 7 | 	parameter BYENW = WIDTH / 8
 8 | )(
 9 | 	input  clk,
10 | 	input  ren,
11 | 	input  [ADDRW-1:0] raddr,
12 | 	output [WIDTH-1:0] rdata,
13 | 	input  wen,
14 | 	input  [ADDRW-1:0] waddr,
15 | 	input  [BYENW-1:0] wben,
16 | 	input  [WIDTH-1: 0] wdata
17 | );
18 | 
19 | reg [WIDTH-1:0] readdata;
20 | wire [WIDTH-1:0] readdata_ram;
21 | wire wren;
22 | 
23 | always @(posedge clk) begin
24 | 	//if (ren) begin
25 | 		readdata <= readdata_ram;
26 | 	//end
27 | end
28 | 
29 | assign rdata = readdata;
30 | 
31 | altera_syncram altera_syncram_component (
32 | 	.address_a (waddr),
33 | 	.address_b (raddr),
34 | 	.byteena_a (1'b1),
35 | 	.clock0 (clk),
36 | 	.data_a (wdata),
37 | 	.wren_a (wen),
38 | 	.q_b (readdata_ram),
39 | 	.aclr0 (1'b0),
40 | 	.aclr1 (1'b0),
41 | 	.address2_a (1'b1),
42 | 	.address2_b (1'b1),
43 | 	.addressstall_a (1'b0),
44 | 	.addressstall_b (1'b0),
45 | 	.byteena_b (1'b1),
46 | 	.clock1 (1'b1),
47 | 	.clocken0 (1'b1),
48 | 	.clocken1 (1'b1),
49 | 	.clocken2 (1'b1),
50 | 	.clocken3 (1'b1),
51 | 	.data_b ({512{1'b1}}),
52 | 	.eccencbypass (1'b0),
53 | 	.eccencparity (8'b0),
54 | 	.eccstatus (),
55 | 	.q_a (),
56 | 	.rden_a (1'b1),
57 | 	.rden_b (1'b1),
58 | 	.sclr (1'b0),
59 | 	.wren_b (1'b0)
60 | );
61 | defparam
62 | 	altera_syncram_component.address_aclr_b  = "NONE",
63 | 	altera_syncram_component.address_reg_b  = "CLOCK0",
64 | 	altera_syncram_component.byte_size  = 8,
65 | 	altera_syncram_component.clock_enable_input_a  = "BYPASS",
66 | 	altera_syncram_component.clock_enable_input_b  = "BYPASS",
67 | 	altera_syncram_component.clock_enable_output_b  = "BYPASS",
68 | 	altera_syncram_component.intended_device_family  = "Stratix 10",
69 | 	altera_syncram_component.lpm_type  = "altera_syncram",
70 | 	altera_syncram_component.numwords_a  = DEPTH,
71 | 	altera_syncram_component.numwords_b  = DEPTH,
72 | 	altera_syncram_component.operation_mode  = "DUAL_PORT",
73 | 	altera_syncram_component.outdata_aclr_b  = "NONE",
74 | 	altera_syncram_component.outdata_sclr_b  = "NONE",
75 | 	altera_syncram_component.outdata_reg_b  = "UNREGISTERED",
76 | 	altera_syncram_component.power_up_uninitialized  = "FALSE",
77 | 	altera_syncram_component.read_during_write_mode_mixed_ports  = "DONT_CARE",
78 | 	altera_syncram_component.widthad_a  = ADDRW,
79 | 	altera_syncram_component.widthad_b  = ADDRW,
80 | 	altera_syncram_component.width_a  = WIDTH,
81 | 	altera_syncram_component.width_b  = WIDTH,
82 | 	altera_syncram_component.width_byteena_a  = 1;
83 | 
84 | endmodule


--------------------------------------------------------------------------------
/rtl/dpe_mrf.sv:
--------------------------------------------------------------------------------
 1 | `include "npu.vh"
 2 | 
 3 | module dpe_mrf # (
 4 | 	parameter MODULE_ID = "",
 5 | 	parameter ID = 0,
 6 | 	parameter DW = 32,
 7 | 	parameter DEPTH = 512,
 8 | 	parameter AW = 9,
 9 | 	parameter EW = `EW,
10 | 	parameter DOTW = `DOTW,
11 | 	parameter NUM_DSP = DOTW/10
12 | )(
13 | 	input           wr_en, 
14 | 	input  [AW-1:0] wr_addr, 
15 | 	input  [AW-1:0] rd_addr,
16 | 	input  [DW-1:0] wr_data,
17 | 	output [DW-1:0] rd_data,
18 | 	input 			clk, 
19 | 	input 			rst
20 | );
21 | 
22 | 
23 | reg  [AW-1:0] rd_addr_balance [0:(2*(NUM_DSP-1))-1];
24 | 
25 | integer c;
26 | always @ (posedge clk) begin
27 | 	rd_addr_balance[0] <= rd_addr;
28 | 	for(c = 1; c < 2*(NUM_DSP-1); c = c + 1) begin
29 | 		rd_addr_balance[c] <= rd_addr_balance[c-1];
30 | 	end
31 | end
32 | 
33 | genvar ram_id;
34 | generate
35 | 	for (ram_id = 0; ram_id < NUM_DSP; ram_id = ram_id + 1) begin: gen_ram
36 | 		if (ram_id == 0) begin
37 | 			mrf_ram #(
38 | 				.ID(ID), 
39 | 				.DW(EW*10), 
40 | 				.AW(AW), 
41 | 				.DEPTH(DEPTH),
42 | 				.MODULE_ID("mvu-mrf"),
43 | 				.RAM_ID(ram_id)
44 | 			) ram_0 (
45 | 				.wr_en   (wr_en),
46 | 				.wr_addr (wr_addr),
47 | 				.wr_data (wr_data[(NUM_DSP-ram_id)*EW*10-1 : (NUM_DSP-(ram_id+1))*EW*10]),
48 | 				.rd_addr (rd_addr),
49 | 				.rd_data (rd_data[(NUM_DSP-ram_id)*EW*10-1 : (NUM_DSP-(ram_id+1))*EW*10]),
50 | 				.clk 	 (clk), 
51 | 				.rst 	 (rst)
52 | 			);
53 | 		end else begin
54 | 			mrf_ram #(
55 | 				.ID(ID), 
56 | 				.DW(EW*10), 
57 | 				.AW(AW), 
58 | 				.DEPTH(DEPTH),
59 | 				.MODULE_ID("mvu-mrf"),
60 | 				.RAM_ID(ram_id)
61 | 			) ram_i (
62 | 				.wr_en   (wr_en),
63 | 				.wr_addr (wr_addr),
64 | 				.wr_data (wr_data[(NUM_DSP-ram_id)*EW*10-1 : (NUM_DSP-(ram_id+1))*EW*10]),
65 | 				.rd_addr (rd_addr_balance[2*ram_id-1]),
66 | 				.rd_data (rd_data[(NUM_DSP-ram_id)*EW*10-1 : (NUM_DSP-(ram_id+1))*EW*10]),
67 | 				.clk 	 (clk), 
68 | 				.rst 	 (rst)
69 | 			);
70 | 		end
71 | 	end
72 | endgenerate
73 | 
74 | endmodule
75 | 
76 | 


--------------------------------------------------------------------------------
/rtl/inst_fifo.sv:
--------------------------------------------------------------------------------
  1 | `include "npu.vh"
  2 | 
  3 | module inst_fifo # (
  4 | 	parameter DW    = 64,   // FIFO data width
  5 | 	parameter DEPTH = 512,    // FIFO depth
  6 | 	parameter ID    = 0,      // Unique FIFO ID (used for debugging)
  7 | 	parameter TARGET_FPGA = `TARGET_FPGA,
  8 | 	parameter AW    = $clog2(DEPTH),
  9 | 	parameter MLAB_FIFO_DEPTH = 7,
 10 | 	parameter MLAB_FIFO_ADDRW = $clog2(MLAB_FIFO_DEPTH),
 11 | 	parameter NTAG = `NTAG,
 12 | 	parameter NTAGW = `NTAGW,
 13 | 	parameter MODULE = "evrf"
 14 | ) (
 15 | 	input           clk,
 16 | 	input           rst,
 17 | 	input           wr_en,
 18 | 	input  [DW-1:0] wr_data,
 19 | 	output          wr_ok,
 20 | 	input           rd_en,
 21 | 	output [DW-1:0] rd_data,
 22 | 	output          rd_ok,
 23 | 	input [NTAGW-1:0] current_tag
 24 | );
 25 | 
 26 | 	wire m20k_fifo_rd_ok, mlab_fifo_wr_ok, mlab_fifo_rd_ok;
 27 | 	wire [DW-1:0] m20k_fifo_rd_data;
 28 | 	reg rd_from_m20k, r_rd_from_m20k;
 29 | 
 30 | 	reg [NTAGW-1:0] tag_lookahead [0:MLAB_FIFO_DEPTH-1];
 31 | 	reg [MLAB_FIFO_ADDRW-1:0] rd_ptr, wr_ptr;
 32 | 
 33 | 	reg inst_rd_ok;
 34 | 
 35 | 	normal_fifo # (
 36 | 		.DW 		(DW),
 37 | 		.DEPTH 		(DEPTH),
 38 | 		.ID 		(ID),
 39 | 		.TARGET_FPGA(TARGET_FPGA),
 40 | 		.AW 		(AW)
 41 | 	) m20k_fifo (
 42 | 		.clk 		(clk),
 43 | 		.rst 		(rst),
 44 | 		.wr_en 		(wr_en),
 45 | 		.wr_data 	(wr_data),
 46 | 		.wr_ok 		(wr_ok),
 47 | 		.rd_en 		(rd_from_m20k),
 48 | 		.rd_data 	(m20k_fifo_rd_data),
 49 | 		.rd_ok 		(m20k_fifo_rd_ok)
 50 | 	);
 51 | 
 52 | 	mlab_fifo # (
 53 | 		.DW 		(DW),
 54 | 		.DEPTH 		(MLAB_FIFO_DEPTH),
 55 | 		.ID 		(ID),
 56 | 		.TARGET_FPGA(TARGET_FPGA),
 57 | 		.AW 		(MLAB_FIFO_ADDRW)
 58 | 	) mlab_fifo (
 59 | 		.clk 		(clk),
 60 | 		.rst 		(rst),
 61 | 		.wr_en 		(r_rd_from_m20k),
 62 | 		.wr_data 	(m20k_fifo_rd_data),
 63 | 		.wr_ok 		(mlab_fifo_wr_ok),
 64 | 		.rd_en 		(rd_en),
 65 | 		.rd_data 	(rd_data),
 66 | 		.rd_ok 		(mlab_fifo_rd_ok)
 67 | 	);
 68 | 
 69 | 	always @ (posedge clk) begin
 70 | 		if (rst) begin
 71 | 			rd_from_m20k <= 1'b0;
 72 | 			r_rd_from_m20k <= 1'b0;
 73 | 		end else begin
 74 | 			rd_from_m20k <= m20k_fifo_rd_ok && mlab_fifo_wr_ok;
 75 | 			r_rd_from_m20k <= rd_from_m20k;
 76 | 		end
 77 | 	end
 78 | 
 79 | 	integer i;
 80 | 	always @ (posedge clk) begin
 81 | 		if (rst) begin
 82 | 			rd_ptr <= 'd0;
 83 | 			wr_ptr <= 'd0;
 84 | 			for (i = 0; i < MLAB_FIFO_DEPTH; i = i + 1) begin
 85 | 				tag_lookahead[i] <= {(NTAGW){1'b1}};
 86 | 			end
 87 | 		end else begin
 88 | 			if (r_rd_from_m20k) begin
 89 | 				if(MODULE == "evrf") begin 
 90 | 					tag_lookahead[wr_ptr] <= `evrf_uinst_tag(m20k_fifo_rd_data);
 91 | 				end else if (MODULE == "mfu") begin
 92 | 					tag_lookahead[wr_ptr] <= `mfu_uinst_tag(m20k_fifo_rd_data);
 93 | 				end
 94 | 				wr_ptr <= (wr_ptr == MLAB_FIFO_DEPTH-1)? 
 95 | 					'd0: MLAB_FIFO_ADDRW'(wr_ptr + 1'b1);
 96 | 			end
 97 | 
 98 | 			if (rd_en) begin
 99 | 				tag_lookahead[rd_ptr] <= {(NTAGW){1'b1}};
100 | 				rd_ptr <= (rd_ptr == MLAB_FIFO_DEPTH-1)? 
101 | 					'd0: MLAB_FIFO_ADDRW'(rd_ptr + 1'b1); 
102 | 			end
103 | 		end
104 | 	end
105 | 
106 | 	wire state_t, state_tm1;
107 | 	assign state_tm1 = (current_tag >= tag_lookahead[rd_ptr]);
108 | 	assign state_t = (rd_ptr == MLAB_FIFO_DEPTH-1)? (current_tag >= tag_lookahead[0]) : (current_tag >= tag_lookahead[rd_ptr+1]);
109 | 	always @ (posedge clk) begin
110 | 		if (rst) begin
111 | 			inst_rd_ok <= 1'b0;
112 | 		end else begin
113 | 			inst_rd_ok <= state_t;
114 | 		end
115 | 	end
116 | 
117 | 	assign rd_ok = inst_rd_ok;
118 | 
119 | endmodule


--------------------------------------------------------------------------------
/rtl/inst_ram.sv:
--------------------------------------------------------------------------------
  1 | `include "npu.vh"
  2 | 
  3 | module inst_ram # (
  4 | 	parameter MODULE_ID = "",
  5 | 	parameter OUTREG = "CLOCK0",
  6 | 	parameter ID = 0,
  7 | 	parameter ID_UNITS = (ID%10) + 8'h30,
  8 | 	parameter ID_TENS = (ID/10 == 0)? "": ((ID/10)%10) + 8'h30,
  9 | 	parameter ID_HUNDREDS = (ID/100 == 0)? "": (ID/100) + 8'h30,
 10 | 	parameter DW = 32,
 11 | 	parameter DEPTH = 512,
 12 | 	parameter AW = 9,
 13 | 	parameter RTL_DIR = `RTL_DIR,
 14 | 	parameter TARGET_FPGA = `TARGET_FPGA
 15 | )(
 16 | 	input           wr_en, 
 17 | 	input  [AW-1:0] wr_addr, 
 18 | 	input  [AW-1:0] rd_addr,
 19 | 	input  [DW-1:0] wr_data,
 20 | 	output [DW-1:0] rd_data,
 21 | 	input 			clk, 
 22 | 	input 			rst
 23 | );
 24 | 
 25 | wire [DW-1:0] sub_wire0;
 26 | assign rd_data = sub_wire0[DW-1:0];
 27 | 
 28 | 
 29 | //localparam RAM_SRC = {RTL_DIR, "mif_files/top_sched.mif"};      
 30 | 
 31 | altera_syncram  altera_syncram_component (
 32 | 	.address_a 		(wr_addr),
 33 | 	.address_b 		(rd_addr),
 34 | 	.clock0 			(clk),
 35 | 	.data_a 			(wr_data),
 36 | 	.wren_a 			(wr_en),
 37 | 	.q_b 				(sub_wire0),
 38 | 	.aclr0 			(1'b0),
 39 | 	.aclr1 			(1'b0),
 40 | 	.address2_a 	(1'b1),
 41 | 	.address2_b 	(1'b1),
 42 | 	.addressstall_a(1'b0),
 43 | 	.addressstall_b(1'b0),
 44 | 	.byteena_a 		(1'b1),
 45 | 	.byteena_b 		(1'b1),
 46 | 	.clock1 			(1'b1),
 47 | 	.clocken0 		(1'b1),
 48 | 	.clocken1 		(1'b1),
 49 | 	.clocken2 		(1'b1),
 50 | 	.clocken3 		(1'b1),
 51 | 	.data_b 			({(DW){1'b1}}),
 52 | 	.eccencbypass 	(1'b0),
 53 | 	.eccencparity 	(8'b0),
 54 | 	.eccstatus 		(),
 55 | 	.q_a 				(),
 56 | 	.rden_a 			(1'b1),
 57 | 	.rden_b 			(1'b1),
 58 | 	.sclr 			(1'b0),
 59 | 	.wren_b 			(1'b0)
 60 | );
 61 | 
 62 | defparam
 63 | 	altera_syncram_component.address_aclr_b  = "NONE",
 64 | 	altera_syncram_component.address_reg_b  = "CLOCK0",
 65 | 	altera_syncram_component.clock_enable_input_a  = "BYPASS",
 66 | 	altera_syncram_component.clock_enable_input_b  = "BYPASS",
 67 | 	altera_syncram_component.clock_enable_output_b  = "BYPASS",
 68 | /*`ifdef DEPLOY
 69 | 	altera_syncram_component.init_file = RAM_SRC,
 70 | `endif*/
 71 | 	altera_syncram_component.enable_ecc  = "FALSE",
 72 | 	altera_syncram_component.intended_device_family  = TARGET_FPGA,
 73 | 	altera_syncram_component.lpm_type  = "altera_syncram",
 74 | 	altera_syncram_component.numwords_a  = DEPTH,
 75 | 	altera_syncram_component.numwords_b  = DEPTH,
 76 | 	altera_syncram_component.operation_mode  = "DUAL_PORT",
 77 | 	altera_syncram_component.outdata_aclr_b  = "NONE",
 78 | 	altera_syncram_component.outdata_sclr_b  = "NONE",
 79 | 	altera_syncram_component.outdata_reg_b  = OUTREG,
 80 | 	altera_syncram_component.power_up_uninitialized  = "FALSE",
 81 | 	altera_syncram_component.ram_block_type  = "M20K",
 82 | 	altera_syncram_component.read_during_write_mode_mixed_ports  = "DONT_CARE",
 83 | 	altera_syncram_component.widthad_a  = AW,
 84 | 	altera_syncram_component.widthad_b  = AW,
 85 | 	altera_syncram_component.width_a  = DW,
 86 | 	altera_syncram_component.width_b  = DW,
 87 | 	altera_syncram_component.width_byteena_a  = 1;
 88 | 
 89 | 
 90 | // Debug
 91 | //   always @ (posedge clk) begin
 92 | //      if(wr_en) 
 93 | //          $display("[%0t][%s] wr ram%d[%d] = %x(%d,%d,%d,%d)", 
 94 | //          $time, `__FILE__, ID, wr_addr, wr_data, wr_data[7:0], wr_data[15:8],
 95 | //          wr_data[23:16], wr_data[31:24]);
 96 | //   end
 97 | 
 98 | endmodule
 99 | 
100 | 


--------------------------------------------------------------------------------
/rtl/mrf_ram.sv:
--------------------------------------------------------------------------------
 1 | `include "npu.vh"
 2 | 
 3 | module mrf_ram # (
 4 | 	parameter MODULE_ID = "",
 5 | 	parameter OUTREG = "CLOCK0",
 6 | 	parameter ID = 0,
 7 | 	parameter ID_UNITS = (ID%10) + 8'h30,
 8 | 	parameter ID_TENS = ((ID/10)%10) + 8'h30,
 9 | 	parameter ID_HUNDREDS = (ID/100) + 8'h30,
10 | 	parameter RAM_ID = 0,
11 | 	parameter DW = 32,
12 | 	parameter DEPTH = 512,
13 | 	parameter AW = 9,
14 | 	parameter RTL_DIR = `RTL_DIR,
15 | 	parameter TARGET_FPGA = `TARGET_FPGA
16 | )(
17 | 	input           wr_en, 
18 | 	input  [AW-1:0] wr_addr, 
19 | 	input  [AW-1:0] rd_addr,
20 | 	input  [DW-1:0] wr_data,
21 | 	output [DW-1:0] rd_data,
22 | 	input 			clk, 
23 | 	input 			rst
24 | );
25 | 
26 | wire [DW-1:0] sub_wire0;
27 | assign rd_data = sub_wire0[DW-1:0];
28 | 
29 | 
30 | localparam RAM_SRC = {RTL_DIR, "mif_files/", MODULE_ID, ID_HUNDREDS, ID_TENS, ID_UNITS, "_", RAM_ID+8'h30, ".mif"};      
31 | 
32 | altera_syncram  altera_syncram_component (
33 | 	.address_a 		(wr_addr),
34 | 	.address_b 		(rd_addr),
35 | 	.clock0 			(clk),
36 | 	.data_a 			(wr_data),
37 | 	.wren_a 			(wr_en),
38 | 	.q_b 				(sub_wire0),
39 | 	.aclr0 			(1'b0),
40 | 	.aclr1 			(1'b0),
41 | 	.address2_a 	(1'b1),
42 | 	.address2_b 	(1'b1),
43 | 	.addressstall_a(1'b0),
44 | 	.addressstall_b(1'b0),
45 | 	.byteena_a 		(1'b1),
46 | 	.byteena_b 		(1'b1),
47 | 	.clock1 			(1'b1),
48 | 	.clocken0 		(1'b1),
49 | 	.clocken1 		(1'b1),
50 | 	.clocken2 		(1'b1),
51 | 	.clocken3 		(1'b1),
52 | 	.data_b 			({(DW){1'b1}}),
53 | 	.eccencbypass 	(1'b0),
54 | 	.eccencparity 	(8'b0),
55 | 	.eccstatus 		(),
56 | 	.q_a 				(),
57 | 	.rden_a 			(1'b1),
58 | 	.rden_b 			(1'b1),
59 | 	.sclr 			(1'b0),
60 | 	.wren_b 			(1'b0)
61 | );
62 | 
63 | defparam
64 | 	altera_syncram_component.address_aclr_b  = "NONE",
65 | 	altera_syncram_component.address_reg_b  = "CLOCK0",
66 | 	altera_syncram_component.clock_enable_input_a  = "BYPASS",
67 | 	altera_syncram_component.clock_enable_input_b  = "BYPASS",
68 | 	altera_syncram_component.clock_enable_output_b  = "BYPASS",
69 | 	altera_syncram_component.init_file = RAM_SRC,
70 | 	altera_syncram_component.enable_ecc  = "FALSE",
71 | 	altera_syncram_component.intended_device_family  = TARGET_FPGA,
72 | 	altera_syncram_component.lpm_type  = "altera_syncram",
73 | 	altera_syncram_component.numwords_a  = DEPTH,
74 | 	altera_syncram_component.numwords_b  = DEPTH,
75 | 	altera_syncram_component.operation_mode  = "DUAL_PORT",
76 | 	altera_syncram_component.outdata_aclr_b  = "NONE",
77 | 	altera_syncram_component.outdata_sclr_b  = "NONE",
78 | 	altera_syncram_component.outdata_reg_b  = OUTREG,
79 | 	altera_syncram_component.power_up_uninitialized  = "FALSE",
80 | 	altera_syncram_component.ram_block_type  = "M20K",
81 | 	altera_syncram_component.read_during_write_mode_mixed_ports  = "DONT_CARE",
82 | 	altera_syncram_component.widthad_a  = AW,
83 | 	altera_syncram_component.widthad_b  = AW,
84 | 	altera_syncram_component.width_a  = DW,
85 | 	altera_syncram_component.width_b  = DW,
86 | 	altera_syncram_component.width_byteena_a  = 1;
87 | 
88 | 
89 | // Debug
90 | //   always @ (posedge clk) begin
91 | //      if(wr_en) 
92 | //          $display("[%0t][%s] wr ram%d[%d] = %x(%d,%d,%d,%d)", 
93 | //          $time, `__FILE__, ID, wr_addr, wr_data, wr_data[7:0], wr_data[15:8],
94 | //          wr_data[23:16], wr_data[31:24]);
95 | //   end
96 | 
97 | endmodule
98 | 
99 | 


--------------------------------------------------------------------------------
/rtl/mvu_vrf.sv:
--------------------------------------------------------------------------------
  1 | `include "npu.vh"
  2 | 
  3 | module mvu_vrf # (
  4 | 	parameter MODULE_ID = "",
  5 | 	parameter OUTREG = "CLOCK0",
  6 | 	parameter ID = 0,
  7 | 	parameter DW = 32,
  8 | 	parameter DEPTH = 512,
  9 | 	parameter AW = 9,
 10 | 	parameter RTL_DIR = `RTL_DIR,
 11 | 	parameter TARGET_FPGA = `TARGET_FPGA,
 12 | 	parameter EW = `EW,
 13 | 	parameter DEVICE = (TARGET_FPGA == "S10-Prime")? "Stratix 10": TARGET_FPGA,
 14 | 	parameter PRIME_DOTW = `PRIME_DOTW,
 15 | 	parameter DOTW = `DOTW,
 16 | 	parameter NUM_DSP = `NUM_DSP,
 17 | 	parameter NUM_RAM = (TARGET_FPGA == "S10-Prime")? NUM_DSP: 1,
 18 | 	parameter RW = DW / NUM_RAM,
 19 | 	parameter VRFIDW = `VRFIDW,
 20 | 	parameter MVU_TILE = 0
 21 | )(
 22 | 	input            wr_en, 
 23 | 	input  [AW-1:0]  wr_addr, 
 24 | 	input  [AW-1:0]  rd_addr,
 25 | 	input  [DW-1:0]  wr_data,
 26 | 	input  [VRFIDW-1:0] rd_id,
 27 | 	input rd_en,
 28 | 	output [RW-1:0]  rd_data,
 29 | 	input 			  clk, 
 30 | 	input 			  rst
 31 | );
 32 | 
 33 | wire [RW-1:0] rdata [0:NUM_RAM-1];
 34 | reg [VRFIDW-1:0] id [0:1];
 35 | reg rd [0:1];
 36 | 
 37 | always @ (posedge clk) begin
 38 | 	if(rst)begin
 39 | 		id[0] <= 'd0;
 40 | 		id[1] <= 'd0;
 41 | 		rd[0] <= 0;
 42 | 		rd[1] <= 0;
 43 | 	end else begin
 44 | 		id[0] <= rd_id;
 45 | 		id[1] <= id[0];
 46 | 
 47 | 		rd[0] <= rd_en;
 48 | 		rd[1] <= rd[0];
 49 | 	end
 50 | end
 51 | 
 52 | genvar i;
 53 | generate
 54 | for(i = 0; i < NUM_RAM; i = i + 1) begin: gen_mvu_vrf_ram
 55 | 	altera_syncram  altera_syncram_component (
 56 | 		.address_a 		(wr_addr),
 57 | 		.address_b 		(rd_addr),
 58 | 		.clock0 			(clk),
 59 | 		.data_a 			(wr_data[i*RW +: RW]),
 60 | 		.wren_a 			(wr_en),
 61 | 		.q_b 				(rdata[i]),
 62 | 		.aclr0 			(1'b0),
 63 | 		.aclr1 			(1'b0),
 64 | 		.address2_a 	(1'b1),
 65 | 		.address2_b 	(1'b1),
 66 | 		.addressstall_a(1'b0),
 67 | 		.addressstall_b(1'b0),
 68 | 		.byteena_a 		(1'b1),
 69 | 		.byteena_b 		(1'b1),
 70 | 		.clock1 			(1'b1),
 71 | 		.clocken0 		(1'b1),
 72 | 		.clocken1 		(1'b1),
 73 | 		.clocken2 		(1'b1),
 74 | 		.clocken3 		(1'b1),
 75 | 		.data_b 			({(RW){1'b1}}),
 76 | 		.eccencbypass 	(1'b0),
 77 | 		.eccencparity 	(8'b0),
 78 | 		.eccstatus 		(),
 79 | 		.q_a 				(),
 80 | 		.rden_a 			(1'b1),
 81 | 		.rden_b 			(1'b1),
 82 | 		.sclr 			(1'b0),
 83 | 		.wren_b 			(1'b0)
 84 | 	);
 85 | 
 86 | 	defparam
 87 | 		altera_syncram_component.address_aclr_b  = "NONE",
 88 | 		altera_syncram_component.address_reg_b  = "CLOCK0",
 89 | 		altera_syncram_component.clock_enable_input_a  = "BYPASS",
 90 | 		altera_syncram_component.clock_enable_input_b  = "BYPASS",
 91 | 		altera_syncram_component.clock_enable_output_b  = "BYPASS",
 92 | 		altera_syncram_component.enable_ecc  = "FALSE",
 93 | 		altera_syncram_component.intended_device_family  = DEVICE,
 94 | 		altera_syncram_component.lpm_type  = "altera_syncram",
 95 | 		altera_syncram_component.numwords_a  = DEPTH,
 96 | 		altera_syncram_component.numwords_b  = DEPTH,
 97 | 		altera_syncram_component.operation_mode  = "DUAL_PORT",
 98 | 		altera_syncram_component.outdata_aclr_b  = "NONE",
 99 | 		altera_syncram_component.outdata_sclr_b  = "NONE",
100 | 		altera_syncram_component.outdata_reg_b  = OUTREG,
101 | 		altera_syncram_component.power_up_uninitialized  = "FALSE",
102 | 		altera_syncram_component.ram_block_type  = "M20K",
103 | 		altera_syncram_component.read_during_write_mode_mixed_ports  = "DONT_CARE",
104 | 		altera_syncram_component.widthad_a  = AW,
105 | 		altera_syncram_component.widthad_b  = AW,
106 | 		altera_syncram_component.width_a  = RW,
107 | 		altera_syncram_component.width_b  = RW,
108 | 		altera_syncram_component.width_byteena_a  = 1;
109 | end
110 | 
111 | endgenerate
112 | 	
113 | assign rd_data = rdata[id[1]];
114 | 
115 | 
116 | `ifdef DISPLAY_MVU
117 | always @(posedge clk) begin
118 | 	if(wr_en && MVU_TILE == 0) begin
119 | 		$display("[%0t][MVU-VRF] wr_addr: %d, wr_data: %d %d %d %d %d %d %d %d %d %d", 
120 | 			$time, 
121 | 			wr_addr,
122 | 			$signed(wr_data[7:0]),
123 | 			$signed(wr_data[15:8]),
124 | 			$signed(wr_data[23:16]),
125 | 			$signed(wr_data[31:24]),
126 | 			$signed(wr_data[39:32]),
127 | 			$signed(wr_data[47:40]),
128 | 			$signed(wr_data[55:48]),
129 | 			$signed(wr_data[63:56]),
130 | 			$signed(wr_data[71:64]),
131 | 			$signed(wr_data[79:72]));
132 | 	end
133 | 
134 | 	if(rd_en && MVU_TILE == 0) begin
135 | 		$display("[%0t][MVU-VRF] rd_addr: %d %b", $time, rd_addr, rd_addr);
136 | 	end
137 | 
138 | 	if(rd[1] && MVU_TILE == 0) begin
139 | 		$display("[%0t][MVU-VRF] vrf_id: %d, rd_data: %d %d %d %d %d %d %d %d %d %d", 
140 | 			$time,
141 | 			id[1],
142 | 			$signed(rd_data[7:0]),
143 | 			$signed(rd_data[15:8]),
144 | 			$signed(rd_data[23:16]),
145 | 			$signed(rd_data[31:24]),
146 | 			$signed(rd_data[39:32]),
147 | 			$signed(rd_data[47:40]),
148 | 			$signed(rd_data[55:48]),
149 | 			$signed(rd_data[63:56]),
150 | 			$signed(rd_data[71:64]),
151 | 			$signed(rd_data[79:72]));
152 | 	end
153 | end
154 | `endif
155 | 
156 | endmodule


--------------------------------------------------------------------------------
/rtl/nx_axbs.sv:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Intel Corporation.
 2 | //
 3 | // This reference design file is subject licensed to you by the terms and
 4 | // conditions of the applicable License Terms and Conditions for Hardware
 5 | // Reference Designs and/or Design Examples (either as signed by you or
 6 | // found at https://www.altera.com/common/legal/leg-license_agreement.html ).
 7 | //
 8 | // As stated in the license, you agree to only use this reference design
 9 | // solely in conjunction with Intel FPGAs or Intel CPLDs.
10 | //
11 | // THE REFERENCE DESIGN IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED
12 | // WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
13 | // NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. Intel does not
14 | // warrant or assume responsibility for the accuracy or completeness of any
15 | // information, links or other items within the Reference Design and any
16 | // accompanying materials.
17 | //
18 | // In the event that you do not agree with such terms and conditions, do not
19 | // use the reference design file.
20 | /////////////////////////////////////////////////////////////////////////////
21 | 
22 | module nx_axbs #(
23 |     parameter SIZE_A = 32,
24 |     parameter SIZE_B = 32
25 | )(
26 | 	input           clk,
27 | 	input signed [SIZE_A-1:0] din_a,
28 | 	input signed [SIZE_B-1:0] din_b,
29 | 	output signed [SIZE_A+SIZE_B-1:0] dout
30 | );
31 | 
32 | generate
33 | 
34 | if ((SIZE_A > 512) && (SIZE_B > 512)) begin
35 |     initial begin
36 |         $fatal("Error: %0dx%0d multiplier is not supported", SIZE_A, SIZE_B);
37 |     end
38 | end
39 | 
40 | localparam NUM_A = (SIZE_A - 2) / 7 + 1;
41 | localparam NUM_B = (SIZE_B - 2) / 7 + 1;
42 | 
43 | if ((NUM_A == 1) || (NUM_B == 1)) begin
44 | 
45 |     axbs #(.SIZE_A(SIZE_A), .SIZE_B(SIZE_B)) mult (
46 |         .clk(clk),
47 |         .din_a(din_a),
48 |         .din_b(din_b),
49 |         .dout(dout)
50 |     );
51 | 
52 | end
53 | else
54 | begin
55 |     localparam SIZE_A_PRIME = NUM_A * 7 + 1;
56 |     localparam SIZE_B_PRIME = NUM_B * 7 + 1;
57 | 
58 |     wire signed [SIZE_A_PRIME-1:0] din_a_prime;
59 |     wire signed [SIZE_B_PRIME-1:0] din_b_prime;
60 | 
61 |     assign din_a_prime = {din_a, {(SIZE_A_PRIME - SIZE_A){1'b0}}};
62 |     assign din_b_prime = {din_b, {(SIZE_B_PRIME - SIZE_B){1'b0}}};
63 | 
64 |     wire signed [SIZE_A_PRIME+SIZE_B_PRIME-1:0] dout_prime;
65 | 
66 |     nx_axbs_core #(.SIZE_A(SIZE_A_PRIME), .SIZE_B(SIZE_B_PRIME)) mult (
67 |         .clk(clk),
68 |         .din_a(din_a_prime),
69 |         .din_b(din_b_prime),
70 |         .dout(dout_prime)
71 |     );
72 | 
73 |     assign dout = dout_prime[SIZE_A_PRIME+SIZE_B_PRIME-1:SIZE_A_PRIME+SIZE_B_PRIME-SIZE_A-SIZE_B];
74 | end
75 | endgenerate
76 | 
77 | endmodule
78 | 


--------------------------------------------------------------------------------
/rtl/nx_axbs_core.sv:
--------------------------------------------------------------------------------
  1 | // Copyright 2020 Intel Corporation.
  2 | //
  3 | // This reference design file is subject licensed to you by the terms and
  4 | // conditions of the applicable License Terms and Conditions for Hardware
  5 | // Reference Designs and/or Design Examples (either as signed by you or
  6 | // found at https://www.altera.com/common/legal/leg-license_agreement.html ).
  7 | //
  8 | // As stated in the license, you agree to only use this reference design
  9 | // solely in conjunction with Intel FPGAs or Intel CPLDs.
 10 | //
 11 | // THE REFERENCE DESIGN IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED
 12 | // WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
 13 | // NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. Intel does not
 14 | // warrant or assume responsibility for the accuracy or completeness of any
 15 | // information, links or other items within the Reference Design and any
 16 | // accompanying materials.
 17 | //
 18 | // In the event that you do not agree with such terms and conditions, do not
 19 | // use the reference design file.
 20 | /////////////////////////////////////////////////////////////////////////////
 21 | 
 22 | module nx_axbs_core #(
 23 |     parameter SIZE_A = 15,
 24 |     parameter SIZE_B = 15,
 25 | 	 parameter SIZE_O = SIZE_A + SIZE_B
 26 | )(
 27 | 	input           clk,
 28 | 	input signed [SIZE_A-1:0] din_a,
 29 | 	input signed [SIZE_B-1:0] din_b,
 30 | 	output reg signed [SIZE_A+SIZE_B-1:0] dout
 31 | );
 32 | localparam NUM_A = (SIZE_A - 2) / 7 + 1;
 33 | localparam NUM_B = (SIZE_B - 2) / 7 + 1;
 34 | 
 35 | genvar i, j;
 36 | generate
 37 | 
 38 | localparam LATENCY = (NUM_A <= NUM_B) ? 4+$clog2((NUM_A-1)/6+1) : 4+$clog2((NUM_B-1)/6+1);
 39 | 
 40 | wire signed [7:0] din_a_w[0:NUM_A-1];
 41 | wire signed [7:0] din_b_w[0:NUM_B-1];
 42 | 
 43 | 
 44 | for (i = 0; i < NUM_A; i=i+1) begin : assign_a
 45 |     if (i < NUM_A - 1)
 46 |         assign din_a_w[i] = {1'b0, din_a[7*i+6:7*i]};
 47 |     else 
 48 |         assign din_a_w[i] = din_a[7*i+7:7*i];
 49 | end
 50 |     
 51 | for (i = 0; i < NUM_B; i=i+1) begin : assign_b
 52 |     if (i < NUM_B - 1)
 53 |         assign din_b_w[i] = {1'b0, din_b[7*i+6:7*i]};
 54 |     else
 55 |         assign din_b_w[i] = din_b[7*i+7:7*i];
 56 | end
 57 | 
 58 | 
 59 | wire signed [20:0] dot_product_out[0:NUM_A+NUM_B-1];
 60 | 
 61 | wire signed [7*(NUM_A+NUM_B)-1:0] dout1;
 62 | wire signed [7*(NUM_A+NUM_B):0] dout2;
 63 | wire signed [7*(NUM_A+NUM_B+1)-1:0] dout3;
 64 | wire signed [7*(NUM_A+NUM_B+1):0] dout4;
 65 |     
 66 | assign dout2[6:0] = 0;
 67 | assign dout3[13:0] = 0;
 68 | assign dout4[14:0] = 0;
 69 | 
 70 | if (7*(NUM_A+NUM_B)-1 >= 7*(NUM_A+NUM_B-2)+7)
 71 |     assign dout1[7*(NUM_A+NUM_B)-1:7*(NUM_A+NUM_B-2)+7] = 0;
 72 | 
 73 | assign dout2[7*(NUM_A+NUM_B)] = 0;
 74 |   
 75 | for (i = 0; i < NUM_A+NUM_B-1; i=i+1) begin: loopa
 76 |     
 77 |     nx_axbs_slice #(.NUM_A(NUM_A), .NUM_B(NUM_B), .INDEX(i), .SIZE_OUT(21), .LATENCY(LATENCY)) dot_product(
 78 |         .clk(clk),
 79 |         .din_a(din_a_w),
 80 |         .din_b(din_b_w),
 81 |         .dout(dot_product_out[i])
 82 |     );
 83 |     
 84 |     assign {dout3[7*i+20:7*i+14], dout2[7*i+13:7*i+7], dout1[7*i+6:7*i]} = dot_product_out[i];
 85 |     assign {dout4[7*i+21:7*i+15]} = {dot_product_out[i][20], 6'b0};
 86 | end
 87 | 
 88 | reg signed [SIZE_A+SIZE_B-1:0] dout12;
 89 | reg signed [SIZE_A+SIZE_B-1:0] dout31;
 90 | 
 91 | always @(posedge clk) begin
 92 |     
 93 |     dout12 <= SIZE_O'(dout1 + dout2);
 94 |     dout31 <= SIZE_O'(dout3 - dout4);
 95 |     dout <= dout12 + dout31;
 96 | end
 97 | 
 98 | endgenerate
 99 | 
100 | endmodule
101 | 


--------------------------------------------------------------------------------
/rtl/nx_axbs_slice.sv:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Intel Corporation.
 2 | //
 3 | // This reference design file is subject licensed to you by the terms and
 4 | // conditions of the applicable License Terms and Conditions for Hardware
 5 | // Reference Designs and/or Design Examples (either as signed by you or
 6 | // found at https://www.altera.com/common/legal/leg-license_agreement.html ).
 7 | //
 8 | // As stated in the license, you agree to only use this reference design
 9 | // solely in conjunction with Intel FPGAs or Intel CPLDs.
10 | //
11 | // THE REFERENCE DESIGN IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED
12 | // WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
13 | // NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. Intel does not
14 | // warrant or assume responsibility for the accuracy or completeness of any
15 | // information, links or other items within the Reference Design and any
16 | // accompanying materials.
17 | //
18 | // In the event that you do not agree with such terms and conditions, do not
19 | // use the reference design file.
20 | /////////////////////////////////////////////////////////////////////////////
21 | 
22 | module nx_axbs_slice #(
23 |     parameter NUM_A = 2,
24 |     parameter NUM_B = 2,
25 |     parameter INDEX = 0,
26 |     parameter SIZE_OUT = 16,
27 |     parameter LATENCY = 4
28 | )(
29 | 	input           clk,
30 | 	input signed  [7:0] din_a[0:NUM_A-1],
31 | 	input signed  [7:0] din_b[0:NUM_B-1],
32 |     
33 | 	output signed [SIZE_OUT-1:0] dout
34 | );
35 | 
36 | localparam MIN_A = (INDEX < NUM_B) ? 0 : (INDEX - NUM_B + 1);
37 | localparam MAX_A = (INDEX < NUM_A) ? INDEX : NUM_A - 1;
38 | localparam MAX_B = (INDEX < NUM_B) ? INDEX : NUM_B - 1;
39 | localparam LOCAL_NUM = MAX_A - MIN_A + 1;
40 | 
41 | wire signed [7:0] din_a_local_w[0:LOCAL_NUM-1];
42 | wire signed [7:0] din_b_local_w[0:LOCAL_NUM-1];
43 | 
44 | genvar j;
45 | generate
46 | for (j = 0; j < LOCAL_NUM; j=j+1) begin: loopb
47 |     assign din_a_local_w[j] = {1'b0, din_a[j+MIN_A]};
48 |     assign din_b_local_w[j] = {1'b0, din_b[MAX_B-j]};
49 | end
50 | 
51 | wire signed [15+$clog2(LOCAL_NUM):0] dout_w;
52 | 
53 | nx_dot_product_int8 #(.NUM(LOCAL_NUM), .LATENCY(LATENCY)) dot_product(
54 |     .clk(clk),
55 |     .din_a(din_a_local_w),
56 |     .din_b(din_b_local_w),
57 |     .dout(dout_w[15+$clog2(LOCAL_NUM):0])
58 | );    
59 | 
60 | if (SIZE_OUT > 16+$clog2(LOCAL_NUM))
61 |     assign dout = { {(SIZE_OUT - $clog2(LOCAL_NUM) - 16) {dout_w[15+$clog2(LOCAL_NUM)]}}, dout_w};
62 | else
63 |     assign dout = dout_w;
64 | endgenerate
65 | endmodule
66 | 
67 | 


--------------------------------------------------------------------------------
/rtl/nx_dot6_int8.sv:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Intel Corporation.
 2 | //
 3 | // This reference design file is subject licensed to you by the terms and
 4 | // conditions of the applicable License Terms and Conditions for Hardware
 5 | // Reference Designs and/or Design Examples (either as signed by you or
 6 | // found at https://www.altera.com/common/legal/leg-license_agreement.html ).
 7 | //
 8 | // As stated in the license, you agree to only use this reference design
 9 | // solely in conjunction with Intel FPGAs or Intel CPLDs.
10 | //
11 | // THE REFERENCE DESIGN IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED
12 | // WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
13 | // NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. Intel does not
14 | // warrant or assume responsibility for the accuracy or completeness of any
15 | // information, links or other items within the Reference Design and any
16 | // accompanying materials.
17 | //
18 | // In the event that you do not agree with such terms and conditions, do not
19 | // use the reference design file.
20 | /////////////////////////////////////////////////////////////////////////////
21 | 
22 | module nx_dot6_int8 (
23 | 	input           clk,
24 | 	input signed  [7:0] din_a1,
25 | 	input signed  [7:0]	din_b1,
26 | 	input signed  [7:0]	din_a2,
27 | 	input signed  [7:0] din_b2,
28 | 	input signed  [7:0] din_a3,
29 | 	input signed  [7:0] din_b3,
30 | 	input signed  [7:0] din_a4,
31 | 	input signed  [7:0] din_b4,
32 | 	input signed  [7:0] din_a5,
33 | 	input signed  [7:0] din_b5,
34 | 	input signed  [7:0] din_a6,
35 | 	input signed  [7:0] din_b6,
36 | 	output reg signed [18:0] dout
37 | );
38 | 
39 | wire signed [18:0] dout_w;
40 | wire [5:0] tmp;
41 | fourteennm_dsp_prime #(
42 | 	.dsp_mode("vector_fxp"),
43 | 	.dsp_sel_int4("select_int8"),
44 | 	.dsp_fp32_sub_en("float_sub_disabled"),
45 | 	.dsp_cascade("cascade_disabled")
46 | )
47 | dsp_prime_wys0 (
48 | 	.ena(1'b1),
49 | 	.clk(clk),
50 | 	.data_in({din_b6,din_a6,din_b5,din_a5,din_b4,din_a4,din_b3,din_a3,din_b2,din_a2,din_b1,din_a1}),
51 | 	.clr({1'b0,1'b0}),
52 | 	.result_l({tmp,dout_w}),
53 | 
54 | 	.load_buf_sel(1'b0),
55 | 	.mode_switch(1'b0),
56 | 	.load_bb_one(1'b0),
57 | 	.load_bb_two(1'b0),
58 | 	.feed_sel(2'b0),
59 | 	.zero_en(1'b0),
60 | 	.shared_exponent(8'h0),
61 | 	.cascade_weight_in(88'h0),
62 | 	.cascade_data_in(96'h0),
63 | 	.acc_en(1'b0),
64 | 
65 | 	.cascade_weight_out(),
66 | 	.cascade_data_out()
67 | );
68 | always @(posedge clk) begin
69 | 	dout <= dout_w;
70 | end
71 | 
72 | endmodule
73 | 
74 | 


--------------------------------------------------------------------------------
/rtl/nx_dot_product_int8.sv:
--------------------------------------------------------------------------------
  1 | // Copyright 2020 Intel Corporation.
  2 | //
  3 | // This reference design file is subject licensed to you by the terms and
  4 | // conditions of the applicable License Terms and Conditions for Hardware
  5 | // Reference Designs and/or Design Examples (either as signed by you or
  6 | // found at https://www.altera.com/common/legal/leg-license_agreement.html ).
  7 | //
  8 | // As stated in the license, you agree to only use this reference design
  9 | // solely in conjunction with Intel FPGAs or Intel CPLDs.
 10 | //
 11 | // THE REFERENCE DESIGN IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED
 12 | // WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
 13 | // NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. Intel does not
 14 | // warrant or assume responsibility for the accuracy or completeness of any
 15 | // information, links or other items within the Reference Design and any
 16 | // accompanying materials.
 17 | //
 18 | // In the event that you do not agree with such terms and conditions, do not
 19 | // use the reference design file.
 20 | /////////////////////////////////////////////////////////////////////////////
 21 | 
 22 | module nx_dot_product_int8 #(
 23 |     parameter NUM = 16,
 24 |     parameter LATENCY = 4+$clog2((NUM-1)/6+1)
 25 | )(
 26 | 	input           clk,
 27 | 	input signed  [7:0] din_a[0:NUM-1],
 28 | 	input signed  [7:0] din_b[0:NUM-1],
 29 | 	output signed [15+$clog2(NUM):0] dout
 30 | );
 31 | 
 32 | localparam MIN_LATENCY = 4+$clog2((NUM-1)/6+1);
 33 | localparam DSP_NUM = (NUM - 1) / 6 + 1;
 34 | 
 35 | localparam NUM_6 = DSP_NUM * 6;
 36 | 
 37 | wire signed  [7:0] din_a_w[0:NUM_6-1];
 38 | wire signed  [7:0] din_b_w[0:NUM_6-1];
 39 | 
 40 | wire signed [18:0] dsp_out[0:DSP_NUM-1];
 41 | genvar i;
 42 | generate
 43 | 
 44 | for (i = 0; i < NUM_6; i=i+1) begin: loop1
 45 |     if (i < NUM) begin
 46 |         assign din_a_w[i] = din_a[i];
 47 |         assign din_b_w[i] = din_b[i];
 48 |     end
 49 |     else
 50 |     begin
 51 |         assign din_a_w[i] = 0;
 52 |         assign din_b_w[i] = 0;
 53 |     end
 54 |     
 55 | end
 56 | 
 57 | for (i = 0; i < DSP_NUM; i=i+1) begin: loop2
 58 |     if ((i < DSP_NUM-1) || (NUM_6 - NUM != 5)) begin
 59 |         nx_dot6_int8 dot (
 60 |             .clk(clk),
 61 |             .din_a1(din_a_w[6*i]),
 62 |             .din_b1(din_b_w[6*i]),
 63 |             .din_a2(din_a_w[6*i+1]),
 64 |             .din_b2(din_b_w[6*i+1]),
 65 |             .din_a3(din_a_w[6*i+2]),
 66 |             .din_b3(din_b_w[6*i+2]),
 67 |             .din_a4(din_a_w[6*i+3]),
 68 |             .din_b4(din_b_w[6*i+3]),
 69 |             .din_a5(din_a_w[6*i+4]),
 70 |             .din_b5(din_b_w[6*i+4]),
 71 |             .din_a6(din_a_w[6*i+5]),
 72 |             .din_b6(din_b_w[6*i+5]),
 73 |             .dout(dsp_out[i])
 74 |         );
 75 |     end else begin
 76 |         axbs #(.SIZE_A(8), .SIZE_B(8)) mult (
 77 |             .clk(clk),
 78 |             .din_a(din_a_w[6*i]),
 79 |             .din_b(din_b_w[6*i]),
 80 |             .dout(dsp_out[i][15:0])
 81 |         );
 82 |         assign dsp_out[i][18:16] = {3{dsp_out[i][15]}};        
 83 |     end
 84 | end
 85 | 
 86 | wire signed [15+$clog2(NUM):0] dout_ww;
 87 | 
 88 | if (DSP_NUM > 1) begin
 89 |     wire signed [18+$clog2(DSP_NUM):0] dout_w;
 90 |     adder_tree #(.SIZE(19), .NUM(DSP_NUM)) adder_tree_inst ( 
 91 |        .clk(clk),
 92 |        .din(dsp_out),
 93 |        .dout(dout_w)
 94 |     );
 95 |     assign dout_ww = dout_w[15+$clog2(NUM):0];
 96 | end else begin
 97 |     assign dout_ww = dsp_out[0][15+$clog2(NUM):0];
 98 | end
 99 | 
100 | integer j;
101 | if (LATENCY < MIN_LATENCY) begin
102 |     initial begin
103 |         $fatal("Specified latency %0d is too small", LATENCY);
104 |     end
105 | end if (LATENCY == MIN_LATENCY) begin
106 |     assign dout = dout_ww;
107 | end else begin
108 |     reg signed [15+$clog2(NUM):0] dout_r[0:LATENCY-MIN_LATENCY-1];
109 |     always @(posedge clk) begin
110 | 		dout_r[0] <= dout_ww;
111 |         for (j = 1; j < LATENCY-MIN_LATENCY; j=j+1) begin
112 |             dout_r[j] <= dout_r[j-1];
113 |         end
114 | 	end
115 |     assign dout = dout_r[LATENCY-MIN_LATENCY-1];
116 | end
117 | 
118 | 
119 | endgenerate
120 | 
121 | endmodule
122 | 
123 | 


--------------------------------------------------------------------------------
/rtl/pipeline_interconnect.sv:
--------------------------------------------------------------------------------
 1 | 
 2 | // This module implements a simple pipelined interconnect from a source to a single sink.
 3 | (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name DONT_MERGE_REGISTER ON" *) module pipeline_interconnect # (
 4 | 	parameter DATAW = 32,
 5 | 	parameter LATENCY = 2
 6 | ) (
 7 | 	input 				clk,
 8 | 	input 				rst,
 9 | 	input  [DATAW-1:0] 	i_pipe_in,
10 | 	output [DATAW-1:0] 	o_pipe_out
11 | );
12 | 
13 | reg [DATAW-1:0] pipeline [0:LATENCY-1];
14 | 
15 | integer t;
16 | always @ (posedge clk) begin
17 | 	// Set the input to the first pipeline stage
18 | 	pipeline[0] <= i_pipe_in;
19 | 
20 | 	// Progress the pipeline
21 | 	for (t = 1; t < LATENCY; t = t + 1) begin
22 | 		pipeline[t] <= pipeline[t-1];
23 | 	end
24 | end
25 | 
26 | // Hook up outputs
27 | assign o_pipe_out = pipeline[LATENCY-1];
28 | 
29 | endmodule


--------------------------------------------------------------------------------
/rtl/ram.sv:
--------------------------------------------------------------------------------
 1 | `include "npu.vh"
 2 | 
 3 | module ram # (
 4 | 	parameter MODULE_ID = "",
 5 | 	parameter OUTREG = "CLOCK0",
 6 | 	parameter ID = 0,
 7 | 	parameter ID_UNITS = (ID%10) + 8'h30,
 8 | 	parameter ID_TENS = (ID/10 == 0)? "": ((ID/10)%10) + 8'h30,
 9 | 	parameter ID_HUNDREDS = (ID/100 == 0)? "": (ID/100) + 8'h30,
10 | 	parameter DW = 32,
11 | 	parameter DEPTH = 512,
12 | 	parameter AW = 9,
13 | 	parameter RTL_DIR = `RTL_DIR,
14 | 	parameter TARGET_FPGA = `TARGET_FPGA
15 | )(
16 | 	input           wr_en, 
17 | 	input  [AW-1:0] wr_addr, 
18 | 	input  [AW-1:0] rd_addr,
19 | 	input  [DW-1:0] wr_data,
20 | 	output [DW-1:0] rd_data,
21 | 	input 			clk, 
22 | 	input 			rst
23 | );
24 | 
25 | wire [DW-1:0] sub_wire0;
26 | assign rd_data = sub_wire0[DW-1:0];
27 | 
28 | 
29 | localparam RAM_SRC = {RTL_DIR, "mif_files/", MODULE_ID, ID_HUNDREDS, ID_TENS, ID_UNITS, ".mif"};      
30 | 
31 | altera_syncram  altera_syncram_component (
32 | 	.address_a 		(wr_addr),
33 | 	.address_b 		(rd_addr),
34 | 	.clock0 			(clk),
35 | 	.data_a 			(wr_data),
36 | 	.wren_a 			(wr_en),
37 | 	.q_b 				(sub_wire0),
38 | 	.aclr0 			(1'b0),
39 | 	.aclr1 			(1'b0),
40 | 	.address2_a 	(1'b1),
41 | 	.address2_b 	(1'b1),
42 | 	.addressstall_a(1'b0),
43 | 	.addressstall_b(1'b0),
44 | 	.byteena_a 		(1'b1),
45 | 	.byteena_b 		(1'b1),
46 | 	.clock1 			(1'b1),
47 | 	.clocken0 		(1'b1),
48 | 	.clocken1 		(1'b1),
49 | 	.clocken2 		(1'b1),
50 | 	.clocken3 		(1'b1),
51 | 	.data_b 			({(DW){1'b1}}),
52 | 	.eccencbypass 	(1'b0),
53 | 	.eccencparity 	(8'b0),
54 | 	.eccstatus 		(),
55 | 	.q_a 				(),
56 | 	.rden_a 			(1'b1),
57 | 	.rden_b 			(1'b1),
58 | 	.sclr 			(1'b0),
59 | 	.wren_b 			(1'b0)
60 | );
61 | 
62 | defparam
63 | 	altera_syncram_component.address_aclr_b  = "NONE",
64 | 	altera_syncram_component.address_reg_b  = "CLOCK0",
65 | 	altera_syncram_component.clock_enable_input_a  = "BYPASS",
66 | 	altera_syncram_component.clock_enable_input_b  = "BYPASS",
67 | 	altera_syncram_component.clock_enable_output_b  = "BYPASS",
68 | 	//altera_syncram_component.init_file = RAM_SRC,
69 | 	altera_syncram_component.enable_ecc  = "FALSE",
70 | 	altera_syncram_component.intended_device_family  = TARGET_FPGA,
71 | 	altera_syncram_component.lpm_type  = "altera_syncram",
72 | 	altera_syncram_component.numwords_a  = DEPTH,
73 | 	altera_syncram_component.numwords_b  = DEPTH,
74 | 	altera_syncram_component.operation_mode  = "DUAL_PORT",
75 | 	altera_syncram_component.outdata_aclr_b  = "NONE",
76 | 	altera_syncram_component.outdata_sclr_b  = "NONE",
77 | 	altera_syncram_component.outdata_reg_b  = OUTREG,
78 | 	altera_syncram_component.power_up_uninitialized  = "FALSE",
79 | 	altera_syncram_component.ram_block_type  = "M20K",
80 | 	altera_syncram_component.read_during_write_mode_mixed_ports  = "DONT_CARE",
81 | 	altera_syncram_component.widthad_a  = AW,
82 | 	altera_syncram_component.widthad_b  = AW,
83 | 	altera_syncram_component.width_a  = DW,
84 | 	altera_syncram_component.width_b  = DW,
85 | 	altera_syncram_component.width_byteena_a  = 1;
86 | 
87 | 
88 | // Debug
89 | //   always @ (posedge clk) begin
90 | //      if(wr_en) 
91 | //          $display("[%0t][%s] wr ram%d[%d] = %x(%d,%d,%d,%d)", 
92 | //          $time, `__FILE__, ID, wr_addr, wr_data, wr_data[7:0], wr_data[15:8],
93 | //          wr_data[23:16], wr_data[31:24]);
94 | //   end
95 | 
96 | endmodule
97 | 
98 | 


--------------------------------------------------------------------------------
/rtl/run_sim.sh:
--------------------------------------------------------------------------------
 1 | QSYS_SIMDIR="."
 2 | QUARTUS_INSTALL_DIR=$QUARTUS_ROOTDIR
 3 | USER_DEFINED_ELAB_OPTIONS="+vcs+lic+wait -debug_access+pp npu_tb.sv"
 4 | SKIP_SIM=1
 5 | TOP_LEVEL_NAME=npu_tb
 6 | 
 7 | sh setup.sh QUARTUS_INSTALL_DIR=$QUARTUS_INSTALL_DIR USER_DEFINED_ELAB_OPTIONS="\"$USER_DEFINED_ELAB_OPTIONS\"" SKIP_SIM=$SKIP_SIM TOP_LEVEL_NAME=$TOP_LEVEL_NAME > rtl_sim_log
 8 | 
 9 | #./simv +vcs+lic+wait
10 | 


--------------------------------------------------------------------------------
/rtl/self_tester_tb.v:
--------------------------------------------------------------------------------
 1 | `timescale 1 ps / 1 ps
 2 | 
 3 | module self_tester_tb; 
 4 |   reg clk, reset; 
 5 |   wire [2:0] status; 
 6 |   wire [31:0] count;
 7 |   wire [31:0] perf_counter;
 8 |   wire done;
 9 |     
10 | self_tester_shim uut (
11 | 	.clk(clk),
12 | 	.reset(reset),
13 | 	.o_test_status(status),
14 | 	.o_result_count(count),
15 | 	.o_perf_counter(perf_counter),
16 | 	.o_test_done(done)
17 | );
18 |     
19 | initial begin
20 | 	clk = 0; 
21 | 	reset = 0; 
22 | end 
23 |     
24 | always  
25 | 	#5 clk = !clk; 
26 |     
27 | initial begin
28 | 	reset = 1; #20
29 | 	reset = 0; 
30 | end
31 |     
32 | endmodule
33 | 


--------------------------------------------------------------------------------
/rtl/setup.sh:
--------------------------------------------------------------------------------
 1 | # initialize variables
 2 | TOP_LEVEL_NAME="npu_tb"
 3 | 
 4 | QUARTUS_INSTALL_DIR=$QUARTUS_ROOTDIR
 5 | SKIP_SIM=1
 6 | #USER_DEFINED_ELAB_OPTIONS="+vcs+lic+wait -debug_access+pp"
 7 | USER_DEFINED_ELAB_OPTIONS="+vcs+lic+wait"
 8 | USER_DEFINED_ELAB_OPTIONS_APPEND=""
 9 | USER_DEFINED_SIM_OPTIONS=""
10 | 
11 | # ----------------------------------------
12 | # overwrite variables - DO NOT MODIFY!
13 | # This block evaluates each command line argument, typically used for 
14 | # overwriting variables. An example usage:
15 | #   sh <simulator>_setup.sh SKIP_SIM=1
16 | for expression in "$@"; do
17 |   eval $expression
18 |   if [ $? -ne 0 ]; then
19 |     echo "Error: This command line argument, \"$expression\", is/has an invalid expression." >&2
20 |     exit $?
21 |   fi
22 | done
23 | 
24 | #-------------------------------------------
25 | # check tclsh version no earlier than 8.5 
26 | version=$(echo "puts [package vcompare [info tclversion] 8.5]; exit" | tclsh)
27 | if [ $version -eq -1 ]; then 
28 |   echo "Error: Minimum required tcl package version is 8.5." >&2 
29 |   exit 1 
30 | fi 
31 | 
32 | ELAB_OPTIONS=""
33 | 
34 | design_files="*.sv *.v"
35 | 
36 | vcs -lca -timescale=1ps/1ps -sverilog +verilog2001ext+.v $USER_DEFINED_ELAB_OPTIONS \
37 |   -v $QUARTUS_INSTALL_DIR/eda/sim_lib/altera_primitives.v \
38 |   -v $QUARTUS_INSTALL_DIR/eda/sim_lib/220model.v \
39 |   -v $QUARTUS_INSTALL_DIR/eda/sim_lib/sgate.v \
40 |   -v $QUARTUS_INSTALL_DIR/eda/sim_lib/altera_mf.v \
41 |   $QUARTUS_INSTALL_DIR/eda/sim_lib/fourteennm_atoms.sv \
42 |   $QUARTUS_INSTALL_DIR/eda/sim_lib/synopsys/fourteennm_atoms_ncrypt.sv \
43 |   $design_files \
44 |   $USER_DEFINED_ELAB_OPTIONS_APPEND \
45 |   -top $TOP_LEVEL_NAME -R #-gui &
46 | 
47 | # ----------------------------------------
48 | # simulate
49 | if [ $SKIP_SIM -eq 0 ]; then
50 |   ./simv $SIM_OPTIONS $USER_DEFINED_SIM_OPTIONS
51 | fi
52 | 
53 | #$QUARTUS_INSTALL_DIR/eda/sim_lib/altera_lnsim.sv \
54 | #$QUARTUS_INSTALL_DIR/eda/sim_lib/ct1_hssi_atoms.sv \
55 | #$QUARTUS_INSTALL_DIR/eda/sim_lib/synopsys/ct1_hssi_atoms_ncrypt.sv \
56 | #$QUARTUS_INSTALL_DIR/eda/sim_lib/synopsys/cr3v0_serdes_models_ncrypt.sv \
57 | #$QUARTUS_INSTALL_DIR/eda/sim_lib/ct1_hip_atoms.sv \
58 | #$QUARTUS_INSTALL_DIR/eda/sim_lib/synopsys/ct1_hip_atoms_ncrypt.sv \
59 | 


--------------------------------------------------------------------------------
/rtl/sigmoid.sv:
--------------------------------------------------------------------------------
  1 | `include "npu.vh"
  2 | 
  3 | module sigmoid # (
  4 | 	parameter DW = 32,
  5 | 	parameter IF = 19,
  6 | 	parameter OF = 19,
  7 | 	parameter SAMPLES = 512,
  8 | 	parameter AW = $clog2(SAMPLES),
  9 | 	parameter RTL_DIR = `RTL_DIR,
 10 | 	parameter TARGET_FPGA = `TARGET_FPGA
 11 | ) (
 12 | 	input  clk,
 13 | 	input  rst,
 14 | 	input  [DW-1:0] x,
 15 | 	output [DW-1:0] result
 16 | );
 17 | 
 18 | reg [AW-1:0] index;
 19 | reg [DW-1:0] abs_x;
 20 | reg is_neg, is_neg_reg, is_big;
 21 | reg [DW-1:0] res;
 22 | wire [DW-1:0] lookup;
 23 | 
 24 | always @ (posedge clk) begin
 25 | 	if(rst) begin
 26 | 		abs_x <= 0;
 27 | 		is_neg <= 0;
 28 | 		is_big <= 0;
 29 | 		res <= 0;
 30 | 		index <= 0;
 31 | 		//lookup <= 0;
 32 | 	end else begin
 33 | 		//Cycle 1: Get abs x
 34 | 		if(x[DW-1]) begin
 35 | 			abs_x <= -x;
 36 | 			is_neg <= 1'b1;
 37 | 		end else begin
 38 | 			abs_x <= x;
 39 | 			is_neg <= 1'b0;
 40 | 		end
 41 | 		
 42 | 		//Cycle 2: Get index & do comparisons
 43 | 		if(abs_x > 4194304) begin
 44 | 			is_big <= 1'b1;
 45 | 		end else begin
 46 | 			is_big <= 1'b0;
 47 | 		end
 48 | 		index <= AW'(abs_x[DW-6:DW-16] + abs_x[DW-17]); 
 49 | 		//lookup <= sigmoid_LUT[index];
 50 | 		is_neg_reg <= is_neg;
 51 | 		
 52 | 		//Cycle 3: Choose output
 53 | 		case({is_neg_reg, is_big})
 54 | 			2'b01: res <= {{(DW-OF-1){1'b0}}, 1'b1, {(OF){1'b0}}};
 55 | 			2'b11: res <= {(DW){1'b0}};
 56 | 			2'b00: res <= lookup;
 57 | 			2'b10: res <= {{(DW-OF-1){1'b0}}, 1'b1, {(OF){1'b0}}} - lookup;
 58 | 			default: res <= 0;
 59 | 		endcase
 60 | 	end
 61 | end
 62 | 
 63 | /*reg [AW-1:0] index_2;
 64 | reg [DW-1:0] abs_x_2;
 65 | reg is_neg_2, is_neg_reg_2, is_big_2;
 66 | reg [DW-1:0] res_2;
 67 | wire [DW-1:0] lookup_2;
 68 | 
 69 | always @ (posedge clk) begin
 70 | 	if(rst) begin
 71 | 		abs_x_2 <= 0;
 72 | 		is_neg_2 <= 0;
 73 | 		is_big_2 <= 0;
 74 | 		res_2 <= 0;
 75 | 		index_2 <= 0;
 76 | 		//lookup <= 0;
 77 | 	end else begin
 78 | 		//Cycle 1: Get abs x
 79 | 		if(x_2[DW-1]) begin
 80 | 			abs_x_2 <= -x_2;
 81 | 			is_neg_2 <= 1'b1;
 82 | 		end else begin
 83 | 			abs_x_2 <= x_2;
 84 | 			is_neg_2 <= 1'b0;
 85 | 		end
 86 | 		
 87 | 		//Cycle 2: Get index & do comparisons
 88 | 		if(abs_x_2 > 4194304) begin
 89 | 			is_big_2 <= 1'b1;
 90 | 		end else begin
 91 | 			is_big_2 <= 1'b0;
 92 | 		end
 93 | 		index_2 <= abs_x_2[DW-6:DW-16] + abs_x_2[DW-17]; 
 94 | 		//lookup <= sigmoid_LUT[index];
 95 | 		is_neg_reg_2 <= is_neg_2;
 96 | 		
 97 | 		//Cycle 3: Choose output
 98 | 		case({is_neg_reg_2, is_big_2})
 99 | 			2'b01: res_2 <= {{(DW-OF-1){1'b0}}, 1'b1, {(OF){1'b0}}};
100 | 			2'b11: res_2 <= {(DW){1'b0}};
101 | 			2'b00: res_2 <= lookup_2;
102 | 			2'b10: res_2 <= {{(DW-OF-1){1'b0}}, 1'b1, {(OF){1'b0}}} - lookup_2;
103 | 			default: res_2 <= 0;
104 | 		endcase
105 | 	end
106 | end*/
107 | 
108 | assign result = res;
109 | /*assign result_2 = res_2;
110 | 
111 | altera_syncram  altera_syncram_component (
112 |                 .address_a (index),
113 |                 .address_b (index_2),
114 |                 .clock0 (clk),
115 |                 .data_a ({(DW){1'b1}}),
116 |                 .data_b ({(DW){1'b1}}),
117 |                 .wren_a (1'b0),
118 |                 .wren_b (1'b0),
119 |                 .q_a (lookup),
120 |                 .q_b (lookup_2),
121 |                 .aclr0 (),
122 |                 .aclr1 (),
123 |                 .address2_a (1'b1),
124 |                 .address2_b (1'b1),
125 |                 .addressstall_a (1'b0),
126 |                 .addressstall_b (1'b0),
127 |                 .byteena_a (1'b1),
128 |                 .byteena_b (1'b1),
129 |                 .clock1 (1'b1),
130 |                 .clocken0 (1'b1),
131 |                 .clocken1 (1'b1),
132 |                 .clocken2 (1'b1),
133 |                 .clocken3 (1'b1),
134 |                 .eccencbypass (1'b0),
135 |                 .eccencparity (8'b0),
136 |                 .eccstatus (),
137 |                 .rden_a (1'b1),
138 |                 .rden_b (1'b1),
139 |                 .sclr (1'b0)
140 | );
141 | defparam
142 | 	altera_syncram_component.address_reg_b  = "CLOCK0",
143 | 	altera_syncram_component.clock_enable_input_a  = "BYPASS",
144 | 	altera_syncram_component.clock_enable_input_b  = "BYPASS",
145 | 	altera_syncram_component.clock_enable_output_a  = "BYPASS",
146 | 	altera_syncram_component.clock_enable_output_b  = "BYPASS",
147 | 	altera_syncram_component.indata_reg_b  = "CLOCK0",
148 | 	altera_syncram_component.init_file = "sigmoid.mif",
149 | 	altera_syncram_component.intended_device_family  = "Stratix 10",
150 | 	altera_syncram_component.lpm_type  = "altera_syncram",
151 | 	altera_syncram_component.numwords_a  = SAMPLES,
152 | 	altera_syncram_component.numwords_b  = SAMPLES,
153 | 	altera_syncram_component.operation_mode  = "BIDIR_DUAL_PORT",
154 | 	altera_syncram_component.outdata_aclr_a  = "NONE",
155 | 	altera_syncram_component.outdata_aclr_b  = "NONE",
156 | 	altera_syncram_component.outdata_sclr_a  = "NONE",
157 | 	altera_syncram_component.outdata_sclr_b  = "NONE",
158 | 	altera_syncram_component.outdata_reg_a  = "CLOCK0",
159 | 	altera_syncram_component.outdata_reg_b  = "CLOCK0",
160 | 	altera_syncram_component.enable_force_to_zero  = "TRUE",
161 | 	altera_syncram_component.power_up_uninitialized  = "FALSE",
162 | 	altera_syncram_component.ram_block_type  = "M20K",
163 | 	altera_syncram_component.widthad_a  = AW,
164 | 	altera_syncram_component.widthad_b  = AW,
165 | 	altera_syncram_component.width_a  = DW,
166 | 	altera_syncram_component.width_b  = DW,
167 | 	altera_syncram_component.width_byteena_a  = 1,
168 | 	altera_syncram_component.width_byteena_b  = 1;*/
169 | 
170 | altera_syncram  altera_syncram_component (
171 | 	 .address_a (index),
172 | 	 .clock0 (clk),
173 | 	 .q_a (lookup),
174 | 	 .aclr0 (1'b0),
175 | 	 .aclr1 (1'b0),
176 | 	 .address2_a (1'b1),
177 | 	 .address2_b (1'b1),
178 | 	 .address_b (1'b1),
179 | 	 .addressstall_a (1'b0),
180 | 	 .addressstall_b (1'b0),
181 | 	 .byteena_a (1'b1),
182 | 	 .byteena_b (1'b1),
183 | 	 .clock1 (1'b1),
184 | 	 .clocken0 (1'b1),
185 | 	 .clocken1 (1'b1),
186 | 	 .clocken2 (1'b1),
187 | 	 .clocken3 (1'b1),
188 | 	 .data_a ({(DW){1'b1}}),
189 | 	 .data_b (1'b1),
190 | 	 .eccencbypass (1'b0),
191 | 	 .eccencparity (8'b0),
192 | 	 .eccstatus ( ),
193 | 	 .q_b ( ),
194 | 	 .rden_a (1'b1),
195 | 	 .rden_b (1'b1),
196 | 	 .sclr (1'b0),
197 | 	 .wren_a (1'b0),
198 | 	 .wren_b (1'b0)
199 | );
200 | defparam
201 | 	altera_syncram_component.address_aclr_a  = "NONE",
202 | 	altera_syncram_component.clock_enable_input_a  = "BYPASS",
203 | 	altera_syncram_component.clock_enable_output_a  = "BYPASS",
204 | 	altera_syncram_component.init_file = {RTL_DIR, "sigmoid.mif"},
205 | 	altera_syncram_component.intended_device_family  = TARGET_FPGA,
206 | 	altera_syncram_component.lpm_hint  = "ENABLE_RUNTIME_MOD=NO",
207 | 	altera_syncram_component.lpm_type  = "altera_syncram",
208 | 	altera_syncram_component.numwords_a  = SAMPLES,
209 | 	altera_syncram_component.operation_mode  = "ROM",
210 | 	altera_syncram_component.outdata_aclr_a  = "NONE",
211 | 	altera_syncram_component.outdata_sclr_a  = "NONE",
212 | 	altera_syncram_component.outdata_reg_a  = "CLOCK0",
213 | 	altera_syncram_component.ram_block_type  = "M20K",
214 | 	altera_syncram_component.enable_force_to_zero  = "FALSE",
215 | 	altera_syncram_component.widthad_a  = AW,
216 | 	altera_syncram_component.width_a  = DW,
217 | 	altera_syncram_component.width_byteena_a  = 1;
218 | 	
219 | endmodule
220 | 


--------------------------------------------------------------------------------
/rtl/star_interconnect.sv:
--------------------------------------------------------------------------------
 1 | 
 2 | // This module implements a star-shapped interconnect from a source to multiple sinks with distinct pipelining registers.
 3 | (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name DONT_MERGE_REGISTER ON" *) module star_interconnect # (
 4 | 	parameter END_POINTS = 4,
 5 | 	parameter DATAW = 32,
 6 | 	parameter LATENCY = 2
 7 | ) (
 8 | 	input 				clk,
 9 | 	input 				rst,
10 | 	input  [DATAW-1:0] 	i_star_in,
11 | 	output [DATAW-1:0] 	o_star_out [0:END_POINTS-1]
12 | );
13 | 
14 | reg [DATAW-1:0] pipeline [0:LATENCY-1][0:END_POINTS-1];
15 | 
16 | integer t, d;
17 | always @ (posedge clk) begin
18 | 	if (rst) begin
19 | 		// Reset the first stage of the pipeline
20 | 		for (d = 0; d < END_POINTS; d = d + 1) begin
21 | 			pipeline[0][d] <= 'd0;
22 | 		end
23 | 	end else begin
24 | 		// Set the input to the first pipeline stage
25 | 		for (d = 0; d < END_POINTS; d = d + 1) begin
26 | 			pipeline[0][d] <= i_star_in;
27 | 		end
28 | 
29 | 		// Progress the pipeline
30 | 		for (d = 0; d < END_POINTS; d = d + 1) begin
31 | 			for (t = 1; t < LATENCY; t = t + 1) begin
32 | 				pipeline[t][d] <= pipeline[t-1][d];
33 | 			end
34 | 		end
35 | 	end
36 | end
37 | 
38 | // Hook up outputs
39 | assign o_star_out = pipeline[LATENCY-1];
40 | 
41 | endmodule


--------------------------------------------------------------------------------
/rtl/tanh.sv:
--------------------------------------------------------------------------------
  1 | `include "npu.vh"
  2 | 
  3 | module tanh # (
  4 | 	parameter DW = 32,
  5 | 	parameter IF = 19,
  6 | 	parameter OF = 19,
  7 | 	parameter SAMPLES = 512,
  8 | 	parameter AW = $clog2(SAMPLES),
  9 | 	parameter RTL_DIR = `RTL_DIR,
 10 | 	parameter TARGET_FPGA = `TARGET_FPGA
 11 | ) (
 12 | 	input  clk,
 13 | 	input  rst,
 14 | 	input  [DW-1:0] x,
 15 | 	output [DW-1:0] result
 16 | );
 17 | 
 18 | reg [AW-1:0] index;
 19 | reg [DW-1:0] abs_x, abs_x_reg;
 20 | reg is_neg, is_neg_reg, is_lin, is_big;
 21 | reg [DW-1:0] res;
 22 | wire [DW-1:0] lookup;
 23 | 
 24 | always @ (posedge clk) begin
 25 | 	if(rst) begin
 26 | 		abs_x <= 0;
 27 | 		is_neg <= 0;
 28 | 		is_lin <= 0;
 29 | 		is_big <= 0;
 30 | 		res <= 0;
 31 | 		index <= 0;
 32 | 		//lookup <= 0;
 33 | 	end else begin
 34 | 		//Cycle 1: Get abs x
 35 | 		if(x[DW-1]) begin
 36 | 			abs_x <= -x;
 37 | 			is_neg <= 1'b1;
 38 | 		end else begin
 39 | 			abs_x <= x;
 40 | 			is_neg <= 1'b0;
 41 | 		end
 42 | 		
 43 | 		//Cycle 2: Get index & do comparisons
 44 | 		if(abs_x > 2097152) begin
 45 | 			is_big <= 1'b1;
 46 | 		end else begin
 47 | 			is_big <= 1'b0;
 48 | 		end		
 49 | 		if((abs_x < 524288) && (abs_x[IF-1:0] <= {4'b0001, {(IF-4){1'b0}}})) begin
 50 | 			is_lin <= 1'b1;
 51 | 		end else begin
 52 | 			is_lin <= 1'b0;
 53 | 		end
 54 | 		index <= AW'(abs_x[DW-7:DW-17] + abs_x[DW-18]); 
 55 | 		//lookup <= tanh_LUT[index];
 56 | 		abs_x_reg <= abs_x;
 57 | 		is_neg_reg <= is_neg;
 58 | 		
 59 | 		//Cycle 3: Choose output
 60 | 		case({is_neg_reg, is_big, is_lin})
 61 | 			3'b010: res <= {{(DW-OF-1){1'b0}}, 1'b1, {(OF){1'b0}}};
 62 | 			3'b110: res <= {{(DW-OF){1'b1}}, {(OF){1'b0}}};
 63 | 			3'b001: res <= abs_x_reg;
 64 | 			3'b101: res <= -abs_x_reg;
 65 | 			3'b000: res <= lookup;
 66 | 			3'b100: res <= -lookup;
 67 | 			default: res <= 0;
 68 | 		endcase
 69 | 	end
 70 | end
 71 | 
 72 | /*reg [AW-1:0] index_2;
 73 | reg [DW-1:0] abs_x_2, abs_x_reg_2;
 74 | reg is_neg_2, is_neg_reg_2, is_lin_2, is_big_2;
 75 | reg [DW-1:0] res_2;
 76 | wire [DW-1:0] lookup_2;
 77 | 
 78 | always @ (posedge clk) begin
 79 | 	if(rst) begin
 80 | 		abs_x_2 <= 0;
 81 | 		is_neg_2 <= 0;
 82 | 		is_lin_2 <= 0;
 83 | 		is_big_2 <= 0;
 84 | 		res_2 <= 0;
 85 | 		index_2 <= 0;
 86 | 		//lookup <= 0;
 87 | 	end else begin
 88 | 		//Cycle 1: Get abs x
 89 | 		if(x_2[DW-1]) begin
 90 | 			abs_x_2 <= -x_2;
 91 | 			is_neg_2 <= 1'b1;
 92 | 		end else begin
 93 | 			abs_x_2 <= x_2;
 94 | 			is_neg_2 <= 1'b0;
 95 | 		end
 96 | 		
 97 | 		//Cycle 2: Get index & do comparisons
 98 | 		if(abs_x_2 > 2097152) begin
 99 | 			is_big_2 <= 1'b1;
100 | 		end else begin
101 | 			is_big_2 <= 1'b0;
102 | 		end		
103 | 		if((abs_x_2 < 524288) && (abs_x_2[IF-1:0] <= {4'b0001, {(IF-4){1'b0}}})) begin
104 | 			is_lin_2 <= 1'b1;
105 | 		end else begin
106 | 			is_lin_2 <= 1'b0;
107 | 		end
108 | 		index_2 <= abs_x_2[DW-7:DW-17] + abs_x_2[DW-18]; 
109 | 		//lookup <= tanh_LUT[index];
110 | 		abs_x_reg_2 <= abs_x_2;
111 | 		is_neg_reg_2 <= is_neg_2;
112 | 		
113 | 		//Cycle 3: Choose output
114 | 		case({is_neg_reg_2, is_big_2, is_lin_2})
115 | 			3'b010: res_2 <= {{(DW-OF-1){1'b0}}, 1'b1, {(OF){1'b0}}};
116 | 			3'b110: res_2 <= {{(DW-OF){1'b1}}, {(OF){1'b0}}};
117 | 			3'b001: res_2 <= abs_x_reg_2;
118 | 			3'b101: res_2 <= -abs_x_reg_2;
119 | 			3'b000: res_2 <= lookup_2;
120 | 			3'b100: res_2 <= -lookup_2;
121 | 			default: res_2 <= 0;
122 | 		endcase
123 | 	end
124 | end*/
125 | 
126 | assign result = res;
127 | /*assign result_2 = res_2;
128 | 
129 | altera_syncram  altera_syncram_component (
130 |                 .address_a (index),
131 |                 .address_b (index_2),
132 |                 .clock0 (clk),
133 |                 .data_a ({(DW){1'b1}}),
134 |                 .data_b ({(DW){1'b1}}),
135 |                 .wren_a (1'b0),
136 |                 .wren_b (1'b0),
137 |                 .q_a (lookup),
138 |                 .q_b (lookup_2),
139 |                 .aclr0 (),
140 |                 .aclr1 (),
141 |                 .address2_a (1'b1),
142 |                 .address2_b (1'b1),
143 |                 .addressstall_a (1'b0),
144 |                 .addressstall_b (1'b0),
145 |                 .byteena_a (1'b1),
146 |                 .byteena_b (1'b1),
147 |                 .clock1 (1'b1),
148 |                 .clocken0 (1'b1),
149 |                 .clocken1 (1'b1),
150 |                 .clocken2 (1'b1),
151 |                 .clocken3 (1'b1),
152 |                 .eccencbypass (1'b0),
153 |                 .eccencparity (8'b0),
154 |                 .eccstatus (),
155 |                 .rden_a (1'b1),
156 |                 .rden_b (1'b1),
157 |                 .sclr (1'b0)
158 | );
159 | defparam
160 | 	altera_syncram_component.address_reg_b  = "CLOCK0",
161 | 	altera_syncram_component.clock_enable_input_a  = "BYPASS",
162 | 	altera_syncram_component.clock_enable_input_b  = "BYPASS",
163 | 	altera_syncram_component.clock_enable_output_a  = "BYPASS",
164 | 	altera_syncram_component.clock_enable_output_b  = "BYPASS",
165 | 	altera_syncram_component.indata_reg_b  = "CLOCK0",
166 | 	altera_syncram_component.init_file = "tanh.mif",
167 | 	altera_syncram_component.intended_device_family  = "Stratix 10",
168 | 	altera_syncram_component.lpm_type  = "altera_syncram",
169 | 	altera_syncram_component.numwords_a  = SAMPLES,
170 | 	altera_syncram_component.numwords_b  = SAMPLES,
171 | 	altera_syncram_component.operation_mode  = "BIDIR_DUAL_PORT",
172 | 	altera_syncram_component.outdata_aclr_a  = "NONE",
173 | 	altera_syncram_component.outdata_aclr_b  = "NONE",
174 | 	altera_syncram_component.outdata_sclr_a  = "NONE",
175 | 	altera_syncram_component.outdata_sclr_b  = "NONE",
176 | 	altera_syncram_component.outdata_reg_a  = "CLOCK0",
177 | 	altera_syncram_component.outdata_reg_b  = "CLOCK0",
178 | 	altera_syncram_component.enable_force_to_zero  = "TRUE",
179 | 	altera_syncram_component.power_up_uninitialized  = "FALSE",
180 | 	altera_syncram_component.ram_block_type  = "M20K",
181 | 	altera_syncram_component.widthad_a  = AW,
182 | 	altera_syncram_component.widthad_b  = AW,
183 | 	altera_syncram_component.width_a  = DW,
184 | 	altera_syncram_component.width_b  = DW,
185 | 	altera_syncram_component.width_byteena_a  = 1,
186 | 	altera_syncram_component.width_byteena_b  = 1;*/
187 | 	
188 | altera_syncram  altera_syncram_component (
189 | 	 .address_a (index),
190 | 	 .clock0 (clk),
191 | 	 .q_a (lookup),
192 | 	 .aclr0 (1'b0),
193 | 	 .aclr1 (1'b0),
194 | 	 .address2_a (1'b1),
195 | 	 .address2_b (1'b1),
196 | 	 .address_b (1'b1),
197 | 	 .addressstall_a (1'b0),
198 | 	 .addressstall_b (1'b0),
199 | 	 .byteena_a (1'b1),
200 | 	 .byteena_b (1'b1),
201 | 	 .clock1 (1'b1),
202 | 	 .clocken0 (1'b1),
203 | 	 .clocken1 (1'b1),
204 | 	 .clocken2 (1'b1),
205 | 	 .clocken3 (1'b1),
206 | 	 .data_a ({(DW){1'b1}}),
207 | 	 .data_b (1'b1),
208 | 	 .eccencbypass (1'b0),
209 | 	 .eccencparity (8'b0),
210 | 	 .eccstatus ( ),
211 | 	 .q_b ( ),
212 | 	 .rden_a (1'b1),
213 | 	 .rden_b (1'b1),
214 | 	 .sclr (1'b0),
215 | 	 .wren_a (1'b0),
216 | 	 .wren_b (1'b0)
217 | );
218 | defparam
219 | 	altera_syncram_component.address_aclr_a  = "NONE",
220 | 	altera_syncram_component.clock_enable_input_a  = "BYPASS",
221 | 	altera_syncram_component.clock_enable_output_a  = "BYPASS",
222 | 	altera_syncram_component.init_file = {RTL_DIR, "tanh.mif"},
223 | 	altera_syncram_component.intended_device_family  = TARGET_FPGA,
224 | 	altera_syncram_component.lpm_hint  = "ENABLE_RUNTIME_MOD=NO",
225 | 	altera_syncram_component.lpm_type  = "altera_syncram",
226 | 	altera_syncram_component.numwords_a  = SAMPLES,
227 | 	altera_syncram_component.operation_mode  = "ROM",
228 | 	altera_syncram_component.outdata_aclr_a  = "NONE",
229 | 	altera_syncram_component.outdata_sclr_a  = "NONE",
230 | 	altera_syncram_component.outdata_reg_a  = "CLOCK0",
231 | 	altera_syncram_component.ram_block_type  = "M20K",
232 | 	altera_syncram_component.enable_force_to_zero  = "FALSE",
233 | 	altera_syncram_component.widthad_a  = AW,
234 | 	altera_syncram_component.width_a  = DW,
235 | 	altera_syncram_component.width_byteena_a  = 1;
236 | 
237 | endmodule
238 | 


--------------------------------------------------------------------------------
/rtl/tester_rom.sv:
--------------------------------------------------------------------------------
 1 | module test_rom # (
 2 | 	parameter DEPTH = 2,
 3 | 	parameter DATAW = 16,
 4 | 	parameter ADDRW = $clog2(DEPTH),
 5 | 	parameter MIF_FILE = "/nfs/site/home/aboutros/self_tester/test_vectors.mif"
 6 | )(
 7 |     input [ADDRW-1:0] address,
 8 |     input clock,
 9 |     output [DATAW-1:0] q
10 | );
11 | 
12 |     wire [DATAW-1:0] sub_wire0;
13 |     assign q = sub_wire0[DATAW-1:0];
14 | 
15 |     altera_syncram  altera_syncram_component (
16 |                 .address_a (address),
17 |                 .clock0 (clock),
18 |                 .q_a (sub_wire0),
19 |                 .aclr0 (1'b0),
20 |                 .aclr1 (1'b0),
21 |                 .address2_a (1'b1),
22 |                 .address2_b (1'b1),
23 |                 .address_b (1'b1),
24 |                 .addressstall_a (1'b0),
25 |                 .addressstall_b (1'b0),
26 |                 .byteena_a (1'b1),
27 |                 .byteena_b (1'b1),
28 |                 .clock1 (1'b1),
29 |                 .clocken0 (1'b1),
30 |                 .clocken1 (1'b1),
31 |                 .clocken2 (1'b1),
32 |                 .clocken3 (1'b1),
33 |                 .data_a ({(DATAW){1'b1}}),
34 |                 .data_b (1'b1),
35 |                 .eccencbypass (1'b0),
36 |                 .eccencparity (8'b0),
37 |                 .eccstatus ( ),
38 |                 .q_b ( ),
39 |                 .rden_a (1'b1),
40 |                 .rden_b (1'b1),
41 |                 .sclr (1'b0),
42 |                 .wren_a (1'b0),
43 |                 .wren_b (1'b0));
44 |     defparam
45 |         altera_syncram_component.address_aclr_a  = "NONE",
46 |         altera_syncram_component.clock_enable_input_a  = "BYPASS",
47 |         altera_syncram_component.clock_enable_output_a  = "BYPASS",
48 |         altera_syncram_component.init_file = MIF_FILE,
49 |         altera_syncram_component.intended_device_family  = "Stratix 10",
50 |         altera_syncram_component.lpm_type  = "altera_syncram",
51 |         altera_syncram_component.numwords_a  = DEPTH,
52 |         altera_syncram_component.operation_mode  = "ROM",
53 |         altera_syncram_component.outdata_aclr_a  = "NONE",
54 |         altera_syncram_component.outdata_sclr_a  = "NONE",
55 |         altera_syncram_component.outdata_reg_a  = "CLOCK0",
56 |         altera_syncram_component.ram_block_type  = "M20K",
57 |         altera_syncram_component.enable_force_to_zero  = "TRUE",
58 |         altera_syncram_component.widthad_a  = ADDRW,
59 |         altera_syncram_component.width_a  = DATAW,
60 |         altera_syncram_component.width_byteena_a  = 1;
61 | 
62 | endmodule


--------------------------------------------------------------------------------
/scripts/perf_baseline:
--------------------------------------------------------------------------------
 1 | 01_gemv_512x512 5.65
 2 | 02_gemv_1024x1024 13.06
 3 | 03_gemv_1152x1152 13.38
 4 | 04_gemv_1536x1536 18.34
 5 | 05_gemv_1792x1792 20.04
 6 | 06_rnn_512_8 10.86
 7 | 07_rnn_1024_8 25.11
 8 | 08_rnn_1152_8 25.59
 9 | 09_rnn_1536_8 33.38
10 | 10_rnn_1792_8 34.25
11 | 11_gru_512_8 10.89
12 | 12_gru_1024_8 24.32
13 | 13_gru_1152_8 25.29
14 | 14_lstm_512_8 15.16
15 | 15_lstm_1024_8 31.85
16 | 16_mlp5_512 6.86
17 | 17_mlp5_1024 16.29
18 | 18_mlp3_1024_512_256_256 6.02
19 | 19_mlp3_1024_512_256_256_batched 7.26
20 | 


--------------------------------------------------------------------------------
/scripts/perf_tests.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os import listdir, chdir
 3 | from os.path import isfile, join
 4 | import sys
 5 | import subprocess
 6 | 
 7 | # Define colors for printing
 8 | class colors:
 9 | 	PASS = '\x1b[42m'
10 | 	FAIL = '\x1b[41m'
11 | 	BOLD = '\033[1m'
12 | 	RESET = '\033[0;0m'
13 | 
14 | keyword = ''
15 | if ('--run_test' in sys.argv):
16 | 	keyword = sys.argv[sys.argv.index('--run_test')+1]
17 | 
18 | # Get list of existing workloads
19 | path = './workloads/'
20 | workloads = [f for f in listdir(path) if isfile(join(path, f))]
21 | workloads = [f for f in workloads if keyword in f]
22 | workloads.sort()
23 | for i in range(len(workloads)):
24 | 	workloads[i] = workloads[i].split('.')[0]
25 | 
26 | # Parse baseline results
27 | baseline_results = {}
28 | baseline = open('../scripts/perf_baseline', 'r')
29 | for line in baseline:
30 | 	split_line = line.split(' ')
31 | 	baseline_results[split_line[0]] = float(split_line[1])
32 | 
33 | print(colors.BOLD + '{:<35}{:<4}    {:<5}    {:<6}'.format('WORKLOAD', 'TEST', 'TOPS', 'QoR') + colors.RESET)
34 | 
35 | chdir('../compiler')
36 | for workload in workloads:
37 | 	subprocess.call(['cp', '../scripts/workloads/'+workload+'.py', './'], shell=False)
38 | 	sys.stdout.write('{:<35}'.format(workload))
39 | 	sys.stdout.flush()
40 | 	outfile = open('../scripts/reports/'+workload+'_perf.rpt', 'w')
41 | 	subprocess.call(['python', workload+'.py', '-perfsim'], stdout=outfile, shell=False)
42 | 	rptfile = open('../scripts/reports/'+workload+'_perf.rpt', 'r')
43 | 	parse_perf_res = False
44 | 	for line in rptfile:
45 | 		if (parse_perf_res and ('Running simulation ... ' in line)):
46 | 			args = line.split()
47 | 			if('PASSED' in args[3]):
48 | 				print(colors.PASS + 'PASS' + colors.RESET, end='')
49 | 				result = args[10]
50 | 				if workload in baseline_results:
51 | 					comparison_to_baseline = ((float(args[10])/baseline_results[workload])-1) * 100
52 | 					if comparison_to_baseline >= 0:
53 | 						print ('    {:>5}    +{:<5.2f}'.format(result, comparison_to_baseline) + '%')
54 | 					else:
55 | 						print ('    {:>5}    -{:<5.2f}'.format(result, comparison_to_baseline) + '%')
56 | 				else:
57 | 					print ('    {:>5}    N/A'.format(result))
58 | 			else:
59 | 				print(colors.FAIL + 'FAIL' + colors.RESET)
60 | 		elif 'C++ Performance Simulation' in line:
61 | 			parse_perf_res = True
62 | 	if(not parse_perf_res):
63 | 		print(colors.FAIL + 'FAIL' + colors.RESET)
64 | 	subprocess.call(['rm', workload+'.py'], shell=False)
65 | 
66 | 


--------------------------------------------------------------------------------
/scripts/reports/README.md:
--------------------------------------------------------------------------------
1 | Directory for script reports
2 | 


--------------------------------------------------------------------------------
/scripts/rtl_baseline:
--------------------------------------------------------------------------------
 1 | 01_gemv_512x512 3.55
 2 | 02_gemv_1024x1024 9.7
 3 | 03_gemv_1152x1152 10.39
 4 | 04_gemv_1536x1536 15.06
 5 | 05_gemv_1792x1792 16.9
 6 | 06_rnn_512_8 7.47
 7 | 07_rnn_1024_8 19.84
 8 | 08_rnn_1152_8 21.1
 9 | 09_rnn_1536_8 29.91
10 | 10_rnn_1792_8 33.12
11 | 11_gru_512_8 8.41
12 | 12_gru_1024_8 21.02
13 | 13_gru_1152_8 22.97
14 | 14_lstm_512_8 13.29
15 | 15_lstm_1024_8 30.0
16 | 16_mlp5_512 4.32
17 | 17_mlp5_1024 12.03
18 | 18_mlp3_1024_512_256_256 3.92
19 | 19_mlp3_1024_512_256_256_batched 5.32
20 | 


--------------------------------------------------------------------------------
/scripts/rtl_tests.py:
--------------------------------------------------------------------------------
 1 | from os import listdir, chdir
 2 | from os.path import isfile, join
 3 | import sys
 4 | import subprocess
 5 | 
 6 | # Define colors for printing
 7 | class colors:
 8 | 	PASS = '\x1b[42m'
 9 | 	FAIL = '\x1b[41m'
10 | 	BOLD = '\033[1m'
11 | 	RESET = '\033[0;0m'
12 | 
13 | keyword = ''
14 | if ('--run_test' in sys.argv):
15 | 	keyword = sys.argv[sys.argv.index('--run_test')+1]
16 | 
17 | # Get list of existing workloads
18 | path = './workloads/'
19 | workloads = [f for f in listdir(path) if isfile(join(path, f))]
20 | workloads = [f for f in workloads if keyword in f]
21 | workloads.sort()
22 | for i in range(len(workloads)):
23 | 	workloads[i] = workloads[i].split('.')[0]
24 | 
25 | # Parse baseline results
26 | baseline_results = {}
27 | baseline = open('../scripts/rtl_baseline', 'r')
28 | for line in baseline:
29 | 	split_line = line.split(' ')
30 | 	baseline_results[split_line[0]] = float(split_line[1])
31 | 
32 | chdir('../compiler')
33 | print(colors.BOLD + '{:<35}{:<4}    {:<5}    {:<6}'.format('WORKLOAD', 'TEST', 'TOPS', 'QoR') + colors.RESET)
34 | for workload in workloads:
35 | 	subprocess.call(['cp', '../scripts/workloads/'+workload+'.py', './'], shell=False)
36 | 	sys.stdout.write('{:<35}'.format(workload))
37 | 	sys.stdout.flush()
38 | 	outfile = open('../scripts/reports/'+workload+'_rtl.rpt', 'w')
39 | 	subprocess.call(['python', workload+'.py', '-rtlsim'], stdout=outfile, shell=False)
40 | 	rptfile = open('../scripts/reports/'+workload+'_rtl.rpt', 'r')
41 | 	parse_rtl_res = False
42 | 	for line in rptfile:
43 | 		if (parse_rtl_res and ('Running simulation ... ' in line)):
44 | 			args = line.split()
45 | 			if('PASSED' in args[3]):
46 | 				print(colors.PASS + 'PASS' + colors.RESET, end='')
47 | 				result = args[10]
48 | 				if workload in baseline_results:
49 | 					comparison_to_baseline = ((float(args[10])/baseline_results[workload])-1) * 100
50 | 					if comparison_to_baseline >= 0:
51 | 						print ('    {:>5}    +{:<5.2f}'.format(result, comparison_to_baseline) + '%')
52 | 					else:
53 | 						print ('    {:>5}    {:<6.2f}'.format(result, comparison_to_baseline) + '%')
54 | 				else:
55 | 					print ('    {:>5}    N/A'.format(result))
56 | 			else:
57 | 				print(colors.FAIL + 'FAIL' + colors.RESET)
58 | 		elif 'Launching RTL Simulation' in line:
59 | 			parse_rtl_res = True
60 | 	if(not parse_rtl_res):
61 | 		print(colors.FAIL + 'FAIL' + colors.RESET)
62 | 	subprocess.call(['rm', workload+'.py'], shell=False)
63 | 
64 | 


--------------------------------------------------------------------------------
/scripts/workloads/01_gemv_512x512.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 512
16 | L1_SIZE = 512
17 | 
18 | # Define model architecture using Keras Sequential Model
19 | model = NPUSequential([
20 | 	layers.Dense(L1_SIZE, name="layer1"),
21 | ])
22 | 
23 | # Random test inputs for different types of layers
24 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], dtype=tf.int32, minval=-128, maxval=127)
25 | 
26 | # Call model on example input
27 | y = model(test_input)
28 | 
29 | # Print model summary
30 | model.summary()
31 | 
32 | ####### END OF MODEL DEFINITION #######
33 | 
34 | # Initialize NPU
35 | npu = initialize_npu(sys.argv)
36 | # Compile model for NPU
37 | model.compile_for_npu(npu, test_input)
38 | # Run NPU flow
39 | npu.run_flow()
40 | 


--------------------------------------------------------------------------------
/scripts/workloads/02_gemv_1024x1024.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 1024
16 | L1_SIZE = 1024
17 | 
18 | # Define model architecture using Keras Sequential Model
19 | model = NPUSequential([
20 | 	layers.Dense(L1_SIZE, name="layer1"),
21 | ])
22 | 
23 | # Random test inputs for different types of layers
24 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], dtype=tf.int32, minval=-128, maxval=127)
25 | 
26 | # Call model on example input
27 | y = model(test_input)
28 | 
29 | # Print model summary
30 | model.summary()
31 | 
32 | ####### END OF MODEL DEFINITION #######
33 | 
34 | # Initialize NPU
35 | npu = initialize_npu(sys.argv)
36 | # Compile model for NPU
37 | model.compile_for_npu(npu, test_input)
38 | # Run NPU flow
39 | npu.run_flow()
40 | 


--------------------------------------------------------------------------------
/scripts/workloads/03_gemv_1152x1152.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 1152
16 | L1_SIZE = 1152
17 | 
18 | # Define model architecture using Keras Sequential Model
19 | model = NPUSequential([
20 | 	layers.Dense(L1_SIZE, name="layer1"),
21 | ])
22 | 
23 | # Random test inputs for different types of layers
24 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], dtype=tf.int32, minval=-128, maxval=127)
25 | 
26 | # Call model on example input
27 | y = model(test_input)
28 | 
29 | # Print model summary
30 | model.summary()
31 | 
32 | ####### END OF MODEL DEFINITION #######
33 | 
34 | # Initialize NPU
35 | npu = initialize_npu(sys.argv)
36 | # Compile model for NPU
37 | model.compile_for_npu(npu, test_input)
38 | # Run NPU flow
39 | npu.run_flow()
40 | 


--------------------------------------------------------------------------------
/scripts/workloads/04_gemv_1536x1536.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 1536
16 | L1_SIZE = 1536
17 | 
18 | # Define model architecture using Keras Sequential Model
19 | model = NPUSequential([
20 | 	layers.Dense(L1_SIZE, name="layer1"),
21 | ])
22 | 
23 | # Random test inputs for different types of layers
24 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], dtype=tf.int32, minval=-128, maxval=127)
25 | 
26 | # Call model on example input
27 | y = model(test_input)
28 | 
29 | # Print model summary
30 | model.summary()
31 | 
32 | ####### END OF MODEL DEFINITION #######
33 | 
34 | # Initialize NPU
35 | npu = initialize_npu(sys.argv)
36 | # Compile model for NPU
37 | model.compile_for_npu(npu, test_input)
38 | # Run NPU flow
39 | npu.run_flow()
40 | 


--------------------------------------------------------------------------------
/scripts/workloads/05_gemv_1792x1792.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 1792
16 | L1_SIZE = 1792
17 | 
18 | # Define model architecture using Keras Sequential Model
19 | model = NPUSequential([
20 | 	layers.Dense(L1_SIZE, name="layer1"),
21 | ])
22 | 
23 | # Random test inputs for different types of layers
24 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], dtype=tf.int32, minval=-128, maxval=127)
25 | 
26 | # Call model on example input
27 | y = model(test_input)
28 | 
29 | # Print model summary
30 | model.summary()
31 | 
32 | ####### END OF MODEL DEFINITION #######
33 | 
34 | # Initialize NPU
35 | npu = initialize_npu(sys.argv)
36 | # Compile model for NPU
37 | model.compile_for_npu(npu, test_input)
38 | # Run NPU flow
39 | npu.run_flow()
40 | 


--------------------------------------------------------------------------------
/scripts/workloads/06_rnn_512_8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 512
16 | HIDDEN_UNITS = 512
17 | TIME_STEPS = 8
18 | 
19 | # Define model architecture using Keras Sequential Model
20 | model = NPUSequential([
21 | 	layers.SimpleRNN(HIDDEN_UNITS, name="layer1"),
22 | ])
23 | 
24 | # Random test inputs for different types of layers
25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127)
26 | 
27 | # Call model on example input
28 | y = model(test_input)
29 | 
30 | # Print model summary
31 | model.summary()
32 | 
33 | ####### END OF MODEL DEFINITION #######
34 | 
35 | # Initialize NPU
36 | npu = initialize_npu(sys.argv)
37 | # Compile model for NPU
38 | model.compile_for_npu(npu, test_input)
39 | # Run NPU flow
40 | npu.run_flow()
41 | 


--------------------------------------------------------------------------------
/scripts/workloads/07_rnn_1024_8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 1024
16 | HIDDEN_UNITS = 1024
17 | TIME_STEPS = 8
18 | 
19 | # Define model architecture using Keras Sequential Model
20 | model = NPUSequential([
21 | 	layers.SimpleRNN(HIDDEN_UNITS, name="layer1"),
22 | ])
23 | 
24 | # Random test inputs for different types of layers
25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127)
26 | 
27 | # Call model on example input
28 | y = model(test_input)
29 | 
30 | # Print model summary
31 | model.summary()
32 | 
33 | ####### END OF MODEL DEFINITION #######
34 | 
35 | # Initialize NPU
36 | npu = initialize_npu(sys.argv)
37 | # Compile model for NPU
38 | model.compile_for_npu(npu, test_input)
39 | # Run NPU flow
40 | npu.run_flow()
41 | 


--------------------------------------------------------------------------------
/scripts/workloads/08_rnn_1152_8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 1152
16 | HIDDEN_UNITS = 1152
17 | TIME_STEPS = 8
18 | 
19 | # Define model architecture using Keras Sequential Model
20 | model = NPUSequential([
21 | 	layers.SimpleRNN(HIDDEN_UNITS, name="layer1"),
22 | ])
23 | 
24 | # Random test inputs for different types of layers
25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127)
26 | 
27 | # Call model on example input
28 | y = model(test_input)
29 | 
30 | # Print model summary
31 | model.summary()
32 | 
33 | ####### END OF MODEL DEFINITION #######
34 | 
35 | # Initialize NPU
36 | npu = initialize_npu(sys.argv)
37 | # Compile model for NPU
38 | model.compile_for_npu(npu, test_input)
39 | # Run NPU flow
40 | npu.run_flow()
41 | 


--------------------------------------------------------------------------------
/scripts/workloads/09_rnn_1536_8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 1536
16 | HIDDEN_UNITS = 1536
17 | TIME_STEPS = 8
18 | 
19 | # Define model architecture using Keras Sequential Model
20 | model = NPUSequential([
21 | 	layers.SimpleRNN(HIDDEN_UNITS, name="layer1"),
22 | ])
23 | 
24 | # Random test inputs for different types of layers
25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127)
26 | 
27 | # Call model on example input
28 | y = model(test_input)
29 | 
30 | # Print model summary
31 | model.summary()
32 | 
33 | ####### END OF MODEL DEFINITION #######
34 | 
35 | # Initialize NPU
36 | npu = initialize_npu(sys.argv)
37 | # Compile model for NPU
38 | model.compile_for_npu(npu, test_input)
39 | # Run NPU flow
40 | npu.run_flow()
41 | 


--------------------------------------------------------------------------------
/scripts/workloads/10_rnn_1792_8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 1792
16 | HIDDEN_UNITS = 1792
17 | TIME_STEPS = 8
18 | 
19 | # Define model architecture using Keras Sequential Model
20 | model = NPUSequential([
21 | 	layers.SimpleRNN(HIDDEN_UNITS, name="layer1"),
22 | ])
23 | 
24 | # Random test inputs for different types of layers
25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127)
26 | 
27 | # Call model on example input
28 | y = model(test_input)
29 | 
30 | # Print model summary
31 | model.summary()
32 | 
33 | ####### END OF MODEL DEFINITION #######
34 | 
35 | # Initialize NPU
36 | npu = initialize_npu(sys.argv)
37 | # Compile model for NPU
38 | model.compile_for_npu(npu, test_input)
39 | # Run NPU flow
40 | npu.run_flow()
41 | 


--------------------------------------------------------------------------------
/scripts/workloads/11_gru_512_8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 512
16 | HIDDEN_UNITS = 512
17 | TIME_STEPS = 8
18 | 
19 | # Define model architecture using Keras Sequential Model
20 | model = NPUSequential([
21 | 	layers.GRU(HIDDEN_UNITS, name="layer1"),
22 | ])
23 | 
24 | # Random test inputs for different types of layers
25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127)
26 | 
27 | # Call model on example input
28 | y = model(test_input)
29 | 
30 | # Print model summary
31 | model.summary()
32 | 
33 | ####### END OF MODEL DEFINITION #######
34 | 
35 | # Initialize NPU
36 | npu = initialize_npu(sys.argv)
37 | # Compile model for NPU
38 | model.compile_for_npu(npu, test_input)
39 | # Run NPU flow
40 | npu.run_flow()
41 | 


--------------------------------------------------------------------------------
/scripts/workloads/12_gru_1024_8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 1024
16 | HIDDEN_UNITS = 1024
17 | TIME_STEPS = 8
18 | 
19 | # Define model architecture using Keras Sequential Model
20 | model = NPUSequential([
21 | 	layers.GRU(HIDDEN_UNITS, name="layer1"),
22 | ])
23 | 
24 | # Random test inputs for different types of layers
25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127)
26 | 
27 | # Call model on example input
28 | y = model(test_input)
29 | 
30 | # Print model summary
31 | model.summary()
32 | 
33 | ####### END OF MODEL DEFINITION #######
34 | 
35 | # Initialize NPU
36 | npu = initialize_npu(sys.argv)
37 | # Compile model for NPU
38 | model.compile_for_npu(npu, test_input)
39 | # Run NPU flow
40 | npu.run_flow()
41 | 


--------------------------------------------------------------------------------
/scripts/workloads/13_gru_1152_8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 1152
16 | HIDDEN_UNITS = 1152
17 | TIME_STEPS = 8
18 | 
19 | # Define model architecture using Keras Sequential Model
20 | model = NPUSequential([
21 | 	layers.GRU(HIDDEN_UNITS, name="layer1"),
22 | ])
23 | 
24 | # Random test inputs for different types of layers
25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127)
26 | 
27 | # Call model on example input
28 | y = model(test_input)
29 | 
30 | # Print model summary
31 | model.summary()
32 | 
33 | ####### END OF MODEL DEFINITION #######
34 | 
35 | # Initialize NPU
36 | npu = initialize_npu(sys.argv)
37 | # Compile model for NPU
38 | model.compile_for_npu(npu, test_input)
39 | # Run NPU flow
40 | npu.run_flow()
41 | 


--------------------------------------------------------------------------------
/scripts/workloads/14_lstm_512_8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 512
16 | HIDDEN_UNITS = 512
17 | TIME_STEPS = 8
18 | 
19 | # Define model architecture using Keras Sequential Model
20 | model = NPUSequential([
21 | 	layers.LSTM(HIDDEN_UNITS, name="layer1"),
22 | ])
23 | 
24 | # Random test inputs for different types of layers
25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127)
26 | 
27 | # Call model on example input
28 | y = model(test_input)
29 | 
30 | # Print model summary
31 | model.summary()
32 | 
33 | ####### END OF MODEL DEFINITION #######
34 | 
35 | # Initialize NPU
36 | npu = initialize_npu(sys.argv)
37 | # Compile model for NPU
38 | model.compile_for_npu(npu, test_input)
39 | # Run NPU flow
40 | npu.run_flow()
41 | 


--------------------------------------------------------------------------------
/scripts/workloads/15_lstm_1024_8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 1024
16 | HIDDEN_UNITS = 1024
17 | TIME_STEPS = 8
18 | 
19 | # Define model architecture using Keras Sequential Model
20 | model = NPUSequential([
21 | 	layers.LSTM(HIDDEN_UNITS, name="layer1"),
22 | ])
23 | 
24 | # Random test inputs for different types of layers
25 | test_input = tf.random.uniform(shape=[TIME_STEPS, 6, INPUT_SIZE], minval=-128, maxval=127)
26 | 
27 | # Call model on example input
28 | y = model(test_input)
29 | 
30 | # Print model summary
31 | model.summary()
32 | 
33 | ####### END OF MODEL DEFINITION #######
34 | 
35 | # Initialize NPU
36 | npu = initialize_npu(sys.argv)
37 | # Compile model for NPU
38 | model.compile_for_npu(npu, test_input)
39 | # Run NPU flow
40 | npu.run_flow()
41 | 


--------------------------------------------------------------------------------
/scripts/workloads/16_mlp5_512.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 512
16 | DENSE_SIZE = 512
17 | 
18 | # Define model architecture using Keras Sequential Model
19 | model = NPUSequential([
20 | 	layers.Dense(DENSE_SIZE, name="layer1"),
21 | 	layers.Dense(DENSE_SIZE, name="layer2"),
22 | 	layers.Dense(DENSE_SIZE, name="layer3"),
23 | ])
24 | 
25 | # Random test inputs for different types of layers
26 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], minval=-128, maxval=127)
27 | 
28 | # Call model on example input
29 | y = model(test_input)
30 | 
31 | # Print model summary
32 | model.summary()
33 | 
34 | ####### END OF MODEL DEFINITION #######
35 | 
36 | # Initialize NPU
37 | npu = initialize_npu(sys.argv)
38 | # Compile model for NPU
39 | model.compile_for_npu(npu, test_input)
40 | # Run NPU flow
41 | npu.run_flow()
42 | 


--------------------------------------------------------------------------------
/scripts/workloads/17_mlp5_1024.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | #import sys
 7 | #sys.path.append('../compiler/')
 8 | 
 9 | from compiler import *
10 | from npu_layers import *
11 | 
12 | ###### START OF MODEL DEFINITION ######
13 | 
14 | # Define constants
15 | INPUT_SIZE = 1024
16 | DENSE_SIZE = 1024
17 | 
18 | # Define model architecture using Keras Sequential Model
19 | model = NPUSequential([
20 | 	layers.Dense(DENSE_SIZE, name="layer1"),
21 | 	layers.Dense(DENSE_SIZE, name="layer2"),
22 | 	layers.Dense(DENSE_SIZE, name="layer3"),
23 | ])
24 | 
25 | # Random test inputs for different types of layers
26 | test_input = tf.random.uniform(shape=[6, INPUT_SIZE], minval=-128, maxval=127)
27 | 
28 | # Call model on example input
29 | y = model(test_input)
30 | 
31 | # Print model summary
32 | model.summary()
33 | 
34 | ####### END OF MODEL DEFINITION #######
35 | 
36 | # Initialize NPU
37 | npu = initialize_npu(sys.argv)
38 | # Compile model for NPU
39 | model.compile_for_npu(npu, test_input)
40 | # Run NPU flow
41 | npu.run_flow()
42 | 


--------------------------------------------------------------------------------
/scripts/workloads/18_mlp3_1024_512_256_256.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | 
 7 | from compiler import *
 8 | from npu_layers import *
 9 | 
10 | ###### START OF MODEL DEFINITION ######
11 | 
12 | # Define constants
13 | INPUT_VEC_SIZE = 1024
14 | DENSE_L1_SIZE = 512
15 | DENSE_L2_SIZE = 256
16 | DENSE_L3_SIZE = 256
17 | 
18 | # Define model architecture using Keras Sequential Model
19 | model = NPUSequential([
20 | 	layers.Dense(DENSE_L1_SIZE, activation="relu", name="layer1"),
21 | 	layers.Dense(DENSE_L2_SIZE, activation="relu", name="layer2"),
22 | 	layers.Dense(DENSE_L3_SIZE, activation="relu", name="layer3"),
23 | ])
24 | 
25 | # Random test inputs for different types of layers
26 | test_input = tf.random.uniform(shape=[6, INPUT_VEC_SIZE], minval=-128, maxval=127)
27 | 
28 | # Call model on example input
29 | y = model(test_input)
30 | 
31 | # Print model summary
32 | model.summary()
33 | 
34 | ####### END OF MODEL DEFINITION #######
35 | 
36 | # Initialize NPU
37 | npu = initialize_npu(sys.argv)
38 | # Compile model for NPU
39 | model.compile_for_npu(npu, test_input)
40 | # Run NPU flow
41 | npu.run_flow()
42 | 


--------------------------------------------------------------------------------
/scripts/workloads/19_mlp3_1024_512_256_256_batched.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | 
 7 | from compiler import *
 8 | from npu_layers import *
 9 | 
10 | ###### START OF MODEL DEFINITION ######
11 | 
12 | # Define constants
13 | INPUT_VEC_SIZE = 1024
14 | DENSE_L1_SIZE = 512
15 | DENSE_L2_SIZE = 256
16 | DENSE_L3_SIZE = 256
17 | 
18 | # Define model architecture using Keras Sequential Model
19 | model = NPUSequential([
20 | 	layers.Dense(DENSE_L1_SIZE, activation="relu", name="layer1"),
21 | 	layers.Dense(DENSE_L2_SIZE, activation="relu", name="layer2"),
22 | 	layers.Dense(DENSE_L3_SIZE, activation="relu", name="layer3"),
23 | ])
24 | 
25 | # Random test inputs for different types of layers
26 | test_input = tf.random.uniform(shape=[18, INPUT_VEC_SIZE], minval=-128, maxval=127)
27 | 
28 | # Call model on example input
29 | y = model(test_input)
30 | 
31 | # Print model summary
32 | model.summary()
33 | 
34 | ####### END OF MODEL DEFINITION #######
35 | 
36 | # Initialize NPU
37 | npu = initialize_npu(sys.argv)
38 | # Compile model for NPU
39 | model.compile_for_npu(npu, test_input)
40 | # Run NPU flow
41 | npu.run_flow()
42 | 


--------------------------------------------------------------------------------
/simulator/Makefile:
--------------------------------------------------------------------------------
 1 | CC 			:= g++
 2 | HEADER  	:= inc/
 3 | CFLAGS 		:= -c -g -std=c++11  -Wall -Wextra
 4 | INCLUDES 	:= -I ./inc/
 5 | OBJ_DIR 	:= ./src/obj/
 6 | SIM_DIR		:= ./main/obj/
 7 | EXE			:= npu_sim 
 8 | 
 9 | OBJ := \
10 | 		$(OBJ_DIR)port.o \
11 | 		$(OBJ_DIR)input.o \
12 | 		$(OBJ_DIR)output.o \
13 | 		$(OBJ_DIR)channel.o \
14 | 		$(OBJ_DIR)dpe.o \
15 | 		$(OBJ_DIR)mvu_vrf.o \
16 | 		$(OBJ_DIR)accumulator.o \
17 | 		$(OBJ_DIR)register_file.o \
18 | 		$(OBJ_DIR)tile.o \
19 | 		$(OBJ_DIR)mvu.o \
20 | 		$(OBJ_DIR)evrf.o \
21 | 		$(OBJ_DIR)mfu.o \
22 | 		$(OBJ_DIR)loader.o \
23 | 		$(OBJ_DIR)datapath.o \
24 | 		$(OBJ_DIR)decoder.o \
25 | 		$(OBJ_DIR)npu.o \
26 | 		$(OBJ_DIR)utils.o
27 | 
28 | all: $(EXE) 
29 | 
30 | %: $(SIM_DIR)%.o $(OBJ)   
31 | 	$(CC) -g $(OBJ) $< -o $@
32 | 
33 | $(SIM_DIR)%.o: main/%.cpp  
34 | 	$(CC) $(INCLUDES) $(CFLAGS) $< -o $@
35 | 
36 | $(OBJ_DIR)%.o: src/%.cpp 
37 | 	$(CC) $(INCLUDES) $(CFLAGS) $< -o $@
38 | 
39 | clean: 
40 | 	rm   $(OBJ) $(EXE)
41 | 
42 | .PRECIOUS: $(OBJ)
43 | 
44 | 


--------------------------------------------------------------------------------
/simulator/inc/accumulator.h:
--------------------------------------------------------------------------------
 1 | #ifndef PRIME_ACCUMULATOR_H
 2 | #define PRIME_ACCUMULATOR_H
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | #include "input.h"
 7 | #include "output.h"
 8 | #include "inst.h"
 9 | #include "defines.h"
10 | #include "utils.h"
11 | 
12 | /* 
13 |  * This class implements the MVU accumulation of the 3 results computed by the dot product engines
14 |  * based on the Stratix 10 NX tensor block.
15 |  * Input Ports:
16 |  * - 3 data inputs (from inter-tile reduction)
17 |  * - uOP (from decoder)
18 |  * - reconfigurable accumulator size (from decoder)
19 |  * Output Ports:
20 |  * - 3 accumulation results (to MVU output)
21 |  */
22 | class Accumulator : public Module {
23 | public:
24 |     // Constructor
25 |     Accumulator (std::string t_name, unsigned int t_accum_id);
26 |     // Clock function
27 |     void clock();
28 |     // Getter functions
29 |     std::string getName();
30 |     unsigned int getId();
31 |     Input<TYPE> *getPortInput(unsigned int i);
32 |     Input<unsigned int> *getPortuOP();
33 |     Input<unsigned int> *getPortSize();
34 |     Output<TYPE> *getPortRes(unsigned int i);
35 |     // Helper functions
36 |     void reset();
37 |     // Destructor
38 |     ~Accumulator();
39 | 
40 | private:
41 |     // Module name
42 |     std::string name;
43 |     // Input and Output ports
44 |     Input<TYPE>* input0;
45 |     Input<TYPE>* input1;
46 |     Input<TYPE>* input2;
47 |     Input<unsigned int>* uOP;
48 |     Input<unsigned int>* size;
49 |     Output<TYPE>* result0;
50 |     Output<TYPE>* result1;
51 |     Output<TYPE>* result2;
52 |     // Local variables
53 |     unsigned int accum_id;
54 |     std::vector<TYPE> accum0_values;
55 |     std::vector<TYPE> accum1_values;
56 |     std::vector<TYPE> accum2_values;
57 | 	unsigned int channel_full_count;
58 |     unsigned int num_accum_values = 2 * 3 * (LANES/10);
59 |     unsigned int accum_idx;
60 | };
61 | 
62 | #endif
63 | 


--------------------------------------------------------------------------------
/simulator/inc/channel.h:
--------------------------------------------------------------------------------
 1 | #ifndef CHANNEL_H_
 2 | #define CHANNEL_H_
 3 | 
 4 | #include <string>
 5 | #include <queue>
 6 | #include <tuple>
 7 | #include <assert.h>
 8 | #include <iostream>
 9 | #include "defines.h"
10 | #include "inst.h"
11 | 
12 | /* 
13 |  * This class implements a communication channel. Each channel has a capacity and latency
14 |  * parameters. By setting these two parameters, a channel can be used to model:
15 |  * - wire: capacity 1 and latency 0
16 |  * - register: capacity 1 and latency 1
17 |  * - pipeline: capacity N and latency N
18 |  * - FIFO: capacity N and latency 1
19 |  */
20 | template <class T>
21 | class Channel { 
22 | public:
23 |     // Constructor
24 |     Channel (std::string t_name, unsigned int t_size, unsigned int t_latency);
25 |     // Clock function
26 |     void clock();
27 |     // Helper functions
28 |     void write(T t_value);
29 |     T read();
30 |     T peek();
31 |     T at(unsigned int idx);
32 |     bool isEmpty();
33 |     bool isFull();
34 |     // Getter functions
35 |     std::string getName();
36 |     unsigned int getSize();
37 | 
38 | private:
39 |     // Module name
40 |     std::string name;
41 |     // Local variables
42 |     std::queue<std::tuple<T, unsigned int>> buffer;
43 |     unsigned int size;
44 |     unsigned int latency;
45 | };
46 | #endif
47 | 


--------------------------------------------------------------------------------
/simulator/inc/datapath.h:
--------------------------------------------------------------------------------
 1 | #ifndef DATAPATH_H
 2 | #define DATAPATH_H
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | 
 7 | #include "input.h"
 8 | #include "output.h"
 9 | #include "mvu.h"
10 | #include "mfu.h"
11 | #include "evrf.h"
12 | #include "loader.h"
13 | #include "inst.h"
14 | #include "defines.h"
15 | 
16 | /* 
17 |  * This class implements the NPU datapath. It consists of 5 main pipeline stages (MVU, eVRF, MFU0,
18 |  * MFU1, and Loader).
19 |  * Input Ports:
20 |  * - MVU uOP (from NPU decoders)
21 |  * - eVRF uOP (from NPU decoders)
22 |  * - MFU0 uOP (from NPU decoders)
23 |  * - MFU1 uOP (from NPU decoders)
24 |  * - Loader uOP (from NPU decoders)
25 |  * Output Ports:
26 |  * - Final NPU output (to tester)
27 |  */
28 | class Datapath : public Module {
29 | public:
30 |     // Constructor
31 |     Datapath (std::string t_name);
32 |     // Clock function
33 |     void clock(unsigned int &cycle_count);
34 |     // Getter functions
35 |     Input<mvu_uOP>* getPortMVUuOP();
36 |     Input<evrf_uOP>* getPortEVRFuOP();
37 |     Input<mfu_uOP>* getPortMFU0uOP();
38 |     Input<mfu_uOP>* getPortMFU1uOP();
39 |     Input<ld_uOP>* getPortLDuOP();
40 |     Output<std::vector<TYPE>>* getPortOutput();
41 |     // Destructor
42 |     ~Datapath();
43 | 
44 | private:
45 |     // Module name
46 |     std::string name;
47 |     // Input and Output ports
48 |     Input<mvu_uOP>*  mvu_uOP_port;
49 |     Input<evrf_uOP>* evrf_uOP_port;
50 |     Input<mfu_uOP>*  mfu0_uOP_port;
51 |     Input<mfu_uOP>*  mfu1_uOP_port;
52 |     Input<ld_uOP>*   ld_uOP_port;
53 |     Output<std::vector<TYPE>>* datapath_output;
54 |     // Internal modules
55 |     MVU* mvu;
56 |     EVRF* evrf;
57 |     MFU* mfu0;
58 |     MFU* mfu1;
59 |     LD* ld;
60 |     // Internal channels
61 |     Channel<std::vector<TYPE>>* mvu_to_evrf_channel;
62 |     Channel<std::vector<TYPE>>* evrf_to_mfu0_channel;
63 |     Channel<std::vector<TYPE>>* mfu0_to_mfu1_channel;
64 |     Channel<std::vector<TYPE>>* mfu1_to_ld_channel;
65 |     // Loader to MVU channels
66 |     std::vector<Channel<std::vector<TYPE>>*> ld_to_mvu_wdata_channels;
67 |     std::vector<Channel<unsigned int>*> ld_to_mvu_waddr_channels;
68 |     Channel<bool>* ld_to_mvu_update_channel;
69 |     // Loader to eVRF Channels
70 |     Channel<std::vector<TYPE>>* ld_to_evrf_wdata_channel;
71 |     Channel<unsigned int>* ld_to_evrf_waddr_channel;
72 |     Channel<bool>* ld_to_evrf_update_channel;
73 |     // Loader to MRF0 Channels
74 |     Channel<std::vector<TYPE>>* ld_to_mfu0_vrf0_wdata_channel;
75 |     Channel<std::vector<TYPE>>* ld_to_mfu0_vrf1_wdata_channel;
76 |     Channel<unsigned int>* ld_to_mfu0_vrf0_waddr_channel;
77 |     Channel<unsigned int>* ld_to_mfu0_vrf1_waddr_channel;
78 |     Channel<bool>* ld_to_mfu0_update_channel;
79 |     // Loader to MRF1 Channels
80 |     Channel<std::vector<TYPE>>* ld_to_mfu1_vrf0_wdata_channel;
81 |     Channel<std::vector<TYPE>>* ld_to_mfu1_vrf1_wdata_channel;
82 |     Channel<unsigned int>* ld_to_mfu1_vrf0_waddr_channel;
83 |     Channel<unsigned int>* ld_to_mfu1_vrf1_waddr_channel;
84 |     Channel<bool>* ld_to_mfu1_update_channel;
85 | };
86 | 
87 | #endif
88 | 


--------------------------------------------------------------------------------
/simulator/inc/decoder.h:
--------------------------------------------------------------------------------
 1 | #ifndef DECODER_H
 2 | #define DECODER_H
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | #include <iostream>
 7 | #include "input.h"
 8 | #include "output.h"
 9 | #include "channel.h"
10 | #include "inst.h"
11 | #include "defines.h"
12 | #include "utils.h"
13 | 
14 | /* 
15 |  * This class implements the NPU instruction decoders that translate an NPU VLIW instruction 
16 |  * (5 chained mOPs) into a sequence of uOPs for each of the 5 NPU pipeline stages.
17 |  * Input Ports: 
18 |  * - VLIW NPU instructions (from NPU top-level module)
19 |  * Output Ports:
20 |  * - MVU uOP (to NPU datapath)
21 |  * - eVRF uOP (to NPU datapath)
22 |  * - MFU0 uOP (to NPU datapath)
23 |  * - MFU1 uOP (to NPU datapath)
24 |  * - Loader uOP (to NPU datapath)
25 |  */
26 | class Decoder : public Module {
27 | public:
28 |     // Constructor
29 |     Decoder (std::string t_name);
30 |     // Clock function
31 |     void clock(unsigned int &cycle_count);
32 |     // Getter functions
33 |     std::string getName();
34 |     Input<npu_instruction>* getPortInputVLIW();
35 |     Output<mvu_uOP>* getPortMVUuOP();
36 |     Output<evrf_uOP>* getPortEVRFuOP();
37 |     Output<mfu_uOP>* getPortMFU0uOP();
38 |     Output<mfu_uOP>* getPortMFU1uOP();
39 |     Output<ld_uOP>* getPortLDuOP();
40 |     // Destructor
41 |     ~Decoder();
42 | 
43 | private:
44 |     // Module name
45 |     std::string name;
46 |     // Input and Output ports
47 |     Input<npu_instruction>* vliw;
48 |     Output<mvu_uOP>*  mvu_uOP_port;
49 |     Output<evrf_uOP>* evrf_uOP_port;
50 |     Output<mfu_uOP>*  mfu0_uOP_port;
51 |     Output<mfu_uOP>*  mfu1_uOP_port;
52 |     Output<ld_uOP>*   ld_uOP_port;
53 |     // Internal channels
54 |     Channel<mvu_mOP>*  mvu_mOP_channel;
55 |     Channel<evrf_mOP>* evrf_mOP_channel;
56 |     Channel<mfu_mOP>*  mfu0_mOP_channel;
57 |     Channel<mfu_mOP>*  mfu1_mOP_channel;
58 |     Channel<ld_mOP>*   ld_mOP_channel;
59 |     // Local variables for decoding logic
60 |     unsigned int mvu_counter;
61 |     unsigned int mvu_pipeline_counter;
62 |     unsigned int mvu_chunk_counter;
63 |     unsigned int reg_sel_flag;
64 |     int remaining_rows;
65 |     unsigned int acc_size;
66 |     unsigned int evrf_counter;
67 |     unsigned int evrf_batch_counter;
68 |     unsigned int mfu0_counter;
69 |     unsigned int mfu0_batch_counter;
70 |     unsigned int mfu1_counter;
71 |     unsigned int mfu1_batch_counter;
72 |     unsigned int ld_counter;
73 |     unsigned int ld_batch_counter;
74 |     bool decoding_mvu;
75 |     bool decoding_evrf;
76 |     bool decoding_mfu0;
77 |     bool decoding_mfu1;
78 |     bool decoding_ld;
79 |     npu_instruction inst;
80 |     mvu_uOP u1; evrf_uOP u2; mfu_uOP u3; mfu_uOP u4; ld_uOP u5;
81 |     mvu_mOP m1; evrf_mOP m2; mfu_mOP m3; mfu_mOP m4; ld_mOP m5;
82 |     unsigned int row_count; unsigned int col_count;
83 |     unsigned int tile_id; unsigned int pue_id;
84 |     unsigned int x_size; unsigned int y_size; unsigned int chunks_per_tile;
85 | };
86 | 
87 | #endif
88 | 


--------------------------------------------------------------------------------
/simulator/inc/defines.h:
--------------------------------------------------------------------------------
 1 | #ifndef DEFINES_H_
 2 | #define DEFINES_H_
 3 | 
 4 | #include <string>
 5 | #include <iostream>
 6 | 
 7 | // Debug Messages
 8 | #define VERBOSE_OP 1
 9 | #define VERBOSE_MVU 1
10 | #define VERBOSE_LD_OUT 0
11 | 
12 | // Architecture Parameters
13 | #define TILES 7
14 | #define DPES 40
15 | #define LANES 40
16 | #define MVU_VRF_DEPTH 512
17 | #define MVU_MRF_DEPTH 1024
18 | #define EVRF_DEPTH 512
19 | #define MFU_VRF0_DEPTH 512
20 | #define MFU_VRF1_DEPTH 512
21 | #define FIFO_DEPTH 512
22 | 
23 | // Latency Parameters
24 | #define DPE_MULT_LATENCY 2
25 | #define DPE_ADDER_LATENCY 1
26 | #define RF_WRITE_LATENCY 1
27 | #define RF_READ_LATENCY 1
28 | #define MRF_TO_DPE_LATENCY 8
29 | #define VRF_TO_DPE_LATENCY 8
30 | #define MVU_ACCUM_LATENCY 4
31 | #define MVU_REDUCTION_LATENCY (unsigned int)(ceil(log2(TILES))+5)
32 | #define MFU_ACT_LATENCY 3
33 | #define MFU_ADD_LATENCY 3
34 | #define MFU_MUL_LATENCY 3
35 | #define MFU_LATENCY MFU_ACT_LATENCY+MFU_ADD_LATENCY+MFU_MUL_LATENCY
36 | #define LD_WB_LATENCY 5
37 | 
38 | // Precision
39 | #define TYPE int
40 | #define INPUT_PRECISION 8
41 | #define MASK_TRUNCATE 0x000000FF
42 | #define MASK_SIGN_EXTEND 0xFFFFFF00
43 | #define MASK_SIGN_CHECK 0x00000080
44 | 
45 | #define LOG(module_name, msg) do { \
46 | std::cout << "[" << module_name << " @ " << cycle_count << "]: " << msg << std::endl; \
47 | } while (0)
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/simulator/inc/dpe.h:
--------------------------------------------------------------------------------
 1 | #ifndef DPE_H_
 2 | #define DPE_H_
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | #include <tuple>
 7 | #include <math.h>
 8 | #include <iostream>
 9 | #include <assert.h>
10 | #include "module.h"
11 | #include "input.h"
12 | #include "output.h"
13 | #include "defines.h"
14 | #include "utils.h"
15 | 
16 | /* 
17 |  * This class implements the MVU dot product engine (DPE) based on the Stratix 10 NX tensor blocks.
18 |  * Each DPE implements a batch-3 dot product operation (i.e. 1 shared vector multiplied by 3 other
19 |  * input vectors).
20 |  * Input Ports:
21 |  * - shared input vector (vBroadcast)
22 |  * - sequentially loaded input vectors (vSeq)
23 |  * - control signals (reg_sel, vrf_en)
24 |  * Output Ports:
25 |  * - 3 dot product results (dpe_res0, dpe_res1, dpe_res2)
26 |  */
27 | class DPE : public Module { 
28 | public:
29 | 	// Constructor
30 | 	DPE (std::string t_name, unsigned int t_dpe_id, unsigned int t_tile_id);
31 | 	// Clock function
32 | 	void clock();
33 | 	// Getter functions
34 | 	std::string getName();
35 | 	Input<std::vector<TYPE>> *getPortVSeq();
36 | 	Input<std::vector<TYPE>> *getPortVBroadcast();
37 | 	Input<unsigned int> *getPortRegSel();
38 | 	Input<unsigned int> *getPortVrfEn();
39 | 	Output<TYPE> *getPortDPERes(unsigned int i);
40 | 	// Destructor
41 | 	~DPE();
42 | 
43 | private:
44 | 	// Module name
45 | 	std::string name;
46 | 	// Input and Output ports
47 | 	Input<std::vector<TYPE>>* vSeq;
48 | 	Input<std::vector<TYPE>>* vBroadcast;
49 | 	Input<unsigned int>* reg_sel;
50 | 	Input<unsigned int>* vrf_en;
51 | 	Output<TYPE>* dpe_res0;
52 | 	Output<TYPE>* dpe_res1;
53 | 	Output<TYPE>* dpe_res2;
54 | 	// Internal channels'
55 | 	Channel<TYPE>* dpe_result0_channel;
56 | 	Channel<TYPE>* dpe_result1_channel;
57 | 	Channel<TYPE>* dpe_result2_channel;
58 | 	Channel<std::vector<TYPE>>* pingpong0;
59 | 	Channel<std::vector<TYPE>>* pingpong1;
60 | 	Channel<std::vector<TYPE>>* broadcast_delay;
61 | 	Channel<unsigned int>* input_sel_delay;
62 | 	Channel<unsigned int>* reg_sel_delay;
63 | 	Channel<unsigned int>* vrf_en_delay;
64 | 	// Local variables
65 | 	unsigned int dpe_id;
66 | 	unsigned int tile_id;
67 | 	// Local latency variables
68 | 	unsigned int num_prime_dsps = (unsigned int) ceil(1.0 * LANES / 10.0);
69 | 	unsigned int dpe_result_latency = (unsigned int) 2 + (ceil(log2(num_prime_dsps)) * 
70 | 		DPE_ADDER_LATENCY);
71 | 	unsigned int pingpong_length = 3 * (1 + num_prime_dsps);
72 | 	int accum_val = 0;
73 | };
74 | 
75 | #endif
76 | 		
77 | 


--------------------------------------------------------------------------------
/simulator/inc/evrf.h:
--------------------------------------------------------------------------------
 1 | #ifndef EVRF_H
 2 | #define EVRF_H
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | #include "input.h"
 7 | #include "output.h"
 8 | #include "register_file.h"
 9 | #include "inst.h"
10 | #include "defines.h"
11 | 
12 | /* 
13 |  * This class implements the external VRF (eVRF) module which is used to skip the MVU if an
14 |  * instruction chain does not have an MVU operation.
15 |  * Input Ports:
16 |  * - eVRF input (from previous block in pipeline -- MVU)
17 |  * - eVRF uOP (from decoder)
18 |  * - eVRF write data (from Loader)
19 |  * - eVRF write address (from Loader)
20 |  * - update tag (from Loader)
21 |  * Output Ports:
22 |  * - eVRF output (to next block in pipeline -- MFU0)
23 |  */
24 | class EVRF : public Module {
25 | public:
26 |     // Constructor
27 |     EVRF (std::string t_name);
28 |     // Clock function
29 |     void clock(unsigned int &cycle_count);
30 |     // Getter functions
31 |     std::string getName();
32 |     Input<std::vector<TYPE>>* getPortInput();
33 |     Input<evrf_uOP>* getPortuOP();
34 |     Input<std::vector<TYPE>>* getPortEvrfWdata();
35 |     Input<unsigned int>* getPortEvrfWaddr();
36 |     Input<bool>* getPortUpdateTag();
37 |     Output<std::vector<TYPE>>* getPortRes();
38 |     // Destructor
39 |     ~EVRF();
40 | 
41 | private:
42 |     // Module name
43 |     std::string name;
44 |     // Input and Output ports
45 |     Input<std::vector<TYPE>>* evrf_input;
46 |     Input<evrf_uOP>* uOP;
47 |     Input<std::vector<TYPE>>* evrf_wdata;
48 |     Input<unsigned int>* evrf_waddr;
49 |     Input<bool>* update_tag;
50 |     Output<std::vector<TYPE>>* evrf_result;
51 |     // Internal modules
52 |     RegisterFile<std::vector<TYPE>>* evrf;
53 |     // Internal channels
54 |     Channel<std::vector<TYPE>>* mvu_channel;
55 |     Channel<unsigned int>* evrf_raddr;
56 |     Channel<std::vector<TYPE>>* evrf_rdata;
57 |     // Local variables
58 |     unsigned int current_tag;
59 | };
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/simulator/inc/input.h:
--------------------------------------------------------------------------------
 1 | #ifndef INPUT_H_
 2 | #define INPUT_H_
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | #include <assert.h>
 7 | #include "port.h"
 8 | #include "module.h"
 9 | #include "defines.h"
10 | #include "inst.h"
11 | #include "channel.h"
12 | 
13 | /* 
14 |  * This class implements an input port for a module. Each input port is connected to a channel.
15 |  */
16 | template <class T>
17 | class Input : public Port<T> 
18 | {
19 | public: 
20 | 	// Constructor
21 | 	Input(std::string t_name, Module *t_module);
22 | 	// Helper functions
23 | 	void connectTo(Channel<T> *t_channel);
24 | 	T readFromChannel();
25 | 	T peekChannel();
26 | 	bool isChannelEmpty();
27 | 	// Destructor
28 | 	~Input();
29 | 
30 | private:
31 | 	Channel<T>* channel;
32 | };
33 | 
34 | #endif


--------------------------------------------------------------------------------
/simulator/inc/loader.h:
--------------------------------------------------------------------------------
 1 | #ifndef LOADER_H
 2 | #define LOADER_H
 3 | 
 4 | #include <vector>
 5 | #include <queue>
 6 | #include <string>
 7 | #include "input.h"
 8 | #include "output.h"
 9 | #include "inst.h"
10 | #include "utils.h"
11 | #include "defines.h"
12 | 
13 | /* 
14 |  * This class implements the loader module which writes the datapath results back to one of the
15 |  * NPU architectural states (VRFs).
16 |  * Input Ports:
17 |  * - Loader input (from previous block in pipeline -- MFU1)
18 |  * - Loader uOP (from decoder)
19 |  * Output Ports:
20 |  * - MVU VRFs write data (to MVU)
21 |  * - MVU VRFs write address (to MVU)
22 |  * - MVU tag update (to MVU)
23 |  * - eVRF write data (to eVRF)
24 |  * - eVRF write address (to eVRF)
25 |  * - eVRF tag update (to eVRF)
26 |  * - MFU0 VRF0 write data (to MFU0)
27 |  * - MFU0 VRF0 write address (to MFU0)
28 |  * - MFU0 VRF1 write data (to MFU0)
29 |  * - MFU0 VRF1 write address (to MFU0)
30 |  * - MFU0 tag update (to MFU0)
31 |  * - MFU1 VRF0 write data (to MFU1)
32 |  * - MFU1 VRF0 write address (to MFU1)
33 |  * - MFU1 VRF1 write data (to MFU1)
34 |  * - MFU1 VRF1 write address (to MFU1)
35 |  * - MFU1 tag update (to MFU1)
36 |  * - Loader output port (to tester)
37 |  */
38 | class LD : public Module {
39 | public:
40 |     // Constructor
41 |     LD (std::string t_name);
42 |     // Clock function
43 |     void clock(unsigned int &cycle_count);
44 |     // Getter functions
45 |     std::string getName();
46 |     Input<ld_uOP>* getPortuOP();
47 |     Input<std::vector<TYPE>>* getPortInput();
48 |     Output<std::vector<TYPE>>* getPortMVUWdata(unsigned int idx);
49 |     Output<unsigned int>* getPortMVUWaddr(unsigned int idx);
50 |     Output<std::vector<TYPE>>* getPortEvrfWdata();
51 |     Output<unsigned int>* getPortEvrfWaddr();
52 |     Output<std::vector<TYPE>>* getPortMFU0Vrf0Wdata();
53 |     Output<unsigned int>* getPortMFU0Vrf0Waddr();
54 |     Output<std::vector<TYPE>>* getPortMFU0Vrf1Wdata();
55 |     Output<unsigned int>* getPortMFU0Vrf1Waddr();
56 |     Output<std::vector<TYPE>>* getPortMFU1Vrf0Wdata();
57 |     Output<unsigned int>* getPortMFU1Vrf0Waddr();
58 |     Output<std::vector<TYPE>>* getPortMFU1Vrf1Wdata();
59 |     Output<unsigned int>* getPortMFU1Vrf1Waddr();
60 |     Output<bool>* getPortUpdateMVU();
61 |     Output<bool>* getPortUpdateEvrf();
62 |     Output<bool>* getPortUpdateMFU0();
63 |     Output<bool>* getPortUpdateMFU1();
64 |     Output<std::vector<TYPE>>* getPortOutput();
65 |     // Destructor
66 |     ~LD();
67 | 
68 | private:
69 |     // Module name
70 |     std::string name;
71 |     // Input and Output ports
72 |     Input<ld_uOP>* uOP;
73 |     Input<std::vector<TYPE>>* ld_input;
74 |     std::vector<Output<std::vector<TYPE>>*> mvu_vrfs_wdata;
75 |     std::vector<Output<unsigned int>*> mvu_vrfs_waddr;
76 |     Output<std::vector<TYPE>>* evrf_wdata;
77 |     Output<unsigned int>* evrf_waddr;
78 |     Output<std::vector<TYPE>>* mfu0_vrf0_wdata;
79 |     Output<unsigned int>* mfu0_vrf0_waddr;
80 |     Output<std::vector<TYPE>>* mfu0_vrf1_wdata;
81 |     Output<unsigned int>* mfu0_vrf1_waddr;
82 |     Output<std::vector<TYPE>>* mfu1_vrf0_wdata;
83 |     Output<unsigned int>* mfu1_vrf0_waddr;
84 |     Output<std::vector<TYPE>>* mfu1_vrf1_wdata;
85 |     Output<unsigned int>* mfu1_vrf1_waddr;
86 |     Output<bool> *update_tag_mvu;
87 |     Output<bool> *update_tag_evrf;
88 |     Output<bool> *update_tag_mfu0;
89 |     Output<bool> *update_tag_mfu1;
90 |     Output<std::vector<TYPE>>* ld_output;
91 |     // Local variables
92 |     std::queue<std::vector<TYPE>> input_fifo;
93 | };
94 | 
95 | #endif
96 | 


--------------------------------------------------------------------------------
/simulator/inc/mfu.h:
--------------------------------------------------------------------------------
 1 | #ifndef MFU_H
 2 | #define MFU_H
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | #include <math.h>
 7 | #include <assert.h>
 8 | 
 9 | #include "input.h"
10 | #include "output.h"
11 | #include "register_file.h"
12 | #include "inst.h"
13 | #include "defines.h"
14 | #include "utils.h"
15 | 
16 | /* 
17 |  * This class implements the Multi-Function Unit (MFU) which performs vector element-wise 
18 |  * operations: activations {tanh, sigmoid, relu}, addition {add, sub_ab, sub_ba, max}, and
19 |  * multiplication {mult}.
20 |  * Input Ports:
21 |  * - MFU input (from previous block in pipeline -- eVRF for MFU0 or MFU0 for MFU1)
22 |  * - MFU uOP (from decoder)
23 |  * - VRF0 write data (from Loader)
24 |  * - VRF0 write address (from Loader)
25 |  * - VRF1 write data (from Loader)
26 |  * - VRF1 write address (from Loader)
27 |  * - Tag update (from Loader)
28 |  * Output Ports:
29 |  * - MFU output (to next block in pipeline -- MFU1 for MFU0 or Loader for MFU1)
30 |  */
31 | class MFU : public Module {
32 | public:
33 |     // Constructor
34 |     MFU (std::string t_name);
35 |     // Clock function
36 |     void clock(unsigned int &cycle_count);
37 |     // Getter functions
38 |     std::string getName();
39 |     Input<std::vector<TYPE>>* getPortInput();
40 |     Input<mfu_uOP>* getPortuOP();
41 |     Output<std::vector<TYPE>>* getPortRes();
42 |     Input<std::vector<TYPE>>* getPortVrf0Wdata();
43 |     Input<std::vector<TYPE>>* getPortVrf1Wdata();
44 |     Input<unsigned int>* getPortVrf0Waddr();
45 |     Input<unsigned int>* getPortVrf1Waddr();
46 |     Input<bool>* getPortUpdateTag();
47 |     // Destructor
48 |     ~MFU();
49 | 
50 | private:
51 |     // Module name
52 |     std::string name;
53 |     // Input and Output port
54 |     Input<std::vector<TYPE>>* mfu_input;
55 |     Input<mfu_uOP>* uOP;
56 |     Input<std::vector<TYPE>>* vrf0_wdata;
57 |     Input<unsigned int>* vrf0_waddr;
58 |     Input<std::vector<TYPE>>* vrf1_wdata;
59 |     Input<unsigned int>* vrf1_waddr;
60 |     Input<bool>* update_tag;
61 |     Output<std::vector<TYPE>>* mfu_result;
62 |     // Internal modules
63 |     RegisterFile<std::vector<TYPE>> *vrf0;
64 |     RegisterFile<std::vector<TYPE>> *vrf1;
65 |     // Internal channels
66 |     Channel<std::vector<TYPE>>* mfu_channel;
67 |     Channel<std::vector<TYPE>>* vrf0_rdata_channel;
68 |     Channel<unsigned int>* vrf0_raddr_channel;
69 |     Channel<std::vector<TYPE>>* vrf1_rdata_channel;
70 |     Channel<unsigned int>* vrf1_raddr_channel;
71 |     Channel<mfu_uOP>* uOP_channel;
72 |     Channel<std::vector<TYPE>>* act_out_channel;
73 |     Channel<std::vector<TYPE>>* add_out_channel;
74 |     Channel<mfu_uOP>* uOP_pipeline;
75 |     // Local variables
76 |     unsigned int current_tag;
77 | };
78 | 
79 | #endif
80 | 


--------------------------------------------------------------------------------
/simulator/inc/module.h:
--------------------------------------------------------------------------------
 1 | #ifndef MODULE_H_
 2 | #define MODULE_H_
 3 | 
 4 | #include <string>
 5 | 
 6 | /*
 7 |  * This header file defines the module abstract class. Any other module in the simulated
 8 |  * architecture inherits this class and has to implement the clock() function
 9 |  */
10 | class Module {
11 | public:
12 | 	// Constructor
13 | 	Module(std::string t_name)	{ name = t_name; }
14 | 	virtual ~Module() {}
15 | 	//Getter functions
16 | 	std::string getName() { return name; }
17 | 	// Defines what happens in this module every clock cycle (analogous to always block) 
18 | 	virtual void clock() { }
19 | 	
20 | private:
21 | 	// Module name
22 | 	std::string name;
23 | };
24 | 
25 | #endif 


--------------------------------------------------------------------------------
/simulator/inc/mvu.h:
--------------------------------------------------------------------------------
 1 | #ifndef MVU_H
 2 | #define MVU_H
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | #include "input.h"
 7 | #include "output.h"
 8 | #include "tile.h"
 9 | #include "inst.h"
10 | #include "utils.h"
11 | #include "defines.h"
12 | 
13 | /* 
14 |  * This class implements the matrix-vector multiplication unit (MVU).
15 |  * Input Ports:
16 |  * - VRFs write data (from Loader)
17 |  * - VRFs write address (from Loader)
18 |  * - MVU uOP (from decoder)
19 |  * - update tag (from Loader)
20 |  * Output Ports:
21 |  * - MVU output (to next block in pipeline -- eVRF)
22 |  */
23 | class MVU : public Module {
24 | public:
25 |     // Constructor
26 |     MVU (std::string t_name);
27 |     // Clock function
28 |     void clock(unsigned int &cycle_count);
29 |     // Getter functions
30 |     std::string getName();
31 |     Input<std::vector<TYPE>>* getPortVrfWdata(unsigned int idx);
32 |     Input<unsigned int>* getPortVrfWaddr(unsigned int idx);
33 |     Input<mvu_uOP>* getPortuOP();
34 |     Input<bool>* getPortUpdateTag();
35 |     Output<std::vector<TYPE>>* getPortRes();
36 |     // Destructor
37 |     ~MVU();
38 | 
39 | private:
40 |     // Module name
41 |     std::string name;
42 |     // Input and Output ports
43 |     std::vector<Input<std::vector<TYPE>>*> vrfs_wdata;
44 |     std::vector<Input<unsigned int>*> vrfs_waddr;
45 |     Input<mvu_uOP>* uOP;
46 |     Input<bool>* update_tag;
47 |     Output<std::vector<TYPE>>* mvu_results;
48 |     // Internal modules
49 |     std::vector<Tile*> mvu_tiles;
50 |     // Internal channels
51 |     std::vector<Channel<mvu_uOP>*> uOP_channels;
52 |     std::vector<std::vector<Channel<TYPE>*>> tile_results0;
53 |     std::vector<std::vector<Channel<TYPE>*>> tile_results1;
54 |     std::vector<std::vector<Channel<TYPE>*>> tile_results2;
55 |     Channel<std::vector<TYPE>>* reduction_channel0;
56 |     Channel<std::vector<TYPE>>* reduction_channel1;
57 |     Channel<std::vector<TYPE>>* reduction_channel2;
58 |     // Local variables
59 |     unsigned int current_tag;
60 | };
61 | 
62 | #endif
63 | 


--------------------------------------------------------------------------------
/simulator/inc/mvu_vrf.h:
--------------------------------------------------------------------------------
 1 | #ifndef MVU_VRF_H
 2 | #define MVU_VRF_H
 3 | 
 4 | #include <vector>
 5 | #include <iostream>
 6 | #include <string>
 7 | #include <tuple>
 8 | #include <assert.h>
 9 | #include <math.h>
10 | #include "input.h"
11 | #include "output.h"
12 | #include "register_file.h"
13 | #include "utils.h"
14 | #include "defines.h"
15 | 
16 | /* 
17 |  * This class implements the MVU vector register file (VRF). This module has the same interface as
18 |  * a conventional register file, but supplies the batch-3 inputs in sequence to be compatible with
19 |  * the Stratix 10 NX DPE. For a conventional DPE, a conventional register file would have been used.
20 |  * Input Ports:
21 |  * - VRF write data (from Loader)
22 |  * - VRF write address (from Loader)
23 |  * - VRF read address (from MVU uOP)
24 |  * - VRF select control signal (from MVU uOP)
25 |  * Output Ports:
26 |  * - VRF read data (to DPEs)
27 |  */
28 | class MVUVRF : public Module {
29 | public:
30 |     // Constructor
31 |     MVUVRF (std::string t_name, unsigned int t_tile_id);
32 |     // Clock function
33 |     void clock();
34 |     // Getters and setters
35 |     Input<std::vector<TYPE>> *getPortVrfWdata();
36 |     Input<unsigned int> *getPortVrfWaddr();
37 |     Output<std::vector<TYPE>> *getPortVrfRdata();
38 |     Input<unsigned int> *getPortVrfRaddr();
39 |     Input<unsigned int> *getPortVrfSel();
40 |     // Destructor
41 |     ~MVUVRF();
42 | 
43 | private:
44 |     // Module name
45 |     std::string name;
46 |     // Input and Output ports
47 |     Input<std::vector<TYPE>>* vrf_wdata;
48 |     Input<unsigned int>* vrf_waddr;
49 |     Output<std::vector<TYPE>>* vrf_rdata;
50 |     Input<unsigned int>* vrf_raddr;
51 |     Input<unsigned int>* vrf_sel;
52 |     // Internal modules
53 |     std::vector<RegisterFile<std::vector<TYPE>>*> vrfs;
54 |     // Internal channels
55 |     std::vector<Channel<unsigned int>*> vrf_raddr_channel;
56 |     std::vector<Channel<std::vector<TYPE>>*> vrf_rdata_channel;
57 |     std::vector<Channel<unsigned int>*> vrf_waddr_channel;
58 |     std::vector<Channel<std::vector<TYPE>>*> vrf_wdata_channel;
59 |     // Local variables
60 |     unsigned int tile_id;
61 |     unsigned int num_vrfs = LANES / 10;
62 | };
63 | 
64 | #endif


--------------------------------------------------------------------------------
/simulator/inc/npu.h:
--------------------------------------------------------------------------------
 1 | #ifndef NPU_H
 2 | #define NPU_H
 3 | 
 4 | #include <vector>
 5 | #include "input.h"
 6 | #include "output.h"
 7 | #include "datapath.h"
 8 | #include "decoder.h"
 9 | #include "inst.h"
10 | #include "defines.h"
11 | 
12 | /* 
13 |  * This class implements the NPU top-level module consisting of datapath and instruction decoders.
14 |  * Input Ports: 
15 |  * - VLIW NPU instructions (from tester)
16 |  * Output Ports:
17 |  * - NPU final outputs (to tester) 
18 |  */
19 | class NPU : public Module {
20 | public:
21 |     // Constructor
22 |     NPU (std::string t_name);
23 |     // Clock function
24 |     void clock(unsigned int &cycle_count);
25 |     // Getter functions
26 |     std::string getName();
27 |     Input<npu_instruction>* getPortInst();
28 |     Output<std::vector<TYPE>>* getPortOutput();
29 |     // Destructor
30 |     ~NPU();
31 | 
32 | private:
33 |     // Module name
34 |     std::string name;
35 |     // Input and Output ports
36 |     Input<npu_instruction>* npu_inst;
37 |     Output<std::vector<TYPE>>* npu_output;
38 |     // Internal modules
39 |     Datapath* npu_datapath;
40 |     Decoder* npu_decoders;
41 |     // Internal channels
42 |     Channel<mvu_uOP>* mvu_uOP_channel;
43 |     Channel<evrf_uOP>* evrf_uOP_channel;
44 |     Channel<mfu_uOP>* mfu0_uOP_channel;
45 |     Channel<mfu_uOP>* mfu1_uOP_channel;
46 |     Channel<ld_uOP>* ld_uOP_channel;
47 | };
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/simulator/inc/output.h:
--------------------------------------------------------------------------------
 1 | #ifndef OUTPUT_H_
 2 | #define OUTPUT_H_
 3 | 
 4 | #include <string>
 5 | #include <vector> 
 6 | #include "defines.h"
 7 | #include "inst.h"
 8 | #include "port.h"
 9 | #include "module.h"
10 | #include "channel.h"
11 | 
12 | /* 
13 |  * This class implements an output port for a module. Each output port is connected to one or more
14 |  * outgoing channel(s).
15 |  */
16 | template <class T>
17 | class Output : public Port<T>
18 | {
19 | public:
20 | 	// Constructor
21 | 	Output(std::string t_name, Module *t_module); 
22 | 	// Helper functions
23 | 	void connectTo(Channel<T> *t_channel);
24 | 	void writeToChannel(T t_data);
25 | 	bool isChannelFull();
26 | 	// Destructor
27 | 	~Output();
28 | 
29 | private:
30 | 	std::vector<Channel<T>*> channels;		
31 | };
32 | 
33 | #endif


--------------------------------------------------------------------------------
/simulator/inc/port.h:
--------------------------------------------------------------------------------
 1 | #ifndef PORT_H_
 2 | #define PORT_H_
 3 | 
 4 | #include <string>
 5 | #include <iostream>
 6 | #include <vector>
 7 | #include <cstring>
 8 | #include "inst.h"
 9 | #include "utils.h"
10 | #include "module.h"
11 | #include "channel.h"
12 | 
13 | /* 
14 |  * This class implements a module port. This class is not used in the implementation of the 
15 |  * simulator. Both Input and Output port classes inherit from it.
16 |  */
17 | template <class T>
18 | class Port { 
19 | public:
20 | 	// Constructor
21 | 	Port (std::string t_name, Module *t_module);
22 | 	// Getther functions
23 | 	std::string getName();
24 | 	Module* getModule();
25 | 	virtual ~Port() {};
26 | 
27 | protected:
28 | 	// Port name
29 | 	std::string name;
30 | 	// Module the port belongs to
31 | 	Module* module;
32 | };
33 | #endif
34 | 
35 | 


--------------------------------------------------------------------------------
/simulator/inc/register_file.h:
--------------------------------------------------------------------------------
 1 | #ifndef REGISTER_FILE_H_
 2 | #define REGISTER_FILE_H_
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | #include <tuple>
 7 | #include <iostream>
 8 | #include <assert.h>
 9 | #include <type_traits>
10 | #include "module.h"
11 | #include "input.h"
12 | #include "output.h"
13 | #include "channel.h"
14 | #include "utils.h"
15 | #include "defines.h"
16 | 
17 | /* 
18 |  * This class implements a simple dual-port register file (1 read and 1 write ports) that is used 
19 |  * in different modules of the NPU.
20 |  * Input Ports:
21 |  * - VRF write data
22 |  * - VRF write address
23 |  * - VRF read address
24 |  * Output Ports:
25 |  * - VRF read data
26 |  */
27 | template<class T>
28 | class RegisterFile : public Module { 
29 | public:
30 | 	// Constructor
31 | 	RegisterFile(std::string t_name, unsigned int t_depth, std::string *t_file_name = nullptr);
32 | 	// Clock function
33 | 	void clock();
34 | 	// Getter functions
35 | 	Input<unsigned int>* getPortRaddr();
36 | 	Output<T>* getPortRdata();
37 | 	Input<unsigned int>* getPortWaddr();
38 | 	Input<T>* getPortWdata();
39 | 	// Helper functions
40 | 	void write();
41 | 	void read();
42 | 	void print();
43 | 	// Destructor
44 | 	~RegisterFile();
45 | 
46 | private:
47 | 	// Input and Output ports
48 | 	Input<unsigned int>* raddr;
49 | 	Output<T>* rdata;
50 | 	Input<unsigned int>* waddr;
51 | 	Input<T>* wdata;
52 | 	// Local variables
53 | 	std::vector<T> register_file;
54 | 	unsigned int depth;
55 | 	std::queue<std::tuple<unsigned int, unsigned int>> read_pipeline;
56 | 	unsigned int reads_in_flight;
57 | 	std::queue<std::tuple<unsigned int, T, unsigned int>> write_pipeline;
58 | 	unsigned int writes_in_flight;
59 | };
60 | 
61 | #endif
62 | 		
63 | 


--------------------------------------------------------------------------------
/simulator/inc/tile.h:
--------------------------------------------------------------------------------
 1 | #ifndef TILE_H
 2 | #define TILE_H
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | #include "input.h"
 7 | #include "output.h"
 8 | #include "dpe.h"
 9 | #include "mvu_vrf.h"
10 | #include "register_file.h"
11 | #include "accumulator.h"
12 | #include "inst.h"
13 | #include "defines.h"
14 | 
15 | /* 
16 |  * This class implements the matrix-vector multiplication unit (MVU) tile.
17 |  * Input Ports:
18 |  * - VRFs write data (from Loader)
19 |  * - VRFs write address (from Loader)
20 |  * - MVU uOP (from decoder)
21 |  * Output Ports:
22 |  * - MVU tile output 0 (to MVU reduction)
23 |  * - MVU tile output 1 (to MVU reduction)
24 |  * - MVU tile output 2 (to MVU reduction)
25 |  */
26 | class Tile : public Module {
27 | public:
28 |     // Constructor
29 |     Tile (std::string t_name, unsigned int t_tile_id);
30 |     // Clock function
31 |     void clock();
32 |     // Getter functions
33 |     Input<std::vector<TYPE>> *getPortVrfWdata();
34 |     Input<unsigned int> *getPortVrfWaddr();
35 |     Input<mvu_uOP> *getPortuOP();
36 |     Output<TYPE> *getPortResults(unsigned int accum, unsigned int idx);
37 |     // Destructor
38 |     ~Tile();
39 | 
40 | private:
41 |     // Module name
42 |     std::string name;
43 |     // Input and Output ports
44 |     Input<std::vector<TYPE>>* vrf_wdata;
45 |     Input<unsigned int>* vrf_waddr;
46 |     Input<mvu_uOP>* uOP;
47 |     std::vector<Output<TYPE>*> accum0_results;
48 |     std::vector<Output<TYPE>*> accum1_results;
49 |     std::vector<Output<TYPE>*> accum2_results;
50 |     // Internal modules
51 |     MVUVRF* vrf;
52 |     std::vector<RegisterFile<std::vector<TYPE>>*> mrfs;
53 |     std::vector<DPE*> dpes;
54 |     std::vector<Accumulator*> accums;
55 |     // Internal channels
56 |     Channel<unsigned int>* vrf_raddr;
57 |     Channel<unsigned int>* vrf_sel;
58 |     std::vector<Channel<unsigned int>*> mrf_raddr;
59 |     std::vector<Channel<std::vector<TYPE>>*> mrf_rdata;
60 |     std::vector<Channel<unsigned int>*> mrf_waddr;
61 |     std::vector<Channel<std::vector<TYPE>>*> mrf_wdata;
62 |     std::vector<Channel<std::vector<TYPE>>*> vrf_to_dpe_channels;
63 |     std::vector<Channel<std::vector<TYPE>>*> mrf_to_dpe_channels;
64 |     std::vector<Channel<unsigned int>*> dpe_reg_sel_channels;
65 |     std::vector<Channel<unsigned int>*> dpe_vrf_en_channels;
66 |     std::vector<Channel<unsigned int>*> accum_uOP;
67 |     std::vector<Channel<unsigned int>*> accum_size;
68 |     std::vector<Channel<TYPE>*> accum0_channels;
69 |     std::vector<Channel<TYPE>*> accum1_channels;
70 |     std::vector<Channel<TYPE>*> accum2_channels;
71 |     // Local variables
72 |     unsigned int tile_id;
73 |     unsigned int accum_latency;
74 |     unsigned int reg_sel_latency;
75 | };
76 | 
77 | #endif
78 | 


--------------------------------------------------------------------------------
/simulator/inc/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H
 2 | #define UTILS_H
 3 | 
 4 | #include <string>
 5 | #include <iostream>
 6 | #include <vector>
 7 | #include <queue>
 8 | #include <iomanip>
 9 | #include <fstream>
10 | #include <sstream>
11 | #include <assert.h>
12 | #include "defines.h"
13 | 
14 | /*
15 |  * This header file declares several utility functions used throughout the simulator
16 |  */
17 | 
18 | // Operator overload for printing a vector
19 | template <typename T>
20 | std::ostream& operator<< (std::ostream& out, const std::vector<T>& v);
21 | 
22 | // Used for populating vector register file contents from a file
23 | void readVectorFile(std::string &file_name, std::vector<std::vector<TYPE>> &vec_data);
24 | 
25 | // Used for populating vector FIFO contents from a file
26 | void readVectorFile(std::string &file_name, std::queue<std::vector<TYPE>> &que_data);
27 | 
28 | // Operator overload for adding two vectors
29 | std::vector<TYPE> operator+ (const std::vector<TYPE> &v1, const std::vector<TYPE> &v2);
30 | 
31 | // Used for reading simulating golden outputs
32 | template <typename T>
33 | void readGoldenOutput(std::string &file_name, std::vector<T> &vec_data, int v_size);
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/simulator/main/obj/README.md:
--------------------------------------------------------------------------------
1 | Directory for object files produced from simulation


--------------------------------------------------------------------------------
/simulator/register_files/README.md:
--------------------------------------------------------------------------------
1 | Directory for register files content created by compiler


--------------------------------------------------------------------------------
/simulator/src/accumulator.cpp:
--------------------------------------------------------------------------------
  1 | #include "accumulator.h"
  2 | 
  3 | // Reset helper function to set accumulated values to zeros
  4 | void Accumulator::reset(){
  5 |     accum0_values.erase(accum0_values.begin(), accum0_values.end());
  6 |     accum1_values.erase(accum1_values.begin(), accum1_values.end());
  7 |     accum2_values.erase(accum2_values.begin(), accum2_values.end());
  8 |     for(unsigned int i = 0; i < num_accum_values; i++){
  9 |         accum0_values.push_back(0);
 10 |         accum1_values.push_back(0);
 11 |         accum2_values.push_back(0);
 12 |     }
 13 | }
 14 | 
 15 | // Accumulator Constructor
 16 | Accumulator::Accumulator(std::string t_name, unsigned int t_accum_id) : Module (t_name) {
 17 | 	// Initialize local variables
 18 |     accum_id = t_accum_id;
 19 |     this->reset();
 20 | 	channel_full_count = 0;
 21 | 	accum_idx = 0;
 22 | 	// Create Input and Output ports
 23 |     input0 = new Input<TYPE>(t_name + "_input0", this);
 24 |     input1 = new Input<TYPE>(t_name + "_input1", this);
 25 |     input2 = new Input<TYPE>(t_name + "_input2", this);
 26 |     uOP = new Input<unsigned int>(t_name + "_uOP", this);
 27 |     size = new Input<unsigned int>(t_name + "_size", this);
 28 |     result0 = new Output<TYPE>(t_name + "_output0", this);
 29 |     result1 = new Output<TYPE>(t_name + "_output1", this);
 30 |     result2 = new Output<TYPE>(t_name + "_output2", this);
 31 | }
 32 | 
 33 | // Clock cycle update function
 34 | void Accumulator::clock(){
 35 | 	// If no input data/size or uOP ready, abort
 36 | 	if (input0->isChannelEmpty() || uOP->isChannelEmpty() || size->isChannelEmpty()) return;
 37 | 	
 38 | 	// Peek uOP and size to decide how to proceed
 39 | 	unsigned int temp_uOP = uOP->peekChannel();
 40 | 	unsigned int temp_size = size->peekChannel(); 
 41 | 	
 42 | 	//Accumlate input values
 43 | 	TYPE input0_data = input0->readFromChannel();
 44 | 	TYPE input1_data = input1->readFromChannel();
 45 | 	TYPE input2_data = input2->readFromChannel();
 46 | 	temp_uOP = uOP->readFromChannel();
 47 | 	temp_size = size->readFromChannel(); 
 48 | 	accum0_values[accum_idx] = accum0_values[accum_idx] + input0_data;
 49 | 	accum1_values[accum_idx] = accum1_values[accum_idx] + input1_data;
 50 | 	accum2_values[accum_idx] = accum2_values[accum_idx] + input2_data;
 51 | 
 52 | 	// Write out the final result & reset the accumulator
 53 | 	if (temp_uOP)   {
 54 | 		result0->writeToChannel(accum0_values[accum_idx]);
 55 | 		result1->writeToChannel(accum1_values[accum_idx]);
 56 | 		result2->writeToChannel(accum2_values[accum_idx]);
 57 | 		accum0_values[accum_idx] = 0;
 58 | 		accum1_values[accum_idx] = 0;
 59 | 		accum2_values[accum_idx] = 0;
 60 | 		channel_full_count = 0;
 61 | 	} 
 62 | 
 63 | 	// Update accumulator index
 64 | 	if(accum_idx == temp_size-1)
 65 | 		accum_idx = 0;
 66 | 	else
 67 | 		accum_idx++;
 68 | }
 69 | 
 70 | // Getter function for name
 71 | std::string Accumulator::getName() { 
 72 | 	return name; 
 73 | }
 74 | 
 75 | // Getter function for ID
 76 | unsigned int Accumulator::getId() { 
 77 | 	return accum_id; 
 78 | }
 79 | 
 80 | // Getter function for input ports
 81 | Input<TYPE>* Accumulator::getPortInput(unsigned int i) {
 82 |     if(i == 0) 
 83 |         return input0;
 84 |     else if (i == 1)
 85 |         return input1;
 86 |     else
 87 |         return input2; 
 88 | }
 89 | 
 90 | // Getter function for uOP input port
 91 | Input<unsigned int>* Accumulator::getPortuOP() { 
 92 | 	return uOP; 
 93 | }
 94 | 
 95 | // Getter function for port size
 96 | Input<unsigned int>* Accumulator::getPortSize() { 
 97 | 	return size; 
 98 | }
 99 | 
100 | // Getter function for output ports
101 | Output<TYPE>* Accumulator::getPortRes(unsigned int i) { 
102 |     if(i == 0)
103 |         return result0;
104 |     else if (i == 1)
105 |         return result1;
106 |     else
107 |         return result2; 
108 | }
109 | 
110 | // Destructor
111 | Accumulator::~Accumulator(){
112 | 	delete input0;
113 | 	delete input1;
114 | 	delete input2;
115 | 	delete uOP;
116 | 	delete size;
117 | 	delete result0;
118 | 	delete result1;
119 | 	delete result2;
120 | }


--------------------------------------------------------------------------------
/simulator/src/channel.cpp:
--------------------------------------------------------------------------------
  1 | #include "channel.h"
  2 | 
  3 | // Channel constructor
  4 | template <class T>
  5 | Channel<T>::Channel(std::string t_name, unsigned int t_size, unsigned int t_latency){
  6 | 	name = t_name;
  7 | 	size = t_size;
  8 | 	latency = t_latency;
  9 | }
 10 | 
 11 | // Helper function to write to a channel
 12 | template <class T>
 13 | void Channel<T>::write(T t_value){
 14 | 	if (this->isFull()) 
 15 | 		std::cerr << "Channel "<< name <<" buffer size "<<
 16 | 		  buffer.size() << " out of " << size << std::endl;
 17 | 	assert((!this->isFull()) && "Writing to a full channel");
 18 | 	buffer.push(std::make_tuple(t_value, latency));
 19 | }
 20 | 
 21 | // Helper function to read from a channel
 22 | template <class T>
 23 | T Channel<T>::read(){
 24 | 	assert((!buffer.empty() || (std::get<1>(buffer.front()) == 0)) && "Reading from empty channel");
 25 | 	T temp = std::get<0>(buffer.front());
 26 | 	buffer.pop();
 27 | 	return temp;
 28 | };
 29 | 
 30 | // Helper function to peek a channel (look at the next element in the channel)
 31 | template <class T>
 32 | T Channel<T>::peek(){
 33 |     assert((!buffer.empty() || (std::get<1>(buffer.front()) == 0)) && "Peeking an empty channel");
 34 |     T temp = std::get<0>(buffer.front());
 35 |     return temp;
 36 | };
 37 | 
 38 | // Helper function to get the element at a specific location in the channel
 39 | template <class T>
 40 | T Channel<T>::at(unsigned int idx){
 41 |     assert((buffer.size() > idx) && "Channel size is less that accessed index");
 42 |     unsigned int i = 0;
 43 |     T temp;
 44 |     std::tuple<T, unsigned int> temp_tuple;
 45 |     for(unsigned int itr = 0; itr < buffer.size(); itr++){
 46 |     	temp_tuple = buffer.front();
 47 |     	buffer.pop();
 48 |     	if(i == idx)
 49 |     		temp = std::get<0>(temp_tuple);
 50 |     	buffer.push(temp_tuple);
 51 |     	i++;
 52 |     }
 53 |     return temp;
 54 | };
 55 | 
 56 | // Helper function to check if channel is empty
 57 | template <class T>
 58 | bool Channel<T>::isEmpty(){
 59 | 	return buffer.empty() || (std::get<1>(buffer.front()) != 0);
 60 | }
 61 | 
 62 | // Helper function to check if channel is full
 63 | template <class T>
 64 | bool Channel<T>::isFull(){
 65 | 	return !(buffer.size() <= size);
 66 | }
 67 | 
 68 | // Clock cycle update function
 69 | template <class T>
 70 | void Channel<T>::clock(){
 71 |     if(!buffer.empty()){
 72 |         for(unsigned int i = 0; i < buffer.size(); i++){
 73 |             std::tuple<T, unsigned int> temp = buffer.front();
 74 |             buffer.pop();
 75 |             if(std::get<1>(temp) > 0){
 76 |                 std::get<1>(temp)--;
 77 |             }
 78 |             buffer.push(temp);
 79 |         }
 80 |     }
 81 | }
 82 | 
 83 | // Getter function for name
 84 | template <class T>
 85 | std::string Channel<T>::getName () { 
 86 |     return name; 
 87 | }
 88 | 
 89 | // Getter function for size
 90 | template <class T>
 91 | unsigned int Channel<T>::getSize() { 
 92 |     return buffer.size(); 
 93 | }
 94 | 
 95 | template class Channel<TYPE>;
 96 | template class Channel<std::vector<TYPE>>;
 97 | template class Channel<bool>;
 98 | template class Channel<unsigned int>;
 99 | template class Channel<mvu_uOP>;
100 | template class Channel<evrf_uOP>;
101 | template class Channel<mfu_uOP>;
102 | template class Channel<ld_uOP>;
103 | template class Channel<npu_instruction>;
104 | template class Channel<mvu_mOP>;
105 | template class Channel<evrf_mOP>;
106 | template class Channel<mfu_mOP>;
107 | template class Channel<ld_mOP>;


--------------------------------------------------------------------------------
/simulator/src/dpe.cpp:
--------------------------------------------------------------------------------
  1 | #include "dpe.h"
  2 | 
  3 | // DPE Constructor
  4 | DPE::DPE (std::string t_name, unsigned int t_dpe_id, unsigned int t_tile_id) : Module(t_name) {
  5 | 	// Create Input and Output ports
  6 | 	vSeq = new Input<std::vector<TYPE>>(t_name + "_vSeq", this);
  7 | 	vBroadcast = new Input<std::vector<TYPE>>(t_name + "_vBroadcast", this);
  8 | 	reg_sel = new Input<unsigned int>(t_name + "_reg_sel", this);
  9 | 	vrf_en = new Input<unsigned int>(t_name + "_vrf_en", this);
 10 | 	dpe_res0  = new Output<TYPE>(t_name + "_dpe_res0", this);
 11 | 	dpe_res1  = new Output<TYPE>(t_name + "_dpe_res1", this);
 12 | 	dpe_res2  = new Output<TYPE>(t_name + "_dpe_res2", this);
 13 | 	// Create internal channels
 14 | 	dpe_result0_channel = new Channel<TYPE>(t_name + "_dpe_result0_channel", dpe_result_latency, 
 15 | 		dpe_result_latency);
 16 | 	dpe_result1_channel = new Channel<TYPE>(t_name + "_dpe_result1_channel", dpe_result_latency, 
 17 | 		dpe_result_latency);
 18 | 	dpe_result2_channel = new Channel<TYPE>(t_name + "_dpe_result2_channel", dpe_result_latency, 
 19 | 		dpe_result_latency);
 20 | 	pingpong0 = new Channel<std::vector<TYPE>>(t_name + "_pingpong0_channel", pingpong_length, 
 21 | 		pingpong_length);
 22 | 	pingpong1 = new Channel<std::vector<TYPE>>(t_name + "_pingpong1_channel", pingpong_length, 
 23 | 		pingpong_length);
 24 | 	broadcast_delay = new Channel<std::vector<TYPE>>(t_name + "_broadcast_delay_channel", 
 25 | 		pingpong_length, pingpong_length);
 26 | 	input_sel_delay = new Channel<unsigned int>(t_name + "_input_sel_delay_channel", 
 27 | 		pingpong_length, pingpong_length);
 28 | 	reg_sel_delay = new Channel<unsigned int>(t_name + "_input_reg_delay_channel", 3, 3);
 29 | 	vrf_en_delay = new Channel<unsigned int>(t_name + "_vrf_en_delay_channel", 3, 3);
 30 | 	// Initialize local variables
 31 | 	dpe_id = t_dpe_id;
 32 | 	tile_id = t_tile_id;
 33 | }
 34 | 
 35 | // Dot product helper function
 36 | TYPE dot_product(std::vector<TYPE> &v1, std::vector<TYPE> &v2){
 37 | 	TYPE result = 0;
 38 | 	for(unsigned int i = 0; i < LANES; i++){
 39 | 		result += (v1[i] * v2[i]);
 40 | 	}
 41 | 	return result;
 42 | }
 43 | 
 44 | // Clock cycle update function
 45 | void DPE::clock() {
 46 | 	std::vector<TYPE> temp_vSeq, temp_vBroadcast;
 47 | 	TYPE dpe_result0, dpe_result1, dpe_result2;
 48 | 	// Write output results when ready
 49 | 	if(!dpe_result0_channel->isEmpty() && !dpe_res0->isChannelFull()){
 50 | 		dpe_res0->writeToChannel(dpe_result0_channel->read());
 51 | 		dpe_res1->writeToChannel(dpe_result1_channel->read());
 52 | 		dpe_res2->writeToChannel(dpe_result2_channel->read());
 53 | 	}
 54 | 	// Prepare operands
 55 | 	if(!broadcast_delay->isEmpty() && !dpe_result0_channel->isFull()){
 56 | 		std::vector<TYPE> v0, v1, v2, temp, vb;
 57 | 		unsigned int input_sel = input_sel_delay->read();
 58 | 		if(input_sel == 0){
 59 | 			for(unsigned int i = 0; i < (LANES/10 * 3); i++){
 60 | 				temp = pingpong0->at(i);
 61 | 				if(i % 3 == 0){			
 62 | 					v0.insert(v0.end(), temp.begin(), temp.end());
 63 | 				} else if (i % 3 == 1) {
 64 | 					v1.insert(v1.end(), temp.begin(), temp.end());
 65 | 				} else {
 66 | 					v2.insert(v2.end(), temp.begin(), temp.end());
 67 | 				}
 68 | 			}
 69 | 		} else {
 70 | 			for(unsigned int i = 0; i < (LANES/10 * 3); i++){
 71 | 				temp = pingpong1->at(i);
 72 | 				if(i % 3 == 0){			
 73 | 					v0.insert(v0.end(), temp.begin(), temp.end());
 74 | 				} else if (i % 3 == 1) {
 75 | 					v1.insert(v1.end(), temp.begin(), temp.end());
 76 | 				} else {
 77 | 					v2.insert(v2.end(), temp.begin(), temp.end());
 78 | 				}
 79 | 			}
 80 | 		}
 81 | 		vb = broadcast_delay->read();
 82 | 		// Perform computation
 83 | 		dpe_result0 = dot_product(vb, v0);
 84 | 		dpe_result1 = dot_product(vb, v1);
 85 | 		dpe_result2 = dot_product(vb, v2);
 86 | 		// Write dot product results to delay channels
 87 | 		dpe_result0_channel->write(dpe_result0);
 88 | 		dpe_result1_channel->write(dpe_result1);
 89 | 		dpe_result2_channel->write(dpe_result2);
 90 | 	}
 91 | 
 92 |     // Accept new inputs
 93 |     if(!vSeq->isChannelEmpty() && !vBroadcast->isChannelEmpty() 
 94 |     	&& !reg_sel->isChannelEmpty() && !broadcast_delay->isFull()) {
 95 |         
 96 |         temp_vSeq = vSeq->readFromChannel();
 97 |         temp_vBroadcast = vBroadcast->readFromChannel();
 98 |         unsigned int temp_reg_sel = reg_sel->readFromChannel();
 99 |         unsigned int temp_vrf_en = vrf_en->readFromChannel();
100 |         unsigned int delayed_reg_sel;
101 |         unsigned int delayed_vrf_en;
102 |         if(!reg_sel_delay->isEmpty()){
103 |         	delayed_reg_sel = reg_sel_delay->read();
104 |         	delayed_vrf_en = vrf_en_delay->read();
105 |         } else {
106 |         	delayed_reg_sel = temp_reg_sel;
107 |         	delayed_vrf_en = temp_vrf_en;
108 |         }
109 | 
110 |         if(!pingpong0->isEmpty() && delayed_reg_sel == 0 && (delayed_vrf_en == 1))
111 |         	pingpong0->read();
112 |         if(!pingpong1->isEmpty() && delayed_reg_sel == 1 && (delayed_vrf_en == 1))
113 |         	pingpong1->read();
114 | 
115 |         if((temp_reg_sel == 0) && (temp_vrf_en == 1)){
116 |         	pingpong0->write(temp_vSeq);
117 |         } else if ((temp_reg_sel == 1) && (temp_vrf_en == 1)){
118 |         	pingpong1->write(temp_vSeq);
119 |         }
120 | 
121 |         if(((temp_reg_sel == 0) && (temp_vrf_en == 1)) || (delayed_reg_sel == 0 && 
122 |         	delayed_vrf_en == 1))
123 |         		pingpong0->clock();
124 | 
125 |        	if(((temp_reg_sel == 1) && (temp_vrf_en == 1)) || (delayed_reg_sel == 1 && 
126 |        		delayed_vrf_en == 1))
127 |        			pingpong1->clock();
128 | 
129 |         broadcast_delay->write(temp_vBroadcast);
130 |         input_sel_delay->write(temp_reg_sel);   
131 |         reg_sel_delay->write(temp_reg_sel);
132 |         vrf_en_delay->write(temp_vrf_en);
133 |     } else if(!reg_sel_delay->isEmpty()){
134 |     	unsigned int delayed_reg_sel = reg_sel_delay->read();
135 |     	unsigned int delayed_vrf_en = vrf_en_delay->read();
136 | 
137 |         if(!pingpong0->isEmpty() && delayed_reg_sel == 0 && delayed_vrf_en == 1)
138 |         	pingpong0->read();
139 |         if(!pingpong1->isEmpty() && delayed_reg_sel == 1 && delayed_vrf_en == 1)
140 |         	pingpong1->read();
141 | 
142 |         if(delayed_reg_sel == 0 && delayed_vrf_en == 1)
143 |         	pingpong0->clock();
144 | 
145 |        	if(delayed_reg_sel == 1 && delayed_vrf_en == 1)
146 |        		pingpong1->clock();
147 |     }
148 | 
149 |     // Clock internal channels
150 |     broadcast_delay->clock();
151 |     input_sel_delay->clock();
152 |     reg_sel_delay->clock();
153 |     vrf_en_delay->clock();
154 |     dpe_result0_channel->clock();
155 |     dpe_result1_channel->clock();
156 |     dpe_result2_channel->clock();
157 | }
158 | 
159 | // Getter function for name
160 | std::string DPE::getName() { 
161 | 	return name; 
162 | }
163 | 
164 | // Getter function for sequentially loaded input port
165 | Input<std::vector<TYPE>>* DPE::getPortVSeq()  { 
166 | 	return vSeq; 
167 | }
168 | 
169 | // Getter function for broadcast input port
170 | Input<std::vector<TYPE>>* DPE::getPortVBroadcast()  { 
171 | 	return vBroadcast; 
172 | }
173 | 
174 | // Getter function for register select input port
175 | Input<unsigned int>* DPE::getPortRegSel() { 
176 | 	return reg_sel; 
177 | }
178 | 
179 | // Getter function for VRF enable input port
180 | Input<unsigned int>* DPE::getPortVrfEn() { 
181 | 	return vrf_en; 
182 | }
183 | 
184 | // Getter function for DPE output ports
185 | Output<TYPE>* DPE::getPortDPERes(unsigned int i) { 
186 | 	if(i == 0)
187 | 		return dpe_res0; 
188 | 	else if (i == 1)
189 | 		return dpe_res1;
190 | 	else
191 | 		return dpe_res2;
192 | }
193 | 
194 | DPE::~DPE() {
195 | 	delete vSeq;
196 | 	delete vBroadcast;
197 | 	delete reg_sel;
198 | 	delete vrf_en;
199 | 	delete dpe_res0;
200 | 	delete dpe_res1;
201 | 	delete dpe_res2;
202 | 	delete dpe_result0_channel;
203 | 	delete dpe_result1_channel;
204 | 	delete dpe_result2_channel;
205 | 	delete pingpong0;
206 | 	delete pingpong1;
207 | 	delete broadcast_delay;
208 | 	delete input_sel_delay;
209 | 	delete reg_sel_delay;
210 | 	delete vrf_en_delay;
211 | }


--------------------------------------------------------------------------------
/simulator/src/evrf.cpp:
--------------------------------------------------------------------------------
  1 | #include "evrf.h"
  2 | 
  3 | // eVRF Constructor
  4 | EVRF::EVRF(std::string t_name) : Module (t_name) {
  5 |     // Create Input and Output ports
  6 |     evrf_input = new Input<std::vector<TYPE>>(t_name + "_input", this);
  7 |     uOP = new Input<evrf_uOP>(t_name + "_uOP", this);
  8 |     update_tag = new Input<bool>(t_name + "_update_tag", this);
  9 |     evrf_result = new Output<std::vector<TYPE>>(t_name + "_result", this);
 10 | 
 11 |     // Create internal modules
 12 |     evrf = new RegisterFile<std::vector<TYPE>>(t_name, EVRF_DEPTH);
 13 |     evrf_wdata = evrf->getPortWdata();
 14 |     evrf_waddr = evrf->getPortWaddr();
 15 | 
 16 |     // Create internal channels
 17 |     mvu_channel = new Channel<std::vector<TYPE>>(t_name + "_mvu_channel", 
 18 |         RF_READ_LATENCY + 1, RF_READ_LATENCY + 1);
 19 |     evrf_raddr = new Channel<unsigned int>(t_name + "_evrf_raddr", 1, 0);
 20 |     evrf->getPortRaddr()->connectTo(evrf_raddr);
 21 |     evrf_rdata = new Channel<std::vector<TYPE>>(t_name + "_evrf_rdata", 1, 0);
 22 |     evrf->getPortRdata()->connectTo(evrf_rdata);
 23 | 
 24 |     // Initialize local variables
 25 |     current_tag = 0;
 26 | }
 27 | 
 28 | // Clock cycle update function
 29 | void EVRF::clock(unsigned int &cycle_count){
 30 |     // If uOP is ready to dispatch
 31 |     if(!uOP->isChannelEmpty()){
 32 |         // Peek ready uOP to decide how to proceed
 33 |         evrf_uOP temp = uOP->peekChannel();
 34 |         // If ready operation is NOP, read and ignore
 35 | 		if (temp.op == 0) {
 36 |             temp = uOP->readFromChannel();
 37 |             LOG(this->getName(), "NOP");
 38 | 
 39 |         // If ready operation is flush
 40 |         } else if (temp.op == 2 && !evrf_input->isChannelEmpty() && temp.tag <= current_tag) {
 41 |             evrf_input->readFromChannel();
 42 | 
 43 |         // If ready operation is bypass
 44 |         } else if(!temp.src && !mvu_channel->isFull() && !evrf_input->isChannelEmpty() && 
 45 |             temp.tag <= current_tag){
 46 |             mvu_channel->write(evrf_input->readFromChannel());
 47 |             temp = uOP->readFromChannel();
 48 |             if(temp.first_flag) {
 49 |                 LOG(this->getName(), "Issued first uOP " + std::to_string(temp.first_flag));
 50 |             }
 51 | 
 52 |         // If ready operation is read eVRF
 53 |         } else if (temp.src && !evrf_rdata->isFull() && temp.tag <= current_tag) {
 54 |             evrf_raddr->write(temp.vrf_addr);
 55 |             temp = uOP->readFromChannel();
 56 |             if(temp.first_flag) {
 57 |                 LOG(this->getName(), "Issued first uOP " + std::to_string(temp.first_flag));
 58 |             }
 59 |         }
 60 |     }
 61 | 
 62 |     // Write eVRF output when ready
 63 |     if(!mvu_channel->isEmpty()){
 64 |         evrf_result->writeToChannel(mvu_channel->read());
 65 |         LOG(this->getName(), "Produced Output");
 66 |     } else if (!evrf_rdata->isEmpty()){
 67 |         evrf_result->writeToChannel(evrf_rdata->read());
 68 |         LOG(this->getName(), "Produced Output");
 69 |     }
 70 | 
 71 |     // Update local instruction tag (if required)
 72 |     if(!update_tag->isChannelEmpty()){
 73 |         update_tag->readFromChannel();
 74 |         current_tag++;
 75 |     }
 76 | 
 77 |     // Clock internal modules
 78 |     evrf->clock();
 79 |     // Clock internal channels
 80 |     evrf_raddr->clock();
 81 |     evrf_rdata->clock();
 82 |     mvu_channel->clock();
 83 | }
 84 | 
 85 | // Getter function for name
 86 | std::string EVRF::getName() { 
 87 |     return name; 
 88 | }
 89 | 
 90 | // Getter function for eVRF input port
 91 | Input<std::vector<TYPE>>* EVRF::getPortInput() { 
 92 |     return evrf_input; 
 93 | }
 94 | 
 95 | // Getter function for uOP input port
 96 | Input<evrf_uOP>* EVRF::getPortuOP() { 
 97 |     return uOP; 
 98 | }
 99 | 
100 | // Getter function for eVRF write data input port
101 | Input<std::vector<TYPE>>* EVRF::getPortEvrfWdata() { 
102 |     return evrf_wdata; 
103 | }
104 | 
105 | // Getter function for eVRF write address input port
106 | Input<unsigned int>* EVRF::getPortEvrfWaddr() { 
107 |     return evrf_waddr; 
108 | }
109 | 
110 | // Getter function for update tag inputy port
111 | Input<bool>* EVRF::getPortUpdateTag() { 
112 |     return update_tag; 
113 | }
114 | 
115 | // Getter function for eVRF output port
116 | Output<std::vector<TYPE>>* EVRF::getPortRes() { 
117 |     return evrf_result; 
118 | }
119 | 
120 | EVRF::~EVRF() {
121 |     delete evrf_input;
122 |     delete uOP;
123 |     delete update_tag;
124 |     delete evrf_result;
125 |     delete evrf;
126 |     delete mvu_channel;
127 |     delete evrf_raddr;
128 |     delete evrf_rdata;
129 | }


--------------------------------------------------------------------------------
/simulator/src/input.cpp:
--------------------------------------------------------------------------------
 1 | #include "input.h"
 2 | 
 3 | // Input Port Constructor
 4 | template <class T>
 5 | Input<T>::Input(std::string t_name, Module *t_module) : Port<T>(t_name, t_module) { }
 6 | 
 7 | // Helper function for connecting an Input port to incoming channel
 8 | template <class T>
 9 | void Input<T>::connectTo(Channel<T> *t_channel) {
10 |     channel = t_channel;
11 | }
12 | 
13 | // Helper function for reading from the incoming channel connected to this port
14 | template <class T>
15 | T Input<T>::readFromChannel() {
16 |     return channel->read();
17 | }
18 | 
19 | // Helper function for peeking the contents of a channel connected to this port
20 | template <class T>
21 | T Input<T>::peekChannel() {
22 |     return channel->peek();
23 | }
24 | 
25 | // Helper function for checking if the channel connected to this port is empty
26 | template <class T>
27 | bool Input<T>::isChannelEmpty(){
28 |     assert((channel) && "no channel for input");
29 |     return channel->isEmpty();
30 | }
31 | 
32 | template <class T>
33 | Input<T>::~Input(){ channel = NULL; }
34 | 
35 | template class Input<TYPE>;	
36 | template class Input<std::vector<TYPE>>;	
37 | template class Input<bool>;	
38 | template class Input<unsigned int>;
39 | template class Input<mvu_uOP>;
40 | template class Input<evrf_uOP>;
41 | template class Input<mfu_uOP>;
42 | template class Input<ld_uOP>;
43 | template class Input<npu_instruction>;


--------------------------------------------------------------------------------
/simulator/src/mvu.cpp:
--------------------------------------------------------------------------------
  1 | #include "mvu.h"
  2 | 
  3 | // MVU Constructor
  4 | MVU::MVU(std::string t_name) : Module (t_name) {
  5 |     // Create Input and Output ports
  6 |     update_tag = new Input<bool>(t_name + "_update_tag", this);
  7 |     uOP = new Input<mvu_uOP>(t_name + "_uOP", this);
  8 |     mvu_results = new Output<std::vector<TYPE>>(t_name + "_results", this);
  9 | 
 10 |     // Create internal modules
 11 |     for(unsigned int i = 0; i < TILES; i++){
 12 |         Tile* mvu_tile = new Tile(t_name+"_tile"+std::to_string(i), i);
 13 |         mvu_tiles.push_back(mvu_tile);
 14 |         vrfs_wdata.push_back(mvu_tile->getPortVrfWdata());
 15 |         vrfs_waddr.push_back(mvu_tile->getPortVrfWaddr());
 16 | 
 17 |         Channel<mvu_uOP>* uOP_channel = new Channel<mvu_uOP>(t_name+"_uOP" + std::to_string(i), 
 18 |             1, 0);
 19 |         uOP_channels.push_back(uOP_channel);
 20 |         mvu_tile->getPortuOP()->connectTo(uOP_channels[i]);
 21 | 
 22 |         std::vector<Channel<TYPE>*> temp_tile_results0, temp_tile_results1, temp_tile_results2;
 23 |         for(unsigned int j = 0; j < DPES; j++){
 24 |             Channel<TYPE> *temp0 = new Channel<TYPE>(t_name + "_tile" + std::to_string(i) + 
 25 |                 "_result0", 1, 0);
 26 |             Channel<TYPE> *temp1 = new Channel<TYPE>(t_name + "_tile" + std::to_string(i) + 
 27 |                 "_result1", 1, 0);
 28 |             Channel<TYPE> *temp2 = new Channel<TYPE>(t_name + "_tile" + std::to_string(i) + 
 29 |                 "_result2", 1, 0);
 30 |             mvu_tile->getPortResults(0, j)->connectTo(temp0);
 31 |             mvu_tile->getPortResults(1, j)->connectTo(temp1);
 32 |             mvu_tile->getPortResults(2, j)->connectTo(temp2);
 33 |             temp_tile_results0.push_back(temp0);
 34 |             temp_tile_results1.push_back(temp1);
 35 |             temp_tile_results2.push_back(temp2);
 36 |         }
 37 |         tile_results0.push_back(temp_tile_results0);
 38 |         tile_results1.push_back(temp_tile_results1);
 39 |         tile_results2.push_back(temp_tile_results2);
 40 |     }
 41 | 
 42 |     // Create internal channels
 43 |     reduction_channel0 = new Channel<std::vector<TYPE>>(t_name + "_reduction0",
 44 |             MVU_REDUCTION_LATENCY, MVU_REDUCTION_LATENCY-1);
 45 |     reduction_channel1 = new Channel<std::vector<TYPE>>(t_name + "_reduction1",
 46 |             MVU_REDUCTION_LATENCY, MVU_REDUCTION_LATENCY-1);
 47 |     reduction_channel2 = new Channel<std::vector<TYPE>>(t_name + "_reduction2",
 48 |             MVU_REDUCTION_LATENCY, MVU_REDUCTION_LATENCY-1);
 49 | 
 50 |     // Initialize local variables
 51 |     current_tag = 0;
 52 | }
 53 | 
 54 | // Clock cycle update function
 55 | void MVU::clock(unsigned int &cycle_count){
 56 |     // If uOP is ready to dispatch
 57 |     if(!uOP->isChannelEmpty()) {
 58 |         // Peek ready uOP to decide how to proceed
 59 |         mvu_uOP temp = uOP->peekChannel();
 60 | 
 61 |         // If ready operation is NOP, read and ignore
 62 | 		if (temp.op == 0) {
 63 |             temp = uOP->readFromChannel();
 64 |             LOG(this->getName(), "NOP");
 65 | 
 66 |         // If ready operation is not NOP, read and dispatch
 67 |         } else if (!uOP->isChannelEmpty() && temp.tag <= current_tag && 
 68 |             !uOP_channels[0]->isFull()) {
 69 |             temp = uOP->readFromChannel();
 70 |             if(temp.first_flag){
 71 |                 LOG(this->getName(), "Issued first uOP " + std::to_string(temp.first_flag/3));
 72 |             }
 73 |             for (unsigned int i = 0; i < TILES; i++) {
 74 |                 uOP_channels[i]->write(temp);
 75 |             }
 76 |         }
 77 |     }
 78 | 
 79 |     // Perform reduction of corresponding DPEs from different tiles
 80 |     if((!tile_results0[0][0]->isEmpty()) && (!reduction_channel0->isFull())){
 81 |         std::vector<TYPE> partial_results0(DPES);
 82 |         std::vector<TYPE> partial_results1(DPES);
 83 |         std::vector<TYPE> partial_results2(DPES);
 84 |         for(unsigned int i = 0; i < TILES; i++){
 85 |             for(unsigned int j = 0; j < DPES; j++){
 86 |                 partial_results0[j] += tile_results0[i][j]->read();
 87 |                 partial_results1[j] += tile_results1[i][j]->read();
 88 |                 partial_results2[j] += tile_results2[i][j]->read();
 89 |             }
 90 |         }
 91 |         reduction_channel0->write(partial_results0);
 92 |         reduction_channel1->write(partial_results1);
 93 |         reduction_channel2->write(partial_results2);
 94 |     }
 95 | 
 96 |     // Write MVU output when ready
 97 |     if((!reduction_channel0->isEmpty()) && (!mvu_results->isChannelFull())){
 98 |         // Read reduction result
 99 |         std::vector<TYPE> mvu_res_vec0 = reduction_channel0->read();
100 |         std::vector<TYPE> mvu_res_vec1 = reduction_channel1->read();
101 |         std::vector<TYPE> mvu_res_vec2 = reduction_channel2->read();
102 |         // Reshape it from vectors of length DPES to vectors of length LANES (Asymmetric FIFO)
103 |         for(unsigned int i = 0; i < (DPES/LANES); i++){
104 |             std::vector<TYPE> mvu_res_part0, mvu_res_part1, mvu_res_part2;
105 |             for(unsigned int j = 0; j < LANES; j++){
106 |                 mvu_res_part0.push_back(mvu_res_vec0[0]);
107 |                 mvu_res_part1.push_back(mvu_res_vec1[0]);
108 |                 mvu_res_part2.push_back(mvu_res_vec2[0]);
109 |                 mvu_res_vec0.erase(mvu_res_vec0.begin());
110 |                 mvu_res_vec1.erase(mvu_res_vec1.begin());
111 |                 mvu_res_vec2.erase(mvu_res_vec2.begin());
112 |             }
113 |             mvu_results->writeToChannel(mvu_res_part0);
114 |             mvu_results->writeToChannel(mvu_res_part1);
115 |             mvu_results->writeToChannel(mvu_res_part2);
116 |             LOG(this->getName(), "Produced Output");
117 | 			#if(VERBOSE_MVU)
118 | 	            std::cout << "MVU OUTPUT0: " << mvu_res_part0 << std::endl;
119 |                 std::cout << "MVU OUTPUT1: " << mvu_res_part1 << std::endl;
120 |                 std::cout << "MVU OUTPUT2: " << mvu_res_part2 << std::endl;
121 | 			#endif
122 |         }
123 |     }
124 | 
125 |     // Update local instruction tag (if required)
126 |     if(!update_tag->isChannelEmpty()){
127 |         update_tag->readFromChannel();
128 |         current_tag++;
129 |     }
130 | 
131 |     // Clock internal modules
132 |     for(unsigned int i = 0; i < TILES; i++){
133 |         mvu_tiles[i]->clock();
134 |     }
135 |     // Clock internal channels
136 |     for(unsigned int i = 0; i < TILES; i++){
137 |         uOP_channels[i]->clock();
138 |     }
139 |     reduction_channel0->clock();
140 |     reduction_channel1->clock();
141 |     reduction_channel2->clock();
142 | }
143 | 
144 | // Getter function for name
145 | std::string MVU::getName() { 
146 |     return name; 
147 | }
148 | 
149 | // Getter function for VRF write data input port
150 | Input<std::vector<TYPE>>* MVU::getPortVrfWdata(unsigned int idx) { 
151 |     return vrfs_wdata[idx]; 
152 | }
153 | 
154 | // Getter function for VRF write address input port
155 | Input<unsigned int>* MVU::getPortVrfWaddr(unsigned int idx) { 
156 |     return vrfs_waddr[idx]; 
157 | }
158 | 
159 | // Getter function for uOP input port
160 | Input<mvu_uOP>* MVU::getPortuOP() { 
161 |     return uOP; 
162 | }
163 | 
164 | // Getter function for update tag input port
165 | Input<bool>* MVU::getPortUpdateTag() { 
166 |     return update_tag; 
167 | }
168 | 
169 | // Getter function for MVU output port
170 | Output<std::vector<TYPE>>* MVU::getPortRes() { 
171 |     return mvu_results; 
172 | }
173 | 
174 | MVU::~MVU() {
175 |     delete uOP;
176 |     delete update_tag;
177 |     delete mvu_results;
178 |     for (unsigned int i = 0; i < TILES; i++) {
179 |         delete mvu_tiles[i];
180 |         delete uOP_channels[i];
181 |         for (unsigned int j = 0; j < DPES; j++) {
182 |             delete tile_results0[i][j];
183 |             delete tile_results1[i][j];
184 |             delete tile_results2[i][j];
185 |         }   
186 |     }
187 |     delete reduction_channel0;
188 |     delete reduction_channel1;
189 |     delete reduction_channel2;
190 | }


--------------------------------------------------------------------------------
/simulator/src/mvu_vrf.cpp:
--------------------------------------------------------------------------------
  1 | #include "mvu_vrf.h"
  2 | 
  3 | // MVU VRF Constructor
  4 | MVUVRF::MVUVRF (std::string t_name, unsigned int t_tile_id) : Module(t_name) {
  5 | 	// Create Input and Output ports
  6 | 	vrf_wdata = new Input<std::vector<TYPE>>(t_name + "_vrf_wdata", this);
  7 |     vrf_waddr = new Input<unsigned int>(t_name + "_vrf_waddr", this);
  8 |     vrf_rdata = new Output<std::vector<TYPE>>(t_name + "_vrf_rdata", this);
  9 |     vrf_raddr = new Input<unsigned int>(t_name + "_vrf_raddr", this);
 10 |     vrf_sel = new Input<unsigned int>(t_name + "_vrf_sel", this);
 11 | 	// Create internal modules and channels
 12 |     for(unsigned int i = 0; i < num_vrfs; i++){
 13 |     	RegisterFile<std::vector<TYPE>>* ivrf = new RegisterFile<std::vector<TYPE>>(t_name + 
 14 |     		"_vrf_" + std::to_string(i), MVU_VRF_DEPTH);
 15 |     	Channel<unsigned int>* ivrf_raddr = new Channel<unsigned int>(t_name + "_vrf_raddr_" + 
 16 |     		std::to_string(i), 1, 0);
 17 |     	Channel<std::vector<TYPE>>* ivrf_rdata = new Channel<std::vector<TYPE>>(t_name + 
 18 |     		"_vrf_rdata_" + std::to_string(i), 1, 0);
 19 |     	Channel<unsigned int>* ivrf_waddr = new Channel<unsigned int>(t_name + "_vrf_waddr_" + 
 20 |     		std::to_string(i), 1, 0);
 21 |     	Channel<std::vector<TYPE>>* ivrf_wdata = new Channel<std::vector<TYPE>>(t_name + 
 22 |     		"_vrf_wdata_" + std::to_string(i), 1, 0);
 23 |     	ivrf->getPortRaddr()->connectTo(ivrf_raddr);
 24 |     	ivrf->getPortRdata()->connectTo(ivrf_rdata);
 25 |     	ivrf->getPortWaddr()->connectTo(ivrf_waddr);
 26 |     	ivrf->getPortWdata()->connectTo(ivrf_wdata);
 27 |     	vrfs.push_back(ivrf);
 28 |     	vrf_raddr_channel.push_back(ivrf_raddr);
 29 |     	vrf_rdata_channel.push_back(ivrf_rdata);
 30 |     	vrf_waddr_channel.push_back(ivrf_waddr);
 31 |     	vrf_wdata_channel.push_back(ivrf_wdata);
 32 |     }
 33 | 	// Initialize local variables
 34 | 	tile_id = t_tile_id; 
 35 | }
 36 | 
 37 | // Clock cycle update function
 38 | void MVUVRF::clock() {
 39 | 	// Parallel write to all VRFs
 40 | 	if(!vrf_wdata->isChannelEmpty() && !vrf_waddr->isChannelEmpty()){
 41 | 		std::vector<TYPE> wdata = vrf_wdata->readFromChannel();
 42 | 		unsigned int waddr = vrf_waddr->readFromChannel();
 43 | 		for(unsigned int i = 0; i < num_vrfs; i++){
 44 | 			vrf_waddr_channel[i]->write(waddr);
 45 | 			std::vector<TYPE> vrf_wdata;
 46 | 			vrf_wdata.insert(vrf_wdata.end(), wdata.begin()+(i*10), wdata.begin()+(i*10)+10);
 47 | 			assert(vrf_wdata.size() == 10);
 48 | 			vrf_wdata_channel[i]->write(vrf_wdata);
 49 | 		}
 50 | 	}
 51 | 
 52 | 	// Read from a single VRF
 53 | 	if(!vrf_sel->isChannelEmpty() && !vrf_raddr->isChannelEmpty()){
 54 | 		unsigned int temp_vrf_sel = vrf_sel->readFromChannel();
 55 | 		unsigned int temp_vrf_raddr = vrf_raddr->readFromChannel();
 56 | 		vrf_raddr_channel[temp_vrf_sel]->write(temp_vrf_raddr);
 57 | 	}
 58 | 
 59 | 	// Write output to ports & clock all internal VRFs and channels
 60 | 	for(unsigned int i = 0; i < num_vrfs; i++){
 61 | 		if(!vrf_rdata_channel[i]->isEmpty() && !vrf_rdata->isChannelFull())
 62 | 			vrf_rdata->writeToChannel(vrf_rdata_channel[i]->read());
 63 | 		vrfs[i]->clock();
 64 | 		vrf_raddr_channel[i]->clock();
 65 | 		vrf_rdata_channel[i]->clock();
 66 | 		vrf_waddr_channel[i]->clock();
 67 | 		vrf_wdata_channel[i]->clock();
 68 | 	}
 69 | }
 70 | 
 71 | // Getter function for VRF write data input port
 72 | Input<std::vector<TYPE>>* MVUVRF::getPortVrfWdata() { 
 73 | 	return vrf_wdata; 
 74 | }
 75 | 
 76 | // Getter function for VRF write address input port
 77 | Input<unsigned int>* MVUVRF::getPortVrfWaddr() { 
 78 | 	return vrf_waddr; 
 79 | }
 80 | 
 81 | // Getter function for VRF read data output port
 82 | Output<std::vector<TYPE>>* MVUVRF::getPortVrfRdata() { 
 83 | 	return vrf_rdata; 
 84 | }
 85 | 
 86 | // Getter function for VRF read address input port
 87 | Input<unsigned int>* MVUVRF::getPortVrfRaddr() { 
 88 | 	return vrf_raddr; 
 89 | }
 90 | 
 91 | // Getter funtion for VRF select input port
 92 | Input<unsigned int>* MVUVRF::getPortVrfSel() { 
 93 | 	return vrf_sel; 
 94 | }	
 95 | 
 96 | MVUVRF::~MVUVRF() {
 97 | 	delete vrf_wdata;
 98 | 	delete vrf_waddr;
 99 | 	delete vrf_rdata;
100 | 	delete vrf_raddr;
101 | 	delete vrf_sel;
102 | 	for (unsigned int i = 0; i < vrfs.size(); i++) {
103 | 		delete vrfs[i];
104 | 		delete vrf_raddr_channel[i];
105 | 		delete vrf_rdata_channel[i];
106 | 		delete vrf_waddr_channel[i];
107 | 		delete vrf_wdata_channel[i];
108 | 	}
109 | }


--------------------------------------------------------------------------------
/simulator/src/npu.cpp:
--------------------------------------------------------------------------------
 1 | #include "npu.h"
 2 | 
 3 | // NPU constructor
 4 | NPU::NPU(std::string t_name) : Module(t_name) {
 5 |     // Create channels for the micro-OPs to connect between datapath and instruction decoders
 6 |     mvu_uOP_channel  = new Channel<mvu_uOP>(t_name+"_mvu_uOP", FIFO_DEPTH, 1);
 7 |     evrf_uOP_channel = new Channel<evrf_uOP>(t_name+"_evrf_uOP", FIFO_DEPTH, 1);
 8 |     mfu0_uOP_channel = new Channel<mfu_uOP>(t_name+"_mfu0_uOP", FIFO_DEPTH, 1);
 9 |     mfu1_uOP_channel = new Channel<mfu_uOP>(t_name+"_mfu1_uOP", FIFO_DEPTH, 1);
10 |     ld_uOP_channel   = new Channel<ld_uOP>(t_name+"_ld_uOP", FIFO_DEPTH, 1);
11 | 
12 |     // Create NPU datapath and connect input uOP ports to channels
13 |     npu_datapath = new Datapath(t_name);
14 |     npu_datapath->getPortMVUuOP()->connectTo(mvu_uOP_channel);
15 |     npu_datapath->getPortEVRFuOP()->connectTo(evrf_uOP_channel);
16 |     npu_datapath->getPortMFU0uOP()->connectTo(mfu0_uOP_channel);
17 |     npu_datapath->getPortMFU1uOP()->connectTo(mfu1_uOP_channel);
18 |     npu_datapath->getPortLDuOP()->connectTo(ld_uOP_channel);
19 | 
20 |     // Create NPU instruction decoder and connect output uOP ports to channels
21 |     npu_decoders = new Decoder(t_name+"_decoder");
22 |     npu_decoders->getPortMVUuOP()->connectTo(mvu_uOP_channel);
23 |     npu_decoders->getPortEVRFuOP()->connectTo(evrf_uOP_channel);
24 |     npu_decoders->getPortMFU0uOP()->connectTo(mfu0_uOP_channel);
25 |     npu_decoders->getPortMFU1uOP()->connectTo(mfu1_uOP_channel);
26 |     npu_decoders->getPortLDuOP()->connectTo(ld_uOP_channel);
27 | 
28 |     // Connect NPU input and output ports
29 |     npu_inst = npu_decoders->getPortInputVLIW();
30 |     npu_output = npu_datapath->getPortOutput();
31 | }
32 | 
33 | // Clock cycle update function
34 | void NPU::clock(unsigned int &cycle_count) {
35 |     npu_datapath->clock(cycle_count);
36 |     npu_decoders->clock(cycle_count);
37 | 
38 |     mvu_uOP_channel->clock();
39 |     evrf_uOP_channel->clock();
40 |     mfu0_uOP_channel->clock();
41 |     mfu1_uOP_channel->clock();
42 |     ld_uOP_channel->clock();
43 | }
44 | 
45 | // Getter function for name
46 | std::string NPU::getName() { 
47 |     return name; 
48 | }
49 | 
50 | // Getter function for instruction port
51 | Input<npu_instruction>* NPU::getPortInst() { 
52 |     return npu_inst; 
53 | }
54 | 
55 | // Getter function for output port
56 | Output<std::vector<TYPE>>* NPU::getPortOutput() { 
57 |     return npu_output; 
58 | }
59 | 
60 | NPU::~NPU(){
61 |     delete npu_datapath;
62 |     delete npu_decoders;
63 |     delete mvu_uOP_channel;
64 |     delete evrf_uOP_channel;
65 |     delete mfu0_uOP_channel;
66 |     delete mfu1_uOP_channel;
67 |     delete ld_uOP_channel;
68 | }


--------------------------------------------------------------------------------
/simulator/src/obj/README.md:
--------------------------------------------------------------------------------
1 | Directory for object files produced from simulation


--------------------------------------------------------------------------------
/simulator/src/output.cpp:
--------------------------------------------------------------------------------
 1 | #include "output.h"
 2 | 
 3 | // Output Port Constructor
 4 | template <class T>
 5 | Output<T>::Output(std::string t_name, Module *t_module): Port<T>(t_name, t_module) { }
 6 | 
 7 | // Helper function for connecting an output port to an outgoing channel
 8 | template <class T> 
 9 | void Output<T>::connectTo(Channel<T> *t_channel) { 
10 | 	channels.push_back(t_channel);
11 | }
12 | 
13 | // Helper function for writing to all the channels connected to this output port
14 | template <class T>
15 | void Output<T>::writeToChannel(T t_data) {
16 |     for(unsigned int i = 0; i < channels.size(); i++){
17 |         channels[i]->write(t_data);
18 |     }
19 | }
20 | 
21 | // Helper function for checking if the channel connected to this port is full
22 | template <class T>
23 | bool Output<T>::isChannelFull() {
24 |     bool full = false;
25 |     for(unsigned int i = 0; i < channels.size(); i++){
26 |         full = full || channels[i]->isFull();
27 |     }
28 |     return full;
29 | }
30 | 
31 | template <class T>
32 | Output<T>::~Output(){ 
33 | 	for (unsigned int i = 0; i < channels.size(); i++){
34 | 		channels[i] = NULL;
35 | 	} 
36 | }
37 | 
38 | template class Output<TYPE>;
39 | template class Output<std::vector<TYPE>>;
40 | template class Output<bool>;
41 | template class Output<unsigned int>;
42 | template class Output<mvu_uOP>;
43 | template class Output<evrf_uOP>;
44 | template class Output<mfu_uOP>;
45 | template class Output<ld_uOP>;
46 | 


--------------------------------------------------------------------------------
/simulator/src/port.cpp:
--------------------------------------------------------------------------------
 1 | #include "port.h"
 2 | 
 3 | // Port Constructor
 4 | template <class T> 
 5 | Port<T>::Port (std::string t_name, Module *t_module) { 
 6 | 	name = t_name;
 7 | 	module = t_module;
 8 | }
 9 | 
10 | // Getter function for name
11 | template <class T> 
12 | std::string Port<T>::getName() { 
13 | 	return name; 
14 | }
15 | 
16 | // Getter function for port module
17 | template <class T> 
18 | Module* Port<T>::getModule() { 
19 | 	return module; 
20 | }
21 | 
22 | template class Port<TYPE>;
23 | template class Port<std::vector<TYPE>>;
24 | template class Port<bool>;
25 | template class Port<unsigned int>;
26 | template class Port<mvu_uOP>;
27 | template class Port<evrf_uOP>;
28 | template class Port<mfu_uOP>;
29 | template class Port<ld_uOP>;
30 | template class Port<npu_instruction>;


--------------------------------------------------------------------------------
/simulator/src/register_file.cpp:
--------------------------------------------------------------------------------
  1 | #include "register_file.h"
  2 | 
  3 | // Helper function for initializing vector register files
  4 | void init_rf(std::vector<std::vector<TYPE>> &rf, unsigned int depth){
  5 |     for (unsigned int i = rf.size(); i < depth; i++) {
  6 |         std::vector <TYPE> zeros;
  7 |         for (int j = 0; j < LANES; j++) {
  8 |             zeros.push_back(0);
  9 |         }
 10 |         rf.push_back(zeros);
 11 |     }
 12 | }
 13 | 
 14 | // Register File Constructor
 15 | template <class T>
 16 | RegisterFile<T>::RegisterFile (std::string t_name, unsigned int t_depth, 
 17 | 	std::string *t_file_name): Module(t_name) { 
 18 | 		// Create Input and Output ports
 19 | 		raddr = new Input<unsigned int> (t_name + "_raddr", this);
 20 | 		rdata = new Output<T> (t_name + "_rdata", this);
 21 | 		waddr = new Input<unsigned int> (t_name + "_waddr", this);
 22 | 		wdata = new Input<T> (t_name + "_wdata", this);
 23 | 		// Initialize local variables
 24 | 		depth = t_depth;
 25 | 		reads_in_flight = 0;
 26 | 		writes_in_flight = 0;
 27 | 		// Initialize register file contents
 28 | 		if (t_file_name)
 29 | 		    readVectorFile(*t_file_name, register_file);
 30 | 		init_rf(register_file, t_depth);
 31 | }
 32 | 
 33 | // Helper function for read operation
 34 | template <class T>
 35 | void RegisterFile<T>::read(){
 36 | 	// Advance the pipeline if there is any data in it
 37 | 	if(read_pipeline.size() > 0){
 38 | 	    bool retire = false;
 39 | 		for (unsigned int i = 0; i < reads_in_flight; i++){
 40 | 			std::tuple<unsigned int, unsigned int> temp = read_pipeline.front();
 41 | 			if((std::get<1>(temp) == 0) && (!rdata->isChannelFull())){
 42 | 				read_pipeline.pop();
 43 | 				assert(std::get<0>(temp) < depth && "Read address out of bound");
 44 | 				rdata->writeToChannel(register_file[std::get<0>(temp)]);
 45 | 				retire = true;
 46 | 			} else if (reads_in_flight <= RF_READ_LATENCY) {
 47 | 				read_pipeline.pop();
 48 | 				if(std::get<1>(temp) > 0){
 49 | 				    std::get<1>(temp)--;
 50 | 				}
 51 | 				read_pipeline.push(temp);
 52 | 			}
 53 | 		}
 54 | 		reads_in_flight = (retire)? reads_in_flight-1: reads_in_flight;
 55 | 	}
 56 | 
 57 | 	// Read in new address if the pipeline is not stalled (i.e. reads in flight
 58 | 	// less than the pipeline depth/latency)
 59 | 	if(!raddr->isChannelEmpty() && reads_in_flight <= RF_READ_LATENCY){
 60 | 		unsigned int temp_raddr = raddr->readFromChannel();
 61 | 		read_pipeline.push(std::make_tuple(temp_raddr, RF_READ_LATENCY-1));
 62 | 		reads_in_flight++;
 63 | 	}
 64 | }
 65 | 
 66 | // Helper function for write operation
 67 | template <class T>
 68 | void RegisterFile<T>::write(){
 69 | 	// Advance the pipeline if there is any data in it
 70 | 	if(write_pipeline.size() > 0){
 71 | 	    bool retire = false;
 72 | 		for (unsigned int i = 0; i < writes_in_flight; i++){
 73 | 			std::tuple<unsigned int, T, unsigned int> temp = write_pipeline.front();
 74 | 			if((std::get<2>(temp) == 0)){
 75 | 				write_pipeline.pop();
 76 | 				assert(std::get<0>(temp) < depth && "Write address out of bound");
 77 | 				register_file[std::get<0>(temp)] = std::get<1>(temp);
 78 | 				retire = true;
 79 | 			} else if (writes_in_flight <= RF_WRITE_LATENCY) {
 80 | 				write_pipeline.pop();
 81 | 				if(std::get<2>(temp) > 0) {
 82 | 					std::get<2>(temp)--;
 83 | 				}
 84 | 				write_pipeline.push(temp);
 85 | 			}
 86 | 		}
 87 | 		writes_in_flight = (retire)? writes_in_flight-1: writes_in_flight;
 88 | 	}
 89 | 
 90 | 	// Read in new address and data if the pipeline is not stalled (i.e. reads 
 91 | 	// in flight less than the pipeline depth/latency)
 92 | 	if((!waddr->isChannelEmpty()) && (!wdata->isChannelEmpty()) && 
 93 | 		(writes_in_flight <= RF_WRITE_LATENCY)){
 94 | 		unsigned int temp_waddr = waddr->readFromChannel();
 95 | 		T temp_wdata = wdata->readFromChannel();
 96 | 		write_pipeline.push(std::make_tuple(temp_waddr, temp_wdata, RF_WRITE_LATENCY-1));
 97 | 		writes_in_flight++;
 98 | 	}
 99 | }
100 | 
101 | // Clock cycle update function
102 | template <class T>
103 | void RegisterFile<T>::clock() {
104 | 	this->write();
105 | 	this->read();
106 | }
107 | 
108 | // Helper function for printing out the contents of a register file (used for debugging)
109 | template <class T>
110 | void RegisterFile<T>::print() { 
111 | 	std::cout<<"Register file elements: ";
112 | 	for (unsigned int i = 0; i < depth; i++)
113 | 		std::cout<< register_file.at(i) << ", ";
114 | 	std::cout << std::endl;
115 | }
116 | 
117 | // Getter function for read address input port
118 | template <class T>
119 | Input<unsigned int>* RegisterFile<T>::getPortRaddr() { 
120 | 	return raddr; 
121 | }
122 | 
123 | // Getter function for read data output port
124 | template <class T>
125 | Output<T>* RegisterFile<T>::getPortRdata() { 
126 | 	return rdata; 
127 | }
128 | 
129 | // Getter function for write address input port
130 | template <class T>
131 | Input<unsigned int>* RegisterFile<T>::getPortWaddr() { 
132 | 	return waddr; 
133 | }
134 | 
135 | // Getter function for write data input port
136 | template <class T>
137 | Input<T>* RegisterFile<T>::getPortWdata() { 
138 | 	return wdata; 
139 | }
140 | 
141 | template <class T>
142 | RegisterFile<T>::~RegisterFile() {
143 | 	delete raddr;
144 | 	delete rdata;
145 | 	delete waddr;
146 | 	delete wdata;
147 | }
148 | 
149 | template class RegisterFile<std::vector<TYPE>>;


--------------------------------------------------------------------------------
/simulator/src/utils.cpp:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | 
 3 | // Operator overload for printing a vector
 4 | template <typename T>
 5 | std::ostream& operator<< (std::ostream& out, const std::vector<T>& v) {
 6 |     out << "{";
 7 |     size_t last = v.size() - 1;
 8 |     for(size_t i = 0; i < v.size(); ++i) {
 9 |         out << v[i];
10 |         if (i != last)
11 |             out << ", ";
12 |     }
13 |     out << "}";
14 |     return out;
15 | }
16 | 
17 | // Used for populating vector register file contents from a file
18 | void readVectorFile(std::string &file_name, std::vector<std::vector<TYPE>> &vec_data) {
19 |     std::ifstream in(file_name);
20 | 
21 |     if (!in) assert(0 && "file not open");
22 |     std::string line;
23 |     while(std::getline(in, line)) {
24 |         std::stringstream line_stream(line);
25 |         std::vector<TYPE> data;
26 |         TYPE temp;
27 |         while (line_stream >> temp) {
28 |             data.push_back(temp);
29 |         }
30 |         vec_data.push_back(data);
31 |     }
32 | }
33 | 
34 | // Used for populating vector FIFO contents from a file
35 | void readVectorFile(std::string &file_name, std::queue<std::vector<TYPE>> &que_data) {
36 |     std::ifstream in(file_name);
37 |     if (!in) assert(0 && "file not open");
38 |     std::string line;
39 |     while(std::getline(in, line)) {
40 |         std::stringstream line_stream(line);
41 |         std::vector<TYPE> data;
42 |         TYPE temp;
43 |         while (line_stream >> temp) {
44 |             data.push_back(temp);
45 |         }
46 |         que_data.push(data);
47 |     }
48 | }
49 | 
50 | // Operator overload for adding two vectors
51 | std::vector<TYPE> operator+ (const std::vector<TYPE> &v1, const std::vector<TYPE> &v2){
52 |     assert(v1.size() == v2.size() && "The two vectors have different lengths");
53 |     std::vector<TYPE> result;
54 |     for(unsigned int i = 0; i < v1.size(); i++){
55 |         result.push_back(v1[i] + v2[i]);
56 |     }
57 |     return result;
58 | }
59 | 
60 | // Used for reading simulating golden outputs
61 | template <typename T>
62 | void readGoldenOutput(std::string &file_name, std::vector<T> &vec_data, int v_size) {
63 |     std::ifstream in(file_name);
64 | 
65 |     if (!in) assert(0 && "file not open");
66 |     std::string line;
67 |     while(std::getline(in, line)) {
68 |         std::stringstream line_stream(line);
69 |         std::vector<TYPE> data;
70 |         TYPE temp;
71 |         int count = 0;
72 |         while (line_stream >> temp) {
73 |             data.push_back(temp);
74 |             count++;
75 |             if(count == v_size){
76 |                 count = 0;
77 |                 vec_data.push_back(data);
78 |                 data.erase(data.begin(), data.end());
79 |             }
80 |         }
81 |     }
82 | }
83 | 
84 | template std::ostream& operator<< (std::ostream& out, const std::vector<TYPE>& v);
85 | template void readGoldenOutput(std::string &file_name, std::vector<std::vector<TYPE>> &vec_data, 
86 |     int v_size);


--------------------------------------------------------------------------------