├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake └── Modules │ ├── FindOpenCv.cmake │ ├── FindVitis.cmake │ └── FindVivado.cmake ├── include ├── dma │ ├── axis_lib.h │ ├── svd_dma.h │ └── width_converter.h ├── hls_utils │ ├── adder_tree.h │ ├── dot_prod_dsp.h │ ├── hls_debugging.h │ ├── hls_metaprogramming.h │ ├── hw_timer.h │ └── priority_encoder.h ├── kernel │ ├── gemv_kernel.h │ ├── s_kernel.h │ ├── svd_kernel.h │ ├── u_kernel.h │ └── v_kernel.h ├── layers │ ├── dense │ │ └── hls │ │ │ └── dense_svd.h │ └── lstm │ │ ├── hls │ │ ├── lstm_hardware.h │ │ ├── lstm_svd.h │ │ └── lstm_svd_emulator.h │ │ ├── lstm_data_handler.h │ │ └── sw │ │ ├── soft_lstm.h │ │ └── soft_lstm_svd.h ├── math_utils │ ├── activation_functions.h │ ├── blas_utils.h │ └── data_handler.h ├── svd_ip.h ├── svd_params.h └── testbenches │ ├── test_dense_svd.h │ ├── test_lstm_svd.h │ ├── test_svd_kernel.h │ ├── test_u_kernel.h │ ├── test_u_kernel_pruned.h │ ├── test_v_kernel.h │ └── test_v_kernel_pruned.h ├── make_hls.py ├── pynq ├── README.md ├── dense_svd │ ├── dense_svd.ipynb │ └── overlay │ │ ├── dense_svd.bit │ │ ├── dense_svd.hwh │ │ └── dense_svd.tcl ├── kernel_svd │ ├── kernel_svd.ipynb │ └── overlay │ │ ├── kernel_svd.bit │ │ ├── kernel_svd.hwh │ │ └── kernel_svd.tcl ├── kernel_u │ ├── kernel_u.ipynb │ ├── kernel_u_hier.ipynb │ └── overlay │ │ ├── kernel_u.bit │ │ ├── kernel_u.hwh │ │ └── kernel_u.tcl └── kernel_v │ ├── binfile_example.bin │ ├── kernel_v.ipynb │ └── overlay │ ├── kernel_v.bit │ ├── kernel_v.hwh │ └── kernel_v.tcl ├── python ├── README.md ├── SVD_Approximation.ipynb ├── __init__.py ├── models │ └── __init__.py ├── requirements.txt ├── roofline │ ├── README.md │ └── __init__.py ├── svd │ └── __init__.py └── svd_approximation.py ├── run_hls.tcl ├── run_hls_test.tcl ├── src ├── CMakeLists.txt ├── dma │ ├── CMakeLists.txt │ ├── axis_lib.cpp │ ├── svd_dma.cpp │ └── width_converter.cpp ├── hls_utils │ ├── CMakeLists.txt │ ├── adder_tree.cpp │ ├── dot_prod_dsp.cpp │ └── hw_timer.cpp ├── kernel │ ├── CMakeLists.txt │ ├── README.md │ ├── gemv_kernel.cpp │ ├── s_kernel.cpp │ ├── svd_kernel.cpp │ ├── u_kernel.cpp │ └── v_kernel.cpp ├── layers │ ├── CMakeLists.txt │ ├── dense │ │ ├── CMakeLists.txt │ │ ├── hls │ │ │ ├── CMakeLists.txt │ │ │ └── dense_svd.cpp │ │ └── sw │ │ │ └── CMakeLists.txt │ └── lstm │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── hls │ │ ├── CMakeLists.txt │ │ ├── lstm_hardware.cpp │ │ ├── lstm_svd.cpp │ │ └── lstm_svd_emulator.cpp │ │ ├── lstm_data_handler.cpp │ │ └── sw │ │ ├── CMakeLists.txt │ │ ├── soft_lstm.cpp │ │ └── soft_lstm_svd.cpp ├── math_utils │ ├── CMakeLists.txt │ ├── activation_functions.cpp │ ├── blas_utils.cpp │ └── data_handler.cpp ├── svd.cpp ├── svd_ip.cpp ├── svd_params.cpp └── testbenches │ ├── CMakeLists.txt │ ├── test_dense_svd.cpp │ ├── test_gemv_kernel.cpp │ ├── test_lstm_svd.cpp │ ├── test_svd_kernel.cpp │ ├── test_u_kernel.cpp │ ├── test_u_kernel_pruned.cpp │ ├── test_v_kernel.cpp │ └── test_v_kernel_pruned.cpp └── tcl ├── lstm_params.tcl └── utils.tcl /.gitignore: -------------------------------------------------------------------------------- 1 | # CMake Directories 2 | build 3 | bin/* 4 | # Images 5 | data/pong_* 6 | data/game_* 7 | data/windows* 8 | # Hardware Directories 9 | vivado_hls.log 10 | vitis_hls.log 11 | hls_prj/ 12 | vivado/ 13 | vitis_include/ 14 | ./token 15 | 16 | # PYNQ 17 | **/.ipynb_checkpoints 18 | **/sds_trace_data.dat -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | # Set the project name 3 | project(Svd VERSION 1.0) 4 | 5 | if(NOT CMAKE_BUILD_TYPE) 6 | set(CMAKE_BUILD_TYPE Release) 7 | endif() 8 | 9 | # To locate "custom", i.e. manually added, libraries 10 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) 11 | 12 | # Locate external libraries and headers (see Files in ./cmake/Modules/) 13 | # Search for HLS: if Vitis is found, use C++14, else fall back to C++11. 14 | find_package(Vitis REQUIRED) 15 | if (Vitis_FOUND) 16 | # Specify the C++14 standard 17 | message("[INFO] Vitis HLS FOUND.") 18 | set(CMAKE_CXX_STANDARD 14) 19 | set(CMAKE_CXX_STANDARD_REQUIRED True) 20 | set(HLS_INCLUDE_DIRS ${VITIS_INCLUDE_DIRS}) 21 | add_compile_definitions(__VITIS_HLS__) 22 | else() 23 | message("[INFO] Vivado HLS FOUND.") 24 | find_package(Vivado REQUIRED) 25 | # Specify the C++11 standard 26 | set(CMAKE_CXX_STANDARD 11) 27 | set(CMAKE_CXX_STANDARD_REQUIRED True) 28 | set(HLS_INCLUDE_DIRS ${VIVADO_INCLUDE_DIRS}) 29 | endif() 30 | # find_package(OpenCv REQUIRED) 31 | 32 | message(${HLS_INCLUDE_DIRS}) 33 | 34 | # set(CMAKE_CXX_FLAGS "-Wall -Wextra") 35 | set(CMAKE_CXX_FLAGS_DEBUG "-g") 36 | set(CMAKE_CXX_FLAGS_RELEASE "-O3") 37 | 38 | # Add all definitions 39 | # The following definitions is required for compiling half-precision numbers. 40 | add_compile_definitions(HLS_NO_XIL_FPO_LIB) 41 | # add_compile_definitions(USE_FLOAT) 42 | add_compile_definitions(DEBUG_LEVEL=2) 43 | 44 | add_compile_definitions(INPUT_SIZE=1024) 45 | add_compile_definitions(HIDDEN_SIZE=512) 46 | add_compile_definitions(NUM_GATES=4) 47 | add_compile_definitions(NUM_SAMPLES=2) 48 | add_compile_definitions(NUM_TILES_U=8) 49 | add_compile_definitions(NUM_ZERO_TILES_U=2) 50 | add_compile_definitions(NUM_TILES_V=8) 51 | add_compile_definitions(NUM_ZERO_TILES_V=2) 52 | add_compile_definitions(NUM_TIMESTEPS=28) 53 | add_compile_definitions(FIX_WIDTH=16) 54 | add_compile_definitions(FIX_FRACT_WIDTH=5) 55 | 56 | # Move executable in bin/, along side the DLLs (copied) 57 | set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin) 58 | # file(COPY ${OpenCv_LIBS} DESTINATION ${EXECUTABLE_OUTPUT_PATH}) 59 | 60 | # NOTE: an object file becomes a library. All libraries/objects must be LINKED later! 61 | # Tell the application where to find the other CMake config files. 62 | enable_testing() 63 | add_subdirectory(${CMAKE_SOURCE_DIR}/src) 64 | 65 | add_executable(Svd ${CMAKE_CURRENT_SOURCE_DIR}/src/svd.cpp) 66 | target_link_libraries(Svd SVD) 67 | if (WIN32) 68 | set_target_properties(Svd PROPERTIES LINK_FLAGS -Wl,--stack,10485760) 69 | endif() 70 | -------------------------------------------------------------------------------- /cmake/Modules/FindOpenCv.cmake: -------------------------------------------------------------------------------- 1 | 2 | if (WIN32) 3 | set(OpenCv_DIR C:/Xilinx/Vivado/2018.3/win64/tools/opencv) 4 | set(OpenCv_INCLUDE_DIRS C:/Xilinx/Vivado/2018.3/win64/tools/opencv/include) 5 | else() 6 | set(OpenCv_DIR /mnt/c/Xilinx/Vivado/2018.3/win64/tools/opencv) 7 | set(OpenCv_INCLUDE_DIRS /mnt/c/Xilinx/Vivado/2018.3/win64/tools/opencv/include) 8 | endif() 9 | 10 | file(GLOB OpenCv_LIBS ${OpenCv_DIR}/*.dll) 11 | 12 | # NOTE: It handles the REQUIRED, QUIET and version-related arguments of find_package. 13 | # It also sets the _FOUND variable. The package is considered found 14 | # if all variables listed contain valid results, e.g. valid filepaths. 15 | include(FindPackageHandleStandardArgs) 16 | find_package_handle_standard_args(OpenCv DEFAULT_MSG OpenCv_INCLUDE_DIRS) -------------------------------------------------------------------------------- /cmake/Modules/FindVitis.cmake: -------------------------------------------------------------------------------- 1 | if (WIN32) 2 | # set(VITIS_INCLUDE_DIRS D:/Programs/Xilinx/Vitis_HLS/2021.1/include/) 3 | set(VITIS_INCLUDE_DIRS C:/Users/ste/phd/hls_projects/hls_svd/vitis_include/2020.2/include/) 4 | # set(VITIS_INCLUDE_DIRS C:/Users/ste/phd/hls_projects/hls_svd/vitis_include/2021.1/include/) 5 | else() 6 | # set(VITIS_INCLUDE_DIRS /mnt/d/Programs/Xilinx/Vitis_HLS/2021.1/include/) 7 | set(VITIS_INCLUDE_DIRS /gdrive/MyDrive/Colab\ Notebooks/svd/hls_svd/vitis_include/2020.2/include) 8 | endif() 9 | 10 | # NOTE: It handles the REQUIRED, QUIET and version-related arguments of find_package. 11 | # It also sets the _FOUND variable. The package is considered found 12 | # if all variables listed contain valid results, e.g. valid filepaths. 13 | include(FindPackageHandleStandardArgs) 14 | find_package_handle_standard_args(Vitis DEFAULT_MSG VITIS_INCLUDE_DIRS) -------------------------------------------------------------------------------- /cmake/Modules/FindVivado.cmake: -------------------------------------------------------------------------------- 1 | if (WIN32) 2 | set(VIVADO_INCLUDE_DIRS C:/Xilinx/Vivado/2018.3/include/) 3 | else() 4 | set(VIVADO_INCLUDE_DIRS /mnt/c/Xilinx/Vivado/2018.3/include/) 5 | endif() 6 | 7 | # NOTE: It handles the REQUIRED, QUIET and version-related arguments of find_package. 8 | # It also sets the _FOUND variable. The package is considered found 9 | # if all variables listed contain valid results, e.g. valid filepaths. 10 | include(FindPackageHandleStandardArgs) 11 | find_package_handle_standard_args(Vivado DEFAULT_MSG VIVADO_INCLUDE_DIRS) 12 | -------------------------------------------------------------------------------- /include/dma/width_converter.h: -------------------------------------------------------------------------------- 1 | #ifndef DMA_WIDTH_CONVERTER_H_ 2 | #define DMA_WIDTH_CONVERTER_H_ 3 | 4 | #include "ap_int.h" 5 | #include "assert.h" 6 | 7 | #include 8 | #include 9 | 10 | /** 11 | * @brief Adjust stream width. Used for DMA-ing input ports. 12 | * 13 | * @param[in] in The input stream 14 | * @param[out] out The output stream 15 | * 16 | * @tparam OutD { description } 17 | * @tparam InWidth Width of input stream 18 | * @tparam OutWidth Width of output stream 19 | * @tparam NumInWords Number of input words (OutWidth) to process 20 | */ 21 | template 22 | void Mem2MemDataWidthConverter(const ap_uint *in, OutD *out) { 23 | assert(InWidth % 8 == 0); 24 | assert(OutWidth % 8 == 0); 25 | if (InWidth > OutWidth) { 26 | // Store multiple output words per input word read 27 | assert(InWidth % OutWidth == 0); 28 | const unsigned kOutPerIn = InWidth / OutWidth; 29 | unsigned out_idx = 0; 30 | unsigned out_addr = 0; 31 | ap_uint elem_in = 0; 32 | for (int i = 0; i < NumInWords; ++i) { 33 | #pragma HLS PIPELINE II=1 34 | if (out_idx == 0) { 35 | elem_in = in[i]; 36 | } 37 | // TODO(15/03/2019 - performance opt) 38 | // if constexpr (std::is_same::value || std::is_same::value) { 39 | // out[out_addr] = elem_in(OutWidth - 1, 0); 40 | // } else { 41 | // out[out_addr].range() = elem_in(OutWidth - 1, 0); 42 | // } 43 | #if USE_FIX 44 | out[out_addr].range() = elem_in(OutWidth - 1, 0); 45 | #else 46 | out[out_addr] = elem_in(OutWidth - 1, 0); 47 | #endif 48 | elem_in = elem_in >> OutWidth; 49 | out_addr++; 50 | out_idx++; 51 | // Wraparound indices to recreate the nested loop structure 52 | if (out_idx == kOutPerIn) { 53 | out_idx = 0; 54 | } 55 | } 56 | } else if (InWidth == OutWidth) { 57 | // Read multiple input words per output word stored 58 | assert(OutWidth % InWidth == 0); 59 | } else { // InWidth < OutWidth 60 | // Read multiple input words per output word stored 61 | assert(OutWidth % InWidth == 0); 62 | } 63 | } 64 | 65 | /** 66 | * @brief Adjust stream width. Used for DMA-ing input ports. 67 | * 68 | * @param[in] num_in_words Number of input words (OutWidth) to process 69 | * @param[in] in The input stream 70 | * @param[out] out The output stream 71 | * 72 | * @tparam OutD { description } 73 | * @tparam InWidth Width of input stream 74 | * @tparam OutWidth Width of output stream 75 | * @tparam NumInWords Number of input words (OutWidth) to process 76 | */ 77 | template 78 | void Mem2MemDataWidthConverter(const int num_in_words, 79 | const ap_uint *in, OutD *out) { 80 | assert(InWidth % 8 == 0); 81 | assert(OutWidth % 8 == 0); 82 | if (InWidth > OutWidth) { 83 | // Store multiple output words per input word read 84 | assert(InWidth % OutWidth == 0); 85 | const unsigned kOutPerIn = InWidth / OutWidth; 86 | unsigned out_idx = 0; 87 | unsigned out_addr = 0; 88 | ap_uint elem_in = 0; 89 | for (int i = 0; i < num_in_words; ++i) { 90 | #pragma HLS PIPELINE II=1 91 | if (out_idx == 0) { 92 | elem_in = in[i]; 93 | } 94 | // TODO(15/03/2019 - performance opt) 95 | // if constexpr (std::is_same::value || std::is_same::value) { 96 | // out[out_addr] = elem_in(OutWidth - 1, 0); 97 | // } else { 98 | // out[out_addr].range() = elem_in(OutWidth - 1, 0); 99 | // } 100 | #if USE_FIX 101 | out[out_addr].range() = elem_in(OutWidth - 1, 0); 102 | #else 103 | out[out_addr] = elem_in(OutWidth - 1, 0); 104 | #endif 105 | elem_in = elem_in >> OutWidth; 106 | // Wraparound indices to recreate the nested loop structure 107 | if (out_idx == kOutPerIn - 1) { 108 | out_idx = 0; 109 | } 110 | out_addr++; 111 | out_idx++; 112 | } 113 | } else if (InWidth == OutWidth) { 114 | assert(OutWidth % InWidth == 0); 115 | ap_uint elem_in = 0; 116 | for (int i = 0; i < num_in_words; ++i) { 117 | #pragma HLS PIPELINE II=1 118 | elem_in = in[i]; 119 | #if USE_FIX 120 | out[i].range() = elem_in; 121 | #else 122 | out[i] = elem_in; 123 | #endif 124 | } 125 | } 126 | } 127 | 128 | template 129 | void Mem2MemDataWidthConverter(const int num_in_words, 130 | const InD *in, ap_uint *out) { 131 | assert(InWidth % 8 == 0); 132 | assert(OutWidth % 8 == 0); 133 | if (InWidth < OutWidth) { 134 | // Read multiple input words per output word stored 135 | assert(OutWidth % InWidth == 0); 136 | 137 | 138 | const unsigned kOutPerIn = InWidth / OutWidth; 139 | const unsigned kInPerOut = OutWidth / InWidth; 140 | unsigned out_idx = 0; 141 | unsigned out_addr = 0; 142 | ap_uint elem_in = 0; 143 | ap_uint elem_out = 0; 144 | 145 | for (int i = 0; i < num_in_words; ++i) { 146 | #pragma HLS PIPELINE II=1 147 | const int kHi = ((i + 1) * InWidth) % OutWidth - 1; 148 | const int kLo = (i * InWidth) % OutWidth; 149 | // if constexpr (std::is_same::value || std::is_same::value) { 150 | // elem_out(kHi, kLo) = in[i]; 151 | // } else { 152 | // elem_out(kHi, kLo) = in[i].range(); 153 | // } 154 | #if USE_FIX 155 | elem_out(kHi, kLo) = in[i].range(); 156 | #else 157 | elem_out(kHi, kLo) = in[i]; 158 | #endif 159 | // Wraparound indices to recreate the nested loop structure 160 | if (out_idx == kInPerOut - 1) { 161 | out[out_addr] = elem_out; 162 | out_addr++; 163 | out_idx = 0; 164 | } 165 | out_idx++; 166 | } 167 | } else if (InWidth == OutWidth) { 168 | assert(OutWidth % InWidth == 0); 169 | ap_uint elem_in = 0; 170 | for (int i = 0; i < num_in_words; ++i) { 171 | #pragma HLS PIPELINE II=1 172 | elem_in = in[i]; 173 | #if USE_FIX 174 | out[i].range() = elem_in; 175 | #else 176 | out[i] = elem_in; 177 | #endif 178 | } 179 | } 180 | } 181 | 182 | template 183 | void Mem2MemDataWidthConverter(const ap_uint *in_dmem, 184 | ap_uint *out_dmem) { 185 | if (InWidth > OutWidth) { 186 | // Emit multiple output words per input word read 187 | assert(InWidth % OutWidth == 0); 188 | const unsigned int kOutPerIn = InWidth / OutWidth; 189 | unsigned int in_idx = 0; 190 | unsigned int out_idx = 0; 191 | ap_uint elem_in = 0; 192 | for (int i = 0; i < NumInWords; ++i) { 193 | #pragma HLS PIPELINE II=1 194 | if (out_idx == 0) { 195 | elem_in = in_dmem[in_idx]; 196 | ++in_idx; 197 | } 198 | ap_uint elem_out = elem_in.range(OutWidth - 1, 0); 199 | out_dmem[i] = elem_out; 200 | elem_in = elem_in >> OutWidth; 201 | out_idx++; 202 | // Wraparound indices to recreate the nested loop structure 203 | if (out_idx == kOutPerIn) { 204 | out_idx = 0; 205 | } 206 | } 207 | } else if (InWidth == OutWidth) { 208 | for (int i = 0; i < NumInWords; ++i) { 209 | #pragma HLS PIPELINE II=1 210 | out_dmem[i] = in_dmem[i]; 211 | } 212 | } else { // InWidth < OutWidth 213 | // Read multiple input words per output word emitted 214 | assert(OutWidth % InWidth == 0); 215 | const unsigned int kInPerOut = OutWidth / InWidth; 216 | const unsigned int kTotalIters = NumInWords; 217 | unsigned int in_idx = 0; 218 | unsigned int out_idx = 0; 219 | ap_uint elem_out = 0; 220 | for (int i = 0; i < kTotalIters; i++) { 221 | #pragma HLS PIPELINE II=1 222 | auto elem_in = in_dmem[i]; 223 | elem_out = elem_out >> InWidth; 224 | elem_out.range(OutWidth - 1, OutWidth - InWidth) = elem_in; 225 | in_idx++; 226 | // Wraparound logic to recreate nested loop functionality 227 | if (in_idx == kInPerOut) { 228 | in_idx = 0; 229 | out_dmem[out_idx] = elem_out; 230 | ++out_idx; 231 | } 232 | } 233 | } 234 | } 235 | 236 | template 237 | void Mem2MemDataWidthConverter(const int num_in_words, 238 | const ap_uint *in_dmem, ap_uint *out_dmem) { 239 | if (InWidth > OutWidth) { 240 | // Emit multiple output words per input word read 241 | assert(InWidth % OutWidth == 0); 242 | const unsigned int kOutPerIn = InWidth / OutWidth; 243 | ap_uint elem_in = 0; 244 | // ========================================================================= 245 | // A NESTED FOR LOOP IS REQUIRED OTHERWISE THERE WOULD BE NO ITERATIONS WHEN 246 | // NumInWords IS EQUAL TO 1! 247 | // ========================================================================= 248 | for (int i = 0; i < num_in_words; ++i) { 249 | for (int j = 0; j < kOutPerIn; ++j) { 250 | #pragma HLS PIPELINE II=1 251 | if (j == 0) { 252 | elem_in = in_dmem[i]; 253 | } 254 | ap_uint elem_out = elem_in.range(OutWidth - 1, 0); 255 | out_dmem[i] = elem_out; 256 | elem_in = elem_in >> OutWidth; 257 | } 258 | } 259 | } else if (InWidth == OutWidth) { 260 | for (int i = 0; i < num_in_words; ++i) { 261 | #pragma HLS PIPELINE II=1 262 | out_dmem[i] = in_dmem[i]; 263 | } 264 | } else { // InWidth < OutWidth 265 | // Read multiple input words per output word emitted 266 | assert(OutWidth % InWidth == 0); 267 | const unsigned int kInPerOut = OutWidth / InWidth; 268 | const unsigned int kTotalIters = num_in_words; 269 | unsigned int in_idx = 0; 270 | unsigned int out_idx = 0; 271 | ap_uint elem_out = 0; 272 | for (int i = 0; i < kTotalIters; i++) { 273 | #pragma HLS PIPELINE II=1 274 | auto elem_in = in_dmem[i]; 275 | elem_out = elem_out >> InWidth; 276 | elem_out.range(OutWidth - 1, OutWidth - InWidth) = elem_in; 277 | in_idx++; 278 | // Wraparound logic to recreate nested loop functionality 279 | if (in_idx == kInPerOut) { 280 | in_idx = 0; 281 | out_dmem[out_idx] = elem_out; 282 | ++out_idx; 283 | } 284 | } 285 | } 286 | } 287 | 288 | #endif // end DMA_WIDTH_CONVERTER_H_ -------------------------------------------------------------------------------- /include/hls_utils/adder_tree.h: -------------------------------------------------------------------------------- 1 | #ifndef HLS_UTILS_ADDER_TREE_H_ 2 | #define HLS_UTILS_ADDER_TREE_H_ 3 | 4 | #include "hls_stream.h" 5 | #include "hls_utils/hls_metaprogramming.h" 6 | 7 | #ifdef __VITIS_HLS__ 8 | #include "hls_vector.h" 9 | #endif 10 | 11 | namespace hlsutils { 12 | 13 | /** 14 | * @brief Given a static array, sum-reduce all its elements. 15 | * 16 | * NOTE: The array will be fully partitioned. 17 | * 18 | * @param x The input static array 19 | * 20 | * @tparam DataType The input and output data type 21 | * @tparam NumPE The number of array elements 22 | * 23 | * @return The sum of all the array elements. 24 | */ 25 | template 26 | DataType adder_tree(DataType x[NumPE]) { 27 | #pragma HLS ARRAY_PARTITION variable=x complete // to force II=1 28 | #pragma HLS PIPELINE II=1 29 | // Determine the number of ranks for the adder tree and declare array: 30 | // - The adder_tree is larger than required as each rank only needs to be 31 | // half the size of the previous rank. 32 | const unsigned kNumPEsLog2 = hlsutils::log2::value; 33 | const unsigned kNumPEsSub1Log2 = hlsutils::log2::value; 34 | const unsigned kNumRanks = kNumPEsLog2 != kNumPEsSub1Log2 ? kNumPEsLog2 : kNumPEsLog2 + 1; 35 | DataType adder_tree[kNumRanks][NumPE]; 36 | #pragma HLS ARRAY_PARTITION variable=adder_tree complete dim=0 37 | 38 | unsigned rank_size = NumPE; 39 | DataType ret_val = 0; 40 | 41 | add_level_loop: 42 | for(int adder_tree_rank = kNumRanks - 1; adder_tree_rank >= 0; --adder_tree_rank) { 43 | const bool kLoopInit = adder_tree_rank == kNumRanks - 1 ? true : false; 44 | const bool kLoopEpilog = adder_tree_rank == 0 ? true : false; 45 | 46 | if (kLoopInit) { 47 | rank_size = NumPE; 48 | } 49 | 50 | const bool prev_rank_is_odd = rank_size % 2 == 0 ? false : true; 51 | rank_size = (rank_size + 1) / 2; 52 | 53 | add_col_loop: 54 | for(int jj = 0; jj < (NumPE + 1) / 2; ++jj) { 55 | if (jj < rank_size) { 56 | if (prev_rank_is_odd && jj == rank_size - 1) { 57 | // Bypass, no adder required. 58 | if (kLoopInit) { 59 | adder_tree[adder_tree_rank][jj] = x[jj * 2]; 60 | // adder_tree[adder_tree_rank][jj] = x[jj * 2]; 61 | } else { 62 | adder_tree[adder_tree_rank][jj] = adder_tree[adder_tree_rank + 1][jj * 2]; 63 | } 64 | } else { 65 | if (kLoopInit) { 66 | auto y_acc = x[jj * 2] + x[jj * 2 + 1]; 67 | // auto y_acc = x[jj * 2] + x[jj * 2 + 1]; 68 | #pragma HLS RESOURCE variable=y_acc core=AddSub_DSP 69 | adder_tree[adder_tree_rank][jj] = y_acc; 70 | } else{ 71 | auto y_acc = adder_tree[adder_tree_rank + 1][jj * 2] + adder_tree[adder_tree_rank + 1][jj * 2 + 1]; 72 | #pragma HLS RESOURCE variable=y_acc core=AddSub_DSP 73 | adder_tree[adder_tree_rank][jj] = y_acc; 74 | } 75 | } 76 | } 77 | } 78 | if (kLoopEpilog) { 79 | ret_val = adder_tree[0][0]; 80 | } 81 | } 82 | return ret_val; 83 | } 84 | 85 | /** 86 | * @brief Given a set of parallel streams, read the first elements element 87 | * from each stream and then sum-reduce them. 88 | * 89 | * NOTE: The streams will be fully partitioned. 90 | * 91 | * @param x The input parallel streams 92 | * 93 | * @tparam DataType The input and output data type 94 | * @tparam NumPE The number of parallel streams 95 | * 96 | * @return The sum of all the array elements. 97 | */ 98 | template 99 | DataType adder_tree(hls::stream x[NumPE]) { 100 | #pragma HLS ARRAY_PARTITION variable=x complete // to force II=1 101 | #pragma HLS PIPELINE II=1 102 | // Determine the number of ranks for the adder tree and declare array: 103 | // - The adder_tree is larger than required as each rank only needs to be 104 | // half the size of the previous rank. 105 | const unsigned kNumPEsLog2 = hlsutils::log2::value; 106 | const unsigned kNumPEsSub1Log2 = hlsutils::log2::value; 107 | const unsigned kNumRanks = kNumPEsLog2 != kNumPEsSub1Log2 ? kNumPEsLog2 : kNumPEsLog2 + 1; 108 | DataType adder_tree[kNumRanks][NumPE]; 109 | #pragma HLS ARRAY_PARTITION variable=adder_tree complete dim=0 110 | 111 | unsigned rank_size = NumPE; 112 | DataType ret_val = 0; 113 | 114 | add_level_loop: 115 | for(int adder_tree_rank = kNumRanks - 1; adder_tree_rank >= 0; --adder_tree_rank) { 116 | const bool kLoopInit = adder_tree_rank == kNumRanks - 1 ? true : false; 117 | const bool kLoopEpilog = adder_tree_rank == 0 ? true : false; 118 | 119 | if (kLoopInit) { 120 | rank_size = NumPE; 121 | } 122 | 123 | const bool prev_rank_is_odd = rank_size % 2 == 0 ? false : true; 124 | rank_size = (rank_size + 1) / 2; 125 | 126 | add_col_loop: 127 | for(int jj = 0; jj < (NumPE + 1) / 2; ++jj) { 128 | if (jj < rank_size) { 129 | if (prev_rank_is_odd && jj == rank_size - 1) { 130 | // Bypass, no adder required. 131 | if (kLoopInit) { 132 | adder_tree[adder_tree_rank][jj] = x[jj * 2].read(); 133 | // adder_tree[adder_tree_rank][jj] = x[jj * 2]; 134 | } else { 135 | adder_tree[adder_tree_rank][jj] = adder_tree[adder_tree_rank + 1][jj * 2]; 136 | } 137 | } else { 138 | if (kLoopInit) { 139 | auto y_acc = x[jj * 2].read() + x[jj * 2 + 1].read(); 140 | // auto y_acc = x[jj * 2] + x[jj * 2 + 1]; 141 | #pragma HLS RESOURCE variable=y_acc core=AddSub_DSP 142 | adder_tree[adder_tree_rank][jj] = y_acc; 143 | } else{ 144 | auto y_acc = adder_tree[adder_tree_rank + 1][jj * 2] + adder_tree[adder_tree_rank + 1][jj * 2 + 1]; 145 | #pragma HLS RESOURCE variable=y_acc core=AddSub_DSP 146 | adder_tree[adder_tree_rank][jj] = y_acc; 147 | } 148 | } 149 | } 150 | } 151 | if (kLoopEpilog) { 152 | ret_val = adder_tree[0][0]; 153 | } 154 | } 155 | return ret_val; 156 | } 157 | 158 | #ifdef __VITIS_HLS__ 159 | template 160 | DataType adder_tree(hls::vector x) { 161 | #pragma HLS ARRAY_PARTITION variable=x complete // to force II=1 162 | #pragma HLS PIPELINE II=1 163 | // Determine the number of ranks for the adder tree and declare array: 164 | // - The adder_tree is larger than required as each rank only needs to be 165 | // half the size of the previous rank. 166 | const unsigned kNumPEsLog2 = hlsutils::log2::value; 167 | const unsigned kNumPEsSub1Log2 = hlsutils::log2::value; 168 | const unsigned kNumRanks = kNumPEsLog2 != kNumPEsSub1Log2 ? kNumPEsLog2 : kNumPEsLog2 + 1; 169 | DataType adder_tree[kNumRanks][NumPE]; 170 | #pragma HLS ARRAY_PARTITION variable=adder_tree complete dim=0 171 | 172 | unsigned rank_size = NumPE; 173 | DataType ret_val = 0; 174 | 175 | add_level_loop: 176 | for(int adder_tree_rank = kNumRanks - 1; adder_tree_rank >= 0; --adder_tree_rank) { 177 | const bool kLoopInit = adder_tree_rank == kNumRanks - 1 ? true : false; 178 | const bool kLoopEpilog = adder_tree_rank == 0 ? true : false; 179 | 180 | if (kLoopInit) { 181 | rank_size = NumPE; 182 | } 183 | 184 | const bool prev_rank_is_odd = rank_size % 2 == 0 ? false : true; 185 | rank_size = (rank_size + 1) / 2; 186 | 187 | add_col_loop: 188 | for(int jj = 0; jj < (NumPE + 1) / 2; ++jj) { 189 | if (jj < rank_size) { 190 | if (prev_rank_is_odd && jj == rank_size - 1) { 191 | // Bypass, no adder required. 192 | if (kLoopInit) { 193 | adder_tree[adder_tree_rank][jj] = x[jj * 2]; 194 | // adder_tree[adder_tree_rank][jj] = x[jj * 2]; 195 | } else { 196 | adder_tree[adder_tree_rank][jj] = adder_tree[adder_tree_rank + 1][jj * 2]; 197 | } 198 | } else { 199 | if (kLoopInit) { 200 | auto y_acc = x[jj * 2] + x[jj * 2 + 1]; 201 | // auto y_acc = x[jj * 2] + x[jj * 2 + 1]; 202 | #pragma HLS RESOURCE variable=y_acc core=AddSub_DSP 203 | adder_tree[adder_tree_rank][jj] = y_acc; 204 | } else{ 205 | auto y_acc = adder_tree[adder_tree_rank + 1][jj * 2] + adder_tree[adder_tree_rank + 1][jj * 2 + 1]; 206 | #pragma HLS RESOURCE variable=y_acc core=AddSub_DSP 207 | adder_tree[adder_tree_rank][jj] = y_acc; 208 | } 209 | } 210 | } 211 | } 212 | if (kLoopEpilog) { 213 | ret_val = adder_tree[0][0]; 214 | } 215 | } 216 | return ret_val; 217 | } 218 | #endif 219 | 220 | } // hlsutils 221 | 222 | #endif // end HLS_UTILS_ADDER_TREE_H_ -------------------------------------------------------------------------------- /include/hls_utils/dot_prod_dsp.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2019 Stefano Ribes. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 16 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 21 | * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 23 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 24 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | *****************************************************************************/ 27 | /****************************************************************************** 28 | * 29 | * 30 | * @file dsp_functions.h 31 | * 32 | * @author Stefano Ribes 33 | * 34 | * Library of templated HLS functions for BNN deployment. 35 | * This file lists a set of functions to access memory mapped values into 36 | * streams 37 | * 38 | *****************************************************************************/ 39 | #ifndef HLS_UTILS_DOT_PROD_DSP_H_ 40 | #define HLS_UTILS_DOT_PROD_DSP_H_ 41 | 42 | #include "ap_int.h" 43 | #include "assert.h" 44 | 45 | namespace hlsutils { 46 | 47 | /** 48 | * @brief Implements p0 += y_dsp * w_dsp + y_lut * w_lut; p1 += x_dsp * 49 | * w_dsp + x_lut * w_lut; 50 | * 51 | * @param[in] x_dsp The x to be mapped to a DSP 52 | * @param[in] y_dsp The y to be mapped to a DSP 53 | * @param[in] w_dsp The w to be mapped to a DSP 54 | * @param[in] x_lut The x to be mapped to a LUT 55 | * @param[in] y_lut The y to be mapped to a LUT 56 | * @param[in] w_lut The w to be mapped to a LUT 57 | * @param p0 The output p0 58 | * @param p1 The output p1 59 | * 60 | * @tparam DspD Must be a 8bit ap_(u)int or ap_(u)fixed. 61 | */ 62 | template 63 | void dot_prod_dsp_lut(const DspD x_dsp, const DspD y_dsp, const DspD w_dsp, 64 | const DspD x_lut, const DspD y_lut, const DspD w_lut, 65 | DspD &p0, DspD &p1) { 66 | #pragma HLS PIPELINE II=3 67 | // NOTE: inlining prevents a clear parent's structure and a simple ctrl logic. 68 | #pragma HLS INLINE off 69 | assert(DspD::width == 8); // Only allow 8bit ap_uint or ap_fixed 70 | // =========================================================================== 71 | // LUT Multiplication 72 | // =========================================================================== 73 | // NOTE: The method range() MUST be used, otherwise there's a cast operation 74 | // happening instead of a bit-by-bit copy. 75 | ap_uint<17> xw_lut = (x_lut * w_lut).range(); 76 | ap_uint<17> yw_lut = (y_lut * w_lut).range(); 77 | #pragma HLS RESOURCE variable=xw_lut core=Mul_LUT latency=1 78 | #pragma HLS RESOURCE variable=yw_lut core=Mul_LUT latency=1 79 | ap_uint<48> p_lut = 0; 80 | p_lut(16, 0) = yw_lut; 81 | p_lut(32, 17) = xw_lut; 82 | // =========================================================================== 83 | // DSP 84 | // =========================================================================== 85 | ap_int<25> x_dsp25 = 0; 86 | ap_int<25> y_dsp25 = 0; 87 | ap_int<18> w_dsp18 = 0; 88 | x_dsp25(24, 17) = x_dsp.range(); 89 | y_dsp25(7, 0) = y_dsp.range(); 90 | w_dsp18(7, 0) = w_dsp.range(); 91 | // =========================================================================== 92 | // Sign extension 93 | // =========================================================================== 94 | const ap_uint<17> kNegativeSignY = 0b11111111111111111; 95 | const ap_uint<17> kPositiveSignY = 0b00000000000000000; 96 | const ap_uint<10> kNegativeSignW = 0b1111111111; 97 | const ap_uint<10> kPositiveSignW = 0b0000000000; 98 | y_dsp25(24, 8) = y_dsp[7] == 1 ? kNegativeSignY : kPositiveSignY; 99 | w_dsp18(17, 8) = w_dsp[7] == 1 ? kNegativeSignW : kPositiveSignW; 100 | // ap_int<48> p_dsp = (x_dsp25 + y_dsp25) * w_dsp18; 101 | //#pragma HLS RESOURCE variable=p_dsp core=DSP48 102 | // =========================================================================== 103 | // Adjust LSB LUT 104 | // =========================================================================== 105 | p_lut[16] = p_lut[15] = (y_dsp[7] ^ w_dsp[7]); 106 | // =========================================================================== 107 | // Final Sum DSP + LUT 108 | // =========================================================================== 109 | // auto p = p_dsp + p_lut; 110 | auto p = (x_dsp25 + y_dsp25) * w_dsp18 + p_lut; 111 | #pragma HLS RESOURCE variable=p core=DSP48 112 | p[16] = yw_lut[15] ^ (y_dsp25[7] ^ w_dsp18[7]); 113 | // =========================================================================== 114 | // Accumulation 115 | // =========================================================================== 116 | const int kIntWidth = DspD::width; // svd::SVDParameters.kFixFracWidth; 117 | ap_fixed<17, kIntWidth*2+1> p0_reg = 0; 118 | ap_fixed<17, kIntWidth*2+1> p1_reg = 0; 119 | p0_reg.range() = p(16, 0); 120 | p1_reg.range() = p(32, 17); 121 | p0 += p0_reg; 122 | p1 += p1_reg; 123 | } 124 | 125 | /** 126 | * @brief Implements p0 += y_dsp * w_dsp + y_lut * w_lut; p1 += x_dsp * 127 | * w_dsp + x_lut * w_lut; 128 | * 129 | * @param[in] x_dsp The x to be mapped to a DSP 130 | * @param[in] y_dsp The y to be mapped to a DSP 131 | * @param[in] w_dsp The w to be mapped to a DSP 132 | * @param[in] x_lut The x to be mapped to a LUT 133 | * @param[in] y_lut The y to be mapped to a LUT 134 | * @param[in] w_lut The w to be mapped to a LUT 135 | * @param p0 The output p0 136 | * @param p1 The output p1 137 | * 138 | * @tparam T Must be a 8bit ap_(u)int or ap_(u)fixed. 139 | */ 140 | template 141 | void dot_prod_dsp_lut_generic(const T x_dsp, const T y_dsp, const T w_dsp, 142 | const T x_lut, const T y_lut, const T w_lut, 143 | T &p0, T &p1) { 144 | #pragma HLS PIPELINE II=3 145 | // NOTE: inlining prevents a clear parent's structure and a simple ctrl logic. 146 | #pragma HLS INLINE off 147 | auto p0_tmp = y_dsp * w_dsp + y_lut * w_lut; 148 | auto p1_tmp = x_dsp * w_dsp + x_lut * w_lut; 149 | #pragma HLS RESOURCE variable=p1_tmp core=DSP48 150 | p0 += p0_tmp; 151 | p1 += p1_tmp; 152 | } 153 | 154 | } // hlsutils 155 | 156 | #endif // end HLS_UTILS_DOT_PROD_DSP_H_ -------------------------------------------------------------------------------- /include/hls_utils/hls_debugging.h: -------------------------------------------------------------------------------- 1 | #ifndef HLS_UTILS_HLS_DEBUGGING 2 | #define HLS_UTILS_HLS_DEBUGGING 3 | 4 | #include "hls_utils/hw_timer.h" 5 | #ifdef __VITIS_HLS__ 6 | #include "hls_vector.h" 7 | #endif 8 | 9 | #include 10 | #include 11 | 12 | #ifndef HLS_DEBUG_LEVEL 13 | #define HLS_DEBUG_LEVEL 0 14 | #endif 15 | 16 | namespace hlsutils { 17 | 18 | static int hls_debug_level = HLS_DEBUG_LEVEL; 19 | 20 | template 21 | void Log(const int verbose_level, const T* str) { 22 | #ifndef __SYNTHESIS__ 23 | if (verbose_level < hls_debug_level) { 24 | std::cout << str << std::endl; 25 | } 26 | #endif 27 | } 28 | 29 | #ifdef __VITIS_HLS__ 30 | template 31 | void PrintVector(hls::vector &x) { 32 | for (int i = 0; i < N; ++i) { 33 | std::cout << x[i] << " "; 34 | } 35 | std::cout << std::endl; 36 | } 37 | #endif 38 | 39 | } // hlsutils 40 | 41 | #endif // HLS_UTILS_HLS_DEBUGGING -------------------------------------------------------------------------------- /include/hls_utils/hls_metaprogramming.h: -------------------------------------------------------------------------------- 1 | #ifndef HLS_UTILS_HLS_METAPROGRAMMING_H_ 2 | #define HLS_UTILS_HLS_METAPROGRAMMING_H_ 3 | 4 | namespace hlsutils { 5 | 6 | template 7 | struct log2 { 8 | enum {value = 1 + hlsutils::log2::value}; 9 | }; 10 | 11 | template <> 12 | struct log2<1> { 13 | enum {value = 0}; 14 | }; 15 | 16 | /** 17 | * @brief Class for Greatest Common Divisor (GCD) compile-time function. 18 | * 19 | * @tparam N First input 20 | * @tparam M Second input 21 | * @tparam K Temporary variable used for recursion 22 | */ 23 | template 24 | class GCDbase; 25 | 26 | template 27 | class GCD { 28 | public: 29 | static const int value = hlsutils::GCDbase::value; 30 | }; 31 | 32 | template 33 | class GCDbase { 34 | public: 35 | static const int value = hlsutils::GCDbase::value; 36 | }; 37 | 38 | template 39 | class GCDbase{ 40 | public: 41 | static const int value = M; 42 | }; 43 | 44 | template 45 | struct Bitwidth { 46 | static const int value = T::width; 47 | }; 48 | 49 | template<> 50 | struct Bitwidth { 51 | static const int value = 8; 52 | }; 53 | 54 | template<> 55 | struct Bitwidth { 56 | static const int value = 32; 57 | }; 58 | 59 | template<> 60 | struct Bitwidth { 61 | static const int value = 16; 62 | }; 63 | 64 | template<> 65 | struct Bitwidth { 66 | static const int value = 64; 67 | }; 68 | 69 | template<> 70 | struct Bitwidth { 71 | static const int value = 8; 72 | }; 73 | 74 | template<> 75 | struct Bitwidth { 76 | static const int value = 32; 77 | }; 78 | 79 | template<> 80 | struct Bitwidth { 81 | static const int value = 16; 82 | }; 83 | 84 | template<> 85 | struct Bitwidth { 86 | static const int value = 64; 87 | }; 88 | 89 | template<> 90 | struct Bitwidth { 91 | static const int value = 32; 92 | }; 93 | 94 | template<> 95 | struct Bitwidth { 96 | static const int value = 64; 97 | }; 98 | 99 | #ifdef __VITIS_HLS__ 100 | template 101 | struct VectBitwidth { 102 | static const int value = N * Bitwidth::value; 103 | }; 104 | #endif 105 | 106 | template 107 | struct PrunedSize { 108 | static const int value = R * X / T * (T - ZT); 109 | }; 110 | 111 | 112 | #ifndef IS_POW2 113 | #define IS_POW2(x) (x & (x - 1)) == 0 114 | #endif 115 | 116 | template 117 | struct is_pow2 { 118 | static const bool value = (N & (N - 1)) == 0; 119 | }; 120 | 121 | template 122 | struct round_up_div { 123 | static const unsigned int value = (X + Y - 1) / Y; 124 | }; 125 | 126 | } // end namespace hls 127 | 128 | #endif // end HLS_UTILS_HLS_METAPROGRAMMING_H_ -------------------------------------------------------------------------------- /include/hls_utils/priority_encoder.h: -------------------------------------------------------------------------------- 1 | #ifndef HLS_UTILS_PRIORITY_ENCODER_H_ 2 | #define HLS_UTILS_PRIORITY_ENCODER_H_ 3 | 4 | #include "hls_utils/hls_metaprogramming.h" 5 | 6 | #include "assert.h" 7 | 8 | namespace hlsutils { 9 | 10 | /** 11 | * @brief Priority Encoder: returns the MSB set bit. 12 | * 13 | * @param[in] a The input value. 14 | * 15 | * @tparam Bitwidth The bit width of the input value. 16 | * 17 | * @return The index of the set MSB. Zero if no bit is set. 18 | */ 19 | template 20 | int PriorityEncoderMSB(const ap_uint a) { 21 | #pragma HLS PIPELINE II=1 22 | int index = 0; 23 | for (int i = 0; i < Bitwidth; ++i) { 24 | if (a[i] == 1) { 25 | index = i; 26 | } 27 | } 28 | return index; 29 | } 30 | 31 | template 32 | int PriorityEncoderLSB(const ap_uint a) { 33 | #pragma HLS PIPELINE II=1 34 | int index = 0; 35 | for (int i = Bitwidth - 1; i >= 0; --i) { 36 | if (a[i] == 1) { 37 | index = i; 38 | } 39 | } 40 | return index; 41 | } 42 | 43 | template 44 | int PriorityEncoderMSB(const T a) { 45 | #pragma HLS PIPELINE II=1 46 | int index = 0; 47 | for (int i = 0; i < T::width; ++i) { 48 | if (a[i] == 1) { 49 | index = i; 50 | } 51 | } 52 | return index; 53 | } 54 | 55 | template 56 | int PriorityEncoderLSB(const T a) { 57 | #pragma HLS INLINE 58 | #pragma HLS PIPELINE II=1 59 | int index = 0; 60 | for (int i = T::width - 1; i >= 0; --i) { 61 | if (a[i] == 1) { 62 | index = i; 63 | } 64 | } 65 | return index; 66 | } 67 | 68 | template 69 | void PriorityEncoder(const int num_zero_tiles, const ap_uint a, hls::stream::value> > &idx_stream) { 70 | ap_uint tmp = a; 71 | for (int i = 0; i < NumTiles - num_zero_tiles; ++i) { 72 | #pragma HLS PIPELINE II=1 73 | int bit_idx = PriorityEncoderLSB(tmp); 74 | assert(bit_idx < NumTiles); 75 | tmp[bit_idx] = 0; 76 | idx_stream.write(bit_idx); 77 | } 78 | } 79 | 80 | template 81 | void PriorityEncoder(const int num_zero_tiles, const T a, hls::stream::value> > &idx_stream) { 82 | T tmp = a; 83 | for (int i = 0; i < T::width - num_zero_tiles; ++i) { 84 | #pragma HLS PIPELINE II=1 85 | int bit_idx = PriorityEncoderLSB(tmp); 86 | assert(bit_idx < T::width); 87 | tmp[bit_idx] = 0; 88 | idx_stream.write(bit_idx); 89 | } 90 | } 91 | 92 | } // end namespace hlsutils 93 | 94 | #endif // end HLS_UTILS_PRIORITY_ENCODER_H_ -------------------------------------------------------------------------------- /include/kernel/gemv_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef KERNEL_GEMV_KERNEL_H_ 2 | #define KERNEL_GEMV_KERNEL_H_ 3 | 4 | #include "assert.h" 5 | #include "hls_stream.h" 6 | #ifdef __VITIS_HLS__ 7 | #include "hls_vector.h" 8 | #endif 9 | 10 | namespace testgemv { 11 | 12 | typedef int DataType; 13 | const int N = 2; 14 | const int I = 1024; 15 | const int T = 4; 16 | const int R = 64; 17 | 18 | } // testgemv 19 | 20 | namespace svd { 21 | 22 | /** 23 | * @brief Given x with shape (N, I) and w with shape (N, I, R), returns y 24 | * with shape (N, R). 25 | * 26 | * The x streams however, contain the `N * I / T` values repeated R 27 | * times (broadcasted in the R dimension). 28 | * 29 | * The w streams instead should be broadcasted to the N dimension. 30 | * 31 | * @param[in] num_rows The number rows, dimension I in the example above 32 | * @param[in] num_cols The number cols, dimension R in the example above 33 | * @param x_streams The x streams 34 | * @param w_streams The w streams 35 | * @param y_streams The y streams 36 | * 37 | * @tparam Type The data type of the operands 38 | * @tparam T The tile size of the streams 39 | * @tparam N The number of parallel inputs. 40 | */ 41 | #ifdef __VITIS_HLS__ 42 | template 43 | void GemvKernel(const int num_rows, const int num_cols, 44 | hls::stream > x_streams[N], 45 | hls::stream > w_streams[N], 46 | hls::stream y_streams[N]) { 47 | assert(num_rows % T == 0); 48 | const int kNumTiles = num_rows / T; 49 | for (int i = 0; i < num_cols; ++i) { 50 | #pragma HLS LOOP_TRIPCOUNT min=testgemv::R max=testgemv::R 51 | hls::vector tmp[N] = {hls::vector(0)}; 52 | #pragma HLS ARRAY_PARTITION variable=tmp complete 53 | for (int j = 0; j < kNumTiles; ++j) { 54 | #pragma HLS LOOP_TRIPCOUNT min=testgemv::I/T max=testgemv::I/T 55 | #pragma HLS PIPELINE II=1 56 | for (int k = 0; k < N; ++k) { 57 | tmp[k] += x_streams[k].read() * w_streams[k].read(); 58 | if (j == kNumTiles - 1) { 59 | y_streams[k] << tmp[k].reduce_add(); 60 | } 61 | } 62 | } 63 | } 64 | } 65 | #endif 66 | 67 | } // svd 68 | 69 | void HlsGemvKernel(const int num_rows, const int num_cols, 70 | hls::stream >& x1_port, 71 | hls::stream >& x2_port, 72 | hls::stream >& w1_port, 73 | hls::stream >& w2_port, 74 | hls::stream& y1_port, 75 | hls::stream& y2_port); 76 | 77 | #endif // end KERNEL_GEMV_KERNEL_H_ -------------------------------------------------------------------------------- /include/kernel/s_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef KERNEL_S_KERNEL_H_ 2 | #define KERNEL_S_KERNEL_H_ 3 | 4 | #include "svd_params.h" 5 | #include "hls_utils/adder_tree.h" 6 | #include "dma/axis_lib.h" 7 | #include "hls_utils/hls_metaprogramming.h" 8 | 9 | #include "hls_stream.h" 10 | 11 | namespace svd { 12 | 13 | template 14 | void KernelS(const int num_refinements, svd::SvdStreams &streams) { 15 | typedef typename params::AccumulationD accum_t; 16 | for (int i = 0; i < num_refinements; ++i) { 17 | #pragma HLS PIPELINE II=1 18 | for (int j = 0; j < params::N; ++j) { 19 | for (int k = 0; k < params::G; ++k) { 20 | auto sum = hlsutils::adder_tree(streams.xu[j][k]); 21 | auto xs = sum * streams.s[j][k].read(); 22 | for (int ii = 0; ii < params::PeV; ++ii) { 23 | streams.xus[j][k][ii].write(xs); 24 | } 25 | } 26 | } 27 | } 28 | } 29 | 30 | template > 31 | struct KernelS_Params { 32 | static const int N = Ni; 33 | static const int G = Gi; 34 | typedef ActivationD_tp ActivationType; 35 | static const int ActivationWidth = hlsutils::Bitwidth::value; 36 | static const int VectG_AxiWidth = ActivationWidth * G; 37 | typedef typename svd::AxiStreamPort::AxiuPacketType VectG_AxiPacketType; 38 | #ifdef __VITIS_HLS__ 39 | typedef hls::vector VectG_Type; 40 | #endif 41 | }; 42 | 43 | #ifndef __VITIS_HLS__ 44 | #else 45 | template < 46 | typename params, 47 | typename PortWrapper = svd::AxiStreamPort 48 | > 49 | void KernelS(const int num_active_inputs, 50 | const int num_refinements[params::N], 51 | hls::stream& xu_port, 52 | hls::stream& s_port, 53 | hls::stream& xus_port) { 54 | #pragma HLS TOP name=KernelS 55 | #pragma HLS DATAFLOW 56 | #pragma HLS INLINE 57 | #ifndef __VITIS_HLS__ 58 | #pragma HLS STABLE variable=xu_port 59 | #pragma HLS STABLE variable=s_port 60 | #pragma HLS STABLE variable=xus_port 61 | #endif 62 | assert(num_active_inputs <= params::N); 63 | assert(num_active_inputs > 0); 64 | int R_max = num_refinements[0]; 65 | int R_total = num_refinements[0] * num_active_inputs; // Total elements. 66 | Get_Total_R: 67 | for (int i = 1; i < num_active_inputs; ++i) { 68 | #pragma HLS PIPELINE II=1 69 | if (num_refinements[i] > R_max) { 70 | R_max = num_refinements[i]; 71 | } 72 | assert(num_refinements[i] >= num_refinements[i - 1]); 73 | R_total += (num_refinements[i] - num_refinements[i - 1]) * (num_active_inputs - i); 74 | } 75 | auto xu_axis = svd::AxiStreamInterface(xu_port); 76 | auto s_axis = svd::AxiStreamPort(s_port); 77 | auto xus_axis = svd::AxiStreamInterface(xus_port); 78 | S_Kernel: 79 | for (int i = 0; i < R_total; ++i) { 80 | #pragma HLS PIPELINE II=1 style=frp 81 | typedef typename params::ActivationD ActivationType; 82 | auto xu_val = xu_axis.template PopVector(); 83 | auto s_val = s_axis.template PopVector(); 84 | auto xus_val = xu_val * s_val; 85 | const bool kIsLast = i == R_total - 1; 86 | xus_axis.template PushVector(xus_val, kIsLast); 87 | } 88 | } 89 | #endif // __VITIS_HLS__ 90 | 91 | } // svd 92 | 93 | namespace tests { 94 | 95 | static const int kNumInputs = 2; 96 | static const int kInputSize = 512; 97 | static const int Tu = 4; 98 | // NOTE: The rest of the parameters are unused for now. 99 | static const int kDummySize = 1; 100 | static const int R = 8; 101 | static const int Tv = 1; 102 | static const int ZTu = 0; 103 | static const int ZTv = 0; 104 | static const int G = 4; 105 | 106 | typedef svd::SvdParameters params; 110 | short, short, short> params; 111 | 112 | } // tests 113 | 114 | #ifndef __VITIS_HLS__ 115 | #else 116 | void HlsKernelS( 117 | const int num_refinements[tests::params::N], 118 | // const hls::vector num_refinements, 119 | hls::stream& xu_port, 120 | hls::stream& s_port, 121 | hls::stream& xus_port); 122 | #endif 123 | 124 | #endif // end KERNEL_S_KERNEL_H_ -------------------------------------------------------------------------------- /include/kernel/svd_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef KERNEL_SVD_KERNEL_H_ 2 | #define KERNEL_SVD_KERNEL_H_ 3 | 4 | #include "svd_params.h" 5 | #include "dma/svd_dma.h" 6 | #include "dma/axis_lib.h" 7 | #include "kernel/u_kernel.h" 8 | #include "kernel/s_kernel.h" 9 | #include "kernel/v_kernel.h" 10 | 11 | namespace svd { 12 | 13 | template 14 | inline void SvdKernel(svd::SvdStreams &streams) { 15 | #pragma HLS INLINE 16 | #ifndef __VITIS_HLS__ 17 | #pragma HLS DATAFLOW 18 | #endif 19 | svd::KernelU(params::R, streams); 20 | svd::KernelS(params::R, streams); 21 | svd::KernelV(params::R, streams); 22 | } 23 | 24 | template < 25 | typename params, 26 | typename WrapperAxisGTv = svd::AxiStreamPort 27 | > 28 | void SvdKernel(const int num_active_inputs, 29 | const int input_size, 30 | const int output_size, 31 | const int num_refinements[params::N], 32 | hls::stream& x_port, 33 | hls::stream& u_port, 34 | hls::stream& s_port, 35 | hls::stream& v_port, 36 | hls::stream& y_port) { 37 | #pragma HLS TOP name=SvdKernel 38 | #pragma HLS INLINE 39 | #pragma HLS DATAFLOW 40 | #ifndef __VITIS_HLS__ 41 | #pragma HLS STABLE variable=x_port 42 | #pragma HLS STABLE variable=u_port 43 | #pragma HLS STABLE variable=s_port 44 | #pragma HLS STABLE variable=v_port 45 | #pragma HLS STABLE variable=y_port 46 | #endif 47 | #pragma HLS ARRAY_PARTITION variable=num_refinements complete 48 | const bool pad_output = false; 49 | typedef svd::AxiStreamFifo WrapperFifoG; 50 | hls::stream xu_port("xu_port"); 51 | hls::stream xus_port("xus_port"); 52 | #pragma HLS STREAM variable=xu_port depth=2 53 | #pragma HLS STREAM variable=xus_port depth=2 54 | int num_refinements_u[params::N]; 55 | int num_refinements_s[params::N]; 56 | int num_refinements_v[params::N]; 57 | #pragma HLS ARRAY_PARTITION variable=num_refinements_u complete 58 | #pragma HLS ARRAY_PARTITION variable=num_refinements_s complete 59 | #pragma HLS ARRAY_PARTITION variable=num_refinements_v complete 60 | Duplicate_R_Stream: 61 | for (int i = 0; i < params::N; ++i) { 62 | #pragma HLS UNROLL 63 | num_refinements_u[i] = num_refinements[i]; 64 | num_refinements_s[i] = num_refinements[i]; 65 | num_refinements_v[i] = num_refinements[i]; 66 | } 67 | svd::KernelU(num_active_inputs, input_size, 68 | num_refinements_u, pad_output, x_port, u_port, xu_port); 69 | svd::KernelS(num_active_inputs, num_refinements_s, 70 | xu_port, s_port, xus_port); 71 | svd::KernelV(num_active_inputs, 72 | output_size, num_refinements_v, xus_port, v_port, y_port); 73 | } 74 | 75 | /** 76 | * @brief Sets the SVD kernel inputs, i.e. streams from arrays into 77 | * hls::streams. 78 | * 79 | * @param[in] num_active_inputs The number of active inputs 80 | * @param[in] input_size The input size 81 | * @param[in] output_size The output size 82 | * @param[in] num_refinements The number of refinements 83 | * @param[in] x The input array. Shape: (N, I) 84 | * @param[in] u The u array. Shape: (R, I, G) 85 | * @param[in] s The s array. Shape: (R, N, G) 86 | * @param[in] v The v array. Shape: (R, H, G) 87 | * @param x_port The x port to be used as argument to SvdKernel 88 | * @param u_port The u port to be used as argument to SvdKernel 89 | * @param s_port The s port to be used as argument to SvdKernel 90 | * @param v_port The v port to be used as argument to SvdKernel 91 | * 92 | * @tparam params Collection of SVD configuration params. 93 | */ 94 | #ifdef __VITIS_HLS__ 95 | template 96 | void SetSvdKernelInputs(const int num_active_inputs, 97 | const int input_size, 98 | const int output_size, 99 | const int num_refinements[params::N], 100 | const typename params::ActivationD* x, 101 | const typename params::ActivationD* u, 102 | const typename params::ActivationD* s, 103 | const typename params::ActivationD* v, 104 | hls::stream& x_port, 105 | hls::stream& u_port, 106 | hls::stream& s_port, 107 | hls::stream& v_port) { 108 | typedef typename params::ActivationD ActivationType; 109 | const int kG = params::G; 110 | const int kTu = params::Tu; 111 | const int kTv = params::Tv; 112 | const int kGTv = kG * kTv; 113 | const int kNumTilesU = input_size / kTu; 114 | const int kNumTilesV = output_size / kTv; 115 | auto x_axis = svd::AxiStreamPort(x_port); 116 | auto u_axis = svd::AxiStreamPort(u_port); 117 | auto s_axis = svd::AxiStreamPort(s_port); 118 | auto v_axis = svd::AxiStreamPort(v_port); 119 | int max_R = num_refinements[0]; 120 | typename params::VectTuType x_val; 121 | typename params::VectTuType u_val; 122 | typename params::VectG_Type s_val; 123 | typename params::VectTvType v_val; 124 | for (int i = i; i < params::N; ++i) { 125 | if (num_refinements[i] > max_R) { 126 | max_R = num_refinements[i]; 127 | } 128 | } 129 | for (int j = 0; j < kNumTilesU; ++j) { 130 | for (int i = 0; i < num_active_inputs; ++i) { 131 | for (int k = 0; k < kTu; ++k) { 132 | x_val[k] = x[i * input_size + j * kTu + k]; 133 | } 134 | x_axis.template PushVector(x_val); 135 | } 136 | } 137 | for (int i = 0; i < max_R; ++i) { 138 | for (int j = 0; j < kNumTilesU; ++j) { 139 | for (int k = 0; k < kG; ++k) { 140 | for (int ii = 0; ii < kTu; ++ii) { 141 | u_val[ii] = u[i * kNumTilesU * kTu * kG + (j * kTu + ii) * kG + k]; 142 | } 143 | u_axis.template PushVector(u_val); 144 | } 145 | } 146 | } 147 | for (int i = 0; i < max_R; ++i) { 148 | for (int j = 0; j < num_active_inputs; ++j) { 149 | if (i < num_refinements[j]) { 150 | for (int k = 0; k < kG; ++k) { 151 | s_val[k] = s[i * num_active_inputs * kG + j * kG + k]; 152 | } 153 | s_axis.template PushVector(s_val); 154 | } 155 | } 156 | } 157 | for (int i = 0; i < max_R; ++i) { 158 | for (int j = 0; j < kNumTilesV; ++j) { 159 | for (int k = 0; k < kG; ++k) { 160 | for (int ii = 0; ii < kTv; ++ii) { 161 | v_val[ii] = v[i * kNumTilesV * kTv * kG + (j * kTv + ii) * kG + k]; 162 | } 163 | v_axis.template PushVector(v_val); 164 | } 165 | } 166 | } 167 | } 168 | #endif // __VITIS_HLS__ 169 | 170 | /** 171 | * @brief Gets the svd kernel outputs, i.e. fills in an array from 172 | * hls::streams. 173 | * 174 | * @param[in] num_active_inputs The number active inputs 175 | * @param[in] output_size The output size (H) 176 | * @param y_port The y port to be used as argument to SvdKernel 177 | * @param y The output array. Shape: (N, G, H) 178 | * 179 | * @tparam params Collection of SVD configuration params. 180 | */ 181 | #ifdef __VITIS_HLS__ 182 | template 183 | void GetSvdKernelOutputs(const int num_active_inputs, const int output_size, 184 | hls::stream& y_port, 185 | typename params::ActivationD* y) { 186 | typedef typename params::ActivationD ActivationType; 187 | const int kG = params::G; 188 | const int kTv = params::Tv; 189 | const int kGTv = kG * kTv; 190 | const int kNumTilesV = output_size / kTv; 191 | auto y_axis = svd::AxiStreamPort(y_port); 192 | for (int j = 0; j < kNumTilesV; ++j) { 193 | for (int i = 0; i < num_active_inputs; ++i) { 194 | auto y_val = y_axis.template PopVector(); 195 | for (int k = 0; k < kTv; ++k) { 196 | for (int ii = 0; ii < kG; ++ii) { 197 | int y_idx = i * output_size * kG + ii * output_size + j * kTv + k; 198 | y[y_idx] = y_val[k * kG + ii]; 199 | } 200 | } 201 | } 202 | } 203 | } 204 | #endif // __VITIS_HLS__ 205 | 206 | } // svd 207 | 208 | void HlsSvdKernel(const int num_active_inputs, 209 | const int input_size, 210 | const int output_size, 211 | const int num_refinements[svd::svd_params::N], 212 | hls::stream& x_port, 213 | hls::stream& u_port, 214 | hls::stream& s_port, 215 | hls::stream& v_port, 216 | hls::stream& y_port); 217 | 218 | #endif // end KERNEL_SVD_KERNEL_H_ -------------------------------------------------------------------------------- /include/layers/dense/hls/dense_svd.h: -------------------------------------------------------------------------------- 1 | #ifndef LAYERS_DENSE_HLS_DENSE_SVD_H_ 2 | #define LAYERS_DENSE_HLS_DENSE_SVD_H_ 3 | 4 | #include "svd_params.h" 5 | #include "kernel/svd_kernel.h" 6 | 7 | #include "ap_int.h" 8 | 9 | 10 | namespace svd { 11 | 12 | static const int kDenseNumGates = 1; 13 | 14 | typedef svd::SvdParameters, 17 | ap_fixed, 18 | ap_fixed > dense_params; 19 | 20 | #ifndef __VITIS_HLS__ 21 | #else 22 | template 23 | void DenseSvdKernel(const int num_active_inputs, 24 | const int input_size, 25 | const int output_size, 26 | const int num_refinements[params::N], 27 | // const hls::vector num_refinements, 28 | hls::stream& x_port, 29 | hls::stream& u_port, 30 | hls::stream& s_port, 31 | hls::stream& v_port, 32 | hls::stream& bias_port, 33 | hls::stream& y_port) { 34 | #pragma HLS TOP name=DenseSvdKernel 35 | // #pragma HLS INLINE 36 | #pragma HLS DATAFLOW 37 | #ifndef __VITIS_HLS__ 38 | #pragma HLS STABLE variable=x_port 39 | #pragma HLS STABLE variable=u_port 40 | #pragma HLS STABLE variable=s_port 41 | #pragma HLS STABLE variable=v_port 42 | #pragma HLS STABLE variable=bias_port 43 | #pragma HLS STABLE variable=y_port 44 | #endif 45 | static_assert(params::G == 1, "DenseSvdKernel must have params::G equal to one."); 46 | assert(params::G == 1); 47 | typedef typename params::ActivationD ActivationType; 48 | typedef svd::AxiStreamFifo WrapperFifoGTv; 49 | hls::stream y_fifo; 50 | #pragma HLS STREAM variable=y_fifo depth=2 51 | auto y_axis = svd::AxiStreamFifo(y_fifo); 52 | auto y_out_axis = svd::AxiStreamPort(y_port); 53 | auto bias_axis = svd::AxiStreamPort(bias_port); 54 | svd::SvdKernel(num_active_inputs, input_size, 55 | output_size, num_refinements, x_port, u_port, s_port, v_port, y_fifo); 56 | Apply_Bias: 57 | for (int i = 0; i < output_size / params::Tv * num_active_inputs; ++i) { 58 | #pragma HLS PIPELINE II=1 59 | const int kGTv = params::G * params::Tv; // NOTE: G is actually equal to 1. 60 | const auto y_val = y_axis.template PopVector(); 61 | const auto bias_val = bias_axis.template PopVector(); 62 | const auto y_out = y_val + bias_val; 63 | // #pragma HLS BIND_OP variable=y_out op=add impl=dsp latency=3 64 | const bool kIsLast = i == output_size / params::Tv * num_active_inputs - 1; 65 | y_out_axis.template PushVector(y_out, kIsLast); 66 | } 67 | } 68 | #endif // end __VITIS_HLS__ 69 | 70 | /** 71 | * @brief Sets the DenseSvd kernel inputs, i.e. streams from arrays into 72 | * hls::streams. 73 | * 74 | * @param[in] num_active_inputs The number of active inputs 75 | * @param[in] input_size The input size 76 | * @param[in] output_size The output size 77 | * @param[in] num_refinements The number of refinements 78 | * @param[in] x The input array. Shape: (N, I) 79 | * @param[in] u The u array. Shape: (R, I, G) 80 | * @param[in] s The s array. Shape: (R, N, G) 81 | * @param[in] v The v array. Shape: (R, H, G) 82 | * @param[in] bias The bias array. Shape: (N, G, H) 83 | * @param x_port The x port to be used as argument to SvdKernel 84 | * @param u_port The u port to be used as argument to SvdKernel 85 | * @param s_port The s port to be used as argument to SvdKernel 86 | * @param v_port The v port to be used as argument to SvdKernel 87 | * @param bias_port The bias port to be used as argument to 88 | * SvdKernel 89 | * 90 | * @tparam params Collection of SVD configuration params. 91 | */ 92 | #ifdef __VITIS_HLS__ 93 | template 94 | void SetDenseSvdInputs(const int num_active_inputs, 95 | const int input_size, 96 | const int output_size, 97 | const int num_refinements[params::N], 98 | const typename params::ActivationD* x, 99 | const typename params::ActivationD* u, 100 | const typename params::ActivationD* s, 101 | const typename params::ActivationD* v, 102 | const typename params::ActivationD* bias, 103 | hls::stream& x_port, 104 | hls::stream& u_port, 105 | hls::stream& s_port, 106 | hls::stream& v_port, 107 | hls::stream& bias_port) { 108 | typedef typename params::ActivationD ActivationType; 109 | const int kG = params::G; // NOTE: G is actually equal to 1. 110 | const int kTv = params::Tv; 111 | const int kGTv = kG * kTv; 112 | const int kNumTilesV = output_size / kTv; 113 | auto bias_axis = svd::AxiStreamPort(bias_port); 114 | typename params::VectGTvType bias_val; 115 | for (int i = 0; i < kNumTilesV; ++i) { 116 | for (int j = 0; j < num_active_inputs; ++j) { 117 | for (int k = 0; k < kTv; ++k) { 118 | for (int ii = 0; ii < kG; ++ii) { 119 | int bias_idx = j * output_size * kG + ii * output_size + i * kTv + k; 120 | bias_val[k * kG + ii] = bias[bias_idx]; 121 | } 122 | } 123 | bias_axis.template PushVector(bias_val); 124 | } 125 | } 126 | svd::SetSvdKernelInputs(num_active_inputs, input_size, 127 | output_size, num_refinements, x, u, s, v, x_port, u_port, s_port, v_port); 128 | } 129 | #endif // __VITIS_HLS__ 130 | 131 | } // svd 132 | 133 | void HlsDenseSvd(const int num_active_inputs, 134 | const int input_size, 135 | const int output_size, 136 | const int num_refinements[svd::dense_params::N], 137 | hls::stream& x_port, 138 | hls::stream& u_port, 139 | hls::stream& s_port, 140 | hls::stream& v_port, 141 | hls::stream& bias_port, 142 | hls::stream& y_port); 143 | 144 | 145 | /** 146 | * @brief HLS Wrapper that calls a DenseSvd accelerator. 147 | * 148 | * Useful in Cosimulation. 149 | * 150 | * @param[in] num_active_inputs The number of active inputs 151 | * @param[in] input_size The input size 152 | * @param[in] output_size The output size 153 | * @param[in] num_refinements The number of refinements 154 | * @param[in] x The input array. Shape: (N, I) 155 | * @param[in] u The u array. Shape: (R, I, G) 156 | * @param[in] s The s array. Shape: (R, N, G) 157 | * @param[in] v The v array. Shape: (R, H, G) 158 | * @param[in] bias The bias array. Shape: (N, G, H) 159 | * @param y The y array. Shape: (N, G, H) 160 | */ 161 | void HlsWrapperDenseSvd(const int num_active_inputs, 162 | const int input_size, 163 | const int output_size, 164 | const int num_refinements[svd::dense_params::N], 165 | const typename svd::dense_params::ActivationD* x, 166 | const typename svd::dense_params::ActivationD* u, 167 | const typename svd::dense_params::ActivationD* s, 168 | const typename svd::dense_params::ActivationD* v, 169 | const typename svd::dense_params::ActivationD* bias, 170 | typename svd::dense_params::ActivationD* y); 171 | 172 | #endif // end DENSE_HLS_DENSE_SVD_H_); 173 | -------------------------------------------------------------------------------- /include/layers/lstm/hls/lstm_hardware.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2019 Stefano Ribes. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 16 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 21 | * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 23 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 24 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | *****************************************************************************/ 27 | /****************************************************************************** 28 | * 29 | * 30 | * @file lstm_hardware.h 31 | * 32 | * @author Stefano Ribes 33 | * 34 | * Library of templated HLS functions for BNN deployment. 35 | * This file lists a set of functions to access memory mapped values into 36 | * streams 37 | * 38 | *****************************************************************************/ 39 | #ifndef LSTM_HLS_LSTM_HARDWARE_H_ 40 | #define LSTM_HLS_LSTM_HARDWARE_H_ 41 | 42 | #include "math_utils/activation_functions.h" 43 | #include "svd_params.h" 44 | 45 | #ifdef AP_INT_MAX_W 46 | #undef AP_INT_MAX_W 47 | #define AP_INT_MAX_W 4096 48 | #endif 49 | #include "ap_int.h" 50 | #include "ap_axi_sdata.h" 51 | #ifndef __VITIS_HLS__ 52 | #include "hls_linear_algebra.h" 53 | #endif 54 | 55 | #ifndef __SYNTHESIS__ 56 | #include 57 | #endif 58 | 59 | #ifdef SDS_DESIGN 60 | #include 61 | #include "sds_lib.h" 62 | #endif 63 | 64 | #ifndef TIMESTEPS_SIZE 65 | #define TIMESTEPS_SIZE NUM_TIMESTEPS 66 | #endif 67 | 68 | #ifndef TIMESTEPS_TILE_SIZE 69 | #define TIMESTEPS_TILE_SIZE NUM_TIMESTEPS // M // (svd::AxiD::width / FIX_WIDTH) 70 | #endif 71 | #ifndef HIDDEN_TILE_SIZE 72 | #define HIDDEN_TILE_SIZE 8 // N // (svd::AxiD::width / FIX_WIDTH) 73 | #endif 74 | #ifndef INPUT_TILE_SIZE 75 | #define INPUT_TILE_SIZE 8 // K // (svd::AxiD::width / FIX_WIDTH) 76 | #endif 77 | 78 | namespace svd { 79 | 80 | #ifndef __VITIS_HLS__ 81 | // struct MY_CONFIG: hls::matrix_multiply_traits { 83 | // static const int ARCH = 4; 84 | // static const int INNER_II = 1; 85 | // static const int UNROLL_FACTOR = 2; // ARCH4 will completely unroll the inner loop anyway. 86 | // }; 87 | 88 | struct MatrixConfigFixCurrent: hls::matrix_multiply_traits < 89 | hls::NoTranspose, 90 | hls::NoTranspose, 91 | TIMESTEPS_TILE_SIZE, 92 | INPUT_TILE_SIZE, 93 | INPUT_TILE_SIZE, 94 | HIDDEN_TILE_SIZE, 95 | ActivationD, 96 | ActivationD > { 97 | // static const int RowsATrans = HIDDEN_TILE_SIZE; // ( TransposeFormA::TransposeType != 0 ? ColsA : RowsA); 98 | // static const int ColsATrans = TIMESTEPS_TILE_SIZE; // ( TransposeFormA::TransposeType != 0 ? RowsA : ColsA); 99 | // static const int RowsBTrans = ( TransposeFormB::TransposeType != 0 ? ColsB : RowsB); 100 | // static const int ColsBTrans = ( TransposeFormB::TransposeType != 0 ? RowsB : ColsB); 101 | // static const int B_UNROLL_DIM = ( TransposeFormB::TransposeType != 0 ? 1 : 2); 102 | // static const int A_FULL_UNROLL_DIM = ( TransposeFormA::TransposeType != 0 ? 1 : 2); 103 | // static const int B_FULL_UNROLL_DIM = ( TransposeFormB::TransposeType != 0 ? 2 : 1); 104 | // typedef ap_fixed INPUT_T; 105 | // typedef ap_fixed MULT_T; 106 | // typedef ap_fixed::Value, I1+I1+BitWidth::Value, AP_TRN, AP_WRAP, 0> ACCUM_T; 107 | typedef ActivationD INPUT_T; 108 | typedef MultD MULT_T; 109 | typedef AccumD ACCUM_T; 110 | static const int ARCH = 4; 111 | static const int INNER_II = 1; 112 | // static const int UNROLL_FACTOR = 1; 113 | static const int M = TIMESTEPS_TILE_SIZE; 114 | static const int N = HIDDEN_TILE_SIZE; 115 | static const int K = INPUT_TILE_SIZE; 116 | }; 117 | 118 | struct MatrixConfigFixRecurrent: hls::matrix_multiply_traits < 119 | hls::NoTranspose, 120 | hls::NoTranspose, 121 | TIMESTEPS_TILE_SIZE, 122 | HIDDEN_TILE_SIZE, 123 | HIDDEN_TILE_SIZE, 124 | 1, 125 | ActivationD, 126 | ActivationD> { 127 | // static const int RowsATrans = ( TransposeFormA::TransposeType != 0 ? ColsA : RowsA); 128 | // static const int ColsATrans = ( TransposeFormA::TransposeType != 0 ? RowsA : ColsA); 129 | // static const int RowsBTrans = ( TransposeFormB::TransposeType != 0 ? ColsB : RowsB); 130 | // static const int ColsBTrans = ( TransposeFormB::TransposeType != 0 ? RowsB : ColsB); 131 | // static const int B_UNROLL_DIM = ( TransposeFormB::TransposeType != 0 ? 1 : 2); 132 | // static const int A_FULL_UNROLL_DIM = ( TransposeFormA::TransposeType != 0 ? 1 : 2); 133 | // static const int B_FULL_UNROLL_DIM = ( TransposeFormB::TransposeType != 0 ? 2 : 1); 134 | // typedef ap_fixed INPUT_T; 135 | // typedef ap_fixed MULT_T; 136 | // typedef ap_fixed::Value, I1+I1+BitWidth::Value, AP_TRN, AP_WRAP, 0> ACCUM_T; 137 | typedef ActivationD INPUT_T; 138 | typedef MultD MULT_T; 139 | typedef AccumD ACCUM_T; 140 | static const int ARCH = 4; 141 | static const int INNER_II = 1; 142 | // static const int UNROLL_FACTOR = 1; 143 | static const int M = TIMESTEPS_TILE_SIZE; 144 | static const int N = 1; 145 | static const int K = HIDDEN_TILE_SIZE; 146 | }; 147 | #endif 148 | 149 | typedef struct { 150 | ap_uint data; 151 | ap_uint<1> last; 152 | } AxisPacketD; 153 | typedef hls::stream DmaInterfaceD; 154 | 155 | void svd_fpga_cur_gemm_axi(const AxiD *a, const AxiD *b, AxiD *c); 156 | 157 | template 158 | void cur_gemm(const ActivationD *a, const ActivationD *b, ActivationD *c); 159 | 160 | } // end namespace svd 161 | 162 | void svd_fpga_cur_gemm_gate(const svd::ActivationD *a, const svd::ActivationD *b, svd::ActivationD *c); 163 | void svd_fpga_cur_gemm_summa_gate(const svd::ActivationD *a, const svd::ActivationD *b, svd::ActivationD *c); 164 | void svd_fpga_cur_gemm(const svd::ActivationD *a, const svd::ActivationD *b, svd::ActivationD *c); 165 | 166 | void svd_fpga_rec_gemv_gate(const svd::ActivationD *a, const svd::ActivationD *b, svd::ActivationD *c); 167 | void svd_fpga_rec_gemv(const svd::ActivationD *a, const svd::ActivationD *b, svd::ActivationD *c); 168 | 169 | void svd_fpga_cur_gemv_gate_systolic(const ap_uint *a, const ap_uint *b, ap_uint *c); 170 | void svd_fpga_rec_gemv_gate_systolic(const ap_uint *a, const ap_uint *b, ap_uint *c); 171 | 172 | void svd_fpga_non_lin(const svd::AxiD *c_rec, const svd::AxiD *cur_gate_i, 173 | const svd::AxiD *cur_gate_f, const svd::AxiD *cur_gate_c, 174 | const svd::AxiD *cur_gate_o, const svd::AxiD *rec_gate_i, 175 | const svd::AxiD *rec_gate_f, const svd::AxiD *rec_gate_c, 176 | const svd::AxiD *rec_gate_o, const svd::AxiD *bias_i, 177 | const svd::AxiD *bias_f, const svd::AxiD *bias_c, const svd::AxiD *bias_o, 178 | svd::AxiD *c_cur, svd::AxiD *h_port); 179 | 180 | void svd_fpga_lstm(const svd::ActivationD *x, 181 | const svd::ActivationD *h, 182 | const svd::WeightD *cur_gates, 183 | const svd::WeightD *rec_gates, 184 | const svd::WeightD *i_bias, 185 | const svd::WeightD *f_bias, 186 | const svd::WeightD *c_bias, 187 | const svd::WeightD *o_bias, 188 | const svd::ActivationD *c_rec, 189 | svd::ActivationD *c_cur, 190 | svd::ActivationD *out); 191 | 192 | void svd_fpga_lstm_v2(const svd::ActivationD *x, 193 | const svd::ActivationD *h, 194 | const svd::WeightD *cur_i, 195 | const svd::WeightD *cur_f, 196 | const svd::WeightD *cur_c, 197 | const svd::WeightD *cur_o, 198 | const svd::WeightD *rec_i, 199 | const svd::WeightD *rec_f, 200 | const svd::WeightD *rec_c, 201 | const svd::WeightD *rec_o, 202 | const svd::WeightD *i_bias, 203 | const svd::WeightD *f_bias, 204 | const svd::WeightD *c_bias, 205 | const svd::WeightD *o_bias, 206 | const svd::ActivationD *c_rec, 207 | svd::ActivationD *c_cur, 208 | svd::ActivationD *out); 209 | 210 | void svd_fpga_lstm_v3(const svd::AxiD *x, 211 | const svd::AxiD *h, 212 | const svd::AxiD *cur_i_T, 213 | const svd::AxiD *cur_f_T, 214 | const svd::AxiD *cur_c_T, 215 | const svd::AxiD *cur_o_T, 216 | const svd::AxiD *rec_i_T, 217 | const svd::AxiD *rec_f_T, 218 | const svd::AxiD *rec_c_T, 219 | const svd::AxiD *rec_o_T, 220 | const svd::AxiD *i_bias, 221 | const svd::AxiD *f_bias, 222 | const svd::AxiD *c_bias, 223 | const svd::AxiD *o_bias, 224 | const svd::AxiD *c_rec, 225 | svd::AxiD *c_cur, 226 | svd::AxiD *out); 227 | 228 | void dummy_gemm(svd::DmaInterfaceD a[2], svd::DmaInterfaceD b[2], svd::DmaInterfaceD c[2]); 229 | 230 | void dummy_gemm_v0(const svd::ActivationD a[16][16], const svd::ActivationD b[16][16], 231 | svd::ActivationD c[16][16]); 232 | 233 | void test_dispatcher(); 234 | 235 | #endif // end LSTM_HLS_LSTM_HARDWARE_H_ -------------------------------------------------------------------------------- /include/layers/lstm/hls/lstm_svd_emulator.h: -------------------------------------------------------------------------------- 1 | #ifndef LSTM_HLS_LSTM_SVD_EMULATOR_H_ 2 | #define LSTM_HLS_LSTM_SVD_EMULATOR_H_ 3 | 4 | #include "math_utils/activation_functions.h" 5 | 6 | #include "hls_stream.h" 7 | 8 | #include 9 | #include 10 | 11 | namespace svd { 12 | 13 | /** 14 | * @brief Emulator used to test the accuracy of the HLS accelerator. It 15 | * allows for testing different design points without recompiling. 16 | * 17 | * @param[in] InputSize The input size 18 | * @param[in] HiddenSize The hidden size 19 | * @param[in] NumIter The number of refinement steps 20 | * @param[in] Tu The number of tiles of u 21 | * @param[in] ZTu The number of pruned tiles of u 22 | * @param[in] Tv The number of tiles of v 23 | * @param[in] ZTv The number of pruned tiles of v 24 | * @param[in] NumTimesteps The number timesteps (deprecated) 25 | * @param[in] x The input data 26 | * @param[in] cur_i_u The current i u 27 | * @param[in] cur_i_s The current i s 28 | * @param[in] cur_i_v The current i v 29 | * @param[in] cur_i_unz The current i unz 30 | * @param[in] cur_i_vnz The current i vnz 31 | * @param[in] cur_f_u The current f u 32 | * @param[in] cur_f_s The current f s 33 | * @param[in] cur_f_v The current f v 34 | * @param[in] cur_f_unz The current f unz 35 | * @param[in] cur_f_vnz The current f vnz 36 | * @param[in] cur_c_u The current c u 37 | * @param[in] cur_c_s The current c s 38 | * @param[in] cur_c_v The current c v 39 | * @param[in] cur_c_unz The current c unz 40 | * @param[in] cur_c_vnz The current c vnz 41 | * @param[in] cur_o_u The current o u 42 | * @param[in] cur_o_s The current o s 43 | * @param[in] cur_o_v The current o v 44 | * @param[in] cur_o_unz The current o unz 45 | * @param[in] cur_o_vnz The current o vnz 46 | * @param[in] rec_i_u The recurrent i u 47 | * @param[in] rec_i_s The recurrent i s 48 | * @param[in] rec_i_v The recurrent i v 49 | * @param[in] rec_i_unz The recurrent i unz 50 | * @param[in] rec_i_vnz The recurrent i vnz 51 | * @param[in] rec_f_u The recurrent f u 52 | * @param[in] rec_f_s The recurrent f s 53 | * @param[in] rec_f_v The recurrent f v 54 | * @param[in] rec_f_unz The recurrent f unz 55 | * @param[in] rec_f_vnz The recurrent f vnz 56 | * @param[in] rec_c_u The recurrent c u 57 | * @param[in] rec_c_s The recurrent c s 58 | * @param[in] rec_c_v The recurrent c v 59 | * @param[in] rec_c_unz The recurrent c unz 60 | * @param[in] rec_c_vnz The recurrent c vnz 61 | * @param[in] rec_o_u The recurrent o u 62 | * @param[in] rec_o_s The recurrent o s 63 | * @param[in] rec_o_v The recurrent o v 64 | * @param[in] rec_o_unz The recurrent o unz 65 | * @param[in] rec_o_vnz The recurrent o vnz 66 | * @param[in] bias The bias 67 | * @param[in] c_prev The c previous 68 | * @param[in] h_prev The h previous 69 | * @param c_curr The c current 70 | * @param h_curr The h current 71 | * 72 | * @tparam DataA Activation type 73 | * @tparam DataW Weight type 74 | * @tparam DataAcc Accumulation type 75 | * @tparam DataMul Multiplication type 76 | * @tparam TanhLutSize Size of the hard sigmoid LUT 77 | */ 78 | template 83 | void LstmSvdSoftEmulator(const int InputSize, 84 | const int HiddenSize, 85 | const int NumIter, 86 | const int Tu, 87 | const int ZTu, 88 | const int Tv, 89 | const int ZTv, 90 | const int NumTimesteps, 91 | const DataA *x, 92 | const DataW *cur_i_u, 93 | const DataW *cur_i_s, 94 | const DataW *cur_i_v, 95 | const int *cur_i_unz, 96 | const int *cur_i_vnz, 97 | const DataW *cur_f_u, 98 | const DataW *cur_f_s, 99 | const DataW *cur_f_v, 100 | const int *cur_f_unz, 101 | const int *cur_f_vnz, 102 | const DataW *cur_c_u, 103 | const DataW *cur_c_s, 104 | const DataW *cur_c_v, 105 | const int *cur_c_unz, 106 | const int *cur_c_vnz, 107 | const DataW *cur_o_u, 108 | const DataW *cur_o_s, 109 | const DataW *cur_o_v, 110 | const int *cur_o_unz, 111 | const int *cur_o_vnz, 112 | const DataW *rec_i_u, 113 | const DataW *rec_i_s, 114 | const DataW *rec_i_v, 115 | const int *rec_i_unz, 116 | const int *rec_i_vnz, 117 | const DataW *rec_f_u, 118 | const DataW *rec_f_s, 119 | const DataW *rec_f_v, 120 | const int *rec_f_unz, 121 | const int *rec_f_vnz, 122 | const DataW *rec_c_u, 123 | const DataW *rec_c_s, 124 | const DataW *rec_c_v, 125 | const int *rec_c_unz, 126 | const int *rec_c_vnz, 127 | const DataW *rec_o_u, 128 | const DataW *rec_o_s, 129 | const DataW *rec_o_v, 130 | const int *rec_o_unz, 131 | const int *rec_o_vnz, 132 | const DataW *bias, 133 | DataA *c_prev, 134 | DataA *h_prev, 135 | DataA *c_curr, 136 | DataA *h_curr) { 137 | assert(Tu % 2 == 0); 138 | assert(Tv % 2 == 0); 139 | assert(Tu >= 8); 140 | assert(Tv >= 8); 141 | assert(Tu > ZTu); 142 | assert(Tv > ZTv); 143 | assert(NumIter % 2 == 0); 144 | const DataW *u[8]; 145 | const DataW *s[8]; 146 | const DataW *v[8]; 147 | const int *unz[8]; 148 | const int *vnz[8]; 149 | u[0] = cur_i_u; u[1] = cur_f_u; u[2] = cur_c_u; u[3] = cur_o_u; 150 | u[4] = rec_i_u; u[5] = rec_f_u; u[6] = rec_c_u; u[7] = rec_o_u; 151 | s[0] = cur_i_s; s[1] = cur_f_s; s[2] = cur_c_s; s[3] = cur_o_s; 152 | s[4] = rec_i_s; s[5] = rec_f_s; s[6] = rec_c_s; s[7] = rec_o_s; 153 | v[0] = cur_i_v; v[1] = cur_f_v; v[2] = cur_c_v; v[3] = cur_o_v; 154 | v[4] = rec_i_v; v[5] = rec_f_v; v[6] = rec_c_v; v[7] = rec_o_v; 155 | unz[0] = cur_i_unz; unz[1] = cur_f_unz; unz[2] = cur_c_unz; unz[3] = cur_o_unz; 156 | unz[4] = rec_i_unz; unz[5] = rec_f_unz; unz[6] = rec_c_unz; unz[7] = rec_o_unz; 157 | vnz[0] = cur_i_vnz; vnz[1] = cur_f_vnz; vnz[2] = cur_c_vnz; vnz[3] = cur_o_vnz; 158 | vnz[4] = rec_i_vnz; vnz[5] = rec_f_vnz; vnz[6] = rec_c_vnz; vnz[7] = rec_o_vnz; 159 | hls::stream **cur_out_fifo = new hls::stream*[4]; 160 | hls::stream **rec_out_fifo = new hls::stream*[4]; 161 | for (int i = 0; i < 4; ++i) { 162 | cur_out_fifo[i] = new hls::stream[Tv]; 163 | rec_out_fifo[i] = new hls::stream[Tv]; 164 | } 165 | DataAcc *u_acc[8]; 166 | DataAcc **acc_buffer[8]; 167 | DataMul xs_val[8] = {0}; 168 | for (int i = 0; i < 8; ++i) { 169 | u_acc[i] = new DataAcc[NumIter]; 170 | } 171 | DataA *h[2]; 172 | DataA *c[2]; 173 | if (NumTimesteps > 1) { 174 | for (int i = 0; i < 2; ++i) { 175 | h[i] = new DataA[HiddenSize]; 176 | c[i] = new DataA[HiddenSize]; 177 | std::memset(h[i], 0, HiddenSize * sizeof(DataA)); 178 | std::memset(c[i], 0, HiddenSize * sizeof(DataA)); 179 | } 180 | } else { 181 | c[0] = c_prev; 182 | c[1] = c_curr; 183 | h[0] = h_prev; 184 | h[1] = h_curr; 185 | } 186 | for (int i = 0; i < 8; ++i) { 187 | acc_buffer[i] = new DataAcc*[Tv]; 188 | for (int j = 0; j < Tv; ++j) { 189 | acc_buffer[i][j] = new DataAcc[HiddenSize / Tv]; 190 | } 191 | } 192 | for (int t = 0; t < NumTimesteps; ++t) { 193 | const int in_ptr = (t % 2) == 0 ? 0 : 1; 194 | const int out_ptr = (t % 2) == 0 ? 1 : 0; 195 | for (int i = 0; i < 8; ++i) { 196 | std::memset(u_acc[i], 0, NumIter * sizeof(DataAcc)); 197 | for (int j = 0; j < Tv; ++j) { 198 | std::memset(acc_buffer[i][j], 0, HiddenSize / Tv * sizeof(DataAcc)); 199 | } 200 | } 201 | for (int i = 0; i < NumIter; ++i) { 202 | for (int q = 0; q < 4; ++q) { 203 | for (int j = 0; j < Tu - ZTu; ++j) { 204 | const int nz_idx = i * (Tu - ZTu) + j; 205 | for (int k = 0; k < InputSize / Tu; ++k) { 206 | int u_idx = i * InputSize / Tu * (Tu - ZTu) + j * InputSize / Tu + k; 207 | u_acc[q][i] += x[t * InputSize + unz[q][nz_idx] * InputSize / Tu + k] * u[q][u_idx]; 208 | } 209 | for (int k = 0; k < HiddenSize / Tu; ++k) { 210 | int u_idx = i * HiddenSize / Tu * (Tu - ZTu) + j * HiddenSize / Tu + k; 211 | u_acc[q + 4][i] += h[in_ptr][unz[q + 4][nz_idx] * HiddenSize / Tu + k] * u[q + 4][u_idx]; 212 | } 213 | } 214 | } 215 | for (int q = 0; q < 8; ++q) { 216 | xs_val[q] = s[q][i] * DataA(u_acc[q][i]); 217 | for (int j = 0; j < Tv - ZTv; ++j) { 218 | for (int k = 0; k < HiddenSize / Tv; ++k) { 219 | const int v_idx = i * HiddenSize / Tv * (Tv - ZTv) + j * HiddenSize / Tv + k; 220 | const int nz_idx = i * (Tv - ZTv) + j; 221 | acc_buffer[q][vnz[q][nz_idx]][k] += xs_val[q] * v[q][v_idx]; 222 | } 223 | } 224 | } 225 | } 226 | for (int i = 0; i < 4; ++i) { 227 | for (int j = 0; j < Tv; ++j) { 228 | for (int k = 0; k < HiddenSize / Tv; ++k) { 229 | cur_out_fifo[i][j].write(acc_buffer[i][j][k]); 230 | rec_out_fifo[i][j].write(acc_buffer[i + 4][j][k]); 231 | } 232 | } 233 | } 234 | NonLinearityUnitSoftware(HiddenSize, 235 | Tv, 4, c[in_ptr], cur_out_fifo, rec_out_fifo, h[out_ptr], c[out_ptr], 236 | true, bias); 237 | } 238 | if (NumTimesteps > 1) { 239 | std::memcpy(h_curr, h[(NumTimesteps - 1) % 2 == 0 ? 1 : 0], HiddenSize * sizeof(DataA)); 240 | } 241 | for (int i = 0; i < 4; ++i) { 242 | delete[] cur_out_fifo[i]; 243 | delete[] rec_out_fifo[i]; 244 | } 245 | delete[] cur_out_fifo; 246 | delete[] rec_out_fifo; 247 | for (int i = 0; i < 8; ++i) { 248 | delete[] u_acc[i]; 249 | for (int j = 0; j < Tv; ++j) { 250 | delete[] acc_buffer[i][j]; 251 | } 252 | delete[] acc_buffer[i]; 253 | } 254 | if (NumTimesteps > 1) { 255 | for (int i = 0; i < 2; ++i) { 256 | delete[] h[i]; 257 | delete[] c[i]; 258 | } 259 | } 260 | } 261 | 262 | } // svd 263 | 264 | #endif // LSTM_HLS_LSTM_SVD_EMULATOR_H_ -------------------------------------------------------------------------------- /include/layers/lstm/sw/soft_lstm.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2019 Stefano Ribes. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 16 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 18 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 21 | * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 23 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 24 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | *****************************************************************************/ 27 | /****************************************************************************** 28 | * 29 | * 30 | * @file lstm_software.h 31 | * 32 | * @author Stefano Ribes 33 | * 34 | * Library of templated HLS functions for BNN deployment. 35 | * This file lists a set of functions to access memory mapped values into 36 | * streams 37 | * 38 | *****************************************************************************/ 39 | #ifndef LAYERS_LSTM_SW_LSTM_SOFTWARE_H_ 40 | #define LAYERS_LSTM_SW_LSTM_SOFTWARE_H_ 41 | 42 | #ifdef __cplusplus 43 | extern "C" 44 | #endif 45 | void Lstm(const bool use_blas, 46 | const float *x, 47 | const int num_samples, 48 | const int num_timesteps, 49 | const int input_size, 50 | const int output_size, 51 | const float *cur_i, 52 | const float *cur_f, 53 | const float *cur_c, 54 | const float *cur_o, 55 | const float *rec_i, 56 | const float *rec_f, 57 | const float *rec_c, 58 | const float *rec_o, 59 | const float *bias_i, 60 | const float *bias_f, 61 | const float *bias_c, 62 | const float *bias_o, 63 | float *out); 64 | 65 | #ifdef __cplusplus 66 | extern "C" 67 | #endif 68 | void LstmUnbatched(const bool use_blas, 69 | const float *x, 70 | const int num_samples, 71 | const int num_timesteps, 72 | const int input_size, 73 | const int output_size, 74 | const float *cur_i, 75 | const float *cur_f, 76 | const float *cur_c, 77 | const float *cur_o, 78 | const float *rec_i, 79 | const float *rec_f, 80 | const float *rec_c, 81 | const float *rec_o, 82 | const float *bias_i, 83 | const float *bias_f, 84 | const float *bias_c, 85 | const float *bias_o, 86 | float *out); 87 | 88 | #endif // end LAYERS_LSTM_SW_LSTM_SOFTWARE_H_ -------------------------------------------------------------------------------- /include/math_utils/blas_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef MATH_BLAS_UTILS_H_ 2 | #define MATH_BLAS_UTILS_H_ 3 | 4 | #ifdef USE_BLAS 5 | #include 6 | #include 7 | #endif 8 | 9 | #include 10 | #include 11 | 12 | #ifdef USE_BLAS 13 | template 14 | void svd_cpu_gemm(const CBLAS_TRANSPOSE TransA, 15 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 16 | const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, 17 | Dtype* C); 18 | 19 | template 20 | void svd_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, 21 | const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, 22 | Dtype* y); 23 | 24 | template 25 | void svd_cpu_gemm_gates(const CBLAS_TRANSPOSE TransA, 26 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 27 | const Dtype* X, const Dtype* I, const Dtype* F, const Dtype* C, 28 | const Dtype* O, Dtype* Y_I, Dtype* Y_F, Dtype* Y_C, Dtype* Y_O); 29 | #endif 30 | 31 | template 32 | void svd_set(const int N, const Dtype alpha, Dtype* Y); 33 | 34 | template 35 | void svd_copy(const int N, const Dtype *X, Dtype *Y); 36 | 37 | template 38 | void svd_scal(const int N, const Dtype alpha, Dtype *X); 39 | 40 | template 41 | void svd_add(const int n, const Dtype* a, const Dtype* b, Dtype* y); 42 | 43 | template 44 | void svd_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y); 45 | 46 | /** 47 | * @brief Transpose a matrix x into y. 48 | * 49 | * @param[in] n The initial number of rows (BEFORE transpose). 50 | * @param[in] m The initial number of columns (BEFORE transpose). 51 | * @param[in] x The input matrix to transpose. Shape: (n, m) 52 | * @param y The output matrix transposed. Shape: (m, n) 53 | * 54 | * @tparam Dtype The real data type. 55 | */ 56 | template 57 | void svd_transpose(const int n, const int m, const Dtype* x, Dtype* y); 58 | 59 | #endif // end MATH_BLAS_UTILS_H_ -------------------------------------------------------------------------------- /include/math_utils/data_handler.h: -------------------------------------------------------------------------------- 1 | #ifndef MATH_UTILS_DATA_HANDLER_H_ 2 | #define MATH_UTILS_DATA_HANDLER_H_ 3 | 4 | #include "hls_utils/hls_metaprogramming.h" 5 | 6 | #include "ap_int.h" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #ifdef SDS_DESIGN 16 | #include 17 | #include "sds_lib.h" 18 | #else 19 | #include 20 | #endif 21 | 22 | #ifndef ALLOC 23 | # ifdef SDS_DESIGN 24 | # define ALLOC(x) sds_alloc(x) 25 | # else 26 | # define ALLOC(x) malloc(x) 27 | # endif 28 | #endif 29 | 30 | #ifndef FREE 31 | # ifdef SDS_DESIGN 32 | # define FREE(x) sds_free(x) 33 | # else 34 | # define FREE(x) free(x) 35 | # endif 36 | #endif 37 | 38 | namespace svd { 39 | 40 | template 41 | T* AllocateContiguously(const int size) { 42 | #ifndef __SYNTHESIS__ 43 | T* tmp; 44 | try { 45 | tmp = (T*)ALLOC(size * sizeof(T)); 46 | } catch(...) { 47 | std::cout << "[ERROR] Exception occurred while contiguously allocating." << std::endl; 48 | throw; 49 | } 50 | if (!tmp) { 51 | std::cout << "[ERROR] Contiguous allocation failed." << std::endl; 52 | std::bad_alloc except_alloc; 53 | throw except_alloc; 54 | } 55 | return tmp; 56 | #else 57 | T* tmp = (T*)ALLOC(size * sizeof(T)); 58 | if (!tmp) { 59 | std::cout << "[ERROR] Contiguous allocation failed." << std::endl; 60 | exit(1); 61 | } 62 | return tmp; 63 | #endif 64 | } 65 | 66 | template 67 | void FreeContiguously(T** x) { 68 | #ifdef SDS_DESIGN 69 | std::cout << "[INFO] Calling FreeContiguously sds_free()." << std::endl; 70 | #else 71 | std::cout << "[INFO] Calling FreeContiguously free()." << std::endl; 72 | #endif 73 | FREE(*x); 74 | *x = nullptr; 75 | } 76 | 77 | template 78 | class VectorBlob { 79 | private: 80 | typedef ap_uint IdxType; 81 | int num_tile_elems_; 82 | int size_; 83 | int pruned_size_; 84 | int total_size_; 85 | int pruned_total_size_; 86 | int refinement_steps_; 87 | int num_tiles_; 88 | int num_zero_tiles_; 89 | std::vector data_; 90 | std::vector pruned_data_; 91 | std::vector fix_data_; 92 | std::vector fix_pruned_data_; 93 | std::vector z_idx_; 94 | std::vector nz_idx_; 95 | std::vector fix_z_idx_; 96 | std::vector fix_nz_idx_; 97 | 98 | public: 99 | VectorBlob(const int refinement_steps, const int vector_size, 100 | const int num_tiles, const int num_zero_tiles) { 101 | assert(refinement_steps > 0); 102 | assert(vector_size > 0); 103 | assert(num_tiles > 0); 104 | assert(vector_size % num_tiles == 0); 105 | this->num_tile_elems_ = vector_size / num_tiles; 106 | this->size_ = vector_size; 107 | this->pruned_size_ = this->num_tile_elems_ * (num_tiles - num_zero_tiles); 108 | this->total_size_ = refinement_steps * this->size_; 109 | this->pruned_total_size_ = refinement_steps * this->pruned_size_; 110 | this->refinement_steps_ = refinement_steps; 111 | this->num_tiles_ = num_tiles; 112 | this->num_zero_tiles_ = num_zero_tiles; 113 | for (int i = 0; i < refinement_steps; ++i) { 114 | this->fix_nz_idx_.push_back(~IdxType(0)); 115 | this->fix_z_idx_.push_back(~IdxType(0)); 116 | if (num_zero_tiles == 0) { 117 | for (int j = 0; j < num_tiles; ++j) { 118 | this->nz_idx_.push_back(j); 119 | this->z_idx_.push_back(j); 120 | } 121 | } 122 | } 123 | if (num_zero_tiles > 0) { 124 | for (int i = 0; i < refinement_steps; ++i) { 125 | std::vector rand_idx; 126 | for (int j = 0; j < num_tiles; ++j) { 127 | rand_idx.push_back(1); 128 | } 129 | for (int j = 0; j < num_zero_tiles; ++j) { 130 | rand_idx[j] = 0; 131 | } 132 | std::random_shuffle(rand_idx.begin(), rand_idx.end()); 133 | // Set the bits 134 | for (int j = 0; j < num_tiles; ++j) { 135 | if (num_tiles == NumTiles) { 136 | this->fix_nz_idx_[i][j] = rand_idx[j]; 137 | this->fix_z_idx_[i][j] = rand_idx[j] == 0 ? 1 : 0; 138 | } 139 | if (rand_idx[j] == 0) { 140 | // Pruned tile 141 | for (int k = 0; k < this->num_tile_elems_; ++k) { 142 | this->data_.push_back(0); 143 | this->fix_data_.push_back(0); 144 | } 145 | this->z_idx_.push_back(j); 146 | } else { 147 | // Non-pruned tile 148 | for (int k = 0; k < this->num_tile_elems_; ++k) { 149 | FloatType tmp; 150 | if (std::is_same::value || 151 | std::is_same::value || 152 | std::is_same::value || 153 | std::is_same::value) { 154 | tmp = rand(); 155 | } else { 156 | tmp = 0.00001 * rand(); 157 | } 158 | this->data_.push_back(tmp); 159 | this->pruned_data_.push_back(tmp); 160 | this->fix_data_.push_back(FixType(tmp)); 161 | this->fix_pruned_data_.push_back(FixType(tmp)); 162 | } 163 | this->nz_idx_.push_back(j); 164 | } 165 | } 166 | } 167 | } else { 168 | for (int i = 0; i < this->total_size_; ++i) { 169 | FloatType tmp; 170 | if (std::is_same::value || 171 | std::is_same::value || 172 | std::is_same::value || 173 | std::is_same::value) { 174 | tmp = rand(); 175 | } else { 176 | tmp = 0.00001 * rand(); 177 | } 178 | this->data_.push_back(tmp); 179 | this->pruned_data_.push_back(tmp); 180 | this->fix_data_.push_back(FixType(tmp)); 181 | this->fix_pruned_data_.push_back(FixType(tmp)); 182 | } 183 | } 184 | } 185 | 186 | ~VectorBlob() {}; 187 | 188 | FloatType* data() { 189 | return this->data_.data(); 190 | } 191 | 192 | FloatType* pruned_data() { 193 | return this->pruned_data_.data(); 194 | } 195 | 196 | FixType* fix_data() { 197 | return this->fix_data_.data(); 198 | } 199 | 200 | FixType* fix_pruned_data() { 201 | return this->fix_pruned_data_.data(); 202 | } 203 | 204 | int get_total_size() { 205 | return this->total_size_; 206 | } 207 | 208 | int get_pruned_total_size() { 209 | assert(this->pruned_total_size_ == this->fix_pruned_data_.size()); 210 | return this->pruned_total_size_; 211 | } 212 | 213 | int get_size() { 214 | return this->size_; 215 | } 216 | 217 | int get_pruned_size() { 218 | return this->pruned_size_; 219 | } 220 | 221 | int* get_z_idx() { 222 | return this->z_idx_.data(); 223 | } 224 | 225 | int get_z_idx(const int i) { 226 | return this->z_idx_.at(i); 227 | } 228 | 229 | /** 230 | * @brief Gets the nz index. 231 | * 232 | * @return The nz index. Shape: (R, NZ-Tiles) 233 | */ 234 | int* get_nz_idx() { 235 | return this->nz_idx_.data(); 236 | } 237 | 238 | int get_nz_idx(const int i) { 239 | return this->nz_idx_.at(i); 240 | } 241 | 242 | /** 243 | * @brief Gets the nz index. 244 | * 245 | * @param[in] r The refinement step 246 | * @param[in] t The non-zero tile index (range 0 to NumT - ZNumT) 247 | * 248 | * @return The nz index. 249 | */ 250 | int get_nz_idx(const int r, const int t) { 251 | return this->nz_idx_.at(r * (this->num_tiles_ - this->num_zero_tiles_) + t); 252 | } 253 | 254 | IdxType* get_fix_z_idx() { 255 | return this->fix_z_idx_.data(); 256 | } 257 | 258 | IdxType get_fix_z_idx(const int refinement_step) { 259 | return this->fix_z_idx_[refinement_step]; 260 | } 261 | 262 | IdxType* get_fix_nz_idx() { 263 | return this->fix_nz_idx_.data(); 264 | } 265 | 266 | IdxType get_fix_nz_idx(const int refinement_step) { 267 | return this->fix_nz_idx_.at(refinement_step); 268 | } 269 | 270 | int get_refinement_steps() { 271 | return this->refinement_steps_; 272 | } 273 | }; 274 | 275 | 276 | template 277 | class SvdComponents { 278 | private: 279 | int num_inputs_; 280 | VectorBlob* u_; 281 | std::vector > s_; 282 | VectorBlob* v_; 283 | public: 284 | SvdComponents(const int num_inputs, const int refinement_steps, 285 | const int u_size, const int v_size, const int num_tiles_u, 286 | const int num_zero_tiles_u, const int num_tiles_v, 287 | const int num_zero_tiles_v) { 288 | assert(num_inputs > 0); 289 | this->num_inputs_ = num_inputs; 290 | this->u_ = new VectorBlob(refinement_steps, 291 | u_size, num_tiles_u, num_zero_tiles_u); 292 | this->v_ = new VectorBlob(refinement_steps, 293 | v_size, num_tiles_v, num_zero_tiles_v); 294 | for (int i = 0; i < num_inputs; ++i) { 295 | this->s_.push_back(VectorBlob(refinement_steps, 1, 1, 0)); 296 | } 297 | } 298 | 299 | ~SvdComponents() { 300 | delete this->u_; 301 | delete this->v_; 302 | } 303 | 304 | VectorBlob* get_u() { 305 | return this->u_; 306 | } 307 | 308 | VectorBlob* get_v() { 309 | return this->v_; 310 | } 311 | 312 | int get_u_size() { 313 | return this->u_->get_size(); 314 | } 315 | 316 | int get_v_size() { 317 | return this->v_->get_size(); 318 | } 319 | 320 | int get_u_pruned_size() { 321 | return this->u_->get_pruned_size(); 322 | } 323 | 324 | int get_v_pruned_size() { 325 | return this->v_->get_pruned_size(); 326 | } 327 | 328 | std::vector > get_s() { 329 | return this->s_; 330 | } 331 | 332 | VectorBlob get_s(const int i) { 333 | return this->s_[i]; 334 | } 335 | 336 | int get_num_inputs() { 337 | return this->num_inputs_; 338 | } 339 | 340 | int get_refinement_steps() { 341 | return this->s_[0].get_refinement_steps(); 342 | } 343 | }; 344 | 345 | } // svd 346 | 347 | #endif // MATH_UTILS_DATA_HANDLER_H_ -------------------------------------------------------------------------------- /include/svd_ip.h: -------------------------------------------------------------------------------- 1 | #ifndef SVD_IP_H_ 2 | #define SVD_IP_H_ 3 | 4 | #include "svd_params.h" 5 | #include "kernel/svd_kernel.h" 6 | 7 | namespace svd { 8 | 9 | // template 10 | // inline void SvdIP( 11 | // const typename params::ActivationD x_port[params::N][params::I], 12 | // const typename params::UPortD u_port[params::R * params::PrunedSizeU], 13 | // const typename params::SPortD s_port[params::N][params::R], 14 | // const typename params::VPortD v_port[params::R * params::PrunedSizeV], 15 | // const typename params::UnzD nz_u_port[params::G * params::R], 16 | // const typename params::VnzD nz_v_port[params::G * params::R], 17 | // typename params::ActivationD y_port[params::N][params::G][params::H]) { 18 | // #pragma HLS INLINE 19 | // #pragma HLS DATAFLOW 20 | // assert(params::I % params::Tu == 0); 21 | // assert(params::H % params::Tv == 0); 22 | // assert(params::Tu - params::ZTu > 0); 23 | // assert(params::Tv - params::ZTv > 0); 24 | // svd::SvdStreams streams; 25 | // svd::SvdBuffers buffers; 26 | // SvdInDMA(x_port, u_port, s_port, v_port, nz_u_port, nz_v_port, streams, buffers); 27 | // svd::SvdKernel(streams); 28 | // SvdOutDMA(streams, y_port); 29 | // } 30 | 31 | // void SvdIp2Inputs( 32 | // const typename svd_params::ActivationD x_port[svd_params::N][svd_params::I], 33 | // const typename svd_params::UPortD u_port[svd_params::R * svd_params::PrunedSizeU], 34 | // const typename svd_params::SPortD s_port[svd_params::N][svd_params::R], 35 | // const typename svd_params::VPortD v_port[svd_params::R * svd_params::PrunedSizeV], 36 | // const ap_uint nz_u_port[svd_params::G * svd_params::R], 37 | // const ap_uint nz_v_port[svd_params::G * svd_params::R], 38 | // typename svd_params::ActivationD y_port[svd_params::N][svd_params::G][svd_params::H]); 39 | 40 | } // svd 41 | 42 | #endif // end SVD_IP_H_ -------------------------------------------------------------------------------- /include/testbenches/test_dense_svd.h: -------------------------------------------------------------------------------- 1 | #ifndef TESTBENCHES_TEST_DENSE_SVD_H_ 2 | #define TESTBENCHES_TEST_DENSE_SVD_H_ 3 | 4 | #include "layers/dense/hls/dense_svd.h" 5 | 6 | #endif // end TESTBENCHES_TEST_DENSE_SVD_H_ -------------------------------------------------------------------------------- /include/testbenches/test_lstm_svd.h: -------------------------------------------------------------------------------- 1 | #ifndef TESTBENCHES_TEST_LSTM_SVD_H_ 2 | #define TESTBENCHES_TEST_LSTM_SVD_H_ 3 | 4 | #include "layers/lstm/hls/lstm_svd.h" 5 | 6 | #endif // end TESTBENCHES_TEST_LSTM_SVD_H_ -------------------------------------------------------------------------------- /include/testbenches/test_svd_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef TESTBENCHES_TEST_SVD_KERNEL_H_ 2 | #define TESTBENCHES_TEST_SVD_KERNEL_H_ 3 | 4 | #include "svd_params.h" 5 | #include "kernel/svd_kernel.h" 6 | 7 | #endif // end TESTBENCHES_TEST_SVD_KERNEL_H_ -------------------------------------------------------------------------------- /include/testbenches/test_u_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef TESTBENCHES_TEST_U_KERNEL_H_ 2 | #define TESTBENCHES_TEST_U_KERNEL_H_ 3 | 4 | #include "kernel/u_kernel.h" 5 | #include "hls_utils/hls_debugging.h" 6 | 7 | #endif // end TESTBENCHES_TEST_U_KERNEL_H_ -------------------------------------------------------------------------------- /include/testbenches/test_u_kernel_pruned.h: -------------------------------------------------------------------------------- 1 | #ifndef TESTBENCHES_TEST_U_KERNEL_PRUNED__H_ 2 | #define TESTBENCHES_TEST_U_KERNEL_PRUNED__H_ 3 | 4 | #include "kernel/u_kernel.h" 5 | #include "hls_utils/hls_debugging.h" 6 | 7 | #endif // end TESTBENCHES_TEST_U_KERNEL_PRUNED__H_ -------------------------------------------------------------------------------- /include/testbenches/test_v_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef TESTBENCHES_TEST_V_KERNEL_H_ 2 | #define TESTBENCHES_TEST_V_KERNEL_H_ 3 | 4 | #include "kernel/v_kernel.h" 5 | 6 | #endif // end TESTBENCHES_TEST_V_KERNEL_H_ -------------------------------------------------------------------------------- /include/testbenches/test_v_kernel_pruned.h: -------------------------------------------------------------------------------- 1 | #ifndef TESTBENCHES_TEST_V_KERNEL_PRUNED_H_ 2 | #define TESTBENCHES_TEST_V_KERNEL_PRUNED_H_ 3 | 4 | #include "kernel/v_kernel.h" 5 | 6 | #endif // end TESTBENCHES_TEST_V_KERNEL_H_ -------------------------------------------------------------------------------- /pynq/README.md: -------------------------------------------------------------------------------- 1 | # Notes on PYNQ Designs 2 | 3 | ## Vivado Project 4 | 5 | ### Xilinx DMA 6 | 7 | The DMA should be configured in the following way: 8 | 9 | * Max burst length to maximum 10 | * Register buffer width to maximum 11 | 12 | ### HP Ports 13 | 14 | All HP ports should be set to their maximum size width (64bit for the PYNQ-Z1 board and 128bit for the ZCU104) in order to avoid receiving data interleaved by zeroes. 15 | 16 | ## Jupyter Notebook 17 | 18 | ### Generating Randomly-filled Buffer 19 | 20 | ```python 21 | import numpy as np 22 | 23 | R, N, G = 64, 2, 4 24 | 25 | xus = np.random.randn(R, N, G).astype(dtype=np.int16) 26 | xus_buffer = pynq.allocate(shape=(R, N, G), dtype=np.int16) 27 | np.copyto(xus_buffer, xus, casting='no') 28 | ``` 29 | 30 | ### Storing and Loading Weights from bin file 31 | 32 | ```python 33 | import numpy as np 34 | 35 | R, N, G = 64, 2, 4 36 | 37 | tmp = np.random.randn(R, N, G).astype(dtype=np.int16) 38 | tmp.tofile('binfile_example.bin') 39 | 40 | def load_from_bin(binfile, shape, dtype): 41 | tmp_buffer = pynq.allocate(shape=shape, dtype=dtype) 42 | tmp = np.fromfile(binfile, dtype=data_t).reshape(tmp_buffer.shape) 43 | np.copyto(tmp_buffer, tmp, casting='no') 44 | return tmp_buffer 45 | 46 | xus_buffer = load_from_bin('binfile_example.bin', shape=(R, N, G), dtype=np.int16) 47 | ``` 48 | -------------------------------------------------------------------------------- /pynq/dense_svd/overlay/dense_svd.bit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ribesstefano/Mapping-Multiple-LSTM-Models-on-FPGAs/7cdf5c7b5c832ceb7984030a522677bdd88ebc0d/pynq/dense_svd/overlay/dense_svd.bit -------------------------------------------------------------------------------- /pynq/kernel_svd/overlay/kernel_svd.bit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ribesstefano/Mapping-Multiple-LSTM-Models-on-FPGAs/7cdf5c7b5c832ceb7984030a522677bdd88ebc0d/pynq/kernel_svd/overlay/kernel_svd.bit -------------------------------------------------------------------------------- /pynq/kernel_u/kernel_u_hier.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Testing an IP that adds 1 to a stream\n", 8 | "\n", 9 | "This notebook will test an IP written in Vivado HLS. The IP adds +1 to a buffer. The HP ports **must** be configured at 64bit, not 32bit." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 8, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from pynq import Overlay\n", 19 | "import pynq.lib.dma\n", 20 | "from pynq import allocate\n", 21 | "import numpy as np\n", 22 | "from pynq import DefaultIP\n", 23 | "from pynq import DefaultHierarchy" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "We need to define our own class **before** istantiating the overlay. In this way it will be automatically bound. We can use an accelerator driver as follows:" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 9, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "class AdderDriver(DefaultIP):\n", 40 | " def __init__(self, description):\n", 41 | " super().__init__(description=description)\n", 42 | " bindto = [\"xilinx.com:hls:hls_adder:1.0\"]\n", 43 | "\n", 44 | " def start_accel(self):\n", 45 | " self.write(0x0, 1)\n", 46 | "\n", 47 | " def set_state(self, state):\n", 48 | " self.write(0x0, state)\n", 49 | " return self.read(0x0)\n", 50 | "\n", 51 | " def get_state(self):\n", 52 | " return self.read(0x0)\n", 53 | "\n", 54 | " @property\n", 55 | " def stream_size(self):\n", 56 | " return self.read(0x10)\n", 57 | "\n", 58 | " @stream_size.setter\n", 59 | " def stream_size(self, size):\n", 60 | " self.write(0x10, size)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "But it comes more handy to use an Hierarchy class as follows:" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 10, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "class StreamAdderDriver(DefaultHierarchy):\n", 77 | " def __init__(self, description):\n", 78 | " super().__init__(description)\n", 79 | "\n", 80 | " def stream_add(self, stream):\n", 81 | " in_buffer = allocate(shape=(len(stream),), dtype=np.float32)\n", 82 | " out_buffer = allocate(shape=(len(stream),), dtype=np.float32)\n", 83 | " for i, elem in enumerate(stream):\n", 84 | " in_buffer[i] = elem\n", 85 | " # NOTE: for managing the HLS accelerator, we exploit\n", 86 | " # the driver that we defined above.\n", 87 | " self.hls_adder.stream_size = len(stream)\n", 88 | " self.hls_adder.start_accel() # NOTE: The start must be sent before setting the other arguments \n", 89 | " self.dma.sendchannel.transfer(in_buffer)\n", 90 | " self.dma.recvchannel.transfer(out_buffer)\n", 91 | " self.dma.sendchannel.wait()\n", 92 | " self.dma.recvchannel.wait()\n", 93 | " result = out_buffer.view(dtype=np.float32).copy()\n", 94 | " del in_buffer, out_buffer\n", 95 | " return result\n", 96 | "\n", 97 | " @staticmethod\n", 98 | " def checkhierarchy(description):\n", 99 | " \"\"\"\n", 100 | " An Hierarchy that meets these requirements will be\n", 101 | " automatically registered to this driver.\n", 102 | " \"\"\"\n", 103 | " if \"dma\" in description[\"ip\"] and \"hls_adder\" in description[\"ip\"]:\n", 104 | " return True\n", 105 | " return False" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Finally, we can istantiate the overaly, so that the drivers above will be automatically registered." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 11, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "overlay = Overlay(\"overlay/streamed_add_hier.bit\", download=False)\n", 122 | "# overlay.download()\n", 123 | "# overlay?" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "### Width of Buffer Length Register\n", 131 | "This integer value specifies the number of valid bits used for the Control field buffer length and Status field bytes transferred in the Scatter/Gather descriptors. It also specifies the number of valid bits in the RX Length of the Status Stream App4 field when Use Rxlength is enabled. For Direct Register Mode, it specifies the number of valid bits in the MM2S_LENGTH and S2MM_LENGTH registers. The length width directly correlates to the number of bytes being specified in a Scatter/Gather descriptor or number of bytes being specified in App4.RxLength, MM2S_LENGTH, or S2MM_LENGTH. The number of bytes is equal to 2^Length Width. So a Length Width of 26 gives a byte count of 67,108,863 bytes. This value should be set to 23 for Multichannel mode." 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 25, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "[ 1.00000000e+00 4.14159298e+00 7.28318548e+00 ..., 3.20856616e+03\n", 144 | " 3.21170776e+03 3.21484937e+03]\n", 145 | "[ True True True ..., True True True]\n", 146 | "3.469756501941687e-05\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "stream = [i * np.pi for i in range(1024)]\n", 152 | "# print(stream)\n", 153 | "out_stream = overlay.adder.stream_add(stream)\n", 154 | "print(out_stream)\n", 155 | "print(np.isclose(np.array(stream) + 1, out_stream))\n", 156 | "print(np.abs((np.array(stream) - (out_stream - 1))).mean())\n", 157 | "\n", 158 | "# # NOTE: The following is a neat way of printing the np.floats in HEX format. \n", 159 | "# for orig, f32, u32 in zip(np.array(stream, dtype=np.float32).view(dtype=np.uint32), out_stream, out_stream.view(dtype=np.uint32)):\n", 160 | "# print(\"{:x}\\t{:03.3}\\t{:x}\".format(orig, f32, u32))" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.6.5" 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 2 197 | } 198 | -------------------------------------------------------------------------------- /pynq/kernel_u/overlay/kernel_u.bit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ribesstefano/Mapping-Multiple-LSTM-Models-on-FPGAs/7cdf5c7b5c832ceb7984030a522677bdd88ebc0d/pynq/kernel_u/overlay/kernel_u.bit -------------------------------------------------------------------------------- /pynq/kernel_v/binfile_example.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ribesstefano/Mapping-Multiple-LSTM-Models-on-FPGAs/7cdf5c7b5c832ceb7984030a522677bdd88ebc0d/pynq/kernel_v/binfile_example.bin -------------------------------------------------------------------------------- /pynq/kernel_v/overlay/kernel_v.bit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ribesstefano/Mapping-Multiple-LSTM-Models-on-FPGAs/7cdf5c7b5c832ceb7984030a522677bdd88ebc0d/pynq/kernel_v/overlay/kernel_v.bit -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # SVD Approximation Algorithms and ML Models 2 | 3 | In this folder we include the SVD-based approximation algorithms and the Machine Learning (ML) models (mainly written in Keras). 4 | 5 | ## SVD 6 | 7 | ## Models 8 | 9 | The current working models are: 10 | 11 | * LSTMs on Fashion-MNIST 12 | -------------------------------------------------------------------------------- /python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ribesstefano/Mapping-Multiple-LSTM-Models-on-FPGAs/7cdf5c7b5c832ceb7984030a522677bdd88ebc0d/python/__init__.py -------------------------------------------------------------------------------- /python/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ribesstefano/Mapping-Multiple-LSTM-Models-on-FPGAs/7cdf5c7b5c832ceb7984030a522677bdd88ebc0d/python/models/__init__.py -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy -------------------------------------------------------------------------------- /python/roofline/README.md: -------------------------------------------------------------------------------- 1 | # Roofline Model 2 | 3 | The maximum bandwidth can be obtained [here](https://www.xilinx.com/support/documentation/user_guides/ug585-Zynq-7000-TRM.pdf). 4 | 5 | A discussion on the peak floating point performace can be obtained [here](https://www.intel.com/content/dam/www/programmable/us/en/pdfs/literature/wp/wp-01222-understanding-peak-floating-point-performance-claims.pdf). 6 | 7 | ## Hardware Design 8 | 9 | ## Number of Iterations 10 | 11 | Increasing the number of iterations shouldn't signlificantly increase the resource usage. 12 | The amount of iterations may heavily impact on the DMAs. 13 | 14 | ## Dot Product Unit 15 | 16 | It consumes most of the DSPs. 17 | 18 | ## Non Linear Unit 19 | 20 | This block is the most resource hungry, so parallelizing it might be expensive. 21 | 22 | ## Weight Memory Layout 23 | 24 | The weight matrixes of two LSTM layers are divided in 8 gates: 4 current and 4 recurrent. 25 | Each gate matrix is 'shared' by the two layers thanks to the modified SVD algorithm. 26 | Because of the algorithm, each gate matrix is stored in different vectors, i.e. `U`, `V`, `S1` and `S2`. 27 | Each of these vectors is, in reality, a *tensor*, meaning that the hardware accelerator assumes a certain tensor shape in order to properly fetch the data and improve throughput. 28 | 29 | The tensors' shapes are: 30 | 31 | * `V`: (I, G, T, E) 32 | * `U`: (I, G, T, E) 33 | * `S1` + `S2`: (I, S, G, E) 34 | 35 | with: 36 | 37 | * I: number of iterations or refinements 38 | * G: number of LSTM gates 39 | * T: number of **non zero** tiles 40 | * E: number of elements (the actual weight values) 41 | 42 | Note that `S1` and `S2` are stored 'contiguously' in the **same tensor**. 43 | 44 | ### Zero Tiles Combinations: Layout 45 | 46 | The accelerator reads a set of indexes indicating which tiles have been pruned. 47 | These combinations are stored in the following layout: 48 | 49 | * `VZ`: (I, G, T) 50 | * `UZ`: (I, G, T) 51 | 52 | Where T is a bit vector of width `NumTiles`, with coding: `1` non-pruned tile, `0` pruned tile. 53 | 54 | **Assumption**: both current and recurrent gates share the same number of tiles and number of zero tiles, despite of having **different matrix dimensions**. 55 | 56 | ## Points to Discuss / TODOs 57 | 58 | ### Roofline Model 59 | 60 | * Which peak FLOP performance shall we use? 61 | * The total number of operations: is it referred to the original LSTM algorithm or the SVD-based one? 62 | * In Computational Roof calculation, how do I decide the slowest module? 63 | * In Computational Roof calculation, is MAC flop equal to MUL flop, i.e. 1? Or `MAC = ADD + MUL = 2` 64 | 65 | ### Hardware Accelerator 66 | 67 | * The current and recurrent gates have different matrix dimensions! 68 | * The previous hidden state `c` can, in principle, be stored on the device for reuse across timesteps. 69 | * If the LSTM is not required to **return a sequence**, the output writes to memory can be avoided and a set of buffers can be used instead. 70 | * If the RTL/Cosimulation succeeds, then there is no need to run the hardware accelerator for testing the accuracy (for performance measurements instead, we obviously need the hardware) 71 | * Both the C and the RTL/Cosimulations are breaking when printing the `mean_error_value`. I believe it might be related to the `UpdateErrorMean()` function, which is modifying it. 72 | -------------------------------------------------------------------------------- /python/roofline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ribesstefano/Mapping-Multiple-LSTM-Models-on-FPGAs/7cdf5c7b5c832ceb7984030a522677bdd88ebc0d/python/roofline/__init__.py -------------------------------------------------------------------------------- /python/svd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ribesstefano/Mapping-Multiple-LSTM-Models-on-FPGAs/7cdf5c7b5c832ceb7984030a522677bdd88ebc0d/python/svd/__init__.py -------------------------------------------------------------------------------- /run_hls_test.tcl: -------------------------------------------------------------------------------- 1 | # 2 | # @brief Greps a file content and writes matches to a file. 3 | # 4 | # @param re Regular expression 5 | # @param lines Number of lines to report/include after the found match 6 | # @param fin The fin pointer 7 | # @param fout The fout pointer 8 | # 9 | proc grep {re lines fin write_fout fout} { 10 | set cnt 0 11 | set match false 12 | seek $fin 0 13 | while {[gets $fin line] >= 0} { 14 | if [regexp -- $re $line] { 15 | set cnt 0 16 | set match true 17 | } 18 | if {$match && ($cnt < $lines)} { 19 | puts $line 20 | if {$write_fout} { 21 | puts $fout $line 22 | } 23 | set cnt [expr {$cnt +1}] 24 | } else { 25 | set match false 26 | } 27 | } 28 | } 29 | 30 | exec mkdir -p -- C:/Users/ste/phd/hls_projects/hls_svd//hls_prj/ 31 | exec mkdir -p -- C:/Users/ste/phd/hls_projects/hls_svd//hls_prj//reports 32 | cd C:/Users/ste/phd/hls_projects/hls_svd//hls_prj/ 33 | open_project -reset "vitis_ZedBoard_HlsKernelU" 34 | set_top "HlsKernelU" 35 | open_solution -flow_target vivado -reset "solution_HlsKernelU" 36 | set_part xc7z020clg484-1 ;# ZedBoard 37 | create_clock -period 10 -name default 38 | config_compile -name_max_length=12 -pipeline_style=frp -enable_auto_rewind=1 39 | config_schedule -effort=high -relax_ii_for_timing=0 40 | config_core DSP48 -latency 3 41 | # Synthesis files 42 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/kernel/u_kernel.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 43 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/kernel/gemv_kernel.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 44 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/hls_utils/adder_tree.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 45 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/dma/svd_dma.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 46 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/dma/axis_lib.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 47 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/kernel/u_kernel.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 48 | add_files C:/Users/ste/phd/hls_projects/hls_svd/src/kernel/u_kernel.cpp -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 49 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/kernel/gemv_kernel.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 50 | add_files C:/Users/ste/phd/hls_projects/hls_svd/src/kernel/gemv_kernel.cpp -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 51 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/hls_utils/adder_tree.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 52 | add_files C:/Users/ste/phd/hls_projects/hls_svd/src/hls_utils/adder_tree.cpp -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 53 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/dma/svd_dma.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 54 | add_files C:/Users/ste/phd/hls_projects/hls_svd/src/dma/svd_dma.cpp -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 55 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/dma/axis_lib.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 56 | add_files C:/Users/ste/phd/hls_projects/hls_svd/src/dma/axis_lib.cpp -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 57 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/svd_params.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 58 | add_files C:/Users/ste/phd/hls_projects/hls_svd/src/svd_params.cpp -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 59 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/hls_utils/hls_metaprogramming.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 60 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/hls_utils/hls_debugging.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 61 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/hls_utils/priority_encoder.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 62 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/dma/width_converter.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 63 | add_files C:/Users/ste/phd/hls_projects/hls_svd/src/dma/width_converter.cpp -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 64 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/hls_utils/hw_timer.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 65 | add_files C:/Users/ste/phd/hls_projects/hls_svd/src/hls_utils/hw_timer.cpp -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 66 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/kernel/u_kernel.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 67 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/kernel/gemv_kernel.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 68 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/hls_utils/adder_tree.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 69 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/dma/svd_dma.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 70 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/dma/axis_lib.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 71 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/svd_params.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 72 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/hls_utils/hls_metaprogramming.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 73 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/hls_utils/hls_debugging.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 74 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/hls_utils/priority_encoder.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 75 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/dma/width_converter.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 76 | add_files C:/Users/ste/phd/hls_projects/hls_svd/include/hls_utils/hw_timer.h -cflags "-O3 -std=c++14 -IC:/Users/ste/phd/hls_projects/hls_svd/include -I/usr/local/include" 77 | csynth_design 78 | puts "================================================================" 79 | puts "\[INFO\] Reporting information" 80 | puts "================================================================" 81 | set fin [open C:/Users/ste/phd/hls_projects/hls_svd//hls_prj/vitis_ZedBoard_HlsKernelU/solution_HlsKernelU/syn/report/HlsKernelU_csynth.rpt r] 82 | set fout [open C:/Users/ste/phd/hls_projects/hls_svd//hls_prj//reports/vitis_ZedBoard_HlsKernelU.rpt a] 83 | grep "== Performance Estimates" 18 $fin 0 $fout 84 | grep "== Utilization Estimates" 20 $fin 0 $fout 85 | close $fin 86 | close $fout 87 | puts "================================================================" 88 | puts "\[INFO\] Closing project: vitis_ZedBoard_HlsKernelU" 89 | puts "================================================================" 90 | exit 91 | cd C:/Users/ste/phd/hls_projects/hls_svd/ 92 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | add_subdirectory(dma) 3 | add_subdirectory(kernel) 4 | add_subdirectory(math_utils) 5 | add_subdirectory(hls_utils) 6 | add_subdirectory(testbenches) 7 | # add_subdirectory(lstm) 8 | # add_subdirectory(dense) 9 | add_subdirectory(layers) 10 | 11 | # NOTE: Each library/object will have an identifier and that identifier will then 12 | # be used to link the final executable, i.e. target_link_libraries(ProjectName LibraryName) 13 | 14 | add_library(SVD_PARAMS STATIC ${CMAKE_SOURCE_DIR}/src/svd_ip.cpp) 15 | target_include_directories(SVD_PARAMS PUBLIC ${CMAKE_SOURCE_DIR}/include) 16 | target_include_directories(SVD_PARAMS PUBLIC ${HLS_INCLUDE_DIRS}) 17 | target_include_directories(SVD_PARAMS PUBLIC ${OpenCv_INCLUDE_DIRS}) 18 | target_link_libraries(SVD_PARAMS ${OpenCv_LIBS}) 19 | target_compile_options(SVD_PARAMS PRIVATE -fno-builtin) 20 | 21 | add_library(SVD_IP STATIC ${CMAKE_SOURCE_DIR}/src/svd_ip.cpp) 22 | target_include_directories(SVD_IP PUBLIC ${CMAKE_SOURCE_DIR}/include) 23 | target_include_directories(SVD_IP PUBLIC ${HLS_INCLUDE_DIRS}) 24 | target_include_directories(SVD_IP PUBLIC ${OpenCv_INCLUDE_DIRS}) 25 | target_link_libraries(SVD_IP ${OpenCv_LIBS}) 26 | target_compile_options(SVD_IP PRIVATE -fno-builtin) 27 | target_link_libraries(SVD_IP SVD_KERNEL) 28 | 29 | add_library(SVD STATIC ${CMAKE_SOURCE_DIR}/src/svd.cpp) 30 | target_include_directories(SVD PUBLIC ${CMAKE_SOURCE_DIR}/include) 31 | target_include_directories(SVD PUBLIC ${HLS_INCLUDE_DIRS}) 32 | target_include_directories(SVD PUBLIC ${OpenCv_INCLUDE_DIRS}) 33 | target_compile_options(SVD PRIVATE -fno-builtin) 34 | target_link_libraries(SVD ${OpenCv_LIBS}) 35 | target_link_libraries(SVD SVD_IP) 36 | target_link_libraries(SVD LSTM_SVD) 37 | target_link_libraries(SVD LSTM_SVD_EMULATOR) 38 | target_link_libraries(SVD SOFT_LSTM_SVD) 39 | -------------------------------------------------------------------------------- /src/dma/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | add_library(SVD_DMA STATIC ${CMAKE_SOURCE_DIR}/src/dma/svd_dma.cpp) 4 | target_include_directories(SVD_DMA PUBLIC ${CMAKE_SOURCE_DIR}/include) 5 | target_include_directories(SVD_DMA PUBLIC ${HLS_INCLUDE_DIRS}) 6 | target_compile_options(SVD_DMA PRIVATE -fno-builtin) 7 | target_link_libraries(SVD_DMA SVD_PARAMS) 8 | 9 | add_library(WIDTH_CONVERTER STATIC ${CMAKE_SOURCE_DIR}/src/dma/width_converter.cpp) 10 | target_include_directories(WIDTH_CONVERTER PUBLIC ${CMAKE_SOURCE_DIR}/include) 11 | target_include_directories(WIDTH_CONVERTER PUBLIC ${HLS_INCLUDE_DIRS}) 12 | target_compile_options(WIDTH_CONVERTER PRIVATE -fno-builtin) 13 | target_link_libraries(WIDTH_CONVERTER SVD_PARAMS) 14 | 15 | add_library(AXIS_LIB STATIC ${CMAKE_SOURCE_DIR}/src/dma/axis_lib.cpp) 16 | target_include_directories(AXIS_LIB PUBLIC ${CMAKE_SOURCE_DIR}/include) 17 | target_include_directories(AXIS_LIB PUBLIC ${HLS_INCLUDE_DIRS}) 18 | target_compile_options(AXIS_LIB PRIVATE -fno-builtin) -------------------------------------------------------------------------------- /src/dma/axis_lib.cpp: -------------------------------------------------------------------------------- 1 | #include "dma/axis_lib.h" -------------------------------------------------------------------------------- /src/dma/svd_dma.cpp: -------------------------------------------------------------------------------- 1 | #include "dma/svd_dma.h" -------------------------------------------------------------------------------- /src/dma/width_converter.cpp: -------------------------------------------------------------------------------- 1 | #include "dma/width_converter.h" -------------------------------------------------------------------------------- /src/hls_utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | add_library(DOT_PROD_DSP STATIC ${CMAKE_SOURCE_DIR}/src/hls_utils/dot_prod_dsp.cpp) 4 | target_include_directories(DOT_PROD_DSP PUBLIC ${CMAKE_SOURCE_DIR}/include) 5 | target_include_directories(DOT_PROD_DSP PUBLIC ${HLS_INCLUDE_DIRS}) 6 | target_include_directories(DOT_PROD_DSP PUBLIC ${OpenCv_INCLUDE_DIRS}) 7 | target_compile_options(DOT_PROD_DSP PRIVATE -fno-builtin) 8 | # target_link_libraries(DOT_PROD_DSP ${OpenCv_LIBS}) 9 | 10 | add_library(HW_TIMER STATIC ${CMAKE_SOURCE_DIR}/src/hls_utils/hw_timer.cpp) 11 | target_include_directories(HW_TIMER PUBLIC ${CMAKE_SOURCE_DIR}/include) 12 | target_include_directories(HW_TIMER PUBLIC ${HLS_INCLUDE_DIRS}) 13 | target_include_directories(HW_TIMER PUBLIC ${OpenCv_INCLUDE_DIRS}) 14 | target_compile_options(HW_TIMER PRIVATE -fno-builtin) 15 | 16 | add_library(ADDER_TREE STATIC ${CMAKE_SOURCE_DIR}/src/hls_utils/adder_tree.cpp) 17 | target_include_directories(ADDER_TREE PUBLIC ${CMAKE_SOURCE_DIR}/include) 18 | target_include_directories(ADDER_TREE PUBLIC ${HLS_INCLUDE_DIRS}) 19 | target_include_directories(ADDER_TREE PUBLIC ${OpenCv_INCLUDE_DIRS}) 20 | target_compile_options(ADDER_TREE PRIVATE -fno-builtin) -------------------------------------------------------------------------------- /src/hls_utils/adder_tree.cpp: -------------------------------------------------------------------------------- 1 | #include "hls_utils/adder_tree.h" -------------------------------------------------------------------------------- /src/hls_utils/dot_prod_dsp.cpp: -------------------------------------------------------------------------------- 1 | #include "hls_utils/dot_prod_dsp.h" -------------------------------------------------------------------------------- /src/hls_utils/hw_timer.cpp: -------------------------------------------------------------------------------- 1 | #include "hls_utils/hw_timer.h" -------------------------------------------------------------------------------- /src/kernel/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | add_library(U_KERNEL STATIC ${CMAKE_SOURCE_DIR}/src/kernel/u_kernel.cpp) 4 | target_include_directories(U_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include) 5 | target_include_directories(U_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include/kernel) 6 | target_include_directories(U_KERNEL PUBLIC ${HLS_INCLUDE_DIRS}) 7 | target_compile_options(U_KERNEL PRIVATE -fno-builtin) 8 | 9 | add_library(S_KERNEL STATIC ${CMAKE_SOURCE_DIR}/src/kernel/s_kernel.cpp) 10 | target_include_directories(S_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include) 11 | target_include_directories(S_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include/kernel) 12 | target_include_directories(S_KERNEL PUBLIC ${HLS_INCLUDE_DIRS}) 13 | target_compile_options(S_KERNEL PRIVATE -fno-builtin) 14 | 15 | add_library(V_KERNEL STATIC ${CMAKE_SOURCE_DIR}/src/kernel/v_kernel.cpp) 16 | target_include_directories(V_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include) 17 | target_include_directories(V_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include/kernel) 18 | target_include_directories(V_KERNEL PUBLIC ${HLS_INCLUDE_DIRS}) 19 | target_compile_options(V_KERNEL PRIVATE -fno-builtin) 20 | 21 | add_library(SVD_KERNEL STATIC ${CMAKE_SOURCE_DIR}/src/kernel/svd_kernel.cpp) 22 | target_include_directories(SVD_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include) 23 | target_include_directories(SVD_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include/kernel) 24 | target_compile_options(SVD_KERNEL PRIVATE -fno-builtin) 25 | target_link_libraries(SVD_KERNEL U_KERNEL) 26 | target_link_libraries(SVD_KERNEL S_KERNEL) 27 | target_link_libraries(SVD_KERNEL V_KERNEL) 28 | 29 | 30 | add_library(GEMV_KERNEL STATIC ${CMAKE_SOURCE_DIR}/src/kernel/gemv_kernel.cpp) 31 | target_include_directories(GEMV_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include) 32 | target_include_directories(GEMV_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include/kernel) 33 | target_compile_options(GEMV_KERNEL PRIVATE -fno-builtin) 34 | target_link_libraries(GEMV_KERNEL U_KERNEL) 35 | target_link_libraries(GEMV_KERNEL S_KERNEL) 36 | target_link_libraries(GEMV_KERNEL V_KERNEL) 37 | 38 | # add_library(DQNET STATIC ${CMAKE_SOURCE_DIR}/src/kernel/kernel.cpp) 39 | # target_include_directories(DQNET PUBLIC ${CMAKE_SOURCE_DIR}/include) 40 | # target_include_directories(DQNET PUBLIC ${CMAKE_SOURCE_DIR}/include/kernel) 41 | # target_link_libraries(DQNET CONV_LAYER) 42 | # target_link_libraries(DQNET DENSE_LAYER) 43 | -------------------------------------------------------------------------------- /src/kernel/README.md: -------------------------------------------------------------------------------- 1 | # Kernels 2 | 3 | ## U-Kernel 4 | 5 | ### HlsAxisKernelU 6 | 7 | To be used with external DMAs. 8 | ```c++ 9 | void HlsAxisKernelU(const int num_refinements, 10 | hls::stream& x_port, 11 | hls::stream& u_port, 12 | hls::stream& xu_port); 13 | ``` 14 | 15 | ### HlsManySamplingsKernelU 16 | 17 | Compared to the previous implementation, this kernel has a different number of refinements per input. The refinements and inputs must be **ordered**. Meaning that input at index zero has the lowest amount of refinements to process. 18 | 19 | ```c++ 20 | void HlsManySamplingsKernelU(const hls::vector num_refinements, 21 | hls::stream& x_port, 22 | hls::stream& u_port, 23 | hls::stream& xu_port); 24 | ``` 25 | 26 | ### HlsKernelU 27 | 28 | Flaxible Kernel-U. 29 | 30 | ```c++ 31 | void HlsKernelU(const int num_active_inputs, 32 | const int input_size, 33 | const hls::vector num_refinements, 34 | const bool pad_output, 35 | hls::stream& x_port, 36 | hls::stream& u_port, 37 | hls::stream& xu_port 38 | ``` -------------------------------------------------------------------------------- /src/kernel/gemv_kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "kernel/gemv_kernel.h" 2 | 3 | #ifdef __VITIS_HLS__ 4 | 5 | void HlsGemvKernel(const int num_rows, const int num_cols, 6 | hls::stream >& x1_port, 7 | hls::stream >& x2_port, 8 | hls::stream >& w1_port, 9 | hls::stream >& w2_port, 10 | hls::stream& y1_port, 11 | hls::stream& y2_port) { 12 | #pragma HLS INTERFACE s_axilite port=return bundle=ctrl 13 | #pragma HLS INTERFACE s_axilite port=num_cols bundle=ctrl 14 | #pragma HLS INTERFACE s_axilite port=num_rows bundle=ctrl 15 | #pragma HLS DATAFLOW 16 | 17 | hls::stream > x_streams[testgemv::N]; 18 | hls::stream > w_streams[testgemv::N]; 19 | hls::stream y_streams[testgemv::N]; 20 | #pragma HLS ARRAY_PARTITION variable=x_streams complete 21 | #pragma HLS ARRAY_PARTITION variable=w_streams complete 22 | #pragma HLS ARRAY_PARTITION variable=y_streams complete 23 | 24 | 25 | const int kNumTiles = num_rows / testgemv::T; 26 | 27 | DMA_in: 28 | for (int i = 0; i < kNumTiles; ++i) { 29 | for (int j = 0; j < num_cols; ++j) { 30 | #pragma HLS PIPELINE II=1 31 | x_streams[0] << x1_port.read(); 32 | x_streams[1] << x2_port.read(); 33 | w_streams[0] << w1_port.read(); 34 | w_streams[1] << w2_port.read(); 35 | } 36 | } 37 | 38 | svd::GemvKernel(num_rows, num_cols, 39 | x_streams, w_streams, y_streams); 40 | 41 | DMA_out: 42 | for (int i = 0; i < num_cols; ++i) { 43 | #pragma HLS PIPELINE II=1 44 | y1_port.write(y_streams[0].read()); 45 | y2_port.write(y_streams[1].read()); 46 | } 47 | } 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /src/kernel/s_kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "kernel/s_kernel.h" 2 | #include "dma/axis_lib.h" 3 | 4 | #include "hls_stream.h" 5 | #ifdef __VITIS_HLS__ 6 | #include "hls_vector.h" 7 | #endif 8 | 9 | #ifdef __VITIS_HLS__ 10 | void HlsKernelS(const int num_active_inputs, 11 | const int num_refinements[tests::params::N], 12 | // const hls::vector num_refinements, 13 | hls::stream& xu_port, 14 | hls::stream& s_port, 15 | hls::stream& xus_port) { 16 | #pragma HLS INTERFACE axis port=xu_port 17 | #pragma HLS INTERFACE axis port=s_port 18 | #pragma HLS INTERFACE axis port=xus_port 19 | #pragma HLS INTERFACE s_axilite port=return 20 | #pragma HLS INTERFACE s_axilite port=num_active_inputs 21 | #pragma HLS INTERFACE s_axilite port=num_refinements 22 | svd::KernelS(num_active_inputs, num_refinements, xu_port, 23 | s_port, xus_port); 24 | } 25 | #endif 26 | 27 | namespace svd { 28 | 29 | } // svd -------------------------------------------------------------------------------- /src/kernel/svd_kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "kernel/svd_kernel.h" 2 | 3 | void HlsSvdKernel(const int num_active_inputs, 4 | const int input_size, 5 | const int output_size, 6 | const int num_refinements[svd::svd_params::N], 7 | hls::stream& x_port, 8 | hls::stream& u_port, 9 | hls::stream& s_port, 10 | hls::stream& v_port, 11 | hls::stream& y_port) { 12 | #pragma HLS INTERFACE axis port=x_port 13 | #pragma HLS INTERFACE axis port=u_port 14 | #pragma HLS INTERFACE axis port=s_port 15 | #pragma HLS INTERFACE axis port=v_port 16 | #pragma HLS INTERFACE axis port=y_port 17 | #pragma HLS INTERFACE s_axilite port=return 18 | #pragma HLS INTERFACE s_axilite port=num_active_inputs 19 | #pragma HLS INTERFACE s_axilite port=input_size 20 | #pragma HLS INTERFACE s_axilite port=output_size 21 | #pragma HLS INTERFACE s_axilite port=num_refinements 22 | #pragma HLS DATAFLOW 23 | svd::SvdKernel(num_active_inputs, input_size, output_size, 24 | num_refinements, x_port, u_port, s_port, v_port, y_port); 25 | } 26 | 27 | void HlsSvdKernelFixed( 28 | hls::stream& x_port, 29 | hls::stream& u_port, 30 | hls::stream& s_port, 31 | hls::stream& v_port, 32 | hls::stream& y_port) { 33 | #pragma HLS INTERFACE axis port=x_port 34 | #pragma HLS INTERFACE axis port=u_port 35 | #pragma HLS INTERFACE axis port=s_port 36 | #pragma HLS INTERFACE axis port=v_port 37 | #pragma HLS INTERFACE axis port=y_port 38 | #pragma HLS INTERFACE s_axilite port=return 39 | #pragma HLS DATAFLOW 40 | const int kNumActiveInputs = svd::svd_params::N; 41 | const int kInputSize = svd::svd_params::I; 42 | const int kOutputSize = svd::svd_params::H; 43 | const int kNumRefinements[svd::svd_params::N] = {svd::svd_params::R}; 44 | svd::SvdKernel(kNumActiveInputs, kInputSize, kOutputSize, 45 | kNumRefinements, x_port, u_port, s_port, v_port, y_port); 46 | } -------------------------------------------------------------------------------- /src/kernel/u_kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "kernel/u_kernel.h" 2 | #include "kernel/gemv_kernel.h" 3 | #include "hls_utils/adder_tree.h" 4 | #include "dma/svd_dma.h" 5 | #include "dma/axis_lib.h" 6 | 7 | #include "assert.h" 8 | #include "ap_axi_sdata.h" 9 | #include "hls_stream.h" 10 | #ifdef __VITIS_HLS__ 11 | #include "hls_vector.h" 12 | #endif 13 | 14 | #ifndef __VITIS_HLS__ 15 | /** 16 | * @brief Synthesizeable Kernel-U. 17 | * @deprecated Compile time parametrization only. 18 | * 19 | * @param[in] num_refinements The number refinements 20 | * @param[in] x_port The x port 21 | * @param[in] u_port The u port 22 | * @param xu_port The xu port 23 | */ 24 | void HlsKernelU(const int num_refinements, 25 | const typename testu::params::ActivationD x_port[testu::params::N][testu::params::I], 26 | const typename testu::params::UPortD u_port[testu::params::R * testu::params::PrunedSizeU], 27 | typename testu::params::ActivationD xu_port[testu::params::N][testu::params::G * testu::params::R]) { 28 | #pragma HLS INTERFACE s_axilite port=return bundle=ctrl 29 | #pragma HLS INTERFACE s_axilite port=num_refinements bundle=ctrl 30 | #pragma HLS INTERFACE m_axi port=x_port offset=slave depth=testu::params::I 31 | #pragma HLS INTERFACE m_axi port=u_port offset=slave depth=testu::params::R*testu::params::PrunedSizeU 32 | #pragma HLS INTERFACE m_axi port=xu_port offset=slave depth=testu::params::R 33 | #pragma HLS DATAFLOW 34 | svd::SvdStreams streams; 35 | svd::SvdBuffers buffers; 36 | svd::InputDMA(num_refinements, x_port, streams, buffers); 37 | svd::StreamSplitter(num_refinements * testu::params::G * testu::params::PrunedSizeU, u_port, streams.u_dma); 38 | U_Dispatcher: 39 | for (int i = 0; i < num_refinements; ++i) { 40 | for (int j = 0; j < testu::params::PeU; ++j) { 41 | for (int k = 0; k < testu::params::PrunedSizeU / testu::params::PeU; ++k) { 42 | #pragma HLS PIPELINE II=1 43 | #pragma HLS LOOP_FLATTEN 44 | for (int g = 0; g < testu::params::G; ++g) { 45 | streams.u[g][j].write(streams.u_dma[g].read()); 46 | } 47 | } 48 | } 49 | } 50 | svd::KernelU(num_refinements, streams); 51 | for (int i = 0; i < num_refinements; ++i) { 52 | for (int j = 0; j < testu::params::N; ++j) { 53 | #pragma HLS PIPELINE II=1 54 | for (int k = 0; k < testu::params::G; ++k) { 55 | auto tmp = hlsutils::adder_tree(streams.xu[j][k]); 56 | xu_port[j][k * num_refinements + i] = tmp; 57 | } 58 | } 59 | } 60 | } 61 | #else 62 | /** 63 | * @brief Synthesizeable flexible Kernel-U. 64 | * 65 | * @param[in] num_active_inputs The number of active inputs 66 | * @param[in] input_size The input size 67 | * @param[in] num_refinements The number of refinements steps (R) per input: 68 | * the Rs must be positive, greater than zero and 69 | * in ASCENDING ORDER. Their amount must be less 70 | * or equal to num_active_inputs. 71 | * @param[in] pad_output Wether to pad output with zeroes 72 | * @param x_port The input x port 73 | * @param u_port The input u port 74 | * @param xu_port The output xu port 75 | */ 76 | void HlsKernelU(const int num_active_inputs, 77 | const int input_size, 78 | const int num_refinements[testu::params::N], 79 | const bool pad_output, 80 | hls::stream& x_port, 81 | hls::stream& u_port, 82 | hls::stream& xu_port) { 83 | #pragma HLS INTERFACE s_axilite port=return 84 | #pragma HLS INTERFACE s_axilite port=num_active_inputs 85 | #pragma HLS INTERFACE s_axilite port=input_size 86 | #pragma HLS INTERFACE s_axilite port=num_refinements 87 | #pragma HLS INTERFACE s_axilite port=pad_output 88 | #pragma HLS INTERFACE axis port=x_port 89 | #pragma HLS INTERFACE axis port=u_port 90 | #pragma HLS INTERFACE axis port=xu_port 91 | #pragma HLS ARRAY_PARTITION variable=num_refinements complete dim=1 92 | svd::KernelU(num_active_inputs, input_size, num_refinements, 93 | pad_output, x_port, u_port, xu_port); 94 | } 95 | 96 | void HlsKernelU_Pruned(const int num_active_inputs, 97 | const int input_size, 98 | const int num_refinements[testu::params::N], 99 | const int num_zero_tiles_u, 100 | hls::stream& unz_idx_port, 101 | hls::stream& x_port, 102 | hls::stream& u_port, 103 | hls::stream& xu_port) { 104 | #pragma HLS INTERFACE s_axilite port=return 105 | #pragma HLS INTERFACE s_axilite port=num_active_inputs 106 | #pragma HLS INTERFACE s_axilite port=input_size 107 | #pragma HLS INTERFACE s_axilite port=num_refinements 108 | #pragma HLS INTERFACE s_axilite port=num_zero_tiles_u 109 | #pragma HLS INTERFACE axis port=unz_idx_port 110 | #pragma HLS INTERFACE axis port=x_port 111 | #pragma HLS INTERFACE axis port=u_port 112 | #pragma HLS INTERFACE axis port=xu_port 113 | #pragma HLS ARRAY_PARTITION variable=num_refinements complete dim=1 114 | svd::KernelU_Pruned(num_active_inputs, input_size, 115 | num_refinements, num_zero_tiles_u, unz_idx_port, x_port, u_port, xu_port); 116 | } 117 | 118 | #endif // __VITIS_HLS__ 119 | 120 | namespace svd { 121 | 122 | } // svd -------------------------------------------------------------------------------- /src/kernel/v_kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "kernel/v_kernel.h" 2 | 3 | #include "hls_stream.h" 4 | #ifdef __VITIS_HLS__ 5 | #include "hls_vector.h" 6 | #endif 7 | 8 | #include "assert.h" 9 | 10 | #ifndef __VITIS_HLS__ 11 | #else 12 | void HlsKernelV(const int num_active_inputs, 13 | const int output_size, 14 | const int num_refinements[testv::params::N], 15 | hls::stream& xus_port, 16 | hls::stream& v_port, 17 | hls::stream& y_port) { 18 | #pragma HLS INTERFACE axis port=xus_port 19 | #pragma HLS INTERFACE axis port=v_port 20 | #pragma HLS INTERFACE axis port=y_port 21 | #pragma HLS INTERFACE s_axilite port=return 22 | #pragma HLS INTERFACE s_axilite port=num_active_inputs 23 | #pragma HLS INTERFACE s_axilite port=output_size 24 | #pragma HLS INTERFACE s_axilite port=num_refinements 25 | #pragma HLS DATAFLOW 26 | #pragma HLS ARRAY_PARTITION variable=num_refinements complete dim=1 27 | svd::KernelV(num_active_inputs, output_size, 28 | num_refinements, xus_port, v_port, y_port); 29 | } 30 | 31 | void HlsKernelV_Pruned(const int num_active_inputs, 32 | const int output_size, 33 | const int num_refinements[testv::params::N], 34 | const int num_zero_tiles_v, 35 | hls::stream& vnz_idx_port, 36 | hls::stream& xus_port, 37 | hls::stream& v_port, 38 | hls::stream& y_port) { 39 | #pragma HLS INTERFACE axis port=vnz_idx_port 40 | #pragma HLS INTERFACE axis port=xus_port 41 | #pragma HLS INTERFACE axis port=v_port 42 | #pragma HLS INTERFACE axis port=y_port 43 | #pragma HLS INTERFACE s_axilite port=return 44 | #pragma HLS INTERFACE s_axilite port=num_active_inputs 45 | #pragma HLS INTERFACE s_axilite port=output_size 46 | #pragma HLS INTERFACE s_axilite port=num_refinements 47 | #pragma HLS INTERFACE s_axilite port=num_zero_tiles_v 48 | #pragma HLS DATAFLOW 49 | #pragma HLS ARRAY_PARTITION variable=num_refinements complete dim=1 50 | svd::KernelV_Pruned(num_active_inputs, output_size, 51 | num_refinements, num_zero_tiles_v, vnz_idx_port, xus_port, v_port, y_port); 52 | } 53 | #endif // end __VITIS_HLS__ 54 | -------------------------------------------------------------------------------- /src/layers/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | add_subdirectory(dense) 4 | add_subdirectory(lstm) -------------------------------------------------------------------------------- /src/layers/dense/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | add_subdirectory(hls) 3 | add_subdirectory(sw) 4 | 5 | # add_library(LSTM_DATA_HANDLER STATIC ${CMAKE_SOURCE_DIR}/src/lstm/lstm_data_handler.cpp) 6 | # target_include_directories(LSTM_DATA_HANDLER PUBLIC ${CMAKE_SOURCE_DIR}/include) 7 | # target_include_directories(LSTM_DATA_HANDLER PUBLIC ${HLS_INCLUDE_DIRS}) 8 | # target_include_directories(LSTM_DATA_HANDLER PUBLIC ${OpenCv_INCLUDE_DIRS}) 9 | # target_link_libraries(LSTM_DATA_HANDLER ${OpenCv_LIBS}) -------------------------------------------------------------------------------- /src/layers/dense/hls/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | add_library(DENSE_SVD STATIC ${CMAKE_SOURCE_DIR}/src/layers/dense/hls/dense_svd.cpp) 4 | target_include_directories(DENSE_SVD PUBLIC ${CMAKE_SOURCE_DIR}/include) 5 | target_include_directories(DENSE_SVD PUBLIC ${HLS_INCLUDE_DIRS}) 6 | target_include_directories(DENSE_SVD PUBLIC ${OpenCv_INCLUDE_DIRS}) 7 | target_link_libraries(DENSE_SVD ${OpenCv_LIBS}) 8 | 9 | # add_library(SOFT_LSTM_SVD STATIC ${CMAKE_SOURCE_DIR}/src/lstm/sw/soft_lstm_svd.cpp) 10 | # target_include_directories(SOFT_LSTM_SVD PUBLIC ${CMAKE_SOURCE_DIR}/include) 11 | # target_include_directories(SOFT_LSTM_SVD PUBLIC ${HLS_INCLUDE_DIRS}) 12 | # target_include_directories(SOFT_LSTM_SVD PUBLIC ${OpenCv_INCLUDE_DIRS}) 13 | # target_link_libraries(SOFT_LSTM_SVD ${OpenCv_LIBS}) 14 | # target_link_libraries(SOFT_LSTM_SVD BLAS_UTILS) 15 | # target_link_libraries(SOFT_LSTM_SVD ACTIVATION_FUNCTIONS) 16 | -------------------------------------------------------------------------------- /src/layers/dense/hls/dense_svd.cpp: -------------------------------------------------------------------------------- 1 | #include "layers/dense/hls/dense_svd.h" 2 | 3 | #ifndef __VITIS_HLS__ 4 | #else 5 | void HlsDenseSvd(const int num_active_inputs, 6 | const int input_size, 7 | const int output_size, 8 | const int num_refinements[svd::dense_params::N], 9 | // const hls::vector num_refinements, 10 | hls::stream& x_port, 11 | hls::stream& u_port, 12 | hls::stream& s_port, 13 | hls::stream& v_port, 14 | hls::stream& bias_port, 15 | hls::stream& y_port) { 16 | #pragma HLS INTERFACE s_axilite port=return bundle=ctrl 17 | #pragma HLS INTERFACE s_axilite port=num_active_inputs bundle=ctrl 18 | #pragma HLS INTERFACE s_axilite port=input_size bundle=ctrl 19 | #pragma HLS INTERFACE s_axilite port=output_size bundle=ctrl 20 | #pragma HLS INTERFACE s_axilite port=num_refinements bundle=ctrl 21 | #pragma HLS INTERFACE axis port=x_port 22 | #pragma HLS INTERFACE axis port=u_port 23 | #pragma HLS INTERFACE axis port=s_port 24 | #pragma HLS INTERFACE axis port=v_port 25 | #pragma HLS INTERFACE axis port=bias_port 26 | #pragma HLS INTERFACE axis port=y_port 27 | svd::DenseSvdKernel(num_active_inputs, input_size, 28 | output_size, num_refinements, x_port, u_port, s_port, v_port, bias_port, 29 | y_port); 30 | } 31 | 32 | /** 33 | * @brief HLS Wrapper that calls a DenseSvd accelerator. 34 | * 35 | * Useful in Cosimulation. 36 | * 37 | * @param[in] num_active_inputs The number of active inputs 38 | * @param[in] input_size The input size 39 | * @param[in] output_size The output size 40 | * @param[in] num_refinements The number of refinements 41 | * @param[in] x The input array. Shape: (N, I) 42 | * @param[in] u The u array. Shape: (R, I, G) 43 | * @param[in] s The s array. Shape: (R, N, G) 44 | * @param[in] v The v array. Shape: (R, H, G) 45 | * @param[in] bias The bias array. Shape: (N, G, H) 46 | * @param y The y array. Shape: (N, G, H) 47 | */ 48 | void HlsWrapperDenseSvd(const int num_active_inputs, 49 | const int input_size, 50 | const int output_size, 51 | const int num_refinements[svd::dense_params::N], 52 | const typename svd::dense_params::ActivationD* x, 53 | const typename svd::dense_params::ActivationD* u, 54 | const typename svd::dense_params::ActivationD* s, 55 | const typename svd::dense_params::ActivationD* v, 56 | const typename svd::dense_params::ActivationD* bias, 57 | typename svd::dense_params::ActivationD* y) { 58 | #ifdef __VITIS_HLS__ 59 | hls::stream x_port("x_port"); 60 | hls::stream u_port("u_port"); 61 | hls::stream s_port("s_port"); 62 | hls::stream v_port("v_port"); 63 | hls::stream bias_port("bias_port"); 64 | hls::stream y_port("y_port"); 65 | svd::SetDenseSvdInputs(num_active_inputs, input_size, 66 | output_size, num_refinements, x, u, s, v, bias, x_port, u_port, s_port, 67 | v_port, bias_port); 68 | HlsDenseSvd(num_active_inputs, input_size, output_size, num_refinements, 69 | x_port, u_port, s_port, v_port, bias_port, y_port); 70 | svd::GetSvdKernelOutputs(num_active_inputs, output_size, 71 | y_port, y); 72 | #endif // __VITIS_HLS__ 73 | } 74 | 75 | #endif -------------------------------------------------------------------------------- /src/layers/dense/sw/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | # add_library(SOFT_LSTM STATIC ${CMAKE_SOURCE_DIR}/src/lstm/sw/soft_lstm.cpp) 4 | # target_include_directories(SOFT_LSTM PUBLIC ${CMAKE_SOURCE_DIR}/include) 5 | # target_include_directories(SOFT_LSTM PUBLIC ${HLS_INCLUDE_DIRS}) 6 | # target_include_directories(SOFT_LSTM PUBLIC ${OpenCv_INCLUDE_DIRS}) 7 | # target_link_libraries(SOFT_LSTM ${OpenCv_LIBS}) 8 | 9 | # add_library(SOFT_LSTM_SVD STATIC ${CMAKE_SOURCE_DIR}/src/lstm/sw/soft_lstm_svd.cpp) 10 | # target_include_directories(SOFT_LSTM_SVD PUBLIC ${CMAKE_SOURCE_DIR}/include) 11 | # target_include_directories(SOFT_LSTM_SVD PUBLIC ${HLS_INCLUDE_DIRS}) 12 | # target_include_directories(SOFT_LSTM_SVD PUBLIC ${OpenCv_INCLUDE_DIRS}) 13 | # target_link_libraries(SOFT_LSTM_SVD ${OpenCv_LIBS}) 14 | # target_link_libraries(SOFT_LSTM_SVD BLAS_UTILS) 15 | # target_link_libraries(SOFT_LSTM_SVD ACTIVATION_FUNCTIONS) 16 | -------------------------------------------------------------------------------- /src/layers/lstm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | add_subdirectory(hls) 3 | add_subdirectory(sw) 4 | 5 | add_library(LSTM_DATA_HANDLER STATIC ${CMAKE_SOURCE_DIR}/src/layers/lstm/lstm_data_handler.cpp) 6 | target_include_directories(LSTM_DATA_HANDLER PUBLIC ${CMAKE_SOURCE_DIR}/include) 7 | target_include_directories(LSTM_DATA_HANDLER PUBLIC ${HLS_INCLUDE_DIRS}) 8 | target_include_directories(LSTM_DATA_HANDLER PUBLIC ${OpenCv_INCLUDE_DIRS}) 9 | target_link_libraries(LSTM_DATA_HANDLER ${OpenCv_LIBS}) -------------------------------------------------------------------------------- /src/layers/lstm/README.md: -------------------------------------------------------------------------------- 1 | # LSTM Software Models 2 | 3 | This folder contains the software implementation and the hardware emulator of an LSTM layer. 4 | 5 | ## Software 6 | 7 | The software version exploits the BLAS libraries for the fast computation of the gate matrix-matrix multiplications. 8 | 9 | ## Hardware 10 | 11 | The hardware emulation functions serve as a mean to test the _accuracy_ of the HLS models. Because of that, the outputs produced by the emulator and the HLS must obviously match. 12 | 13 | Compared to the HLS counterpart, the emulator: 14 | * utilizes dynamic parameters, meaning that they aren't statically defined at compile-time 15 | * has a software-friendly execution: the HLS coding style is in fact un-optimized for fast execution (e.g. the HLS has numerous if-statements inside loops) -------------------------------------------------------------------------------- /src/layers/lstm/hls/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | add_library(LSTM_HARDWARE STATIC ${CMAKE_SOURCE_DIR}/src/layers/lstm/hls/lstm_hardware.cpp) 4 | target_include_directories(LSTM_HARDWARE PUBLIC ${CMAKE_SOURCE_DIR}/include) 5 | target_include_directories(LSTM_HARDWARE PUBLIC ${HLS_INCLUDE_DIRS}) 6 | target_include_directories(LSTM_HARDWARE PUBLIC ${OpenCv_INCLUDE_DIRS}) 7 | target_link_libraries(LSTM_HARDWARE ${OpenCv_LIBS}) 8 | 9 | add_library(LSTM_SVD STATIC ${CMAKE_SOURCE_DIR}/src/layers/lstm/hls/lstm_svd.cpp) 10 | target_include_directories(LSTM_SVD PUBLIC ${CMAKE_SOURCE_DIR}/include) 11 | target_include_directories(LSTM_SVD PUBLIC ${HLS_INCLUDE_DIRS}) 12 | target_include_directories(LSTM_SVD PUBLIC ${OpenCv_INCLUDE_DIRS}) 13 | target_link_libraries(LSTM_SVD ${OpenCv_LIBS}) 14 | # target_link_libraries(LSTM_SVD SVD_PARAMS) 15 | # target_link_libraries(LSTM_SVD SVD_DMA) 16 | # target_link_libraries(LSTM_SVD U_KERNEL) 17 | # target_link_libraries(LSTM_SVD S_KERNEL) 18 | # target_link_libraries(LSTM_SVD V_KERNEL) 19 | # target_link_libraries(LSTM_SVD ACTIVATION_FUNCTIONS) 20 | # target_link_libraries(LSTM_SVD HLS_DEBUGGING) 21 | 22 | add_library(LSTM_SVD_SHARED SHARED ${CMAKE_SOURCE_DIR}/src/layers/lstm/hls/lstm_svd.cpp) 23 | target_include_directories(LSTM_SVD_SHARED PUBLIC ${CMAKE_SOURCE_DIR}/include) 24 | target_include_directories(LSTM_SVD_SHARED PUBLIC ${HLS_INCLUDE_DIRS}) 25 | target_include_directories(LSTM_SVD_SHARED PUBLIC ${OpenCv_INCLUDE_DIRS}) 26 | target_link_libraries(LSTM_SVD_SHARED ${OpenCv_LIBS}) 27 | set_property(TARGET LSTM_SVD_SHARED PROPERTY POSITION_INDEPENDENT_CODE ON) 28 | 29 | set(LSTM_SVD_EMULATOR_H ${CMAKE_SOURCE_DIR}/include/math_utils/activation_functions.h) 30 | add_library(LSTM_SVD_EMULATOR STATIC ${CMAKE_SOURCE_DIR}/src/layers/lstm/hls/lstm_svd_emulator.cpp ${LSTM_SVD_EMULATOR_H}) 31 | target_include_directories(LSTM_SVD_EMULATOR PUBLIC ${CMAKE_SOURCE_DIR}/include) 32 | target_include_directories(LSTM_SVD_EMULATOR PUBLIC ${HLS_INCLUDE_DIRS}) 33 | target_include_directories(LSTM_SVD_EMULATOR PUBLIC ${OpenCv_INCLUDE_DIRS}) 34 | target_link_libraries(LSTM_SVD_EMULATOR ${OpenCv_LIBS}) 35 | # target_link_libraries(LSTM_SVD_EMULATOR ACTIVATION_FUNCTIONS) -------------------------------------------------------------------------------- /src/layers/lstm/hls/lstm_svd_emulator.cpp: -------------------------------------------------------------------------------- 1 | #include "layers/lstm/hls/lstm_svd_emulator.h" -------------------------------------------------------------------------------- /src/layers/lstm/lstm_data_handler.cpp: -------------------------------------------------------------------------------- 1 | #include "layers/lstm/lstm_data_handler.h" -------------------------------------------------------------------------------- /src/layers/lstm/sw/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | add_library(SOFT_LSTM STATIC ${CMAKE_SOURCE_DIR}/src/layers/lstm/sw/soft_lstm.cpp) 4 | target_include_directories(SOFT_LSTM PUBLIC ${CMAKE_SOURCE_DIR}/include) 5 | target_include_directories(SOFT_LSTM PUBLIC ${HLS_INCLUDE_DIRS}) 6 | target_include_directories(SOFT_LSTM PUBLIC ${OpenCv_INCLUDE_DIRS}) 7 | target_link_libraries(SOFT_LSTM ${OpenCv_LIBS}) 8 | 9 | add_library(SOFT_LSTM_SVD STATIC ${CMAKE_SOURCE_DIR}/src/layers/lstm/sw/soft_lstm_svd.cpp) 10 | target_include_directories(SOFT_LSTM_SVD PUBLIC ${CMAKE_SOURCE_DIR}/include) 11 | target_include_directories(SOFT_LSTM_SVD PUBLIC ${HLS_INCLUDE_DIRS}) 12 | target_include_directories(SOFT_LSTM_SVD PUBLIC ${OpenCv_INCLUDE_DIRS}) 13 | target_link_libraries(SOFT_LSTM_SVD ${OpenCv_LIBS}) 14 | target_link_libraries(SOFT_LSTM_SVD BLAS_UTILS) 15 | target_link_libraries(SOFT_LSTM_SVD ACTIVATION_FUNCTIONS) 16 | -------------------------------------------------------------------------------- /src/math_utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | add_library(BLAS_UTILS STATIC ${CMAKE_SOURCE_DIR}/src/math_utils/blas_utils.cpp) 4 | target_include_directories(BLAS_UTILS PUBLIC ${CMAKE_SOURCE_DIR}/include) 5 | target_include_directories(BLAS_UTILS PUBLIC ${HLS_INCLUDE_DIRS}) 6 | target_include_directories(BLAS_UTILS PUBLIC ${OpenCv_INCLUDE_DIRS}) 7 | target_link_libraries(BLAS_UTILS ${OpenCv_LIBS}) 8 | 9 | add_library(ACTIVATION_FUNCTIONS STATIC ${CMAKE_SOURCE_DIR}/src/math_utils/activation_functions.cpp) 10 | target_include_directories(ACTIVATION_FUNCTIONS PUBLIC ${CMAKE_SOURCE_DIR}/include) 11 | target_include_directories(ACTIVATION_FUNCTIONS PUBLIC ${HLS_INCLUDE_DIRS}) 12 | target_include_directories(ACTIVATION_FUNCTIONS PUBLIC ${OpenCv_INCLUDE_DIRS}) 13 | target_link_libraries(ACTIVATION_FUNCTIONS ${OpenCv_LIBS}) 14 | 15 | add_library(DATA_HANDLER STATIC ${CMAKE_SOURCE_DIR}/src/math_utils/data_handler.cpp) 16 | target_include_directories(DATA_HANDLER PUBLIC ${CMAKE_SOURCE_DIR}/include) 17 | target_include_directories(DATA_HANDLER PUBLIC ${HLS_INCLUDE_DIRS}) 18 | target_include_directories(DATA_HANDLER PUBLIC ${OpenCv_INCLUDE_DIRS}) 19 | target_link_libraries(DATA_HANDLER ${OpenCv_LIBS}) 20 | 21 | # # add_test(NAME TestOpenCv COMMAND OPENCV_TEST) 22 | # # add_test(NAME TestAxisLib COMMAND TEST_AXIS_LIB) 23 | # # add_test(NAME TestConvLayer COMMAND TEST_CONV_LAYER) 24 | # add_test(NAME TestDenseLayer COMMAND TEST_DENSE_LAYER) 25 | # # add_test(NAME TestDQNet COMMAND TEST_DQNET) 26 | # # add_test(NAME TestGame COMMAND TEST_GAME) 27 | # # add_test(NAME TestPong COMMAND TEST_PONG) -------------------------------------------------------------------------------- /src/math_utils/activation_functions.cpp: -------------------------------------------------------------------------------- 1 | #include "math_utils/activation_functions.h" 2 | #include "hls_stream.h" 3 | #include 4 | #include "assert.h" 5 | 6 | // #include "hls_math.h" 7 | 8 | template <> 9 | void svd_sigmoid(const int n, const float* a, float* y) { 10 | // NOTE: Using OpenMP drastically slows down the execution. 11 | // #pragma omp parallel for num_threads(4) private(i) 12 | for (int i = 0; i < n; ++i) { 13 | y[i] = 1 / (1 + exp(-a[i])); 14 | } 15 | } 16 | 17 | template <> 18 | void svd_sigmoid(const int n, const double* a, double* y) { 19 | // NOTE: Using OpenMP drastically slows down the execution. 20 | // #pragma omp parallel for num_threads(4) private(i) 21 | for (int i = 0; i < n; ++i) { 22 | y[i] = 1 / (1 + exp(-a[i])); 23 | } 24 | } 25 | 26 | template <> 27 | void svd_hard_sigmoid(const int n, const float* a, float* y) { 28 | // NOTE: Using OpenMP drastically slows down the execution. 29 | // #pragma omp parallel for num_threads(4) private(i) 30 | for (int i = 0; i < n; ++i) { 31 | if (a[i] < -2.5) { 32 | y[i] = 0; 33 | } else if (a[i] > 2.5) { 34 | y[i] = 1; 35 | } else { 36 | y[i] = 0.2 * a[i] + 0.5; 37 | } 38 | } 39 | } 40 | 41 | template <> 42 | void svd_hard_sigmoid(const int n, const double* a, double* y) { 43 | // NOTE: Using OpenMP drastically slows down the execution. 44 | // #pragma omp parallel for num_threads(4) private(i) 45 | for (int i = 0; i < n; ++i) { 46 | if (a[i] < -2.5) { 47 | y[i] = 0; 48 | } else if (a[i] > 2.5) { 49 | y[i] = 1; 50 | } else { 51 | y[i] = 0.2 * a[i] + 0.5; 52 | } 53 | } 54 | } 55 | 56 | template <> 57 | void svd_tanh(const int n, const float* a, float* y) { 58 | // NOTE: Using OpenMP drastically slows down the execution. 59 | // #pragma omp parallel for num_threads(4) 60 | for (int i = 0; i < n; ++i) { 61 | y[i] = std::tanh(a[i]); 62 | } 63 | } 64 | 65 | template <> 66 | void svd_tanh(const int n, const double* a, double* y) { 67 | // NOTE: Using OpenMP drastically slows down the execution. 68 | // #pragma omp parallel for num_threads(4) 69 | for (int i = 0; i < n; ++i) { 70 | y[i] = std::tanh(a[i]); 71 | } 72 | } -------------------------------------------------------------------------------- /src/math_utils/blas_utils.cpp: -------------------------------------------------------------------------------- 1 | #include "math_utils/blas_utils.h" 2 | 3 | #ifdef USE_BLAS 4 | template<> 5 | void svd_cpu_gemm(const CBLAS_TRANSPOSE TransA, 6 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 7 | const float alpha, const float* A, const float* B, const float beta, 8 | float* C) { 9 | int lda = (TransA == CblasNoTrans) ? K : M; 10 | int ldb = (TransB == CblasNoTrans) ? N : K; 11 | cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, 12 | ldb, beta, C, N); 13 | } 14 | 15 | template<> 16 | void svd_cpu_gemm(const CBLAS_TRANSPOSE TransA, 17 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 18 | const double alpha, const double* A, const double* B, const double beta, 19 | double* C) { 20 | int lda = (TransA == CblasNoTrans) ? K : M; 21 | int ldb = (TransB == CblasNoTrans) ? N : K; 22 | cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, 23 | ldb, beta, C, N); 24 | } 25 | 26 | template <> 27 | void svd_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, 28 | const int N, const float alpha, const float* A, const float* x, 29 | const float beta, float* y) { 30 | cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); 31 | } 32 | 33 | template <> 34 | void svd_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, 35 | const int N, const double alpha, const double* A, const double* x, 36 | const double beta, double* y) { 37 | cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); 38 | } 39 | 40 | template <> 41 | void svd_cpu_gemm_gates(const CBLAS_TRANSPOSE TransA, 42 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 43 | const float* X, const float* I, const float* F, const float* C, 44 | const float* O, float* Y_I, float* Y_F, float* Y_C, float* Y_O) { 45 | svd_cpu_gemm(TransA, TransB, M, N, K, (float) 1., X, I, (float) 0., Y_I); 46 | svd_cpu_gemm(TransA, TransB, M, N, K, (float) 1., X, F, (float) 0., Y_F); 47 | svd_cpu_gemm(TransA, TransB, M, N, K, (float) 1., X, C, (float) 0., Y_C); 48 | svd_cpu_gemm(TransA, TransB, M, N, K, (float) 1., X, O, (float) 0., Y_O); 49 | } 50 | 51 | template <> 52 | void svd_cpu_gemm_gates(const CBLAS_TRANSPOSE TransA, 53 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 54 | const double* X, const double* I, const double* F, const double* C, 55 | const double* O, double* Y_I, double* Y_F, double* Y_C, double* Y_O) { 56 | svd_cpu_gemm(TransA, TransB, M, N, K, (double) 1., X, I, (double) 0., Y_I); 57 | svd_cpu_gemm(TransA, TransB, M, N, K, (double) 1., X, F, (double) 0., Y_F); 58 | svd_cpu_gemm(TransA, TransB, M, N, K, (double) 1., X, C, (double) 0., Y_C); 59 | svd_cpu_gemm(TransA, TransB, M, N, K, (double) 1., X, O, (double) 0., Y_O); 60 | } 61 | #else 62 | #endif // end USE_BLAS 63 | 64 | template 65 | void svd_set(const int N, const Dtype alpha, Dtype* Y) { 66 | if (alpha == 0) { 67 | memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) 68 | return; 69 | } 70 | for (int i = 0; i < N; ++i) { 71 | Y[i] = alpha; 72 | } 73 | } 74 | 75 | template void svd_set(const int N, const int alpha, int* Y); 76 | template void svd_set(const int N, const float alpha, float* Y); 77 | template void svd_set(const int N, const double alpha, double* Y); 78 | 79 | template 80 | void svd_copy(const int N, const Dtype* X, Dtype* Y) { 81 | if (X != Y) { 82 | memcpy(Y, X, sizeof(Dtype) * N); 83 | } else { 84 | for (int i = 0; i < N; ++i) { 85 | Y[i] = X[i]; 86 | } 87 | } 88 | } 89 | 90 | template void svd_copy(const int N, const int* X, int* Y); 91 | template void svd_copy(const int N, const unsigned int* X, 92 | unsigned int* Y); 93 | template void svd_copy(const int N, const float* X, float* Y); 94 | template void svd_copy(const int N, const double* X, double* Y); 95 | 96 | template <> 97 | void svd_scal(const int N, const float alpha, float *X) { 98 | #ifdef USE_BLAS 99 | cblas_sscal(N, alpha, X, 1); 100 | #else 101 | for (int i = 0; i < N; ++i) { 102 | X[i] *= alpha; 103 | } 104 | #endif 105 | } 106 | 107 | template <> 108 | void svd_scal(const int N, const double alpha, double *X) { 109 | #ifdef USE_BLAS 110 | cblas_dscal(N, alpha, X, 1); 111 | #else 112 | for (int i = 0; i < N; ++i) { 113 | X[i] *= alpha; 114 | } 115 | #endif 116 | } 117 | 118 | template <> 119 | void svd_add(const int n, const float* a, const float* b, float* y) { 120 | // NOTE: Using OpenMP drastically slows down the execution. 121 | // #pragma omp parallel for num_threads(4) private(i) 122 | for (int i = 0; i < n; ++i) { 123 | y[i] = a[i] + b[i]; 124 | } 125 | } 126 | 127 | template <> 128 | void svd_add(const int n, const double* a, const double* b, double* y) { 129 | // NOTE: Using OpenMP drastically slows down the execution. 130 | // #pragma omp parallel for num_threads(4) private(i) 131 | for (int i = 0; i < n; ++i) { 132 | y[i] = a[i] + b[i]; 133 | } 134 | } 135 | 136 | template <> 137 | void svd_mul(const int n, const float* a, const float* b, float* y) { 138 | // NOTE: Using OpenMP drastically slows down the execution. 139 | // #pragma omp parallel for num_threads(4) private(i) 140 | for (int i = 0; i < n; ++i) { 141 | y[i] = a[i] * b[i]; 142 | } 143 | } 144 | 145 | template <> 146 | void svd_mul(const int n, const double* a, const double* b, double* y) { 147 | for (int i = 0; i < n; ++i) { 148 | y[i] = a[i] * b[i]; 149 | } 150 | } 151 | 152 | template <> 153 | void svd_transpose(const int n, const int m, const float* x, float* y) { 154 | for(int i = 0; i < n; ++i) { 155 | for(int j = 0; j < m; ++j) { 156 | y[j * n + i] = x[i * m + j]; 157 | } 158 | } 159 | } 160 | 161 | template <> 162 | void svd_transpose(const int n, const int m, const double* x, double* y) { 163 | for(int i = 0; i < n; ++i) { 164 | for(int j = 0; j < m; ++j) { 165 | y[j * n + i] = x[i * m + j]; 166 | } 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /src/math_utils/data_handler.cpp: -------------------------------------------------------------------------------- 1 | #include "math_utils/data_handler.h" -------------------------------------------------------------------------------- /src/svd_ip.cpp: -------------------------------------------------------------------------------- 1 | #include "svd_ip.h" 2 | 3 | namespace svd { 4 | 5 | // void SvdIp2Inputs( 6 | // const typename svd_params::ActivationD x_port[svd_params::N][svd_params::I], 7 | // const typename svd_params::UPortD u_port[svd_params::R * svd_params::PrunedSizeU], 8 | // const typename svd_params::SPortD s_port[svd_params::N][svd_params::R], 9 | // const typename svd_params::VPortD v_port[svd_params::R * svd_params::PrunedSizeV], 10 | // const ap_uint nz_u_port[svd_params::G * svd_params::R], 11 | // const ap_uint nz_v_port[svd_params::G * svd_params::R], 12 | // typename svd_params::ActivationD y_port[svd_params::N][svd_params::G][svd_params::H]) { 13 | // svd::SvdIP(x_port, u_port, s_port, v_port, nz_u_port, nz_v_port, y_port); 14 | // } 15 | 16 | } // svd -------------------------------------------------------------------------------- /src/svd_params.cpp: -------------------------------------------------------------------------------- 1 | #include "svd_params.h" -------------------------------------------------------------------------------- /src/testbenches/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | add_executable(TEST_U_KERNEL ${CMAKE_SOURCE_DIR}/src/testbenches/test_u_kernel.cpp) 4 | target_include_directories(TEST_U_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include) 5 | target_include_directories(TEST_U_KERNEL PUBLIC ${HLS_INCLUDE_DIRS}) 6 | target_include_directories(TEST_U_KERNEL PUBLIC ${OpenCv_INCLUDE_DIRS}) 7 | target_link_libraries(TEST_U_KERNEL ${OpenCv_LIBS}) 8 | target_link_libraries(TEST_U_KERNEL U_KERNEL) 9 | 10 | add_executable(TEST_U_KERNEL_PRUNED ${CMAKE_SOURCE_DIR}/src/testbenches/test_u_kernel_pruned.cpp) 11 | target_include_directories(TEST_U_KERNEL_PRUNED PUBLIC ${CMAKE_SOURCE_DIR}/include) 12 | target_include_directories(TEST_U_KERNEL_PRUNED PUBLIC ${HLS_INCLUDE_DIRS}) 13 | target_include_directories(TEST_U_KERNEL_PRUNED PUBLIC ${OpenCv_INCLUDE_DIRS}) 14 | target_link_libraries(TEST_U_KERNEL_PRUNED ${OpenCv_LIBS}) 15 | target_link_libraries(TEST_U_KERNEL_PRUNED U_KERNEL) 16 | 17 | add_executable(TEST_V_KERNEL ${CMAKE_SOURCE_DIR}/src/testbenches/test_v_kernel.cpp) 18 | target_include_directories(TEST_V_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include) 19 | target_include_directories(TEST_V_KERNEL PUBLIC ${HLS_INCLUDE_DIRS}) 20 | target_include_directories(TEST_V_KERNEL PUBLIC ${OpenCv_INCLUDE_DIRS}) 21 | target_link_libraries(TEST_V_KERNEL ${OpenCv_LIBS}) 22 | target_link_libraries(TEST_V_KERNEL V_KERNEL) 23 | 24 | add_executable(TEST_V_KERNEL_PRUNED ${CMAKE_SOURCE_DIR}/src/testbenches/test_v_kernel_pruned.cpp) 25 | target_include_directories(TEST_V_KERNEL_PRUNED PUBLIC ${CMAKE_SOURCE_DIR}/include) 26 | target_include_directories(TEST_V_KERNEL_PRUNED PUBLIC ${HLS_INCLUDE_DIRS}) 27 | target_include_directories(TEST_V_KERNEL_PRUNED PUBLIC ${OpenCv_INCLUDE_DIRS}) 28 | target_link_libraries(TEST_V_KERNEL_PRUNED ${OpenCv_LIBS}) 29 | target_link_libraries(TEST_V_KERNEL_PRUNED V_KERNEL) 30 | 31 | add_executable(TEST_GEMV_KERNEL ${CMAKE_SOURCE_DIR}/src/testbenches/test_gemv_kernel.cpp) 32 | target_include_directories(TEST_GEMV_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include) 33 | target_include_directories(TEST_GEMV_KERNEL PUBLIC ${HLS_INCLUDE_DIRS}) 34 | target_include_directories(TEST_GEMV_KERNEL PUBLIC ${OpenCv_INCLUDE_DIRS}) 35 | target_link_libraries(TEST_GEMV_KERNEL ${OpenCv_LIBS}) 36 | target_link_libraries(TEST_GEMV_KERNEL GEMV_KERNEL) 37 | 38 | add_executable(TEST_DENSE_SVD ${CMAKE_SOURCE_DIR}/src/testbenches/test_dense_svd.cpp) 39 | target_include_directories(TEST_DENSE_SVD PUBLIC ${CMAKE_SOURCE_DIR}/include) 40 | target_include_directories(TEST_DENSE_SVD PUBLIC ${HLS_INCLUDE_DIRS}) 41 | target_include_directories(TEST_DENSE_SVD PUBLIC ${OpenCv_INCLUDE_DIRS}) 42 | target_link_libraries(TEST_DENSE_SVD ${OpenCv_LIBS}) 43 | target_link_libraries(TEST_DENSE_SVD DENSE_SVD) 44 | 45 | add_executable(TEST_LSTM_SVD ${CMAKE_SOURCE_DIR}/src/testbenches/test_lstm_svd.cpp) 46 | target_include_directories(TEST_LSTM_SVD PUBLIC ${CMAKE_SOURCE_DIR}/include) 47 | target_include_directories(TEST_LSTM_SVD PUBLIC ${HLS_INCLUDE_DIRS}) 48 | target_include_directories(TEST_LSTM_SVD PUBLIC ${OpenCv_INCLUDE_DIRS}) 49 | target_link_libraries(TEST_LSTM_SVD ${OpenCv_LIBS}) 50 | target_link_libraries(TEST_LSTM_SVD LSTM_SVD) 51 | 52 | add_executable(TEST_SVD_KERNEL ${CMAKE_SOURCE_DIR}/src/testbenches/test_svd_kernel.cpp) 53 | target_include_directories(TEST_SVD_KERNEL PUBLIC ${CMAKE_SOURCE_DIR}/include) 54 | target_include_directories(TEST_SVD_KERNEL PUBLIC ${HLS_INCLUDE_DIRS}) 55 | target_include_directories(TEST_SVD_KERNEL PUBLIC ${OpenCv_INCLUDE_DIRS}) 56 | target_link_libraries(TEST_SVD_KERNEL ${OpenCv_LIBS}) 57 | target_link_libraries(TEST_SVD_KERNEL SVD_KERNEL) 58 | 59 | add_test(NAME TestU_Kernel COMMAND TEST_U_KERNEL) 60 | add_test(NAME TestU_Kernel_Pruned COMMAND TEST_U_KERNEL_PRUNED) 61 | add_test(NAME TestV_Kernel_Pruned COMMAND TEST_V_KERNEL_PRUNED) 62 | add_test(NAME TestV_Kernel COMMAND TEST_V_KERNEL) 63 | add_test(NAME TestGemvKernel COMMAND TEST_GEMV_KERNEL) 64 | add_test(NAME TestDenseSvd COMMAND TEST_DENSE_SVD) 65 | add_test(NAME TestLstmSvd COMMAND TEST_LSTM_SVD) 66 | add_test(NAME TestSvdKernel COMMAND TEST_SVD_KERNEL) -------------------------------------------------------------------------------- /src/testbenches/test_dense_svd.cpp: -------------------------------------------------------------------------------- 1 | #include "testbenches/test_dense_svd.h" 2 | #include "dma/axis_lib.h" 3 | 4 | #ifdef __VITIS_HLS__ 5 | #include "hls_vector.h" 6 | #endif 7 | #include "ap_int.h" 8 | #include "hls_stream.h" 9 | #include 10 | #include 11 | 12 | int main(int argc, char const *argv[]) { 13 | #ifndef __VITIS_HLS__ 14 | return 0; 15 | #else 16 | std::cout << "[INFO] Starting HlsDenseSvd test." << std::endl; 17 | typedef typename svd::dense_params::ActivationD ActivationType; 18 | const int kG = svd::dense_params::G; 19 | int num_active_inputs = svd::dense_params::N; 20 | int input_size = 16; 21 | int output_size = 16; 22 | int max_R = 1; 23 | int num_tests = 2; 24 | auto get_arg = [&](const int i, const int max_val, int& arg) { 25 | if (argc >= i) { 26 | arg = atoi(argv[i -1]); 27 | arg = (arg > max_val) ? max_val : arg; 28 | } 29 | }; 30 | get_arg(2, svd::dense_params::N, num_active_inputs); 31 | get_arg(3, 512, max_R); 32 | get_arg(4, svd::dense_params::I, input_size); 33 | get_arg(5, svd::dense_params::H, output_size); 34 | get_arg(6, 32, num_tests); 35 | int num_refinements[svd::dense_params::N]; 36 | ActivationType* x = new ActivationType[num_active_inputs * input_size]; 37 | ActivationType* u = new ActivationType[max_R * input_size * kG]; 38 | ActivationType* s = new ActivationType[max_R * num_active_inputs * kG]; 39 | ActivationType* v = new ActivationType[max_R * output_size * kG]; 40 | ActivationType* bias = new ActivationType[num_active_inputs * kG * output_size]; 41 | ActivationType* y = new ActivationType[num_active_inputs * kG * output_size]; 42 | auto init_random = [&](const int size, ActivationType* x) { 43 | for (int i = 0; i < size; ++i) { 44 | if (std::is_same::value) { 45 | x[i] = ActivationType(rand()); 46 | } else { 47 | x[i] = ActivationType(rand() * 0.00001); 48 | } 49 | } 50 | }; 51 | for (int i = 0; i < svd::dense_params::N; ++i) { 52 | num_refinements[i] = max_R; 53 | } 54 | init_random(num_active_inputs * input_size, x); 55 | init_random(max_R * input_size * kG, u); 56 | init_random(max_R * num_active_inputs * kG, s); 57 | init_random(max_R * output_size * kG, v); 58 | init_random(num_active_inputs * kG * output_size, bias); 59 | std::cout << "[INFO] Calling accelerator." << std::endl; 60 | for (int i = 0; i < num_tests; ++i) { 61 | HlsWrapperDenseSvd(num_active_inputs, input_size, output_size, 62 | num_refinements, x, u, s, v, bias, y); 63 | } 64 | delete[] x; 65 | delete[] u; 66 | delete[] s; 67 | delete[] v; 68 | delete[] bias; 69 | delete[] y; 70 | std::cout << "[INFO] Exiting." << std::endl; 71 | return 0; 72 | #endif // end __VITIS_HLS__ 73 | } -------------------------------------------------------------------------------- /src/testbenches/test_gemv_kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "kernel/gemv_kernel.h" 2 | 3 | #include "hls_stream.h" 4 | #ifdef __VITIS_HLS__ 5 | #include "hls_vector.h" 6 | #endif 7 | 8 | int main(int argc, char const *argv[]) { 9 | #ifndef __VITIS_HLS__ 10 | return 0; 11 | #else 12 | typedef hls::vector VectType; 13 | testgemv::DataType x[testgemv::I]; 14 | testgemv::DataType w[testgemv::I][testgemv::R]; 15 | 16 | testgemv::DataType y[testgemv::R] = {0}; 17 | 18 | hls::stream x_port[testgemv::N]; 19 | hls::stream w_port[testgemv::N]; 20 | hls::stream y_port[testgemv::N]; 21 | for (int i = 0; i < testgemv::I; ++i) { 22 | 23 | x[i] = testgemv::DataType(rand() * 0.0001); 24 | for (int j = 0; j < testgemv::R; ++j) { 25 | w[i][j] = testgemv::DataType(rand() * 0.0001); 26 | } 27 | } 28 | 29 | for (int i = 0; i < testgemv::R; ++i) { 30 | for (int j = 0; j < testgemv::I / testgemv::T; ++j) { 31 | VectType tmp; 32 | for (int k = 0; k < testgemv::T; ++k) { 33 | tmp[k] = w[j * testgemv::T + k][i]; 34 | } 35 | for (int ii = 0; ii < testgemv::N; ++ii) { 36 | w_port[ii] << tmp; 37 | } 38 | } 39 | } 40 | 41 | for (int i = 0; i < testgemv::R; ++i) { 42 | for (int j = 0; j < testgemv::I / testgemv::T; ++j) { 43 | VectType tmp; 44 | for (int k = 0; k < testgemv::T; ++k) { 45 | tmp[k] = x[j * testgemv::T + k]; 46 | } 47 | for (int ii = 0; ii < testgemv::N; ++ii) { 48 | x_port[ii] << tmp; 49 | } 50 | } 51 | } 52 | 53 | HlsGemvKernel(testgemv::I, testgemv::R, x_port[0], x_port[1], w_port[0], w_port[1], y_port[0], y_port[1]); 54 | for (int i = 0; i < testgemv::R; ++i) { 55 | y[i] = 0; 56 | for (int j = 0; j < testgemv::I; ++j) { 57 | y[i] += x[j] * w[j][i]; 58 | } 59 | } 60 | 61 | std::cout << "Checking results." << std::endl; 62 | int num_errors = 0; 63 | for (int i = 0; i < testgemv::R; ++i) { 64 | for (int j = 0; j < testgemv::N; ++j) { 65 | auto y_test = y_port[j].read(); 66 | if (y[i] - y_test > testgemv::DataType(0.001)) { 67 | std::cout << i << ") test/gold: " << y_test << " / " 68 | << y[i] << std::endl; 69 | ++num_errors; 70 | } 71 | } 72 | } 73 | std::cout << "[INFO] Number of mismatches: " << num_errors << std::endl; 74 | return 0; // num_errors; 75 | #endif 76 | } -------------------------------------------------------------------------------- /src/testbenches/test_lstm_svd.cpp: -------------------------------------------------------------------------------- 1 | #include "testbenches/test_lstm_svd.h" 2 | 3 | #include "dma/axis_lib.h" 4 | 5 | #ifdef __VITIS_HLS__ 6 | #include "hls_vector.h" 7 | #endif 8 | #include "ap_int.h" 9 | #include "hls_stream.h" 10 | #include 11 | #include 12 | 13 | int main(int argc, char const *argv[]) { 14 | #ifndef __VITIS_HLS__ 15 | return 0; 16 | #else 17 | std::cout << "[INFO] Starting HlsLstmSvd test." << std::endl; 18 | typedef typename svd::lstm_params::ActivationD ActivationType; 19 | const int kG = svd::lstm_params::G; 20 | int num_active_inputs = svd::lstm_params::N; 21 | int input_size = 16; 22 | int output_size = 16; 23 | int max_R = 1; 24 | int num_tests = 2; 25 | auto get_arg = [&](const int i, const int max_val, int& arg) { 26 | if (argc >= i) { 27 | arg = atoi(argv[i -1]); 28 | arg = (arg > max_val) ? max_val : arg; 29 | } 30 | }; 31 | get_arg(2, svd::lstm_params::N, num_active_inputs); 32 | get_arg(3, 512, max_R); 33 | get_arg(4, svd::lstm_params::I, input_size); 34 | get_arg(5, svd::lstm_params::H, output_size); 35 | get_arg(6, 32, num_tests); 36 | int num_refinements[svd::lstm_params::N]; 37 | ActivationType* x = new ActivationType[num_active_inputs * input_size]; 38 | ActivationType* h_prev = new ActivationType[num_active_inputs * output_size]; 39 | ActivationType* c_prev = new ActivationType[num_active_inputs * output_size]; 40 | ActivationType* h_curr = new ActivationType[num_active_inputs * output_size]; 41 | ActivationType* c_curr = new ActivationType[num_active_inputs * output_size]; 42 | ActivationType* u_cur = new ActivationType[max_R * input_size * kG]; 43 | ActivationType* s_cur = new ActivationType[max_R * num_active_inputs * kG]; 44 | ActivationType* v_cur = new ActivationType[max_R * output_size * kG]; 45 | ActivationType* u_rec = new ActivationType[max_R * output_size * kG]; 46 | ActivationType* s_rec = new ActivationType[max_R * num_active_inputs * kG]; 47 | ActivationType* v_rec = new ActivationType[max_R * output_size * kG]; 48 | ActivationType* bias = new ActivationType[num_active_inputs * kG * output_size]; 49 | auto init_random = [&](const int size, ActivationType* x) { 50 | for (int i = 0; i < size; ++i) { 51 | if (std::is_same::value) { 52 | x[i] = ActivationType(rand()); 53 | } else { 54 | x[i] = ActivationType(rand() * 0.00001); 55 | } 56 | } 57 | }; 58 | for (int i = 0; i < svd::lstm_params::N; ++i) { 59 | num_refinements[i] = max_R; 60 | } 61 | init_random(num_active_inputs * input_size, x); 62 | init_random(max_R * input_size * kG, u_cur); 63 | init_random(max_R * num_active_inputs * kG, s_cur); 64 | init_random(max_R * output_size * kG, v_cur); 65 | init_random(max_R * output_size * kG, u_rec); 66 | init_random(max_R * num_active_inputs * kG, s_rec); 67 | init_random(max_R * output_size * kG, v_rec); 68 | init_random(num_active_inputs * kG * output_size, bias); 69 | std::cout << "[INFO] Calling accelerator." << std::endl; 70 | for (int i = 0; i < num_tests; ++i) { 71 | HlsWrapperLstmSvd(num_active_inputs, input_size, output_size, 72 | num_refinements, x, u_cur, s_cur, v_cur, h_prev, u_rec, s_rec, v_rec, 73 | bias, c_prev, h_curr, c_curr); 74 | } 75 | delete[] x; 76 | delete[] h_prev; 77 | delete[] c_prev; 78 | delete[] h_curr; 79 | delete[] c_curr; 80 | delete[] u_cur; 81 | delete[] s_cur; 82 | delete[] v_cur; 83 | delete[] u_rec; 84 | delete[] s_rec; 85 | delete[] v_rec; 86 | delete[] bias; 87 | std::cout << "[INFO] Exiting." << std::endl; 88 | return 0; 89 | #endif // end __VITIS_HLS__ 90 | } -------------------------------------------------------------------------------- /src/testbenches/test_svd_kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "testbenches/test_svd_kernel.h" 2 | #include "dma/axis_lib.h" 3 | 4 | #ifdef __VITIS_HLS__ 5 | #include "hls_vector.h" 6 | #endif 7 | #include "ap_int.h" 8 | #include "hls_stream.h" 9 | #include 10 | #include 11 | 12 | int main(int argc, char const *argv[]) { 13 | #ifndef __VITIS_HLS__ 14 | return 0; 15 | #else 16 | std::cout << "[INFO] Starting HlsSvdKernel test." << std::endl; 17 | typedef typename svd::svd_params::ActivationD ActivationType; 18 | const int kG = svd::svd_params::G; 19 | int num_active_inputs = svd::svd_params::N; 20 | int input_size = 16; 21 | int output_size = 16; 22 | int max_R = 1; 23 | int num_tests = 2; 24 | auto get_arg = [&](const int i, const int max_val, int& arg) { 25 | if (argc >= i) { 26 | arg = atoi(argv[i -1]); 27 | arg = (arg > max_val) ? max_val : arg; 28 | } 29 | }; 30 | get_arg(2, svd::svd_params::N, num_active_inputs); 31 | get_arg(3, 512, max_R); 32 | get_arg(4, svd::svd_params::I, input_size); 33 | get_arg(5, svd::svd_params::H, output_size); 34 | get_arg(6, 32, num_tests); 35 | int num_refinements[svd::svd_params::N]; 36 | ActivationType* x = new ActivationType[num_active_inputs * input_size]; 37 | ActivationType* u = new ActivationType[max_R * input_size * kG]; 38 | ActivationType* s = new ActivationType[max_R * num_active_inputs * kG]; 39 | ActivationType* v = new ActivationType[max_R * output_size * kG]; 40 | ActivationType* y = new ActivationType[num_active_inputs * kG * output_size]; 41 | hls::stream x_port("x_port"); 42 | hls::stream u_port("u_port"); 43 | hls::stream s_port("s_port"); 44 | hls::stream v_port("v_port"); 45 | hls::stream y_port("y_port"); 46 | auto init_random = [&](const int size, ActivationType* x) { 47 | for (int i = 0; i < size; ++i) { 48 | if (std::is_same::value) { 49 | x[i] = ActivationType(rand()); 50 | } else { 51 | x[i] = ActivationType(rand() * 0.00001); 52 | } 53 | } 54 | }; 55 | auto init_zero = [&](const int size, ActivationType* x) { 56 | for (int i = 0; i < size; ++i) { 57 | x[i] = ActivationType(0); 58 | } 59 | }; 60 | for (int i = 0; i < svd::svd_params::N; ++i) { 61 | num_refinements[i] = max_R; 62 | } 63 | init_random(num_active_inputs * input_size, x); 64 | init_random(num_active_inputs * kG * output_size, y); 65 | init_random(max_R * input_size * kG, u); 66 | init_random(max_R * num_active_inputs * kG, s); 67 | init_random(max_R * output_size * kG, v); 68 | std::cout << "[INFO] Calling accelerator." << std::endl; 69 | for (int i = 0; i < num_tests; ++i) { 70 | svd::SetSvdKernelInputs(num_active_inputs, input_size, 71 | output_size, num_refinements, x, u, s, v, x_port, u_port, s_port, v_port); 72 | HlsSvdKernel(num_active_inputs, input_size, output_size, num_refinements, 73 | x_port, u_port, s_port, v_port, y_port); 74 | svd::GetSvdKernelOutputs(num_active_inputs, output_size, 75 | y_port, y); 76 | } 77 | delete[] x; 78 | delete[] u; 79 | delete[] s; 80 | delete[] v; 81 | delete[] y; 82 | std::cout << "[INFO] Exiting." << std::endl; 83 | return 0; 84 | #endif // end __VITIS_HLS__ 85 | } -------------------------------------------------------------------------------- /src/testbenches/test_u_kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "testbenches/test_u_kernel.h" 2 | #include "dma/axis_lib.h" 3 | 4 | #ifdef __VITIS_HLS__ 5 | #include "hls_vector.h" 6 | #endif 7 | #include "ap_int.h" 8 | #include "hls_stream.h" 9 | #include 10 | #include 11 | 12 | int main(int argc, char const *argv[]) { 13 | #ifdef COSIM_DESIGN 14 | srand(1); 15 | #else 16 | srand(time(NULL)); 17 | #endif 18 | std::cout << "[INFO] Starting HlsKernelU test." << std::endl; 19 | #ifndef __VITIS_HLS__ 20 | return 0; 21 | #else 22 | const int num_refinements = testu::params::R; 23 | hls::vector num_refinements_vect = hls::vector(num_refinements); 24 | for (int i = testu::params::N - 1; i >= 0; --i) { 25 | int R_tmp = testu::params::R - 2 * (testu::params::N - i - 1); 26 | num_refinements_vect[i] = R_tmp > 0 ? R_tmp : 1; 27 | } 28 | const int kNumActiveInputs = (testu::params::N / 2 > 0) ? testu::params::N / 2 : 1; 29 | const int kInputSize_tmp = testu::params::I / 1; 30 | const int kInputSize = (kInputSize_tmp > testu::params::I) ? testu::params::I : kInputSize_tmp; 31 | const int kNumTilesU = kInputSize / testu::params::Tu; 32 | typedef typename testu::params::ActivationD ActivationType; 33 | typedef hls::vector VectN_Type; 34 | typedef hls::vector VectG_Type; 35 | typedef hls::vector VectTuAct_Type; 36 | assert(testu::params::I == testu::params::PrunedSizeU); 37 | 38 | ActivationType x[testu::params::N][testu::params::I]; 39 | ActivationType u[num_refinements][testu::params::PrunedSizeU][testu::params::G]; 40 | ActivationType xu[num_refinements][testu::params::N][testu::params::G]; 41 | 42 | hls::stream x_port; //[testu::params::N * kNumTilesU]; 43 | hls::stream u_port; //[num_refinements * kNumTilesU * testu::params::G]; 44 | hls::stream xu_port; //[num_refinements * testu::params::G]; 45 | 46 | hls::stream unz_idx_axis("unz_idx_axis"); 47 | hls::stream x_axis("x_axis"); 48 | hls::stream u_axis("u_axis"); 49 | hls::stream xu_gn_axis("xu_gn_axis"); 50 | hls::stream xu_n_axis("xu_n_axis"); 51 | hls::stream xu_g_axis("xu_g_axis"); 52 | VectN_Type xu_gold[num_refinements * testu::params::G]; 53 | 54 | auto x_axis_interface = svd::AxiStreamPort(x_axis); 55 | auto u_axis_interface = svd::AxiStreamPort(u_axis); 56 | auto xu_gn_axis_interface = svd::AxiStreamPort(xu_gn_axis); 57 | auto xu_n_axis_interface = svd::AxiStreamPort(xu_n_axis); 58 | auto xu_g_axis_interface = svd::AxiStreamPort(xu_g_axis); 59 | 60 | for (int i = 0; i < testu::params::N; ++i) { 61 | for (int j = 0; j < testu::params::I; ++j) { 62 | x[i][j] = rand(); // * 0.00001; 63 | } 64 | } 65 | for (int i = 0; i < num_refinements; ++i) { 66 | for (int j = 0; j < testu::params::PrunedSizeU; ++j) { 67 | for (int k = 0; k < testu::params::G; ++k) { 68 | u[i][j][k] = rand(); // * 0.00001; 69 | } 70 | } 71 | } 72 | for (int i = 0; i < num_refinements; ++i) { 73 | for (int j = 0; j < testu::params::N; ++j) { 74 | for (int k = 0; k < testu::params::G; ++k) { 75 | xu[i][j][k] = 0; 76 | } 77 | } 78 | } 79 | for (int i = 0; i < num_refinements; ++i) { 80 | for (int j = 0; j < kInputSize; ++j) { 81 | for (int k = 0; k < testu::params::G; ++k) { 82 | for (int ii = 0; ii < testu::params::N; ++ii) { 83 | xu[i][ii][k] += u[i][j][k] * x[ii][j]; 84 | } 85 | } 86 | } 87 | } 88 | 89 | 90 | for (int i = 0; i < num_refinements; ++i) { 91 | for (int j = 0; j < testu::params::N; ++j) { 92 | for (int k = 0; k < testu::params::G; ++k) { 93 | xu_gold[i * testu::params::G + k][j] = xu[i][j][k]; 94 | } 95 | } 96 | } 97 | 98 | const int num_tests = 2; 99 | int num_errors = 0; 100 | 101 | for (int t = 0; t < num_tests; ++t) { 102 | // NOTE: The streaming order differs from before! kNumTilesU is swapped with 103 | // testu::params::N. 104 | for (int j = 0; j < kNumTilesU; ++j) { 105 | for (int i = 0; i < kNumActiveInputs; ++i) { 106 | VectTuAct_Type x_val; 107 | for (int k = 0; k < testu::params::Tu; ++k) { 108 | x_val[k] = x[i][j * testu::params::Tu + k]; 109 | } 110 | x_axis_interface.PushVector(x_val); 111 | } 112 | } 113 | // NOTE: The streaming order differs from before! kNumTilesU is swapped with 114 | // testu::params::G. 115 | for (int i = 0; i < num_refinements_vect[kNumActiveInputs - 1]; ++i) { 116 | for (int j = 0; j < kNumTilesU; ++j) { 117 | for (int k = 0; k < testu::params::G; ++k) { 118 | VectTuAct_Type u_val; 119 | for (int ii = 0; ii < testu::params::Tu; ++ii) { 120 | u_val[ii] = u[i][j * testu::params::Tu + ii][k]; 121 | } 122 | u_axis_interface.PushVector(u_val); 123 | } 124 | } 125 | } 126 | std::cout << "[INFO] Starting HlsKernelU." << std::endl; 127 | 128 | int refinements_tmp[testu::params::N]; 129 | for (int i = 0; i < testu::params::N; ++i) { 130 | refinements_tmp[i] = num_refinements_vect[i]; 131 | } 132 | // HlsKernelU(kNumActiveInputs, kInputSize, refinements_tmp, false, x_axis, u_axis, xu_g_axis); 133 | 134 | const int num_zero_tiles_u = 0; 135 | HlsKernelU_Pruned(kNumActiveInputs, kInputSize, refinements_tmp, 136 | num_zero_tiles_u, unz_idx_axis, x_axis, u_axis, xu_g_axis); 137 | 138 | 139 | testu::params::VectG_Type xu_g_val; 140 | int total_cnt = 0; 141 | int last_at = -1; 142 | for (int i = 0; i < num_refinements_vect[kNumActiveInputs - 1]; ++i) { // R_max 143 | for (int j = 0; j < kNumActiveInputs; ++j) { 144 | if (i < num_refinements_vect[j]) { 145 | bool is_last = xu_g_axis_interface.isLastPopVector(xu_g_val); 146 | if (is_last) { 147 | last_at = total_cnt; 148 | std::cout << "[INFO] Last index arrived at iteration: " << last_at << std::endl; 149 | } 150 | ++total_cnt; 151 | // std::cout << "\t[INFO] Reading xu[R." << i << "][N." << j << "]" << std::endl; 152 | for (int k = 0; k < testu::params::G; ++k) { 153 | // VectN_Type xu_gold[num_refinements * testu::params::G]; 154 | std::cout << i << ") test/gold: " << xu_g_val[k] << " / " 155 | << xu[i][j][k] << std::endl; 156 | if (xu_g_val[k] != xu[i][j][k]) { 157 | ++num_errors; 158 | } 159 | } 160 | } 161 | } 162 | } 163 | std::cout << "[INFO] Last index arrived at iteration: " << last_at << std::endl; 164 | std::cout << "[INFO] Total iterations: " << total_cnt << std::endl; 165 | std::cout << "[INFO] Number of mismatches: " << num_errors << std::endl; 166 | 167 | while(!xu_n_axis.empty()) { 168 | auto xu_n_val = xu_n_axis_interface.PopVector(); 169 | } 170 | } 171 | std::cout << "[INFO] Number of mismatches: " << num_errors << std::endl; 172 | return 0; // num_errors; 173 | #endif 174 | } -------------------------------------------------------------------------------- /src/testbenches/test_u_kernel_pruned.cpp: -------------------------------------------------------------------------------- 1 | #include "testbenches/test_u_kernel_pruned.h" 2 | #include "layers/lstm/lstm_data_handler.h" 3 | #include "dma/axis_lib.h" 4 | 5 | #ifdef __VITIS_HLS__ 6 | #include "hls_vector.h" 7 | #endif 8 | #include "ap_int.h" 9 | #include "hls_stream.h" 10 | #include 11 | #include 12 | 13 | int main(int argc, char const *argv[]) { 14 | #ifdef COSIM_DESIGN 15 | srand(1); 16 | #else 17 | srand(time(NULL)); 18 | #endif 19 | std::cout << "[INFO] Starting HlsKernelU_Pruned test." << std::endl; 20 | #ifndef __VITIS_HLS__ 21 | return 0; 22 | #else 23 | const int max_num_refinements = testu::params::R; 24 | int num_refinements[testu::params::N] = {max_num_refinements}; 25 | for (int i = testu::params::N - 1; i >= 0; --i) { 26 | int R_tmp = testu::params::R - 2 * (testu::params::N - i - 1); 27 | num_refinements[i] = R_tmp > 0 ? R_tmp : 1; 28 | } 29 | 30 | const int kNumActiveInputs = 1; // testu::params::N; 31 | const int kInputSize_tmp = testu::params::I / 16; 32 | const int kInputSize = (kInputSize_tmp > testu::params::I) ? testu::params::I : kInputSize_tmp; 33 | const int kNumTilesU = kInputSize / testu::params::Tu; 34 | const int kN = testu::params::N; 35 | const int kR = testu::params::R; 36 | const int kI = testu::params::I; 37 | const int kH = testu::params::H; 38 | const int kTu = testu::params::Tu; 39 | const int kNTu = testu::params::MaxNumTu; 40 | const int kZTu_tmp = 10; 41 | const int kZTu = kZTu_tmp >= kNumTilesU ? 0 : kZTu_tmp; // testu::params::ZTu; 42 | const int kNTv = testu::params::MaxNumTv; 43 | const int kZTv = testu::params::ZTv; 44 | 45 | typedef typename testu::params::ActivationD ActivationType; 46 | typedef ap_uint IndexType; 47 | 48 | typedef hls::vector VectN_Type; 49 | typedef hls::vector VectG_Type; 50 | typedef hls::vector VectTuAct_Type; 51 | assert(testu::params::I == testu::params::PrunedSizeU); 52 | 53 | ActivationType x[testu::params::N][testu::params::I]; 54 | ActivationType u[max_num_refinements][testu::params::PrunedSizeU][testu::params::G]; 55 | ActivationType xu[max_num_refinements][testu::params::N][testu::params::G]; 56 | 57 | hls::stream unz_idx_axis("unz_idx_axis"); 58 | hls::stream x_axis("x_axis"); 59 | hls::stream u_axis("u_axis"); 60 | hls::stream xu_axis("xu_axis"); 61 | 62 | VectN_Type xu_gold[max_num_refinements * testu::params::G]; 63 | 64 | auto unz_idx_interface = svd::AxiStreamPort(unz_idx_axis); 65 | auto x_interface = svd::AxiStreamPort(x_axis); 66 | auto u_interface = svd::AxiStreamPort(u_axis); 67 | auto xu_interface = svd::AxiStreamPort(xu_axis); 68 | 69 | std::cout << "kN: " << kN << std::endl; 70 | std::cout << "kR: " << kR << std::endl; 71 | std::cout << "kI: " << kI << std::endl; 72 | std::cout << "kH: " << kH << std::endl; 73 | std::cout << "kNTu: " << kNTu << std::endl; 74 | std::cout << "kZTu: " << kZTu << std::endl; 75 | std::cout << "kNTv: " << kNTv << std::endl; 76 | std::cout << "kZTv: " << kZTv << std::endl; 77 | 78 | std::cout << "Setting AcceleratorBlob." << std::endl; 79 | auto storage = svd::AcceleratorBlob( 80 | kN, kR, kInputSize, kInputSize, kH, kNumTilesU, kZTu, kNTv, kZTv); 81 | 82 | // const int kPrunedSize = storage.get_cur_gates("i")->get_pruned_total_size(); 83 | auto i_gate = storage.get_cur_gates("i")->get_u(); 84 | auto f_gate = storage.get_cur_gates("f")->get_u(); 85 | auto c_gate = storage.get_cur_gates("c")->get_u(); 86 | auto o_gate = storage.get_cur_gates("o")->get_u(); 87 | 88 | int* nz_i_idx = i_gate->get_nz_idx(); 89 | int* nz_f_idx = f_gate->get_nz_idx(); 90 | int* nz_c_idx = c_gate->get_nz_idx(); 91 | int* nz_o_idx = o_gate->get_nz_idx(); 92 | 93 | std::cout << "i_gate->get_nz_idx(0, 0): "; 94 | int tmp = i_gate->get_nz_idx(0, 0); 95 | std::cout << tmp << std::endl; 96 | 97 | std::cout << "xu setup." << std::endl; 98 | for (int i = 0; i < max_num_refinements; ++i) { 99 | for (int j = 0; j < testu::params::N; ++j) { 100 | for (int k = 0; k < testu::params::G; ++k) { 101 | xu[i][j][k] = 0; 102 | } 103 | } 104 | } 105 | 106 | std::cout << "[INFO] Generating gold results." << std::endl; 107 | auto i_weight = i_gate->fix_data(); 108 | auto f_weight = f_gate->fix_data(); 109 | auto c_weight = c_gate->fix_data(); 110 | auto o_weight = o_gate->fix_data(); 111 | auto i_weight_pruned = i_gate->fix_pruned_data(); 112 | auto f_weight_pruned = f_gate->fix_pruned_data(); 113 | auto c_weight_pruned = c_gate->fix_pruned_data(); 114 | auto o_weight_pruned = o_gate->fix_pruned_data(); 115 | for (int i = 0; i < max_num_refinements; ++i) { 116 | for (int j = 0; j < kInputSize; ++j) { 117 | // std::cout << i_weight[i * kInputSize + j] << " "; 118 | for (int ii = 0; ii < testu::params::N; ++ii) { 119 | xu[i][ii][0] += i_weight[i * kInputSize + j] * storage.get_fix_x(ii)[j]; 120 | xu[i][ii][1] += f_weight[i * kInputSize + j] * storage.get_fix_x(ii)[j]; 121 | xu[i][ii][2] += c_weight[i * kInputSize + j] * storage.get_fix_x(ii)[j]; 122 | xu[i][ii][3] += o_weight[i * kInputSize + j] * storage.get_fix_x(ii)[j]; 123 | } 124 | } 125 | // std::cout << std::endl; 126 | } 127 | 128 | #if 1 129 | const int num_tests = 2; 130 | int num_errors = 0; 131 | 132 | std::cout << "[INFO] Starting tests." << std::endl; 133 | for (int t = 0; t < num_tests; ++t) { 134 | // NOTE: The streaming order differs from before! kNumTilesU is swapped with 135 | // testu::params::N. 136 | std::cout << "[INFO] Sending x." << std::endl; 137 | for (int j = 0; j < kNumTilesU; ++j) { 138 | for (int i = 0; i < kNumActiveInputs; ++i) { 139 | VectTuAct_Type x_val; 140 | for (int k = 0; k < testu::params::Tu; ++k) { 141 | x_val[k] = storage.get_fix_x(i)[j * testu::params::Tu + k]; 142 | } 143 | x_interface.PushVector(x_val); 144 | } 145 | } 146 | // NOTE: The streaming order differs from before! kNumTilesU is swapped with 147 | // testu::params::G. 148 | std::cout << "[INFO] Sending u." << std::endl; 149 | for (int i = 0; i < num_refinements[kNumActiveInputs - 1]; ++i) { 150 | for (int j = 0; j < kNumTilesU - kZTu; ++j) { 151 | VectTuAct_Type u_val; 152 | for (int k = 0; k < testu::params::Tu; ++k) { 153 | u_val[k] = i_weight[i * kInputSize + i_gate->get_nz_idx(i, j) * kTu + k]; 154 | } 155 | u_interface.PushVector(u_val); 156 | for (int k = 0; k < testu::params::Tu; ++k) { 157 | u_val[k] = f_weight[i * kInputSize + f_gate->get_nz_idx(i, j) * kTu + k]; 158 | } 159 | u_interface.PushVector(u_val); 160 | for (int k = 0; k < testu::params::Tu; ++k) { 161 | u_val[k] = c_weight[i * kInputSize + c_gate->get_nz_idx(i, j) * kTu + k]; 162 | } 163 | u_interface.PushVector(u_val); 164 | for (int k = 0; k < testu::params::Tu; ++k) { 165 | u_val[k] = o_weight[i * kInputSize + o_gate->get_nz_idx(i, j) * kTu + k]; 166 | } 167 | u_interface.PushVector(u_val); 168 | } 169 | } 170 | 171 | std::cout << "[INFO] Sending nzu." << std::endl; 172 | for (int i = 0; i < num_refinements[kNumActiveInputs - 1]; ++i) { 173 | for (int j = 0; j < kNumTilesU - kZTu; ++j) { 174 | const int bits = testu::params::NumTuBits; 175 | IndexType nzu_val; 176 | nzu_val.range(1 * bits - 1, 0 * bits) = i_gate->get_nz_idx(i, j); 177 | nzu_val.range(2 * bits - 1, 1 * bits) = f_gate->get_nz_idx(i, j); 178 | nzu_val.range(3 * bits - 1, 2 * bits) = c_gate->get_nz_idx(i, j); 179 | nzu_val.range(4 * bits - 1, 3 * bits) = o_gate->get_nz_idx(i, j); 180 | // std::cout << i_gate->get_nz_idx(i, j) << std::endl; 181 | unz_idx_interface.Push(nzu_val); 182 | } 183 | } 184 | 185 | std::cout << "[INFO] Starting HlsKernelU." << std::endl; 186 | // HlsKernelU(kNumActiveInputs, kInputSize, refinements_tmp, false, x_axis, u_axis, xu_axis); 187 | HlsKernelU_Pruned(kNumActiveInputs, kInputSize, num_refinements, kZTu, unz_idx_axis, x_axis, u_axis, xu_axis); 188 | 189 | testu::params::VectG_Type xu_g_val; 190 | int total_cnt = 0; 191 | int last_at = -1; 192 | for (int i = 0; i < num_refinements[kNumActiveInputs - 1]; ++i) { // R_max 193 | for (int j = 0; j < kNumActiveInputs; ++j) { 194 | if (i < num_refinements[j]) { 195 | bool is_last = xu_interface.isLastPopVector(xu_g_val); 196 | if (is_last) { 197 | last_at = total_cnt; 198 | std::cout << "[INFO] Last index arrived at iteration: " << last_at << std::endl; 199 | } 200 | ++total_cnt; 201 | // std::cout << "\t[INFO] Reading xu[R." << i << "][N." << j << "]" << std::endl; 202 | for (int k = 0; k < testu::params::G; ++k) { 203 | // VectN_Type xu_gold[max_num_refinements * testu::params::G]; 204 | std::cout << i << ") test/gold: " << xu_g_val[k] << " / " 205 | << xu[i][j][k] << std::endl; 206 | if (xu_g_val[k] != xu[i][j][k]) { 207 | ++num_errors; 208 | } 209 | } 210 | } 211 | } 212 | } 213 | std::cout << "[INFO] Last index arrived at iteration: " << last_at << std::endl; 214 | std::cout << "[INFO] Total iterations: " << total_cnt << std::endl; 215 | std::cout << "[INFO] Number of mismatches: " << num_errors << std::endl; 216 | } 217 | std::cout << "[INFO] Number of mismatches: " << num_errors << std::endl; 218 | return 0; // num_errors; 219 | 220 | #endif 221 | std::cout << "Exiting." << std::endl; 222 | 223 | 224 | #endif 225 | } -------------------------------------------------------------------------------- /src/testbenches/test_v_kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "testbenches/test_v_kernel.h" 2 | #include "dma/axis_lib.h" 3 | 4 | #ifdef __VITIS_HLS__ 5 | #include "hls_vector.h" 6 | #endif 7 | #include "ap_int.h" 8 | #include "hls_stream.h" 9 | #include 10 | #include 11 | 12 | int main(int argc, char const *argv[]) { 13 | #ifdef COSIM_DESIGN 14 | srand(1); 15 | #else 16 | srand(1); 17 | // srand(time(NULL)); 18 | #endif 19 | std::cout << "[INFO] Starting HlsKernelV test." << std::endl; 20 | #ifndef __VITIS_HLS__ 21 | return 0; 22 | #else 23 | int num_active_inputs = testv::params::N; 24 | int output_size = testv::params::H; 25 | int num_refinements = testv::params::R; 26 | if (argc >= 2) { 27 | num_active_inputs = atoi(argv[1]); 28 | } 29 | if (argc >= 3) { 30 | output_size = atoi(argv[2]); 31 | } 32 | if (argc >= 4) { 33 | num_refinements = atoi(argv[3]); 34 | } 35 | const int kMaxRefinements = num_refinements; 36 | typedef hls::vector VectN; 37 | VectN num_refinements_vect = VectN(kMaxRefinements); 38 | const int kNumTests = 2; 39 | const int kNumActiveInputs = (num_active_inputs > testv::params::N) ? testv::params::N : num_active_inputs; 40 | const int kOutputSize = (output_size > testv::params::H) ? testv::params::H : output_size; 41 | const int kNumTilesV = kOutputSize / testv::params::Tv; 42 | for (int i = kNumActiveInputs-1; i >= 0; --i) { 43 | // num_refinements_vect[i] = kMaxRefinements; 44 | int R_tmp = kMaxRefinements - 2 * (kNumActiveInputs - i - 1); 45 | num_refinements_vect[i] = R_tmp > 0 ? R_tmp : 1; 46 | } 47 | typedef typename testv::params::ActivationD ActivationType; 48 | assert(testv::params::H == testv::params::PrunedSizeV); // No pruning. 49 | 50 | ActivationType xus[kMaxRefinements][testv::params::N][testv::params::G] = {ActivationType(0.001)}; 51 | ActivationType v[kMaxRefinements][testv::params::PrunedSizeV][testv::params::G] = {ActivationType(0.001)}; 52 | ActivationType y_gold[testv::params::N][testv::params::G][testv::params::H] = {0}; 53 | 54 | for (int i = 0; i < kMaxRefinements; ++i) { 55 | for (int j = 0; j < testv::params::G; ++j) { 56 | for (int k = 0; k < testv::params::N; ++k) { 57 | if (std::is_same::value) { 58 | xus[i][k][j] = ActivationType(rand()); 59 | } else { 60 | xus[i][k][j] = ActivationType(rand() * 0.00001); 61 | } 62 | } 63 | for (int k = 0; k < testv::params::PrunedSizeV; ++k) { 64 | if (std::is_same::value) { 65 | v[i][k][j] = ActivationType(rand()); 66 | } else { 67 | v[i][k][j] = ActivationType(rand() * 0.00001); 68 | } 69 | } 70 | } 71 | } 72 | 73 | for (int i = 0; i < kMaxRefinements; ++i) { 74 | for (int j = 0; j < kNumActiveInputs; ++j) { 75 | if (i < num_refinements_vect[j]) { 76 | for (int k = 0; k < kOutputSize; ++k) { 77 | for (int ii = 0; ii < testv::params::G; ++ii) { 78 | y_gold[j][ii][k] += v[i][k][ii] * xus[i][j][ii]; 79 | } 80 | } 81 | } 82 | } 83 | } 84 | 85 | hls::stream vnz_idx_port("vnz_idx_port"); 86 | hls::stream xus_port("xus_port"); 87 | hls::stream v_port("v_port"); 88 | hls::stream y_port("y_port"); 89 | 90 | auto xus_axis = svd::AxiStreamPort(xus_port); 91 | auto v_axis = svd::AxiStreamPort(v_port); 92 | auto y_axis = svd::AxiStreamPort(y_port); 93 | 94 | int num_errors = 0; 95 | std::cout << "[INFO] Pushing into FIFOs." << std::endl; 96 | for (int t = 0; t < kNumTests; ++t) { 97 | std::cout << "[INFO] Pushing into XUS." << std::endl; 98 | typename testv::params::VectG_Type xus_val; 99 | for (int i = 0; i < kMaxRefinements; ++i) { 100 | for (int j = 0; j < kNumActiveInputs; ++j) { 101 | if (i < num_refinements_vect[j]) { 102 | for (int k = 0; k < testv::params::G; ++k) { 103 | xus_val[k] = xus[i][j][k]; 104 | } 105 | xus_axis.PushVector(xus_val); 106 | } 107 | } 108 | } 109 | std::cout << "[INFO] Pushing into V." << std::endl; 110 | typename testv::params::VectTvType v_val; 111 | for (int i = 0; i < kMaxRefinements; ++i) { 112 | for (int k = 0; k < kNumTilesV; ++k) { 113 | for (int j = 0; j < testv::params::G; ++j) { 114 | for (int ii = 0; ii < testv::params::Tv; ++ii) { 115 | v_val[ii] = v[i][k * testv::params::Tv + ii][j]; 116 | } 117 | v_axis.PushVector(v_val); 118 | } 119 | } 120 | } 121 | } 122 | std::cout << "[INFO] Starting HlsKernelV." << std::endl; 123 | std::cout << "[INFO] v_port.size(): " << v_port.size() << std::endl; 124 | for (int t = 0; t < kNumTests; ++t) { 125 | int R_tmp[testv::params::N]; 126 | for (int i = 0; i < testv::params::N; ++i) { 127 | R_tmp[i] = num_refinements_vect[i]; 128 | } 129 | // HlsKernelV(kNumActiveInputs, kOutputSize, R_tmp, xus_port, v_port, y_port); 130 | 131 | const int num_zero_tiles_v = 0; 132 | HlsKernelV_Pruned(kNumActiveInputs, kOutputSize, R_tmp, num_zero_tiles_v, vnz_idx_port, xus_port, v_port, y_port); 133 | 134 | std::cout << "[INFO] v_port.size(): " << v_port.size() << std::endl; 135 | } 136 | int num_elems = 0; 137 | for (int t = 0; t < kNumTests; ++t) { 138 | std::cout << "[INFO] Checking results test n." << t << std::endl; 139 | int test_errors = 0; 140 | num_elems = 0; 141 | for (int j = 0; j < kNumTilesV; ++j) { 142 | for (int i = 0; i < kNumActiveInputs; ++i) { 143 | const int kGTv = testv::params::G * testv::params::Tv; 144 | auto y_val = y_axis.PopVector(); 145 | for (int k = 0; k < testv::params::Tv; ++k) { 146 | for (int ii = 0; ii < testv::params::G; ++ii) { 147 | if (y_val[k * testv::params::G + ii] != y_gold[i][ii][j * testv::params::Tv + k]) { 148 | std::cout << "N:" << i << "][NTv:" << j << "][Tv:" << k << "][G:" 149 | << ii << "] test/gold: " 150 | << y_val[k * testv::params::G + ii] << " / " 151 | << y_gold[i][ii][j * testv::params::Tv + k] << std::endl; 152 | ++test_errors; 153 | } else { 154 | // std::cout << "\tN:" << i << "][NTv:" << j << "][Tv:" << k << "][G:" 155 | // << ii << "] test/gold: " 156 | // << y_val[k * testv::params::G + ii] << " / " 157 | // << y_gold[i][ii][j * testv::params::Tv + k] << std::endl; 158 | } 159 | ++num_elems; 160 | } 161 | } 162 | } 163 | } 164 | std::cout << "[INFO] Number of mismatches per test / total: " << test_errors 165 | << " / " << num_elems << std::endl; 166 | num_errors += test_errors; 167 | } 168 | std::cout << "[INFO] Total number of mismatches / total: " << num_errors 169 | << " / " << num_elems * kNumTests << std::endl; 170 | return 0; // num_errors; 171 | #endif // end __VITIS_HLS__ 172 | } 173 | -------------------------------------------------------------------------------- /src/testbenches/test_v_kernel_pruned.cpp: -------------------------------------------------------------------------------- 1 | #include "testbenches/test_v_kernel_pruned.h" 2 | #include "dma/axis_lib.h" 3 | 4 | #ifdef __VITIS_HLS__ 5 | #include "hls_vector.h" 6 | #endif 7 | #include "ap_int.h" 8 | #include "hls_stream.h" 9 | #include 10 | #include 11 | 12 | int main(int argc, char const *argv[]) { 13 | #ifdef COSIM_DESIGN 14 | srand(1); 15 | #else 16 | srand(1); 17 | // srand(time(NULL)); 18 | #endif 19 | std::cout << "[INFO] Starting HlsKernelV test." << std::endl; 20 | #ifndef __VITIS_HLS__ 21 | return 0; 22 | #else 23 | int num_active_inputs = testv::params::N; 24 | int output_size = testv::params::H; 25 | int num_refinements = testv::params::R; 26 | if (argc >= 2) { 27 | num_active_inputs = atoi(argv[1]); 28 | } 29 | if (argc >= 3) { 30 | output_size = atoi(argv[2]); 31 | } 32 | if (argc >= 4) { 33 | num_refinements = atoi(argv[3]); 34 | } 35 | const int kMaxRefinements = num_refinements; 36 | typedef hls::vector VectN; 37 | VectN num_refinements_vect = VectN(kMaxRefinements); 38 | const int kNumTests = 2; 39 | const int kNumActiveInputs = (num_active_inputs > testv::params::N) ? testv::params::N : num_active_inputs; 40 | const int kOutputSize = (output_size > testv::params::H) ? testv::params::H : output_size; 41 | const int kNumTilesV = kOutputSize / testv::params::Tv; 42 | for (int i = kNumActiveInputs-1; i >= 0; --i) { 43 | // num_refinements_vect[i] = kMaxRefinements; 44 | int R_tmp = kMaxRefinements - 2 * (kNumActiveInputs - i - 1); 45 | num_refinements_vect[i] = R_tmp > 0 ? R_tmp : 1; 46 | } 47 | typedef typename testv::params::ActivationD ActivationType; 48 | assert(testv::params::H == testv::params::PrunedSizeV); // No pruning. 49 | 50 | ActivationType xus[kMaxRefinements][testv::params::N][testv::params::G] = {ActivationType(0.001)}; 51 | ActivationType v[kMaxRefinements][testv::params::PrunedSizeV][testv::params::G] = {ActivationType(0.001)}; 52 | ActivationType y_gold[testv::params::N][testv::params::G][testv::params::H] = {0}; 53 | 54 | for (int i = 0; i < kMaxRefinements; ++i) { 55 | for (int j = 0; j < testv::params::G; ++j) { 56 | for (int k = 0; k < testv::params::N; ++k) { 57 | if (std::is_same::value) { 58 | xus[i][k][j] = ActivationType(rand()); 59 | } else { 60 | xus[i][k][j] = ActivationType(rand() * 0.00001); 61 | } 62 | } 63 | for (int k = 0; k < testv::params::PrunedSizeV; ++k) { 64 | if (std::is_same::value) { 65 | v[i][k][j] = ActivationType(rand()); 66 | } else { 67 | v[i][k][j] = ActivationType(rand() * 0.00001); 68 | } 69 | } 70 | } 71 | } 72 | 73 | for (int i = 0; i < kMaxRefinements; ++i) { 74 | for (int j = 0; j < kNumActiveInputs; ++j) { 75 | if (i < num_refinements_vect[j]) { 76 | for (int k = 0; k < kOutputSize; ++k) { 77 | for (int ii = 0; ii < testv::params::G; ++ii) { 78 | y_gold[j][ii][k] += v[i][k][ii] * xus[i][j][ii]; 79 | } 80 | } 81 | } 82 | } 83 | } 84 | 85 | hls::stream vnz_idx_port("vnz_idx_port"); 86 | hls::stream xus_port("xus_port"); 87 | hls::stream v_port("v_port"); 88 | hls::stream y_port("y_port"); 89 | 90 | auto xus_axis = svd::AxiStreamPort(xus_port); 91 | auto v_axis = svd::AxiStreamPort(v_port); 92 | auto y_axis = svd::AxiStreamPort(y_port); 93 | 94 | int num_errors = 0; 95 | std::cout << "[INFO] Pushing into FIFOs." << std::endl; 96 | for (int t = 0; t < kNumTests; ++t) { 97 | std::cout << "[INFO] Pushing into XUS." << std::endl; 98 | typename testv::params::VectG_Type xus_val; 99 | for (int i = 0; i < kMaxRefinements; ++i) { 100 | for (int j = 0; j < kNumActiveInputs; ++j) { 101 | if (i < num_refinements_vect[j]) { 102 | for (int k = 0; k < testv::params::G; ++k) { 103 | xus_val[k] = xus[i][j][k]; 104 | } 105 | xus_axis.PushVector(xus_val); 106 | } 107 | } 108 | } 109 | std::cout << "[INFO] Pushing into V." << std::endl; 110 | typename testv::params::VectTvType v_val; 111 | for (int i = 0; i < kMaxRefinements; ++i) { 112 | for (int k = 0; k < kNumTilesV; ++k) { 113 | for (int j = 0; j < testv::params::G; ++j) { 114 | for (int ii = 0; ii < testv::params::Tv; ++ii) { 115 | v_val[ii] = v[i][k * testv::params::Tv + ii][j]; 116 | } 117 | v_axis.PushVector(v_val); 118 | } 119 | } 120 | } 121 | } 122 | std::cout << "[INFO] Starting HlsKernelV." << std::endl; 123 | std::cout << "[INFO] v_port.size(): " << v_port.size() << std::endl; 124 | for (int t = 0; t < kNumTests; ++t) { 125 | int R_tmp[testv::params::N]; 126 | for (int i = 0; i < testv::params::N; ++i) { 127 | R_tmp[i] = num_refinements_vect[i]; 128 | } 129 | // HlsKernelV(kNumActiveInputs, kOutputSize, R_tmp, xus_port, v_port, y_port); 130 | 131 | const int num_zero_tiles_v = 0; 132 | HlsKernelV_Pruned(kNumActiveInputs, kOutputSize, R_tmp, num_zero_tiles_v, vnz_idx_port, xus_port, v_port, y_port); 133 | 134 | std::cout << "[INFO] v_port.size(): " << v_port.size() << std::endl; 135 | } 136 | int num_elems = 0; 137 | for (int t = 0; t < kNumTests; ++t) { 138 | std::cout << "[INFO] Checking results test n." << t << std::endl; 139 | int test_errors = 0; 140 | num_elems = 0; 141 | for (int j = 0; j < kNumTilesV; ++j) { 142 | for (int i = 0; i < kNumActiveInputs; ++i) { 143 | const int kGTv = testv::params::G * testv::params::Tv; 144 | auto y_val = y_axis.PopVector(); 145 | for (int k = 0; k < testv::params::Tv; ++k) { 146 | for (int ii = 0; ii < testv::params::G; ++ii) { 147 | if (y_val[k * testv::params::G + ii] != y_gold[i][ii][j * testv::params::Tv + k]) { 148 | std::cout << "N:" << i << "][NTv:" << j << "][Tv:" << k << "][G:" 149 | << ii << "] test/gold: " 150 | << y_val[k * testv::params::G + ii] << " / " 151 | << y_gold[i][ii][j * testv::params::Tv + k] << std::endl; 152 | ++test_errors; 153 | } else { 154 | // std::cout << "\tN:" << i << "][NTv:" << j << "][Tv:" << k << "][G:" 155 | // << ii << "] test/gold: " 156 | // << y_val[k * testv::params::G + ii] << " / " 157 | // << y_gold[i][ii][j * testv::params::Tv + k] << std::endl; 158 | } 159 | ++num_elems; 160 | } 161 | } 162 | } 163 | } 164 | std::cout << "[INFO] Number of mismatches per test / total: " << test_errors 165 | << " / " << num_elems << std::endl; 166 | num_errors += test_errors; 167 | } 168 | std::cout << "[INFO] Total number of mismatches / total: " << num_errors 169 | << " / " << num_elems * kNumTests << std::endl; 170 | return 0; // num_errors; 171 | #endif // end __VITIS_HLS__ 172 | } 173 | -------------------------------------------------------------------------------- /tcl/lstm_params.tcl: -------------------------------------------------------------------------------- 1 | proc append_lstm_params {&defines} { 2 | dict set params NUM_GATES 4 3 | dict set params NUM_INPUTS 2 4 | dict set params NUM_SAMPLES 2 5 | dict set params INPUT_SIZE 128 6 | dict set params HIDDEN_SIZE 64 7 | dict set params NUM_ITERATIONS 32 8 | dict set params NUM_TILES_U 8 9 | dict set params NUM_ZERO_TILES_U 2 10 | dict set params NUM_TILES_V 16 ;# NOTE: The parallelism is HIDDEN_SIZE / NUM_TILES_V! 11 | dict set params NUM_ZERO_TILES_V 2 12 | dict set params NUM_TIMESTEPS 28 13 | dict set params FIX_WIDTH 16 14 | dict set params FIX_FRACT_WIDTH 6 15 | 16 | set tmp {} 17 | append tmp " " 18 | foreach key [dict keys $params] { 19 | set value [dict get $params $key] 20 | append tmp "-D${key}=${value} " 21 | } 22 | puts "================================================================" 23 | puts "\[INFO\] LSTM parameters:" 24 | puts $tmp 25 | puts "================================================================" 26 | upvar 1 ${&defines} defines ;# To have a "pass by reference" argument. 27 | append defines $tmp 28 | } -------------------------------------------------------------------------------- /tcl/utils.tcl: -------------------------------------------------------------------------------- 1 | # 2 | # @brief Find all files in a directory and return them in a list. 3 | # 4 | # @param basedir The directory to start looking in pattern. 5 | # @param pattern A pattern, as defined by the glob command, that 6 | # the files must match. 7 | # @param exclude_dirs_list Ignore searching in specified directories 8 | # 9 | # @return The list of found files. 10 | # 11 | proc findFiles { basedir pattern exclude_dirs_list } { 12 | # Fix the directory name, this ensures the directory name is in the 13 | # native format for the platform and contains a final directory seperator 14 | set basedir [string trimright [file join [file normalize $basedir] { }]] 15 | set fileList {} 16 | # Look in the current directory for matching files, -type {f r} 17 | # means ony readable normal files are looked at, -nocomplain stops 18 | # an error being thrown if the returned list is empty 19 | foreach fileName [glob -nocomplain -type {f r} -path $basedir $pattern] { 20 | lappend fileList $fileName 21 | } 22 | # Now look for any sub direcories in the current directory 23 | foreach dirName [glob -nocomplain -type {d r} -path $basedir *] { 24 | # Recusively call the routine on the sub directory and append any 25 | # new files to the results 26 | if {[lsearch -exact ${exclude_dirs_list} $dirName] == -1} { 27 | set subDirList [findFiles $dirName $pattern $exclude_dirs_list] 28 | if { [llength $subDirList] > 0 } { 29 | foreach subDirFile $subDirList { 30 | lappend fileList $subDirFile 31 | } 32 | } 33 | } 34 | } 35 | return $fileList 36 | } 37 | 38 | # 39 | # @brief Greps a file content and writes matches to a file. 40 | # 41 | # @param re Regular expression 42 | # @param lines Number of lines to report/include after the found match 43 | # @param fin The fin pointer 44 | # @param fout The fout pointer 45 | # 46 | proc grep {re lines fin fout} { 47 | set cnt 0 48 | set match false 49 | seek $fin 0 50 | while {[gets $fin line] >= 0} { 51 | if [regexp -- $re $line] { 52 | set cnt 0 53 | set match true 54 | } 55 | if {$match && ($cnt < $lines)} { 56 | puts $line 57 | puts $fout $line 58 | set cnt [expr {$cnt +1}] 59 | } else { 60 | set match false 61 | } 62 | } 63 | } --------------------------------------------------------------------------------