├── .clang-format
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── doc
    └── acri-room-howto.md
├── host
    ├── CMakeLists.txt
    ├── host_util.h
    ├── link_u200_template.ini
    ├── link_u50_template.ini
    └── run_inference.cc
├── include
    └── dnn-kernel
    │   ├── conv2d.h
    │   ├── inference.h
    │   ├── linear.h
    │   ├── maxpool2d.h
    │   └── relu.h
├── learning
    ├── requirements.txt
    └── train_mnist.py
├── tests
    ├── CMakeLists.txt
    ├── hls
    │   ├── CMakeLists.txt
    │   ├── conv2d
    │   │   ├── CMakeLists.txt
    │   │   ├── conv2d_hls.cc
    │   │   ├── conv2d_hls.h
    │   │   └── conv2d_test.cc
    │   ├── inference
    │   │   ├── CMakeLists.txt
    │   │   ├── inference_hls.cc
    │   │   ├── inference_hls.h
    │   │   └── inference_test.cc
    │   ├── linear
    │   │   ├── CMakeLists.txt
    │   │   ├── linear_hls.cc
    │   │   ├── linear_hls.h
    │   │   └── linear_test.cc
    │   ├── maxpool2d
    │   │   ├── CMakeLists.txt
    │   │   ├── maxpool2d_hls.cc
    │   │   ├── maxpool2d_hls.h
    │   │   └── maxpool2d_test.cc
    │   ├── relu
    │   │   ├── CMakeLists.txt
    │   │   ├── relu_hls.cc
    │   │   ├── relu_hls.h
    │   │   └── relu_test.cc
    │   └── tb.tcl
    ├── ref
    │   ├── CMakeLists.txt
    │   ├── conv2d.cc
    │   ├── inference.cc
    │   ├── linear.cc
    │   ├── maxpool2d.cc
    │   └── relu.cc
    └── util.h
└── thirdparty
    └── download.sh


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | # BasedOnStyle:  Google
  4 | AccessModifierOffset: -1
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveAssignments: false
  7 | AlignConsecutiveDeclarations: false
  8 | AlignEscapedNewlines: Left
  9 | AlignOperands:   true
 10 | AlignTrailingComments: true
 11 | AllowAllParametersOfDeclarationOnNextLine: true
 12 | AllowShortBlocksOnASingleLine: false
 13 | AllowShortCaseLabelsOnASingleLine: false
 14 | AllowShortFunctionsOnASingleLine: All
 15 | AllowShortIfStatementsOnASingleLine: true
 16 | AllowShortLoopsOnASingleLine: true
 17 | AlwaysBreakAfterDefinitionReturnType: None
 18 | AlwaysBreakAfterReturnType: None
 19 | AlwaysBreakBeforeMultilineStrings: true
 20 | AlwaysBreakTemplateDeclarations: true
 21 | BinPackArguments: true
 22 | BinPackParameters: true
 23 | BraceWrapping:   
 24 |   AfterClass:      false
 25 |   AfterControlStatement: false
 26 |   AfterEnum:       false
 27 |   AfterFunction:   false
 28 |   AfterNamespace:  false
 29 |   AfterObjCDeclaration: false
 30 |   AfterStruct:     false
 31 |   AfterUnion:      false
 32 |   AfterExternBlock: false
 33 |   BeforeCatch:     false
 34 |   BeforeElse:      false
 35 |   IndentBraces:    false
 36 |   SplitEmptyFunction: true
 37 |   SplitEmptyRecord: true
 38 |   SplitEmptyNamespace: true
 39 | BreakBeforeBinaryOperators: None
 40 | BreakBeforeBraces: Attach
 41 | BreakBeforeInheritanceComma: false
 42 | BreakBeforeTernaryOperators: true
 43 | BreakConstructorInitializersBeforeComma: false
 44 | BreakConstructorInitializers: BeforeColon
 45 | BreakAfterJavaFieldAnnotations: false
 46 | BreakStringLiterals: true
 47 | ColumnLimit:     80
 48 | CommentPragmas:  '^ IWYU pragma:'
 49 | CompactNamespaces: false
 50 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 51 | ConstructorInitializerIndentWidth: 4
 52 | ContinuationIndentWidth: 4
 53 | Cpp11BracedListStyle: true
 54 | DerivePointerAlignment: true
 55 | DisableFormat:   false
 56 | ExperimentalAutoDetectBinPacking: false
 57 | FixNamespaceComments: true
 58 | ForEachMacros:   
 59 |   - foreach
 60 |   - Q_FOREACH
 61 |   - BOOST_FOREACH
 62 | IncludeBlocks:   Preserve
 63 | IncludeCategories: 
 64 |   - Regex:           '^<ext/.*\.h>'
 65 |     Priority:        2
 66 |   - Regex:           '^<.*\.h>'
 67 |     Priority:        1
 68 |   - Regex:           '^<.*'
 69 |     Priority:        2
 70 |   - Regex:           '.*'
 71 |     Priority:        3
 72 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 73 | IndentCaseLabels: true
 74 | IndentPPDirectives: None
 75 | IndentWidth:     2
 76 | IndentWrappedFunctionNames: false
 77 | JavaScriptQuotes: Leave
 78 | JavaScriptWrapImports: true
 79 | KeepEmptyLinesAtTheStartOfBlocks: false
 80 | MacroBlockBegin: ''
 81 | MacroBlockEnd:   ''
 82 | MaxEmptyLinesToKeep: 1
 83 | NamespaceIndentation: None
 84 | ObjCBlockIndentWidth: 2
 85 | ObjCSpaceAfterProperty: false
 86 | ObjCSpaceBeforeProtocolList: false
 87 | PenaltyBreakAssignment: 2
 88 | PenaltyBreakBeforeFirstCallParameter: 1
 89 | PenaltyBreakComment: 300
 90 | PenaltyBreakFirstLessLess: 120
 91 | PenaltyBreakString: 1000
 92 | PenaltyExcessCharacter: 1000000
 93 | PenaltyReturnTypeOnItsOwnLine: 200
 94 | PointerAlignment: Left
 95 | RawStringFormats: 
 96 |   - Delimiter:       pb
 97 |     Language:        TextProto
 98 |     BasedOnStyle:    google
 99 | ReflowComments:  true
100 | SortIncludes:    true
101 | SortUsingDeclarations: true
102 | SpaceAfterCStyleCast: false
103 | SpaceAfterTemplateKeyword: true
104 | SpaceBeforeAssignmentOperators: true
105 | SpaceBeforeParens: ControlStatements
106 | SpaceInEmptyParentheses: false
107 | SpacesBeforeTrailingComments: 2
108 | SpacesInAngles:  false
109 | SpacesInContainerLiterals: true
110 | SpacesInCStyleCastParentheses: false
111 | SpacesInParentheses: false
112 | SpacesInSquareBrackets: false
113 | Standard:        Auto
114 | TabWidth:        8
115 | UseTab:          Never
116 | ...
117 | 
118 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build*
2 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.11 FATAL_ERROR)
 2 | enable_testing()
 3 | 
 4 | # Project
 5 | set(PROJECT_NAME dnn-kernel)
 6 | project(${PROJECT_NAME} LANGUAGES C CXX)
 7 | 
 8 | 
 9 | # Default to Debug build type
10 | if(NOT CMAKE_BUILD_TYPE)
11 |   set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
12 | endif()
13 | 
14 | # Project settings
15 | set(DNNK_INCLUDE_DIRS ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/include)
16 | set(DNNK_CXX_FLAGS "-g" "-Wall" "-Wno-uninitialized" "-Wno-unused-function" "-Wno-unknown-pragmas")
17 | 
18 | # Vivado HLS
19 | set(VIVADO_HLS_ROOT "/opt/Xilinx/VivadoHLS/2019.1" CACHE STRING "Path to Vivado HLS root directory")
20 | set(VHLS_INCLUDE_DIRS ${VIVADO_HLS_ROOT}/include)
21 | set(XILINX_XRT "/opt/xilinx/xrt")
22 | 
23 | # Target board
24 | set(TARGET_BOARD "u200" CACHE STRING "Select target Alveo board (available: \"u200\", \"u250\", \"u280\", \"u50\")")
25 | if (${TARGET_BOARD} STREQUAL "u200")
26 |   set(CHIP_PART "xcu200-fsgd2104-2-e")
27 |   set(VITIS_PLATFORM "/opt/xilinx/platforms/xilinx_u200_xdma_201830_2/xilinx_u200_xdma_201830_2.xpfm")
28 | elseif (${TARGET_BOARD} STREQUAL "u250")
29 |   set(CHIP_PART "xcu250-figd2104-2L-e")
30 |   set(VITIS_PLATFORM "/opt/xilinx/platforms/xilinx_u250_xdma_201830_2/xilinx_u250_xdma_201830_2.xpfm")
31 | elseif (${TARGET_BOARD} STREQUAL "u280")
32 |   set(CHIP_PART "xcu280-fsvh2892-2L-e")
33 |   set(VITIS_PLATFORM "/opt/xilinx/platforms/xilinx_u280-es1_xdma_201910_1/xilinx_u280-es1_xdma_201910_1.xpfm")
34 | elseif (${TARGET_BOARD} STREQUAL "u50")
35 |   set(CHIP_PART "xcu50-fsvh2104-2-e")
36 |   set(VITIS_PLATFORM "/opt/xilinx/platforms/xilinx_u50_gen3x16_xdma_201920_3/xilinx_u50_gen3x16_xdma_201920_3.xpfm")
37 | else()
38 |   message(FATAL_ERROR "Unknown TARGET_BOARD value \"${TARGET_BOARD}\"")
39 | endif()
40 | 
41 | # thirdpartys
42 | include(ExternalProject)
43 | include(FetchContent)
44 | 
45 | ## libtorch
46 | set(LIBTORCH_LOCAL_PATH "filefile://://${CMAKE_SOURCE_DIR}/thirdparty/libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip")
47 | FetchContent_Declare(
48 |   libtorch
49 |   URL ${LIBTORCH_LOCAL_PATH} https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip
50 | )
51 | FetchContent_GetProperties(libtorch)
52 | if(NOT libtorch_POPULATED)
53 |   FetchContent_Populate(libtorch)
54 | endif()
55 | 
56 | list(APPEND CMAKE_PREFIX_PATH ${libtorch_SOURCE_DIR})
57 | set(TORCH_LIBRARY_DIRS ${libtorch_SOURCE_DIR}/lib)
58 | find_package(Torch REQUIRED)
59 | 
60 | ## googletest
61 | set(GTEST_PREFIX ${PROJECT_BINARY_DIR}/thirdparty/googletest)
62 | set(GTEST_INSTALL ${GTEST_PREFIX}/install)
63 | set(GTEST_INCLUDE_DIRS ${GTEST_INSTALL}/include)
64 | set(GTEST_LIBRARY_DIRS ${GTEST_INSTALL}/lib)
65 | set(GTEST_LIBRARIES ${GTEST_INSTALL}/lib/libgtest.a)
66 | set(GTEST_LOCAL_PATH "filefile://://${CMAKE_SOURCE_DIR}/thirdparty/release-1.10.0.zip")
67 | 
68 | ExternalProject_Add(
69 |   googletest
70 |   PREFIX ${GTEST_PREFIX}
71 |   URL ${GTEST_LOCAL_PATH} https://github.com/google/googletest/archive/release-1.10.0.zip
72 |   CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL} -DCMAKE_BUILD_TYPE=Release
73 |   BUILD_BYPRODUCTS ${GTEST_LIB}
74 | )
75 | 
76 | # Logging
77 | message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
78 | message(STATUS "Path to Vivado HLS: ${VIVADO_HLS_ROOT}")
79 | 
80 | # tests
81 | add_subdirectory(tests)
82 | add_subdirectory(host)
83 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Fixstars
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DNN-Kernel-FPGA
 2 | 
 3 | Deep Learning の FPGA 向けフルスクラッチ実装
 4 | 
 5 | ## 概要
 6 | 
 7 | このプロジェクトは、小規模な畳み込みネットワークを FPGA で実装したものです。  
 8 | MNIST データセットをターゲットに、フルスクラッチで書いたネットワークモデルを Alveo FPGA カード上で動作させます。
 9 | 
10 | 特にACRi ルーム上でこのコードを使用する場合は、[doc/acri-room-howto.md](doc/acri-room-howto.md) を参考にしてください。
11 | 
12 | ## 開発環境
13 | - Ubuntu (>= 18.04)
14 | - Python (>= 3.5.2)
15 | - CMake (>= 3.11)
16 | - Vivado HLS (>= 2019.2)
17 | 
18 | ## MNIST の学習
19 | 
20 | 以下はvirtualenv を使用しているので、その他の python 仮想環境を使用する場合は各々変更してください。
21 | 
22 | ```sh
23 | cd learning
24 | virtualenv -p python3 venv
25 | source venv/bin/activate
26 | pip install -r requirements.txt
27 | python train_mnist.py
28 | ```
29 | 
30 | ## ビルド
31 | 
32 | #### ホストアプリケーションなど
33 | ```sh
34 | mkdir build && cd build
35 | cmake -DTARGET_BOARD=u200 ../
36 | cmake --build .
37 | ```
38 | 
39 | #### FPGA イメージ
40 | 
41 | 環境変数を設定します。  
42 | ```
43 | $ source /tools/Xilinx/Vitis/2019.2/settings64.sh
44 | $ source /opt/xilinx/xrt/setup.sh
45 | ```
46 | 
47 | ビットストリームの合成を行います。  
48 | この手順は2時間程度かかります。  
49 | ```sh
50 | cmake --build . --target inference_top_hw_xo
51 | cmake --build . --target inference_top_hw
52 | ```
53 | 
54 | 合成レポートは次のように確認できます。  
55 | ```sh
56 | vitis_analyzer host/inference_top_hw.xclbin.link_summary
57 | ```
58 | 
59 | ## 推論処理
60 | 
61 | ### 推論の実行
62 | 
63 | トレース取得用に `xrt.ini` を作成します。
64 | ```sh
65 | echo -e "[Debug]\nprofile=true\ntimeline_trace=true" > xrt.ini
66 | ```
67 | 
68 | 以下のコマンドで推論処理が実行されます。
69 | ```sh
70 | ./host/run_inference ./host/inference_top_hw.xclbin inference_top
71 | ```
72 | 
73 | 実行レポートは次のように確認できます。  
74 | ```sh
75 | vitis_analyzer inference_top_hw.xclbin.run_summary
76 | ```
77 | 
78 | ## テスト
79 | 
80 | #### 単体テスト
81 | 
82 | 以下のようにして単体テストが可能です (ReLU の場合) 。  
83 | 
84 | ```sh
85 | ctest -V -R "relu_ref"         # Test of reference implementation
86 | ctest -V -R "relu_hls_csim"    # C simulation test of HLS implementation
87 | ctest -V -R "relu_hls_cosim"   # C/RTL co-simulation test of HLS implementation
88 | ```
89 | 
90 | 


--------------------------------------------------------------------------------
/doc/acri-room-howto.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # ACRi ルームのサーバー上での動かし方
 3 | 
 4 | ACRi ルームの Alveo サーバー上で本リポジトリで作成する MNIST モデルを実行する方法を記します。
 5 | ターゲットとなる環境は、`as001` サーバーです。
 6 | 
 7 | ACRi ルームのサーバー上では外部ネットワークへの接続が不可なため、まずは自前の開発マシンでの準備が必要です。
 8 | 
 9 | 
10 | ## 自前の開発マシンでの作業手順
11 | 
12 | 1. このリポジトリをclone 
13 | 2. `thirdparty` 以下の`download.sh` を実行
14 | 3. `learning` 以下に入り学習を行う (手順は[README.md](../README.md) 内に記載)
15 | 4. リポジトリのコード全体を圧縮し、ACRi ルームのサーバーのホームディレクトリ上にコピーする
16 | 
17 | ## ACRi ルームのサーバー上での作業手順
18 | 
19 | 公式の利用方法を元にログインします。  
20 | - サーバ全般: http://gw.acri.c.titech.ac.jp/wp/manual/how-to-reserve
21 | - Alveoサーバ: http://gw.acri.c.titech.ac.jp/wp/manual/alveo-server
22 | 
23 | レポート表示時に GUI 機能を使うため、リモートデスクトップの使用を推奨します。
24 | 
25 | ### 準備
26 | 
27 | `/home/<username>/dnn-kernel-fpga.zip` に圧縮済みのコードがある前提で説明します。
28 | 
29 | まず、高速なローカルディレクトリである`/scratch` 上にデータをコピーします。  
30 | ```
31 | $ cp /home/<username>/dnn-kernel-fpga.zip /scratch
32 | ```
33 | 
34 | ワーキングディレクトリを`/scratch`に移動しコピーしたファイルを解凍します。  
35 | ```
36 | $ cd /scratch
37 | $ unzip dnn-kernel-fpga.zip
38 | ```
39 | 
40 | cmake 3.16.8 にパスを通します。  
41 | ```
42 | $ export PATH=/scratch/dnn-kernel-fpga/thirdparty/cmake-3.16.8-Linux-x86_64/bin:${PATH}`
43 | ```
44 | 
45 | 以降は、[README.md](../README.md) に記載のビルド・推論処理の手順を行います。  
46 | MNIST の学習、テストの手順は ACRi ルームのサーバー上では行えません。
47 | 


--------------------------------------------------------------------------------
/host/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # host
 3 | file(GLOB HOST_SRCS *.cc)
 4 | 
 5 | find_library(XRT_LIBRARIES NAMES xrt_core PATHS ${XILINX_XRT}/lib)
 6 | 
 7 | set(VITIS_INCLUDE_DIRS ${XILINX_XRT}/include ${XILINX_VIVADO}/include)
 8 | set(VITIS_LIBRARIES OpenCL pthread ${XRT_LIBRARIES})
 9 | 
10 | add_executable(run_inference ${HOST_SRCS})
11 | target_include_directories(run_inference PRIVATE ${DNNK_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS} ${VITIS_INCLUDE_DIRS})
12 | target_link_libraries(run_inference PRIVATE ${TORCH_LIBRARIES} ${VITIS_LIBRARIES})
13 | target_compile_options(run_inference PRIVATE "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"")
14 | target_compile_features(run_inference PRIVATE cxx_std_14)
15 | 
16 | # xo
17 | function (add_xo name top target sources platform)
18 |   set(include_dirs ${DNNK_INCLUDE_DIRS})
19 |   prepend_option("${include_dirs}" "-I" include_options)
20 | 
21 |   add_custom_target(
22 |     ${name}_xo
23 |     COMMAND v++ -g --compile --target ${target} --kernel ${top} --platform ${platform} --profile_kernel data:all:all:all --profile_kernel stall:all:all:all --temp_dir build_${name} --save-temps ${include_options} ${sources} -o ${name}.xo
24 |     WORKING_DIRECTORY ${WORK_DIR}
25 |     )
26 | endfunction()
27 | 
28 | # xclbin
29 | function (add_xclbin name top target ini_file platform)
30 | 
31 |   abs_path(${ini_file} abs_ini_file)
32 | 
33 |   set(top_func ${top})
34 |   set(target_ini_file ${name}_${TARGET_BOARD}.ini)
35 | 
36 |   configure_file(${abs_ini_file} ${target_ini_file})
37 |   
38 |   add_custom_target(
39 |     ${name}
40 |     COMMAND v++ -g --link --target ${target} --platform ${platform} --config ${target_ini_file} --temp_dir build_${name} --save-temps ${name}.xo -o ${name}.xclbin
41 |     WORKING_DIRECTORY ${WORK_DIR}
42 |     )
43 | endfunction()
44 | 
45 | function (add_xo_and_xclbin name top sources ini_file platform)
46 | 
47 |   add_xo(${name}_hw ${top} hw ${sources} ${platform})
48 |   add_xo(${name}_hw_emu ${top} hw_emu ${sources} ${platform})
49 |   add_xo(${name}_sw_emu ${top} sw_emu ${sources} ${platform})
50 |   add_xclbin(${name}_hw ${top} hw ${ini_file} ${platform})
51 |   add_xclbin(${name}_hw_emu ${top} hw_emu ${ini_file} ${platform})
52 |   add_xclbin(${name}_sw_emu ${top} sw_emu ${ini_file} ${platform})
53 | endfunction()
54 |   
55 | get_filename_component(inference_src ../tests/hls/inference/inference_hls.cc ABSOLUTE)
56 | 
57 | add_xo_and_xclbin(inference_top inference_top ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM})
58 | add_xo_and_xclbin(inference_dataflow inference_dataflow ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM})
59 | add_xo_and_xclbin(inference_with_local_buffer inference_with_local_buffer ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM})
60 | add_xo_and_xclbin(inference_pipelined_conv_v1 inference_pipelined_conv_v1 ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM})
61 | add_xo_and_xclbin(inference_pipelined_conv_v2 inference_pipelined_conv_v2 ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM})
62 | add_xo_and_xclbin(inference_unrolledx4_conv_v1 inference_unrolledx4_conv_v1 ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM})
63 | add_xo_and_xclbin(inference_unrolledx4_conv_v2 inference_unrolledx4_conv_v2 ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM})
64 | add_xo_and_xclbin(inference_final inference_final ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM})
65 | 


--------------------------------------------------------------------------------
/host/host_util.h:
--------------------------------------------------------------------------------
  1 | #ifndef DNNKERNEL_HOST_UTIL_H
  2 | #define DNNKERNEL_HOST_UTIL_H
  3 | 
  4 | #include <cstdlib>
  5 | #include <vector>
  6 | #include <string>
  7 | #include <fstream>
  8 | #include <CL/cl2.hpp>
  9 | #include <chrono>
 10 | 
 11 | namespace dnnk {
 12 | 
 13 | class ClHelper {
 14 | public:
 15 |     ClHelper(const std::string& xclbin_name) {
 16 | 
 17 |         cl::Platform::get(&platforms_);
 18 |         for (std::size_t i = 0; i < platforms_.size(); i++) {
 19 |             cl::Platform& platform = platforms_[i];
 20 |             std::string platform_name = platform.getInfo<CL_PLATFORM_NAME>();
 21 | 
 22 |             if (platform_name == "Xilinx") {
 23 |                 platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices_);
 24 |                 break;
 25 |             }
 26 |         }
 27 | 
 28 |         cl::Device device = devices_[0];
 29 | 
 30 |         context_ = cl::Context(device);
 31 | 
 32 |         auto xclbin = read_binary_file(xclbin_name);
 33 |         cl::Program::Binaries binaries;
 34 |         binaries.push_back(xclbin);
 35 | 
 36 |         program_ = cl::Program(context_, devices_, binaries);
 37 |     }
 38 | 
 39 |     cl::Program& get_program() {
 40 |         return program_;
 41 |     }
 42 | 
 43 |     cl::Context& get_context() {
 44 |         return context_;
 45 |     }
 46 | 
 47 |     cl::Device& get_device() {
 48 |         return devices_[0];
 49 |     }
 50 | 
 51 | private:
 52 |     std::vector<unsigned char> read_binary_file(const std::string& filename) {
 53 |         std::vector<unsigned char> ret;
 54 |         std::ifstream ifs(filename, std::ifstream::binary);
 55 | 
 56 |         ifs.seekg(0, ifs.end);
 57 |         std::size_t size = ifs.tellg();
 58 |         ifs.seekg(0, ifs.beg);
 59 | 
 60 |         ret.resize(size);
 61 |         ifs.read(reinterpret_cast<char*>(ret.data()), ret.size());
 62 | 
 63 |         return ret;
 64 |     }
 65 | 
 66 |     std::vector<cl::Platform> platforms_;
 67 |     std::vector<cl::Device> devices_;
 68 |     cl::Context context_;
 69 |     cl::Program program_;
 70 | };
 71 | 
 72 | 
 73 | template <typename T>
 74 | class aligned_allocator {
 75 | public:
 76 |   using value_type = T;
 77 | 
 78 |   aligned_allocator() = default;
 79 | 
 80 |   template <class U>
 81 |   constexpr aligned_allocator(const aligned_allocator<U>&) noexcept {}
 82 | 
 83 |   T* allocate(std::size_t size) {
 84 |     void* ptr = nullptr;
 85 | 
 86 |     if (posix_memalign(&ptr, 4096, size * sizeof(T))) {
 87 |       throw std::bad_alloc();
 88 |     }
 89 | 
 90 |     return reinterpret_cast<T*>(ptr);
 91 |   }
 92 | 
 93 |   void deallocate(T* ptr, std::size_t size) {
 94 |     free(ptr);
 95 |   }
 96 | };
 97 | 
 98 | template <typename T>
 99 | using aligned_vector = std::vector<T, aligned_allocator<T>>;
100 | 
101 | 
102 | class StopWatch {
103 | public:
104 |   StopWatch() = default;
105 | 
106 |   void start() {
107 |     tstart_ = clock::now();
108 |   }
109 | 
110 |   void stop() {
111 |     tstop_ = clock::now();
112 |   }
113 | 
114 |   double elapsed_time_ms() const {
115 |     auto elapsed_micro = std::chrono::duration_cast<std::chrono::microseconds>(tstop_ - tstart_).count();
116 |     return elapsed_micro / 1000.0;
117 |   }
118 | 
119 | private:
120 |   using clock = std::chrono::high_resolution_clock;
121 |   using time_point = std::chrono::time_point<clock>;
122 | 
123 |   time_point tstart_;
124 |   time_point tstop_;
125 | };
126 | 
127 | }
128 | 
129 | #endif
130 | 


--------------------------------------------------------------------------------
/host/link_u200_template.ini:
--------------------------------------------------------------------------------
 1 | # Debug
 2 | dk=chipscope:${top_func}_1:M_AXI_GMEM0
 3 | dk=chipscope:${top_func}_1:M_AXI_GMEM1
 4 | dk=chipscope:${top_func}_1:M_AXI_GMEM2
 5 | dk=chipscope:${top_func}_1:M_AXI_GMEM3
 6 | dk=chipscope:${top_func}_1:M_AXI_GMEM4
 7 | dk=chipscope:${top_func}_1:M_AXI_GMEM5
 8 | dk=chipscope:${top_func}_1:M_AXI_GMEM6
 9 | dk=chipscope:${top_func}_1:M_AXI_GMEM7
10 | dk=chipscope:${top_func}_1:M_AXI_GMEM8
11 | dk=chipscope:${top_func}_1:M_AXI_GMEM9
12 | dk=chipscope:${top_func}_1:S_AXI_CONTROL
13 | dk=protocol:${top_func}_1:M_AXI_GMEM0
14 | dk=protocol:${top_func}_1:M_AXI_GMEM1
15 | dk=protocol:${top_func}_1:M_AXI_GMEM2
16 | dk=protocol:${top_func}_1:M_AXI_GMEM3
17 | dk=protocol:${top_func}_1:M_AXI_GMEM4
18 | dk=protocol:${top_func}_1:M_AXI_GMEM5
19 | dk=protocol:${top_func}_1:M_AXI_GMEM6
20 | dk=protocol:${top_func}_1:M_AXI_GMEM7
21 | dk=protocol:${top_func}_1:M_AXI_GMEM8
22 | dk=protocol:${top_func}_1:M_AXI_GMEM9
23 | dk=chipscope:${top_func}_1:S_AXI_CONTROL
24 | 
25 | # Profile
26 | profile_kernel=stall:all:all:all
27 | profile_kernel=data:${top_func}:${top_func}_1:x:all
28 | profile_kernel=data:${top_func}:${top_func}_1:weight0:all
29 | profile_kernel=data:${top_func}:${top_func}_1:bias0:all
30 | profile_kernel=data:${top_func}:${top_func}_1:weight1:all
31 | profile_kernel=data:${top_func}:${top_func}_1:bias1:all
32 | profile_kernel=data:${top_func}:${top_func}_1:weight2:all
33 | profile_kernel=data:${top_func}:${top_func}_1:bias2:all
34 | profile_kernel=data:${top_func}:${top_func}_1:weight3:all
35 | profile_kernel=data:${top_func}:${top_func}_1:bias3:all
36 | profile_kernel=data:${top_func}:${top_func}_1:y:all
37 | 
38 | [connectivity]
39 | nk=${top_func}:1:${top_func}_1
40 | sp=${top_func}_1.x:DDR[0]
41 | sp=${top_func}_1.weight0:DDR[0]
42 | sp=${top_func}_1.bias0:DDR[0]
43 | sp=${top_func}_1.weight1:DDR[0]
44 | sp=${top_func}_1.bias1:DDR[0]
45 | sp=${top_func}_1.weight2:DDR[0]
46 | sp=${top_func}_1.bias2:DDR[0]
47 | sp=${top_func}_1.weight3:DDR[0]
48 | sp=${top_func}_1.bias3:DDR[0]
49 | sp=${top_func}_1.y:DDR[0]
50 | 


--------------------------------------------------------------------------------
/host/link_u50_template.ini:
--------------------------------------------------------------------------------
 1 | # Debug
 2 | dk=chipscope:${top_func}_1:M_AXI_GMEM0
 3 | dk=chipscope:${top_func}_1:M_AXI_GMEM1
 4 | dk=chipscope:${top_func}_1:M_AXI_GMEM2
 5 | dk=chipscope:${top_func}_1:M_AXI_GMEM3
 6 | dk=chipscope:${top_func}_1:M_AXI_GMEM4
 7 | dk=chipscope:${top_func}_1:M_AXI_GMEM5
 8 | dk=chipscope:${top_func}_1:M_AXI_GMEM6
 9 | dk=chipscope:${top_func}_1:M_AXI_GMEM7
10 | dk=chipscope:${top_func}_1:M_AXI_GMEM8
11 | dk=chipscope:${top_func}_1:M_AXI_GMEM9
12 | dk=chipscope:${top_func}_1:S_AXI_CONTROL
13 | dk=protocol:${top_func}_1:M_AXI_GMEM0
14 | dk=protocol:${top_func}_1:M_AXI_GMEM1
15 | dk=protocol:${top_func}_1:M_AXI_GMEM2
16 | dk=protocol:${top_func}_1:M_AXI_GMEM3
17 | dk=protocol:${top_func}_1:M_AXI_GMEM4
18 | dk=protocol:${top_func}_1:M_AXI_GMEM5
19 | dk=protocol:${top_func}_1:M_AXI_GMEM6
20 | dk=protocol:${top_func}_1:M_AXI_GMEM7
21 | dk=protocol:${top_func}_1:M_AXI_GMEM8
22 | dk=protocol:${top_func}_1:M_AXI_GMEM9
23 | dk=chipscope:${top_func}_1:S_AXI_CONTROL
24 | 
25 | # Profile
26 | profile_kernel=stall:all:all:all
27 | profile_kernel=data:${top_func}:${top_func}_1:x:all
28 | profile_kernel=data:${top_func}:${top_func}_1:weight0:all
29 | profile_kernel=data:${top_func}:${top_func}_1:bias0:all
30 | profile_kernel=data:${top_func}:${top_func}_1:weight1:all
31 | profile_kernel=data:${top_func}:${top_func}_1:bias1:all
32 | profile_kernel=data:${top_func}:${top_func}_1:weight2:all
33 | profile_kernel=data:${top_func}:${top_func}_1:bias2:all
34 | profile_kernel=data:${top_func}:${top_func}_1:weight3:all
35 | profile_kernel=data:${top_func}:${top_func}_1:bias3:all
36 | profile_kernel=data:${top_func}:${top_func}_1:y:all
37 | 
38 | [connectivity]
39 | nk=${top_func}:1:${top_func}_1
40 | sp=${top_func}_1.x:HBM[0]
41 | sp=${top_func}_1.weight0:HBM[0]
42 | sp=${top_func}_1.bias0:HBM[0]
43 | sp=${top_func}_1.weight1:HBM[0]
44 | sp=${top_func}_1.bias1:HBM[0]
45 | sp=${top_func}_1.weight2:HBM[0]
46 | sp=${top_func}_1.bias2:HBM[0]
47 | sp=${top_func}_1.weight3:HBM[0]
48 | sp=${top_func}_1.bias3:HBM[0]
49 | sp=${top_func}_1.y:HBM[0]
50 | 


--------------------------------------------------------------------------------
/host/run_inference.cc:
--------------------------------------------------------------------------------
  1 | 
  2 | #define CL_HPP_CL_1_2_DEFAULT_BUILD
  3 | #define CL_HPP_TARGET_OPENCL_VERSION 120
  4 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120
  5 | 
  6 | #include "host_util.h"
  7 | 
  8 | #include <cstdio>
  9 | #include <cstdlib>
 10 | #include <cstdint>
 11 | #include <algorithm>
 12 | #include <stdexcept>
 13 | 
 14 | #include <torch/torch.h>
 15 | #include <torch/script.h>
 16 | 
 17 | #include <tests/util.h>
 18 | 
 19 | 
 20 | static const std::size_t kNumTestImages = 1000;
 21 | 
 22 | 
 23 | void setup_parameters(cl::Context& context,
 24 |                       cl::CommandQueue& queue,
 25 |                       cl::Kernel& kernel,
 26 |                       std::map<std::string, cl::Buffer>& buf_params) {
 27 | 
 28 |   std::vector<std::string> kernel_args = {
 29 |     "-",
 30 |     "conv1.weight",
 31 |     "conv1.bias",
 32 |     "conv2.weight",
 33 |     "conv2.bias",
 34 |     "fc1.weight",
 35 |     "fc1.bias",
 36 |     "fc2.weight",
 37 |     "fc2.bias",
 38 |   };
 39 | 
 40 |   // load model file
 41 |   auto model = torch::jit::load(PROJECT_ROOT "/learning/traced_model.pt");
 42 | 
 43 |   // load parameter values from model and copy to the device memory
 44 |   for (const auto& param_ref : model.named_parameters()) {
 45 | 
 46 |     dnnk::aligned_vector<float> host_buf(param_ref.value.numel());
 47 | 
 48 |     float* ptr = param_ref.value.data_ptr<float>();
 49 |     std::copy(ptr, ptr + host_buf.size(), host_buf.begin());
 50 | 
 51 |     // use param_ref.name as key (ex: "conv1.weight"), and initialize device buffer
 52 |     {
 53 |       cl::Buffer buf(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, host_buf.size() * sizeof(float), host_buf.data(), nullptr);
 54 |       buf_params[param_ref.name] = std::move(buf);
 55 |     }
 56 | 
 57 |     // set kernel argument
 58 |     auto index = std::distance(kernel_args.begin(), std::find(kernel_args.begin(), kernel_args.end(), param_ref.name));
 59 |     if (index == kernel_args.size()) {
 60 |       throw std::runtime_error("Unknown parameter name: " + param_ref.name);
 61 |     }
 62 |     kernel.setArg(index, buf_params[param_ref.name]);
 63 | 
 64 |     // copy parameter data into the device buffer
 65 |     queue.enqueueMigrateMemObjects({buf_params[param_ref.name]}, 0);
 66 |     queue.finish();
 67 |   }
 68 | }
 69 | 
 70 | void setup_inouts(cl::Context& context,
 71 |                   cl::CommandQueue& queue,
 72 |                   cl::Kernel& kernel,
 73 |                   std::vector<cl::Buffer>& buf_x,
 74 |                   std::vector<cl::Buffer>& buf_y,
 75 |                   std::vector<int64_t>& answers) {
 76 |   // read MNIST dataset
 77 |   auto dataset = torch::data::datasets::MNIST(PROJECT_ROOT "/learning/data/MNIST/raw")
 78 |     .map(torch::data::transforms::Stack<>());
 79 | 
 80 |   // define loader and set batch_size to 1
 81 |   auto data_loader =
 82 |     torch::data::make_data_loader<torch::data::samplers::SequentialSampler>(std::move(dataset),
 83 |                                                                             torch::data::DataLoaderOptions().batch_size(1));
 84 | 
 85 |   // create reference data
 86 |   int num_iter = 0;
 87 |   for (auto& batch : *data_loader) {
 88 |     auto& x_ref = batch.data;
 89 |     auto& y_ref = batch.target;
 90 | 
 91 |     auto x_size = x_ref.numel() * sizeof(float);
 92 |     auto y_size = 10 * sizeof(float);
 93 | 
 94 |     dnnk::aligned_vector<float> host_buf(x_ref.numel());
 95 |     float* x_ptr = x_ref.data_ptr<float>();
 96 |     std::copy(x_ptr, x_ptr + host_buf.size(), host_buf.begin());
 97 | 
 98 |     buf_x.emplace_back(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, x_size, host_buf.data(), nullptr);
 99 |     buf_y.emplace_back(context, CL_MEM_WRITE_ONLY, y_size);
100 |     answers.push_back(*(y_ref.data_ptr<int64_t>()));
101 | 
102 |     // copy to device
103 |     cl::Buffer& target = buf_x[buf_x.size() - 1];
104 |     kernel.setArg(0, target);
105 |     queue.enqueueMigrateMemObjects({target}, 0);
106 |     queue.finish();
107 | 
108 |     if (++num_iter == kNumTestImages) {
109 |       break;
110 |     }
111 |   }
112 | }
113 | 
114 | 
115 | int main(int argc, char* argv[]) {
116 |   if (argc != 3 && argc != 4) {
117 |     printf("Usage: %s <xclbin> <kernel_name> [enable_OoO]\n", argv[0]);
118 |     return 0;
119 |   }
120 | 
121 |   dnnk::ClHelper clhelper(argv[1]);
122 |   std::string kernel_name(argv[2]);
123 |   bool enable_OoO = (argc == 3) ? 0 : (std::atoi(argv[3]) != 0);
124 | 
125 |   auto device = clhelper.get_device();
126 |   auto context = clhelper.get_context();
127 |   auto program = clhelper.get_program();
128 | 
129 |   auto queue_flag = (enable_OoO) ? (CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) : CL_QUEUE_PROFILING_ENABLE;
130 |   cl::CommandQueue queue(context, device, queue_flag);
131 | 
132 |   // create kernel object
133 |   cl::Kernel kernel(program, kernel_name.c_str());
134 | 
135 |   // define device buffer
136 |   std::vector<cl::Buffer> buf_x;
137 |   std::map<std::string, cl::Buffer> buf_params;
138 |   std::vector<cl::Buffer> buf_y;
139 | 
140 |   // MNIST answers
141 |   std::vector<int64_t> answers;
142 | 
143 |   // setup device buffers
144 |   setup_parameters(context, queue, kernel, buf_params);
145 |   setup_inouts(context, queue, kernel, buf_x, buf_y, answers);
146 | 
147 |   // run
148 |   dnnk::StopWatch sw;
149 |   sw.start();
150 |   for (std::size_t i = 0; i < buf_x.size(); i++) {
151 |     kernel.setArg(0, buf_x[i]);
152 |     kernel.setArg(9, buf_y[i]);
153 | 
154 |     queue.enqueueTask(kernel);
155 |   }
156 |   queue.finish();
157 |   sw.stop();
158 | 
159 |   std::cout << "Elapsed time: " << sw.elapsed_time_ms() / kNumTestImages << " [ms/image]" << std::endl;
160 | 
161 |   // get results from device buffer
162 |   std::vector<std::array<float, 10>> results(buf_x.size());
163 |   for (std::size_t i = 0; i < results.size(); i++) {
164 |     queue.enqueueReadBuffer(buf_y[i], false, 0, results[i].size() * sizeof(float), results[i].data());
165 |   }
166 |   queue.finish();
167 | 
168 |   // report
169 |   auto argmax = [](const std::array<float, 10>& vec) {
170 |     return std::distance(vec.begin(), std::max_element(vec.begin(), vec.end()));
171 |   };
172 | 
173 |   std::size_t num_corrects = 0;
174 |   for (std::size_t i = 0; i < results.size(); i++) {
175 |     if (argmax(results[i]) == answers[i]) {
176 |       num_corrects++;
177 |     }
178 |   }
179 | 
180 |   std::cout << "accuracy: " << double(num_corrects) / results.size() << std::endl;
181 | 
182 |   return 0;
183 | }
184 | 


--------------------------------------------------------------------------------
/include/dnn-kernel/conv2d.h:
--------------------------------------------------------------------------------
  1 | #ifndef DNNKERNEL_CONV2D_H
  2 | #define DNNKERNEL_CONV2D_H
  3 | 
  4 | #include <stdint.h>
  5 | #include <algorithm>
  6 | 
  7 | namespace dnnk {
  8 | 
  9 | static void conv2d(const float* x, const float* weight, const float* bias, int32_t width, int32_t height,
 10 |                    int32_t in_channels, int32_t out_channels, int32_t ksize, float* y) {
 11 |   for (int32_t och = 0; och < out_channels; ++och) {
 12 |     for (int32_t h = 0; h < height; ++h) {
 13 |       for (int32_t w = 0; w < width; ++w) {
 14 |         float sum = 0.f;
 15 | 
 16 |         for (int32_t ich = 0; ich < in_channels; ++ich) {
 17 |           for (int32_t kh = 0; kh < ksize; ++kh) {
 18 |             for (int32_t kw = 0; kw < ksize; ++kw) {
 19 |               int32_t ph = h + kh - ksize/2;
 20 |               int32_t pw = w + kw - ksize/2;
 21 | 
 22 |               // zero padding
 23 |               if (ph < 0 || ph >= height || pw < 0 || pw >= width) {
 24 |                 continue;
 25 |               }
 26 | 
 27 |               int64_t pix_idx = (ich * height + ph) * width + pw;
 28 |               int64_t weight_idx = ((och * in_channels + ich) * ksize + kh) * ksize + kw;
 29 | 
 30 |               sum += x[pix_idx] * weight[weight_idx];
 31 |             }
 32 |           }
 33 |         }
 34 | 
 35 |         // add bias
 36 |         sum += bias[och];
 37 | 
 38 |         y[(och * height + h) * width + w] = sum;
 39 |       }
 40 |     }
 41 |   }
 42 | }
 43 | 
 44 | 
 45 | static void conv2d_pipelined_v1(const float* x, const float* weight, const float* bias, int32_t width, int32_t height,
 46 |                                 int32_t in_channels, int32_t out_channels, int32_t ksize, float* y) {
 47 |   for (int32_t och = 0; och < out_channels; ++och) {
 48 |     for (int32_t h = 0; h < height; ++h) {
 49 |       for (int32_t w = 0; w < width; ++w) {
 50 |         float sum = 0.f;
 51 | 
 52 |         for (int32_t ich = 0; ich < in_channels; ++ich) {
 53 |           for (int32_t kh = 0; kh < ksize; ++kh) {
 54 |             for (int32_t kw = 0; kw < ksize; ++kw) {
 55 | #pragma HLS pipeline II=1
 56 | 
 57 |               int32_t ph = h + kh - ksize/2;
 58 |               int32_t pw = w + kw - ksize/2;
 59 | 
 60 |               // zero padding
 61 |               if (ph < 0 || ph >= height || pw < 0 || pw >= width) {
 62 |                 continue;
 63 |               }
 64 | 
 65 |               int64_t pix_idx = (ich * height + ph) * width + pw;
 66 |               int64_t weight_idx = ((och * in_channels + ich) * ksize + kh) * ksize + kw;
 67 | 
 68 |               sum += x[pix_idx] * weight[weight_idx];
 69 |             }
 70 |           }
 71 |         }
 72 | 
 73 |         // add bias
 74 |         sum += bias[och];
 75 | 
 76 |         y[(och * height + h) * width + w] = sum;
 77 |       }
 78 |     }
 79 |   }
 80 | }
 81 | 
 82 | static void conv2d_pipelined_v2(const float* x, const float* weight, const float* bias, int32_t width, int32_t height,
 83 |                                 int32_t in_channels, int32_t out_channels, int32_t ksize, float* y) {
 84 |   static const int kShiftRegLength = 4;
 85 | 
 86 |   for (int32_t och = 0; och < out_channels; ++och) {
 87 |     for (int32_t h = 0; h < height; ++h) {
 88 |       for (int32_t w = 0; w < width; ++w) {
 89 |         float shift_reg[kShiftRegLength + 1];
 90 | #pragma HLS array_partition variable=shift_reg complete
 91 | 
 92 |         int32_t glob_idx = 0;
 93 |         for (int32_t ich = 0; ich < in_channels; ++ich) {
 94 |           for (int32_t kh = 0; kh < ksize; ++kh) {
 95 |             for (int32_t kw = 0; kw < ksize; ++kw) {
 96 | #pragma HLS pipeline II=1
 97 | 
 98 |               int32_t ph = h + kh - ksize/2;
 99 |               int32_t pw = w + kw - ksize/2;
100 | 
101 |               // zero padding
102 |               if (ph < 0 || ph >= height || pw < 0 || pw >= width) {
103 |                 continue;
104 |               }
105 | 
106 |               int64_t pix_idx = (ich * height + ph) * width + pw;
107 |               int64_t weight_idx = ((och * in_channels + ich) * ksize + kh) * ksize + kw;
108 | 
109 |               float mul = x[pix_idx] * weight[weight_idx];
110 | 
111 |               // local sum
112 |               for (int i = 0; i < kShiftRegLength; ++i) {
113 |                 if (i == 0) {
114 |                   if (glob_idx < kShiftRegLength) {
115 |                     shift_reg[kShiftRegLength] = mul;
116 |                   } else {
117 |                     shift_reg[kShiftRegLength] = shift_reg[0] + mul;
118 |                   }
119 |                 }
120 | 
121 |                 shift_reg[i] = shift_reg[i + 1];
122 |               }
123 | 
124 |               ++glob_idx;
125 |             }
126 |           }
127 |         }
128 | 
129 |         // global sum
130 |         float sum = 0.f;
131 |         for (int i = 0; i < kShiftRegLength; ++i) {
132 | #pragma HLS pipeline II=1
133 |           sum += shift_reg[i];
134 |         }
135 | 
136 |         // add bias
137 |         sum += bias[och];
138 | 
139 |         y[(och * height + h) * width + w] = sum;
140 |       }
141 |     }
142 |   }
143 | }
144 | 
145 | 
146 | template <int UNROLL_X>
147 | static void conv2d_unrolled_v1(const float* x, const float* weight, const float* bias, int32_t width, int32_t height,
148 |                                int32_t in_channels, int32_t out_channels, int32_t ksize, float* y) {
149 | 
150 |   for (int32_t och = 0; och < out_channels; ++och) {
151 |     for (int32_t h = 0; h < height; ++h) {
152 |       for (int32_t block_w = 0; block_w < width; block_w += UNROLL_X) {
153 |         float sum[UNROLL_X];
154 | #pragma HLS array_partition variable=sum complete
155 | 
156 |         for (int32_t ich = 0; ich < in_channels; ++ich) {
157 |           for (int32_t kh = 0; kh < ksize; ++kh) {
158 |             for (int32_t kw = 0; kw < ksize; ++kw) {
159 | #pragma HLS pipeline II=4
160 |               for (int local_w = 0; local_w < UNROLL_X; local_w++) {
161 | #pragma HLS unroll
162 |                 if (block_w + local_w < width) {
163 | 
164 |                   int32_t w = block_w + local_w;
165 | 
166 |                   int32_t ph = h + kh - ksize/2;
167 |                   int32_t pw = w + kw - ksize/2;
168 | 
169 |                   float last = (ich == 0 && kh == 0 && kw == 0) ? 0 : sum[local_w];
170 | 
171 |                   // zero padding
172 |                   if (ph < 0 || ph >= height || pw < 0 || pw >= width) {
173 |                     sum[local_w] = last;
174 |                     continue;
175 |                   }
176 | 
177 |                   int64_t pix_idx = (ich * height + ph) * width + pw;
178 |                   int64_t weight_idx = ((och * in_channels + ich) * ksize + kh) * ksize + kw;
179 | 
180 |                   sum[local_w] = last + x[pix_idx] * weight[weight_idx];
181 |                 }
182 |               }
183 |             }
184 |           }
185 |         }
186 | 
187 |         for (int local_w = 0; local_w < UNROLL_X; local_w++) {
188 | #pragma HLS unroll
189 |           if (block_w + local_w < width) {
190 | 
191 |             int32_t w = block_w + local_w;
192 | 
193 |             // add bias
194 |             y[(och * height + h) * width + w] = sum[local_w] + bias[och];
195 |           }
196 |         }
197 |       }
198 |     }
199 |   }
200 | }
201 | 
202 | 
203 | template <int UNROLL_X, int UNROLL_OCH>
204 | static void conv2d_unrolled_v2(const float* x, const float* weight, const float* bias, int32_t width, int32_t height,
205 |                                int32_t in_channels, int32_t out_channels, int32_t ksize, float* y) {
206 | 
207 |   for (int32_t block_och = 0; block_och < out_channels; block_och += UNROLL_OCH) {
208 |     for (int32_t h = 0; h < height; ++h) {
209 |       for (int32_t block_w = 0; block_w < width; block_w += UNROLL_X) {
210 |         float sum[UNROLL_OCH][UNROLL_X];
211 | #pragma HLS array_partition variable=sum complete dim=0
212 | 
213 |         for (int32_t ich = 0; ich < in_channels; ++ich) {
214 |           for (int32_t kh = 0; kh < ksize; ++kh) {
215 |             for (int32_t kw = 0; kw < ksize; ++kw) {
216 | #pragma HLS pipeline II=4
217 |               for (int local_och = 0; local_och < UNROLL_OCH; local_och++) {
218 | #pragma HLS unroll
219 |                 for (int local_w = 0; local_w < UNROLL_X; local_w++) {
220 | #pragma HLS unroll
221 |                   if (block_w + local_w < width && block_och + local_och < out_channels) {
222 | 
223 |                     int32_t och = block_och + local_och;
224 |                     int32_t w = block_w + local_w;
225 | 
226 |                     int32_t ph = h + kh - ksize/2;
227 |                     int32_t pw = w + kw - ksize/2;
228 | 
229 |                     float last = (ich == 0 && kh == 0 && kw == 0) ? 0 : sum[local_och][local_w];
230 | 
231 |                     // zero padding
232 |                     if (ph < 0 || ph >= height || pw < 0 || pw >= width) {
233 |                       sum[local_och][local_w] = last;
234 |                       continue;
235 |                     }
236 | 
237 |                     int64_t pix_idx = (ich * height + ph) * width + pw;
238 |                     int64_t weight_idx = ((och * in_channels + ich) * ksize + kh) * ksize + kw;
239 | 
240 |                     sum[local_och][local_w] = last + x[pix_idx] * weight[weight_idx];
241 |                   }
242 |                 }
243 |               }
244 |             }
245 |           }
246 |         }
247 | 
248 |         for (int local_och = 0; local_och < UNROLL_OCH; local_och++) {
249 | #pragma HLS unroll
250 |           for (int local_w = 0; local_w < UNROLL_X; local_w++) {
251 | #pragma HLS unroll
252 |             if (block_w + local_w < width && block_och + local_och < out_channels) {
253 |               int32_t och = block_och + local_och;
254 |               int32_t w = block_w + local_w;
255 | 
256 |               // add bias
257 |               y[(och * height + h) * width + w] = sum[local_och][local_w] + bias[och];
258 |             }
259 |           }
260 |         }
261 |       }
262 |     }
263 |   }
264 | }
265 | 
266 | }  // namespace dnnk
267 | 
268 | #endif  // DNNKERNEL_CONV2D_H
269 | 


--------------------------------------------------------------------------------
/include/dnn-kernel/inference.h:
--------------------------------------------------------------------------------
  1 | #ifndef DNNKERNEL_INFERENCE_H
  2 | #define DNNKERNEL_INFERENCE_H
  3 | 
  4 | #include "conv2d.h"
  5 | #include "maxpool2d.h"
  6 | #include "relu.h"
  7 | #include "linear.h"
  8 | 
  9 | #include <stdint.h>
 10 | #include <algorithm>
 11 | 
 12 | namespace dnnk {
 13 | 
 14 | template <typename CONV_FUNC, typename MAXPOOL_FUNC, typename RELU_FUNC, typename LINEAR_FUNC>
 15 | static void inference_custom(const float* x,
 16 |                              const float* weight0, const float* bias0,
 17 |                              const float* weight1, const float* bias1,
 18 |                              const float* weight2, const float* bias2,
 19 |                              const float* weight3, const float* bias3,
 20 |                              float* y,
 21 |                              CONV_FUNC* conv1_f,
 22 |                              RELU_FUNC* relu1_f,
 23 |                              MAXPOOL_FUNC* maxpool1_f,
 24 |                              CONV_FUNC* conv2_f,
 25 |                              RELU_FUNC* relu2_f,
 26 |                              MAXPOOL_FUNC* maxpool2_f,
 27 |                              LINEAR_FUNC* linear1_f,
 28 |                              RELU_FUNC* relu3_f,
 29 |                              LINEAR_FUNC* linear2_f) {
 30 | #pragma HLS inline
 31 | 
 32 |   static const int kWidths[] = {28, 14, 7};
 33 |   static const int kHeights[] = {28, 14, 7};
 34 |   static const int kChannels[] = {1, 4, 8, 32, 10};
 35 | 
 36 |   float x1[kWidths[0] * kHeights[0] * kChannels[1]];
 37 |   float x2[kWidths[0] * kHeights[0] * kChannels[1]];
 38 |   float x3[kWidths[1] * kHeights[1] * kChannels[1]];
 39 |   float x4[kWidths[1] * kHeights[1] * kChannels[2]];
 40 |   float x5[kWidths[1] * kHeights[1] * kChannels[2]];
 41 |   float x6[kWidths[2] * kHeights[2] * kChannels[2]];
 42 |   float x7[kChannels[3]];
 43 |   float x8[kChannels[3]];
 44 | 
 45 |   // 1st layer
 46 |   conv1_f(x, weight0, bias0, kWidths[0], kHeights[0], kChannels[0], kChannels[1], 3, x1);
 47 |   relu1_f(x1, kWidths[0] * kHeights[0] * kChannels[1], x2);
 48 |   maxpool1_f(x2, kWidths[0], kHeights[0], kChannels[1], 2, x3);
 49 | 
 50 |   // 2nd layer
 51 |   conv2_f(x3, weight1, bias1, kWidths[1], kHeights[1], kChannels[1], kChannels[2], 3, x4);
 52 |   relu2_f(x4, kWidths[1] * kHeights[1] * kChannels[2], x5);
 53 |   maxpool2_f(x5, kWidths[1], kHeights[1], kChannels[2], 2, x6);
 54 | 
 55 |   // 3rd layer
 56 |   linear1_f(x6, weight2, bias2, kWidths[2] * kHeights[2] * kChannels[2], kChannels[3], x7);
 57 |   relu3_f(x7, kChannels[3], x8);
 58 | 
 59 |   // 4th layer
 60 |   linear2_f(x8, weight3, bias3, kChannels[3], kChannels[4], y);
 61 | }
 62 | 
 63 | template <typename CONV_FUNC, typename MAXPOOL_FUNC, typename RELU_FUNC, typename LINEAR_FUNC>
 64 | static void inference_custom(const float* x,
 65 |                              const float* weight0, const float* bias0,
 66 |                              const float* weight1, const float* bias1,
 67 |                              const float* weight2, const float* bias2,
 68 |                              const float* weight3, const float* bias3,
 69 |                              float* y,
 70 |                              CONV_FUNC* conv2d_f,
 71 |                              MAXPOOL_FUNC* maxpool2d_f,
 72 |                              RELU_FUNC* relu_f,
 73 |                              LINEAR_FUNC* linear_f) {
 74 | #pragma HLS inline
 75 |   inference_custom(x,
 76 |                    weight0, bias0,
 77 |                    weight1, bias1,
 78 |                    weight2, bias2,
 79 |                    weight3, bias3,
 80 |                    y,
 81 |                    conv2d_f, relu_f, maxpool2d_f,
 82 |                    conv2d_f, relu_f, maxpool2d_f,
 83 |                    linear_f, relu_f,
 84 |                    linear_f);
 85 | }
 86 | 
 87 | static void inference(const float* x,
 88 |                       const float* weight0, const float* bias0,
 89 |                       const float* weight1, const float* bias1,
 90 |                       const float* weight2, const float* bias2,
 91 |                       const float* weight3, const float* bias3,
 92 |                       float* y) {
 93 | #pragma HLS inline
 94 | 
 95 |   inference_custom(x,
 96 |                    weight0, bias0,
 97 |                    weight1, bias1,
 98 |                    weight2, bias2,
 99 |                    weight3, bias3,
100 |                    y,
101 |                    conv2d, maxpool2d, relu, linear);
102 | }
103 | 
104 | }  // namespace dnnk
105 | 
106 | #endif  // DNNKERNEL_INFERENCE_H
107 | 


--------------------------------------------------------------------------------
/include/dnn-kernel/linear.h:
--------------------------------------------------------------------------------
 1 | #ifndef DNNKERNEL_LINEAR_H
 2 | #define DNNKERNEL_LINEAR_H
 3 | 
 4 | #include <stdint.h>
 5 | #include <algorithm>
 6 | 
 7 | namespace dnnk {
 8 | 
 9 | static void linear(const float *x, const float* weight, const float* bias, int64_t in_features, int64_t out_features, float *y) {
10 |   for (int64_t i = 0; i < out_features; ++i) {
11 |     float sum = 0.f;
12 |     for (int64_t j = 0; j < in_features; ++j) {
13 |       sum += x[j] * weight[i * in_features + j];
14 |     }
15 |     y[i] = sum + bias[i];
16 |   }
17 | }
18 | 
19 | template <int UNROLL_OCH>
20 | static void linear_opt(const float *x, const float* weight, const float* bias, int64_t in_features, int64_t out_features, float *y) {
21 | 
22 |   for (int64_t block_i = 0; block_i < out_features; block_i += UNROLL_OCH) {
23 |     float sum[UNROLL_OCH];
24 | #pragma HLS array_partition variable=sum complete
25 | 
26 |     for (int64_t j = 0; j < in_features; ++j) {
27 | #pragma HLS pipeline II=1
28 |       for (int64_t local_i = 0; local_i < UNROLL_OCH; local_i++) {
29 | #pragma HLS unroll
30 |         int64_t i = block_i + local_i;
31 |         if (i < out_features) {
32 |           float last = (j == 0) ? 0 : sum[local_i];
33 |           sum[local_i] = last + x[j] * weight[i * in_features + j];
34 |         }
35 |       }
36 |     }
37 | 
38 |     for (int64_t local_i = 0; local_i < UNROLL_OCH; local_i++) {
39 | #pragma HLS unroll
40 |       int64_t i = block_i + local_i;
41 |       if (i < out_features) {
42 |         y[i] = sum[local_i] + bias[i];
43 |       }
44 |     }
45 |   }
46 | }
47 | 
48 | }  // namespace dnnk
49 | 
50 | #endif  // DNNKERNEL_LINEAR_H
51 | 


--------------------------------------------------------------------------------
/include/dnn-kernel/maxpool2d.h:
--------------------------------------------------------------------------------
 1 | #ifndef DNNKERNEL_MAXPOOL2D_H
 2 | #define DNNKERNEL_MAXPOOL2D_H
 3 | 
 4 | #include <stdint.h>
 5 | #include <cfloat>
 6 | #include <algorithm>
 7 | #include <limits>
 8 | 
 9 | namespace dnnk {
10 | 
11 | static void maxpool2d(const float *x, int32_t width, int32_t height, int32_t channels, int32_t stride, float *y) {
12 |   for (int ch = 0; ch < channels; ++ch) {
13 |     for (int32_t h = 0; h < height; h += stride) {
14 |       for (int32_t w = 0; w < width; w += stride) {
15 |         float maxval = -FLT_MAX;
16 | 
17 |         for (int bh = 0; bh < stride; ++bh) {
18 |           for (int bw = 0; bw < stride; ++bw) {
19 |             maxval = std::max(maxval, x[(ch * height + h + bh) * width + w + bw]);
20 |           }
21 |         }
22 | 
23 |         y[(ch * (height / stride) + (h / stride)) * (width / stride) + w / stride] = maxval;
24 |       }
25 |     }
26 |   }
27 | }
28 | 
29 | }  // namespace dnnk
30 | 
31 | #endif  // DNNKERNEL_MAXPOOL2D_H
32 | 


--------------------------------------------------------------------------------
/include/dnn-kernel/relu.h:
--------------------------------------------------------------------------------
 1 | #ifndef DNNKERNEL_RELU_H
 2 | #define DNNKERNEL_RELU_H
 3 | 
 4 | #include <stdint.h>
 5 | #include <algorithm>
 6 | 
 7 | namespace dnnk {
 8 | 
 9 | void relu(const float *x, int64_t size, float *y) {
10 |   for (int64_t i = 0; i < size; ++i) {
11 |     y[i] = std::max(x[i], .0f);
12 |   }
13 | }
14 | 
15 | }  // namespace dnnk
16 | 
17 | #endif  // DNNKERNEL_RELU_H
18 | 


--------------------------------------------------------------------------------
/learning/requirements.txt:
--------------------------------------------------------------------------------
 1 | joblib==0.14.1
 2 | numpy==1.18.5
 3 | Pillow==7.1.2
 4 | scikit-learn==0.22.2.post1
 5 | scipy==1.4.1
 6 | six==1.15.0
 7 | sklearn==0.0
 8 | torch==1.4.0
 9 | torchvision==0.5.0
10 | 


--------------------------------------------------------------------------------
/learning/train_mnist.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | import torch.nn as nn
  4 | import torchvision
  5 | import torchvision.transforms as transforms
  6 | from sklearn.metrics import accuracy_score, confusion_matrix
  7 | 
  8 | 
  9 | # 1. ネットワークモデルの定義
 10 | class Net(nn.Module):
 11 |     def __init__(self, num_output_classes=10):
 12 |         super(Net, self).__init__()
 13 | 
 14 |         # 入力は28x28 のグレースケール画像 (チャネル数=1)
 15 |         # 出力が8チャネルとなるような畳み込みを行う
 16 |         self.conv1 = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3, padding=1)
 17 | 
 18 |         # 活性化関数はReLU
 19 |         self.relu1 = nn.ReLU(inplace=True)
 20 | 
 21 |         # 画像を28x28から14x14に縮小する
 22 |         self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
 23 | 
 24 |         # 4ch -> 8ch, 14x14 -> 7x7
 25 |         self.conv2 = nn.Conv2d(in_channels=4, out_channels=8, kernel_size=3, padding=1)
 26 |         self.relu2 = nn.ReLU(inplace=True)
 27 |         self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
 28 | 
 29 |         # 全結合層
 30 |         # 8chの7x7画像を1つのベクトルとみなし、要素数32のベクトルまで縮小
 31 |         self.fc1 = nn.Linear(8 * 7 * 7, 32)
 32 |         self.relu3 = nn.ReLU(inplace=True)
 33 | 
 34 |         # 全結合層その2
 35 |         # 出力クラス数まで縮小
 36 |         self.fc2 = nn.Linear(32, num_output_classes)
 37 | 
 38 |     def forward(self, x):
 39 |         # 1層目の畳み込み
 40 |         # 活性化関数 (activation) はReLU
 41 |         x = self.conv1(x)
 42 |         x = self.relu1(x)
 43 | 
 44 |         # 縮小
 45 |         x = self.pool1(x)
 46 | 
 47 |         # 2層目+縮小
 48 |         x = self.conv2(x)
 49 |         x = self.relu2(x)
 50 |         x = self.pool2(x)
 51 | 
 52 |         # フォーマット変換 (Batch, Ch, Height, Width) -> (Batch, Ch)
 53 |         x = x.view(x.shape[0], -1)
 54 | 
 55 |         # 全結合層
 56 |         x = self.fc1(x)
 57 |         x = self.relu3(x)
 58 |         x = self.fc2(x)
 59 | 
 60 |         return x
 61 | 
 62 | 
 63 | net = Net()
 64 | 
 65 | # 2. データセットの読み出し法の定義
 66 | # MNIST の学習・テストデータの取得
 67 | trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
 68 | testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())
 69 | 
 70 | # データの読み出し方法の定義
 71 | # 1stepの学習・テストごとに16枚ずつ画像を読みだす
 72 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=16, shuffle=True)
 73 | testloader = torch.utils.data.DataLoader(testset, batch_size=16, shuffle=False)
 74 | 
 75 | # ロス関数、最適化器の定義
 76 | loss_func = nn.CrossEntropyLoss()
 77 | optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
 78 | 
 79 | # 3. 学習
 80 | # データセット内の全画像を10回使用するまでループ
 81 | for epoch in range(10):
 82 |     running_loss = 0
 83 | 
 84 |     # データセット内でループ
 85 |     for i, data in enumerate(trainloader, 0):
 86 |         # 入力バッチの読み込み (画像、正解ラベル)
 87 |         inputs, labels = data
 88 | 
 89 |         # 最適化器をゼロ初期化
 90 |         optimizer.zero_grad()
 91 | 
 92 |         # 入力画像をモデルに通して出力ラベルを取得
 93 |         outputs = net(inputs)
 94 | 
 95 |         # 正解との誤差の計算 + 誤差逆伝搬
 96 |         loss = loss_func(outputs, labels)
 97 |         loss.backward()
 98 | 
 99 |         # 誤差を用いてモデルの最適化
100 |         optimizer.step()
101 |         running_loss += loss.item()
102 |         if i % 1000 == 999:
103 |             print('[%d, %5d] loss: %.3f' %
104 |                   (epoch + 1, i + 1, running_loss / 1000))
105 |             running_loss = 0.0
106 | 
107 | # 4. テスト
108 | ans = []
109 | pred = []
110 | for i, data in enumerate(testloader, 0):
111 |     inputs, labels = data
112 | 
113 |     outputs = net(inputs)
114 | 
115 |     ans += labels.tolist()
116 |     pred += torch.argmax(outputs, 1).tolist()
117 | 
118 | print('accuracy:', accuracy_score(ans, pred))
119 | print('confusion matrix:')
120 | print(confusion_matrix(ans, pred))
121 | 
122 | # 5. モデルの保存
123 | # PyTorchから普通に読み出すためのモデルファイル
124 | torch.save(net.state_dict(), 'model.pt')
125 | 
126 | # libtorch (C++ API) から読み出すためのTorch Script Module を保存
127 | example = torch.rand(1, 1, 28, 28)
128 | traced_script_module = torch.jit.trace(net, example)
129 | traced_script_module.save('traced_model.pt')
130 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(ref)
2 | add_subdirectory(hls)
3 | 


--------------------------------------------------------------------------------
/tests/hls/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | include(CMakeParseArguments)
  2 | 
  3 | set(TB_TCL ${CMAKE_CURRENT_SOURCE_DIR}/tb.tcl)
  4 | 
  5 | macro(abs_path files abs_paths)
  6 |   foreach(f ${files})
  7 |     list(APPEND ${abs_paths} ${CMAKE_CURRENT_SOURCE_DIR}/${f})
  8 |   endforeach()
  9 | endmacro()
 10 | 
 11 | macro(append_include_option src dst)
 12 |   foreach(p ${src})
 13 |     list(APPEND ${dst} "includepath=${p}")
 14 |   endforeach()
 15 | endmacro()
 16 | 
 17 | macro(list2str l str)
 18 |   string(REPLACE ";" " " ${str} "${l}")
 19 | endmacro()
 20 | 
 21 | macro(prepend_option srcs option dsts)
 22 |   foreach(src ${srcs})
 23 |     list(APPEND ${dsts} "${option}${src}")
 24 |   endforeach()
 25 | endmacro()
 26 | 
 27 | 
 28 | function(add_csim name)
 29 |   cmake_parse_arguments(ARG "" "" "HLS_SRC;TB_SRC;CXXFLAGS" ${ARGN})
 30 | 
 31 |   add_executable(${name} ${ARG_HLS_SRC} ${ARG_TB_SRC})
 32 | 
 33 |   # For checking c++ standards
 34 |   set_source_files_properties("${ARG_TB_SRC}" PROPERTIES COMPILE_FLAGS "-std=c++14")
 35 |   set_source_files_properties("${ARG_HLS_SRC}" PROPERTIES COMPILE_FLAGS "-std=c++98")
 36 | 
 37 |   target_include_directories(${name} PRIVATE ${DNNK_INCLUDE_DIRS} ${VHLS_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS})
 38 |   target_compile_options(${name} PRIVATE ${DNNK_CXX_FLAGS} ${ARG_CXXFLAGS})
 39 |   target_link_libraries(${name} PRIVATE ${TORCH_LIBRARIES} ${GTEST_LIBRARIES})
 40 | 
 41 |   add_test(
 42 |     NAME ${name}
 43 |     COMMAND ${name} ${ARG_UNPARSED_ARGUMENTS}
 44 |     )
 45 | 
 46 | endfunction()
 47 | 
 48 | function(add_cosim name top)
 49 |   cmake_parse_arguments(ARG "" "" "HLS_SRC;TB_SRC;CXXFLAGS" ${ARGN})
 50 |  
 51 |   abs_path(${ARG_HLS_SRC} abs_hls_srcs)
 52 |   abs_path(${ARG_TB_SRC} abs_tb_srcs)
 53 |   
 54 |   set(include_dirs ${DNNK_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS})
 55 |   prepend_option("${include_dirs}" "-I" include_options)
 56 |   
 57 |   set(library_dirs ${TORCH_LIBRARY_DIRS})
 58 |   prepend_option("${library_dirs}" "-L" library_dir_options)
 59 |   prepend_option("${library_dirs}" "-Wl,-rpath," rpath_options)
 60 | 
 61 |   set(libraries "torch" "c10" "gtest" "pthread")
 62 |   prepend_option("${libraries}" "-l" librariy_options)
 63 | 
 64 |   set(cxxflags ${ARG_CXXFLAGS} ${include_options})
 65 |   list2str("${cxxflags}" cxxflags_str)
 66 | 
 67 |   set(ldflags ${library_dir_options} ${librariy_options} ${rpath_options})
 68 |   list2str("${ldflags}" ldflags_str)
 69 |   
 70 |   add_test(
 71 |     NAME ${name}
 72 |     COMMAND vivado_hls -f ${TB_TCL} "cosim" ${name} "${abs_hls_srcs}" ${top} ${CHIP_PART} "cxxflags=${cxxflags_str}" "ldflags=${ldflags_str}" "${abs_tb_srcs}" "${ARG_UNPARSED_ARGUMENTS}"
 73 |     WORKING_DIRECTORY ${WORK_DIR}
 74 |     )
 75 | endfunction()
 76 | 
 77 | function(add_impl name top)
 78 |   cmake_parse_arguments(ARG "" "" "HLS_SRC;CXXFLAGS" ${ARGN})
 79 | 
 80 |   abs_path(${ARG_HLS_SRC} abs_hls_srcs)
 81 | 
 82 |   set(include_dirs ${DNNK_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS})
 83 |   prepend_option("${include_dirs}" "-I" include_options)
 84 |   set(cxxflags ${ARG_CXXFLAGS} ${include_options})
 85 |   list2str("${cxxflags}" cxxflags_str)
 86 | 
 87 |   add_custom_target(
 88 |     ${name}
 89 |     COMMAND vivado_hls -f ${TB_TCL} "impl" ${name} "${abs_hls_srcs}" ${top} ${CHIP_PART} "cxxflags=${cxxflags_str}" ${ARG_UNPARSED_ARGUMENTS}
 90 |     WORKING_DIRECTORY ${WORK_DIR}
 91 |     )
 92 | endfunction()
 93 | 
 94 | function(add_test_and_impl name top)
 95 |   cmake_parse_arguments(ARG "" "" "HLS_SRC;TB_SRC;CXXFLAGS" ${ARGN})
 96 | 
 97 |   add_csim(${name}_csim CXXFLAGS ${ARG_CXXFLAGS} HLS_SRC ${ARG_HLS_SRC} TB_SRC ${ARG_TB_SRC})
 98 |   add_cosim(${name}_cosim ${top} CXXFLAGS ${ARG_CXXFLAGS} HLS_SRC ${ARG_HLS_SRC} TB_SRC ${ARG_TB_SRC})
 99 |   add_impl(${name}_impl ${top} CXXFLAGS ${ARG_CXXFLAGS} HLS_SRC ${ARG_HLS_SRC})
100 | endfunction()
101 | 
102 | add_subdirectory(relu)
103 | add_subdirectory(conv2d)
104 | add_subdirectory(maxpool2d)
105 | add_subdirectory(linear)
106 | add_subdirectory(inference)
107 | 


--------------------------------------------------------------------------------
/tests/hls/conv2d/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(hls_src conv2d_hls.cc)
 2 | set(test_src conv2d_test.cc)
 3 | 
 4 | add_test_and_impl(conv2d_hls conv2d_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_hls")
 5 | add_test_and_impl(conv2d_pipelined_v1_hls conv2d_pipelined_v1_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_pipelined_v1_hls")
 6 | add_test_and_impl(conv2d_pipelined_v2_hls conv2d_pipelined_v2_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_pipelined_v2_hls")
 7 | add_test_and_impl(conv2d_unrolled_v1_2_hls conv2d_unrolled_v1_2_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_unrolled_v1_2_hls")
 8 | add_test_and_impl(conv2d_unrolled_v1_3_hls conv2d_unrolled_v1_3_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_unrolled_v1_3_hls")
 9 | add_test_and_impl(conv2d_unrolled_v2_2_2_hls conv2d_unrolled_v2_2_2_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_unrolled_v2_2_2_hls")
10 | add_test_and_impl(conv2d_unrolled_v2_2_3_hls conv2d_unrolled_v2_2_3_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_unrolled_v2_2_3_hls")
11 | add_test_and_impl(conv2d_unrolled_v2_3_2_hls conv2d_unrolled_v2_3_2_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_unrolled_v2_3_2_hls")
12 | add_test_and_impl(conv2d_unrolled_v2_3_3_hls conv2d_unrolled_v2_3_3_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_unrolled_v2_3_3_hls")
13 | 
14 | 


--------------------------------------------------------------------------------
/tests/hls/conv2d/conv2d_hls.cc:
--------------------------------------------------------------------------------
 1 | #include "dnn-kernel/conv2d.h"
 2 | 
 3 | #include <stdint.h>
 4 | #include <algorithm>
 5 | 
 6 | static const std::size_t kMaxSize = 65536;
 7 | 
 8 | void conv2d_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize],
 9 |                 int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) {
10 | 
11 |   dnnk::conv2d(x, weight, bias, width, height, in_channels, out_channels, ksize, y);
12 | }
13 | 
14 | void conv2d_pipelined_v1_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize],
15 |                              int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) {
16 | 
17 |   dnnk::conv2d_pipelined_v1(x, weight, bias, width, height, in_channels, out_channels, ksize, y);
18 | }
19 | 
20 | void conv2d_pipelined_v2_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize],
21 |                              int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) {
22 | 
23 |   dnnk::conv2d_pipelined_v2(x, weight, bias, width, height, in_channels, out_channels, ksize, y);
24 | }
25 | 
26 | void conv2d_unrolled_v1_2_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize],
27 |                               int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) {
28 | 
29 |   dnnk::conv2d_unrolled_v1<2>(x, weight, bias, width, height, in_channels, out_channels, ksize, y);
30 | }
31 | 
32 | void conv2d_unrolled_v1_3_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize],
33 |                               int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) {
34 | 
35 |   dnnk::conv2d_unrolled_v1<3>(x, weight, bias, width, height, in_channels, out_channels, ksize, y);
36 | }
37 | 
38 | void conv2d_unrolled_v2_2_2_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize],
39 |                                 int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) {
40 | 
41 |   dnnk::conv2d_unrolled_v2<2, 2>(x, weight, bias, width, height, in_channels, out_channels, ksize, y);
42 | }
43 | 
44 | void conv2d_unrolled_v2_3_2_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize],
45 |                                 int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) {
46 | 
47 |   dnnk::conv2d_unrolled_v2<3, 2>(x, weight, bias, width, height, in_channels, out_channels, ksize, y);
48 | }
49 | 
50 | void conv2d_unrolled_v2_2_3_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize],
51 |                                 int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) {
52 | 
53 |   dnnk::conv2d_unrolled_v2<2, 3>(x, weight, bias, width, height, in_channels, out_channels, ksize, y);
54 | }
55 | 
56 | void conv2d_unrolled_v2_3_3_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize],
57 |                                 int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) {
58 | 
59 |   dnnk::conv2d_unrolled_v2<3, 3>(x, weight, bias, width, height, in_channels, out_channels, ksize, y);
60 | }
61 | 


--------------------------------------------------------------------------------
/tests/hls/conv2d/conv2d_hls.h:
--------------------------------------------------------------------------------
 1 | #ifndef DNNKERNEL_TEST_CONV2D_HLS_H
 2 | #define DNNKERNEL_TEST_CONV2D_HLS_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | void conv2d_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y);
 7 | void conv2d_pipelined_v1_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y);
 8 | void conv2d_pipelined_v2_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y);
 9 | void conv2d_unrolled_v1_2_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y);
10 | void conv2d_unrolled_v1_3_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y);
11 | void conv2d_unrolled_v2_2_2_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y);
12 | void conv2d_unrolled_v2_2_3_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y);
13 | void conv2d_unrolled_v2_3_2_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y);
14 | void conv2d_unrolled_v2_3_3_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y);
15 | 
16 | #endif  // DNNKERNEL_TEST_CONV2D_HLS_H
17 | 


--------------------------------------------------------------------------------
/tests/hls/conv2d/conv2d_test.cc:
--------------------------------------------------------------------------------
 1 | #include "conv2d_hls.h"
 2 | 
 3 | #include <algorithm>
 4 | #include <cstdint>
 5 | #include <iostream>
 6 | #include <vector>
 7 | 
 8 | #include <torch/torch.h>
 9 | 
10 | #include <tests/util.h>
11 | 
12 | #ifndef TOP_FUNC
13 | #error "TOP_FUNC is not defined"
14 | #endif
15 | 
16 | static const std::size_t kMaxSize = 65536;
17 | 
18 | using namespace dnnk;
19 | namespace F = torch::nn::functional;
20 | 
21 | int main() {
22 |   // Seeds must be fixed because the testbench is executed twice in
23 |   // the cosimulation.
24 |   torch::manual_seed(0);
25 | 
26 |   int h = 14, w = 14, in_channels = 4, out_channels = 8, ksize = 3;
27 | 
28 |   auto x_ref = torch::randn({1, in_channels, h, w});
29 |   auto weight_ref = torch::randn({out_channels, in_channels, ksize, ksize});
30 |   auto bias_ref = torch::randn({out_channels});
31 | 
32 |   float x[kMaxSize], weight[kMaxSize], bias[kMaxSize], y[kMaxSize];
33 |   tensor2array(x_ref, x);
34 |   tensor2array(weight_ref, weight);
35 |   tensor2array(bias_ref, bias);
36 | 
37 |   auto y_ref = F::detail::conv2d(x_ref, weight_ref, bias_ref, 1, ksize/2, 1, 1);
38 |   TOP_FUNC (x, weight, bias, w, h, in_channels, out_channels, ksize, y);
39 | 
40 |   if (!verify(y, y_ref)) {
41 |     printf("%sFailed%s\n", Color::red, Color::reset);
42 |     return 1;
43 |   }
44 | 
45 |   printf("%sSucceed!%s\n", Color::green, Color::reset);
46 |   return 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/tests/hls/inference/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(hls_src inference_hls.cc)
 2 | set(test_src inference_test.cc)
 3 | 
 4 | add_test_and_impl(inference_hls inference_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_hls")
 5 | add_test_and_impl(inference_top inference_top HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_top")
 6 | add_test_and_impl(inference_dataflow inference_dataflow HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_dataflow")
 7 | add_test_and_impl(inference_with_local_buffer inference_with_local_buffer HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_with_local_buffer")
 8 | add_test_and_impl(inference_pipelined_conv_v1 inference_pipelined_conv_v1 HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_pipelined_conv_v1")
 9 | add_test_and_impl(inference_pipelined_conv_v2 inference_pipelined_conv_v2 HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_pipelined_conv_v2")
10 | add_test_and_impl(inference_unrolledx4_conv_v1 inference_unrolledx4_conv_v1 HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_unrolledx4_conv_v1")
11 | add_test_and_impl(inference_unrolledx4_conv_v2 inference_unrolledx4_conv_v2 HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_unrolledx4_conv_v2")
12 | add_test_and_impl(inference_final inference_final HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_final")
13 | 


--------------------------------------------------------------------------------
/tests/hls/inference/inference_hls.cc:
--------------------------------------------------------------------------------
  1 | #include "dnn-kernel/inference.h"
  2 | 
  3 | #include <stdint.h>
  4 | #include <cstring>
  5 | #include <algorithm>
  6 | 
  7 | static const std::size_t kMaxSize = 16384;
  8 | 
  9 | void inference_hls(const float x[kMaxSize],
 10 |                    const float weight0[kMaxSize], const float bias0[kMaxSize],
 11 |                    const float weight1[kMaxSize], const float bias1[kMaxSize],
 12 |                    const float weight2[kMaxSize], const float bias2[kMaxSize],
 13 |                    const float weight3[kMaxSize], const float bias3[kMaxSize],
 14 |                    float y[kMaxSize]) {
 15 |   dnnk::inference(x,
 16 |                   weight0, bias0,
 17 |                   weight1, bias1,
 18 |                   weight2, bias2,
 19 |                   weight3, bias3,
 20 |                   y);
 21 | }
 22 | 
 23 | extern "C" {
 24 | 
 25 | void inference_top(const float x[kMaxSize],
 26 |                    const float weight0[kMaxSize], const float bias0[kMaxSize],
 27 |                    const float weight1[kMaxSize], const float bias1[kMaxSize],
 28 |                    const float weight2[kMaxSize], const float bias2[kMaxSize],
 29 |                    const float weight3[kMaxSize], const float bias3[kMaxSize],
 30 |                    float y[kMaxSize]) {
 31 | #pragma HLS interface m_axi port=x offset=slave bundle=gmem0
 32 | #pragma HLS interface m_axi port=weight0 offset=slave bundle=gmem1
 33 | #pragma HLS interface m_axi port=weight1 offset=slave bundle=gmem2
 34 | #pragma HLS interface m_axi port=weight2 offset=slave bundle=gmem3
 35 | #pragma HLS interface m_axi port=weight3 offset=slave bundle=gmem4
 36 | #pragma HLS interface m_axi port=bias0 offset=slave bundle=gmem5
 37 | #pragma HLS interface m_axi port=bias1 offset=slave bundle=gmem6
 38 | #pragma HLS interface m_axi port=bias2 offset=slave bundle=gmem7
 39 | #pragma HLS interface m_axi port=bias3 offset=slave bundle=gmem8
 40 | #pragma HLS interface m_axi port=y offset=slave bundle=gmem9
 41 | #pragma HLS interface s_axilite port=x bundle=control
 42 | #pragma HLS interface s_axilite port=weight0 bundle=control
 43 | #pragma HLS interface s_axilite port=weight1 bundle=control
 44 | #pragma HLS interface s_axilite port=weight2 bundle=control
 45 | #pragma HLS interface s_axilite port=weight3 bundle=control
 46 | #pragma HLS interface s_axilite port=bias0 bundle=control
 47 | #pragma HLS interface s_axilite port=bias1 bundle=control
 48 | #pragma HLS interface s_axilite port=bias2 bundle=control
 49 | #pragma HLS interface s_axilite port=bias3 bundle=control
 50 | #pragma HLS interface s_axilite port=y bundle=control
 51 | #pragma HLS interface s_axilite port=return bundle=control
 52 | 
 53 |   dnnk::inference(x,
 54 |                   weight0, bias0,
 55 |                   weight1, bias1,
 56 |                   weight2, bias2,
 57 |                   weight3, bias3,
 58 |                   y);
 59 | }
 60 | 
 61 | void inference_dataflow(const float x[kMaxSize],
 62 |                         const float weight0[kMaxSize], const float bias0[kMaxSize],
 63 |                         const float weight1[kMaxSize], const float bias1[kMaxSize],
 64 |                         const float weight2[kMaxSize], const float bias2[kMaxSize],
 65 |                         const float weight3[kMaxSize], const float bias3[kMaxSize],
 66 |                         float y[kMaxSize]) {
 67 | #pragma HLS dataflow
 68 | #pragma HLS interface m_axi port=x offset=slave bundle=gmem0
 69 | #pragma HLS interface m_axi port=weight0 offset=slave bundle=gmem1
 70 | #pragma HLS interface m_axi port=weight1 offset=slave bundle=gmem2
 71 | #pragma HLS interface m_axi port=weight2 offset=slave bundle=gmem3
 72 | #pragma HLS interface m_axi port=weight3 offset=slave bundle=gmem4
 73 | #pragma HLS interface m_axi port=bias0 offset=slave bundle=gmem5
 74 | #pragma HLS interface m_axi port=bias1 offset=slave bundle=gmem6
 75 | #pragma HLS interface m_axi port=bias2 offset=slave bundle=gmem7
 76 | #pragma HLS interface m_axi port=bias3 offset=slave bundle=gmem8
 77 | #pragma HLS interface m_axi port=y offset=slave bundle=gmem9
 78 | #pragma HLS interface s_axilite port=x bundle=control
 79 | #pragma HLS interface s_axilite port=weight0 bundle=control
 80 | #pragma HLS interface s_axilite port=weight1 bundle=control
 81 | #pragma HLS interface s_axilite port=weight2 bundle=control
 82 | #pragma HLS interface s_axilite port=weight3 bundle=control
 83 | #pragma HLS interface s_axilite port=bias0 bundle=control
 84 | #pragma HLS interface s_axilite port=bias1 bundle=control
 85 | #pragma HLS interface s_axilite port=bias2 bundle=control
 86 | #pragma HLS interface s_axilite port=bias3 bundle=control
 87 | #pragma HLS interface s_axilite port=y bundle=control
 88 | #pragma HLS interface s_axilite port=return bundle=control
 89 | #pragma HLS interface ap_ctrl_chain port=return bundle=control
 90 | 
 91 | #pragma HLS stable variable=x
 92 | #pragma HLS stable variable=weight0
 93 | #pragma HLS stable variable=bias0
 94 | #pragma HLS stable variable=weight1
 95 | #pragma HLS stable variable=bias1
 96 | #pragma HLS stable variable=weight2
 97 | #pragma HLS stable variable=bias2
 98 | #pragma HLS stable variable=weight3
 99 | #pragma HLS stable variable=bias3
100 | #pragma HLS stable variable=y
101 | 
102 |   dnnk::inference(x,
103 |                   weight0, bias0,
104 |                   weight1, bias1,
105 |                   weight2, bias2,
106 |                   weight3, bias3,
107 |                   y);
108 | }
109 | 
110 | 
111 | void inference_with_local_buffer(const float x[kMaxSize],
112 |                                  const float weight0[kMaxSize], const float bias0[kMaxSize],
113 |                                  const float weight1[kMaxSize], const float bias1[kMaxSize],
114 |                                  const float weight2[kMaxSize], const float bias2[kMaxSize],
115 |                                  const float weight3[kMaxSize], const float bias3[kMaxSize],
116 |                                  float y[kMaxSize]) {
117 | #pragma HLS dataflow
118 | #pragma HLS interface m_axi port=x offset=slave bundle=gmem0
119 | #pragma HLS interface m_axi port=weight0 offset=slave bundle=gmem1
120 | #pragma HLS interface m_axi port=weight1 offset=slave bundle=gmem2
121 | #pragma HLS interface m_axi port=weight2 offset=slave bundle=gmem3
122 | #pragma HLS interface m_axi port=weight3 offset=slave bundle=gmem4
123 | #pragma HLS interface m_axi port=bias0 offset=slave bundle=gmem5
124 | #pragma HLS interface m_axi port=bias1 offset=slave bundle=gmem6
125 | #pragma HLS interface m_axi port=bias2 offset=slave bundle=gmem7
126 | #pragma HLS interface m_axi port=bias3 offset=slave bundle=gmem8
127 | #pragma HLS interface m_axi port=y offset=slave bundle=gmem9
128 | #pragma HLS interface s_axilite port=x bundle=control
129 | #pragma HLS interface s_axilite port=weight0 bundle=control
130 | #pragma HLS interface s_axilite port=weight1 bundle=control
131 | #pragma HLS interface s_axilite port=weight2 bundle=control
132 | #pragma HLS interface s_axilite port=weight3 bundle=control
133 | #pragma HLS interface s_axilite port=bias0 bundle=control
134 | #pragma HLS interface s_axilite port=bias1 bundle=control
135 | #pragma HLS interface s_axilite port=bias2 bundle=control
136 | #pragma HLS interface s_axilite port=bias3 bundle=control
137 | #pragma HLS interface s_axilite port=y bundle=control
138 | #pragma HLS interface s_axilite port=return bundle=control
139 | #pragma HLS interface ap_ctrl_chain port=return bundle=control
140 | 
141 | #pragma HLS stable variable=x
142 | #pragma HLS stable variable=weight0
143 | #pragma HLS stable variable=bias0
144 | #pragma HLS stable variable=weight1
145 | #pragma HLS stable variable=bias1
146 | #pragma HLS stable variable=weight2
147 | #pragma HLS stable variable=bias2
148 | #pragma HLS stable variable=weight3
149 | #pragma HLS stable variable=bias3
150 | #pragma HLS stable variable=y
151 | 
152 |   const std::size_t x_size = 1 * 28 * 28;
153 |   const std::size_t w0_size = 4 * 1 * 3 * 3, b0_size = 4;
154 |   const std::size_t w1_size = 8 * 4 * 3 * 3, b1_size = 8;
155 |   const std::size_t w2_size = 32 * 392, b2_size = 32;
156 |   const std::size_t w3_size = 10 * 32, b3_size = 10;
157 |   const std::size_t y_size = 10;
158 | 
159 |   float x_local[x_size];
160 |   float w0_local[w0_size], b0_local[b0_size];
161 |   float w1_local[w1_size], b1_local[b1_size];
162 |   float w2_local[w2_size], b2_local[b2_size];
163 |   float w3_local[w3_size], b3_local[b3_size];
164 |   float y_local[y_size];
165 | 
166 |   // fetch to local buffer
167 |   std::memcpy(x_local, x, x_size * sizeof(float));
168 |   std::memcpy(w0_local, weight0, w0_size * sizeof(float));
169 |   std::memcpy(b0_local, bias0, b0_size * sizeof(float));
170 |   std::memcpy(w1_local, weight1, w1_size * sizeof(float));
171 |   std::memcpy(b1_local, bias1, b1_size * sizeof(float));
172 |   std::memcpy(w2_local, weight2, w2_size * sizeof(float));
173 |   std::memcpy(b2_local, bias2, b2_size * sizeof(float));
174 |   std::memcpy(w3_local, weight3, w3_size * sizeof(float));
175 |   std::memcpy(b3_local, bias3, b3_size * sizeof(float));
176 | 
177 |   // run inference with local buffer
178 |   dnnk::inference(x_local,
179 |                   w0_local, b0_local,
180 |                   w1_local, b1_local,
181 |                   w2_local, b2_local,
182 |                   w3_local, b3_local,
183 |                   y_local);
184 | 
185 |   // store to global buffer
186 |   std::memcpy(y, y_local, y_size * sizeof(float));
187 | }
188 | 
189 | 
190 | #define DECLARE_INFERENCE_WITH_LOCAL_BUFFER(NAME, CONV_FUNC, MAXPOOL_FUNC, RELU_FUNC, LINEAR_FUNC) \
191 |   void NAME(const float x[kMaxSize],                                   \
192 |             const float weight0[kMaxSize], const float bias0[kMaxSize], \
193 |             const float weight1[kMaxSize], const float bias1[kMaxSize], \
194 |             const float weight2[kMaxSize], const float bias2[kMaxSize], \
195 |             const float weight3[kMaxSize], const float bias3[kMaxSize], \
196 |             float y[kMaxSize]) {                                        \
197 |   _Pragma("HLS dataflow")                                               \
198 |     _Pragma("HLS interface m_axi port=x offset=slave bundle=gmem0")     \
199 |     _Pragma("HLS interface m_axi port=weight0 offset=slave bundle=gmem1") \
200 |     _Pragma("HLS interface m_axi port=weight1 offset=slave bundle=gmem2") \
201 |     _Pragma("HLS interface m_axi port=weight2 offset=slave bundle=gmem3") \
202 |     _Pragma("HLS interface m_axi port=weight3 offset=slave bundle=gmem4") \
203 |     _Pragma("HLS interface m_axi port=bias0 offset=slave bundle=gmem5") \
204 |     _Pragma("HLS interface m_axi port=bias1 offset=slave bundle=gmem6") \
205 |     _Pragma("HLS interface m_axi port=bias2 offset=slave bundle=gmem7") \
206 |     _Pragma("HLS interface m_axi port=bias3 offset=slave bundle=gmem8") \
207 |     _Pragma("HLS interface m_axi port=y offset=slave bundle=gmem9")     \
208 |     _Pragma("HLS interface s_axilite port=x bundle=control")            \
209 |     _Pragma("HLS interface s_axilite port=weight0 bundle=control")      \
210 |     _Pragma("HLS interface s_axilite port=weight1 bundle=control")      \
211 |     _Pragma("HLS interface s_axilite port=weight2 bundle=control")      \
212 |     _Pragma("HLS interface s_axilite port=weight3 bundle=control")      \
213 |     _Pragma("HLS interface s_axilite port=bias0 bundle=control")        \
214 |     _Pragma("HLS interface s_axilite port=bias1 bundle=control")        \
215 |     _Pragma("HLS interface s_axilite port=bias2 bundle=control")        \
216 |     _Pragma("HLS interface s_axilite port=bias3 bundle=control")        \
217 |     _Pragma("HLS interface s_axilite port=y bundle=control")            \
218 |     _Pragma("HLS interface s_axilite port=return bundle=control")       \
219 |     _Pragma("HLS interface ap_ctrl_chain port=return bundle=control")   \
220 |     _Pragma("HLS stable variable=x")                                    \
221 |     _Pragma("HLS stable variable=weight0")                              \
222 |     _Pragma("HLS stable variable=bias0")                                \
223 |     _Pragma("HLS stable variable=weight1")                              \
224 |     _Pragma("HLS stable variable=bias1")                                \
225 |     _Pragma("HLS stable variable=weight2")                              \
226 |     _Pragma("HLS stable variable=bias2")                                \
227 |     _Pragma("HLS stable variable=weight3")                              \
228 |     _Pragma("HLS stable variable=bias3")                                \
229 |     _Pragma("HLS stable variable=y")                                    \
230 |     const std::size_t x_size = 1 * 28 * 28;                             \
231 |   const std::size_t w0_size = 4 * 1 * 3 * 3, b0_size = 4;               \
232 |   const std::size_t w1_size = 8 * 4 * 3 * 3, b1_size = 8;               \
233 |   const std::size_t w2_size = 32 * 392, b2_size = 32;                   \
234 |   const std::size_t w3_size = 10 * 32, b3_size = 10;                    \
235 |   const std::size_t y_size = 10;                                        \
236 |   float x_local[x_size];                                                \
237 |   float w0_local[w0_size], b0_local[b0_size];                           \
238 |   float w1_local[w1_size], b1_local[b1_size];                           \
239 |   float w2_local[w2_size], b2_local[b2_size];                           \
240 |   float w3_local[w3_size], b3_local[b3_size];                           \
241 |   float y_local[y_size];                                                \
242 |   std::memcpy(x_local, x, x_size * sizeof(float));                      \
243 |   std::memcpy(w0_local, weight0, w0_size * sizeof(float));              \
244 |   std::memcpy(b0_local, bias0, b0_size * sizeof(float));                \
245 |   std::memcpy(w1_local, weight1, w1_size * sizeof(float));              \
246 |   std::memcpy(b1_local, bias1, b1_size * sizeof(float));                \
247 |   std::memcpy(w2_local, weight2, w2_size * sizeof(float));              \
248 |   std::memcpy(b2_local, bias2, b2_size * sizeof(float));                \
249 |   std::memcpy(w3_local, weight3, w3_size * sizeof(float));              \
250 |   std::memcpy(b3_local, bias3, b3_size * sizeof(float));                \
251 |                                                                         \
252 |   dnnk::inference_custom(x_local,                                       \
253 |                          w0_local, b0_local,                            \
254 |                          w1_local, b1_local,                            \
255 |                          w2_local, b2_local,                            \
256 |                          w3_local, b3_local,                            \
257 |                          y_local,                                       \
258 |                          CONV_FUNC, MAXPOOL_FUNC, RELU_FUNC, LINEAR_FUNC); \
259 |                                                                         \
260 |   std::memcpy(y, y_local, y_size * sizeof(float));                      \
261 | }
262 | 
263 | DECLARE_INFERENCE_WITH_LOCAL_BUFFER(inference_pipelined_conv_v1, dnnk::conv2d_pipelined_v1, dnnk::maxpool2d, dnnk::relu, dnnk::linear);
264 | DECLARE_INFERENCE_WITH_LOCAL_BUFFER(inference_pipelined_conv_v2, dnnk::conv2d_pipelined_v2, dnnk::maxpool2d, dnnk::relu, dnnk::linear);
265 | DECLARE_INFERENCE_WITH_LOCAL_BUFFER(inference_unrolledx4_conv_v1, dnnk::conv2d_unrolled_v1<4>, dnnk::maxpool2d, dnnk::relu, dnnk::linear);
266 | DECLARE_INFERENCE_WITH_LOCAL_BUFFER(inference_unrolledx4_conv_v2, (dnnk::conv2d_unrolled_v2<4, 4>), dnnk::maxpool2d, dnnk::relu, dnnk::linear);
267 | 
268 | 
269 | 
270 | void inference_final(const float x[kMaxSize],
271 |                      const float weight0[kMaxSize], const float bias0[kMaxSize],
272 |                      const float weight1[kMaxSize], const float bias1[kMaxSize],
273 |                      const float weight2[kMaxSize], const float bias2[kMaxSize],
274 |                      const float weight3[kMaxSize], const float bias3[kMaxSize],
275 |                      float y[kMaxSize]) {
276 | #pragma HLS dataflow
277 | #pragma HLS interface m_axi port=x offset=slave bundle=gmem0
278 | #pragma HLS interface m_axi port=weight0 offset=slave bundle=gmem1
279 | #pragma HLS interface m_axi port=weight1 offset=slave bundle=gmem2
280 | #pragma HLS interface m_axi port=weight2 offset=slave bundle=gmem3
281 | #pragma HLS interface m_axi port=weight3 offset=slave bundle=gmem4
282 | #pragma HLS interface m_axi port=bias0 offset=slave bundle=gmem5
283 | #pragma HLS interface m_axi port=bias1 offset=slave bundle=gmem6
284 | #pragma HLS interface m_axi port=bias2 offset=slave bundle=gmem7
285 | #pragma HLS interface m_axi port=bias3 offset=slave bundle=gmem8
286 | #pragma HLS interface m_axi port=y offset=slave bundle=gmem9
287 | #pragma HLS interface s_axilite port=x bundle=control
288 | #pragma HLS interface s_axilite port=weight0 bundle=control
289 | #pragma HLS interface s_axilite port=weight1 bundle=control
290 | #pragma HLS interface s_axilite port=weight2 bundle=control
291 | #pragma HLS interface s_axilite port=weight3 bundle=control
292 | #pragma HLS interface s_axilite port=bias0 bundle=control
293 | #pragma HLS interface s_axilite port=bias1 bundle=control
294 | #pragma HLS interface s_axilite port=bias2 bundle=control
295 | #pragma HLS interface s_axilite port=bias3 bundle=control
296 | #pragma HLS interface s_axilite port=y bundle=control
297 | #pragma HLS interface s_axilite port=return bundle=control
298 | #pragma HLS interface ap_ctrl_chain port=return bundle=control
299 | 
300 | #pragma HLS stable variable=x
301 | #pragma HLS stable variable=weight0
302 | #pragma HLS stable variable=bias0
303 | #pragma HLS stable variable=weight1
304 | #pragma HLS stable variable=bias1
305 | #pragma HLS stable variable=weight2
306 | #pragma HLS stable variable=bias2
307 | #pragma HLS stable variable=weight3
308 | #pragma HLS stable variable=bias3
309 | #pragma HLS stable variable=y
310 | 
311 |   const std::size_t x_size = 1 * 28 * 28;
312 |   const std::size_t w0_size = 4 * 1 * 3 * 3, b0_size = 4;
313 |   const std::size_t w1_size = 8 * 4 * 3 * 3, b1_size = 8;
314 |   const std::size_t w2_size = 32 * 392, b2_size = 32;
315 |   const std::size_t w3_size = 10 * 32, b3_size = 10;
316 |   const std::size_t y_size = 10;
317 | 
318 |   float x_local[x_size];
319 |   float w0_local[w0_size], b0_local[b0_size];
320 |   float w1_local[w1_size], b1_local[b1_size];
321 |   float w2_local[w2_size], b2_local[b2_size];
322 |   float w3_local[w3_size], b3_local[b3_size];
323 |   float y_local[y_size];
324 | 
325 |   // fetch to local buffer
326 |   std::memcpy(x_local, x, x_size * sizeof(float));
327 |   std::memcpy(w0_local, weight0, w0_size * sizeof(float));
328 |   std::memcpy(b0_local, bias0, b0_size * sizeof(float));
329 |   std::memcpy(w1_local, weight1, w1_size * sizeof(float));
330 |   std::memcpy(b1_local, bias1, b1_size * sizeof(float));
331 |   std::memcpy(w2_local, weight2, w2_size * sizeof(float));
332 |   std::memcpy(b2_local, bias2, b2_size * sizeof(float));
333 |   std::memcpy(w3_local, weight3, w3_size * sizeof(float));
334 |   std::memcpy(b3_local, bias3, b3_size * sizeof(float));
335 | 
336 |   // run inference with local buffer
337 |   dnnk::inference_custom(x_local,
338 |                          w0_local, b0_local,
339 |                          w1_local, b1_local,
340 |                          w2_local, b2_local,
341 |                          w3_local, b3_local,
342 |                          y_local,
343 |                          dnnk::conv2d_unrolled_v2<4, 4>,
344 |                          dnnk::relu,
345 |                          dnnk::maxpool2d,
346 |                          dnnk::conv2d_unrolled_v2<8, 4>,
347 |                          dnnk::relu,
348 |                          dnnk::maxpool2d,
349 |                          dnnk::linear_opt<4>,
350 |                          dnnk::relu,
351 |                          dnnk::linear);
352 | 
353 |   // store to global buffer
354 |   std::memcpy(y, y_local, y_size * sizeof(float));
355 | }
356 | 
357 | }
358 | 


--------------------------------------------------------------------------------
/tests/hls/inference/inference_hls.h:
--------------------------------------------------------------------------------
 1 | #ifndef DNNKERNEL_TEST_INFERENCE_HLS_H
 2 | #define DNNKERNEL_TEST_INFERENCE_HLS_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | void inference_hls(const float *x,
 7 |                    const float* weight0, const float* bias0,
 8 |                    const float* weight1, const float* bias1,
 9 |                    const float* weight2, const float* bias2,
10 |                    const float* weight3, const float* bias3,
11 |                    float *y);
12 | 
13 | extern "C" {
14 | 
15 | void inference_top(const float *x,
16 |                    const float* weight0, const float* bias0,
17 |                    const float* weight1, const float* bias1,
18 |                    const float* weight2, const float* bias2,
19 |                    const float* weight3, const float* bias3,
20 |                    float *y);
21 | 
22 | void inference_dataflow(const float *x,
23 |                         const float* weight0, const float* bias0,
24 |                         const float* weight1, const float* bias1,
25 |                         const float* weight2, const float* bias2,
26 |                         const float* weight3, const float* bias3,
27 |                         float *y);
28 | 
29 | void inference_with_local_buffer(const float *x,
30 |                                  const float* weight0, const float* bias0,
31 |                                  const float* weight1, const float* bias1,
32 |                                  const float* weight2, const float* bias2,
33 |                                  const float* weight3, const float* bias3,
34 |                                  float *y);
35 | 
36 | void inference_pipelined_conv_v1(const float *x,
37 |                                  const float* weight0, const float* bias0,
38 |                                  const float* weight1, const float* bias1,
39 |                                  const float* weight2, const float* bias2,
40 |                                  const float* weight3, const float* bias3,
41 |                                  float *y);
42 | 
43 | void inference_pipelined_conv_v2(const float *x,
44 |                                  const float* weight0, const float* bias0,
45 |                                  const float* weight1, const float* bias1,
46 |                                  const float* weight2, const float* bias2,
47 |                                  const float* weight3, const float* bias3,
48 |                                  float *y);
49 | 
50 | void inference_unrolledx4_conv_v1(const float *x,
51 |                                   const float* weight0, const float* bias0,
52 |                                   const float* weight1, const float* bias1,
53 |                                   const float* weight2, const float* bias2,
54 |                                   const float* weight3, const float* bias3,
55 |                                   float *y);
56 | 
57 | void inference_unrolledx4_conv_v2(const float *x,
58 |                                   const float* weight0, const float* bias0,
59 |                                   const float* weight1, const float* bias1,
60 |                                   const float* weight2, const float* bias2,
61 |                                   const float* weight3, const float* bias3,
62 |                                   float *y);
63 | 
64 | void inference_final(const float *x,
65 |                      const float* weight0, const float* bias0,
66 |                      const float* weight1, const float* bias1,
67 |                      const float* weight2, const float* bias2,
68 |                      const float* weight3, const float* bias3,
69 |                      float *y);
70 | 
71 | }
72 | 
73 | #endif  // DNNKERNEL_TEST_INFERENCE_HLS_H
74 | 


--------------------------------------------------------------------------------
/tests/hls/inference/inference_test.cc:
--------------------------------------------------------------------------------
 1 | #include "inference_hls.h"
 2 | 
 3 | #include <algorithm>
 4 | #include <cstdint>
 5 | #include <iostream>
 6 | #include <vector>
 7 | #include <memory>
 8 | #include <string>
 9 | #include <map>
10 | 
11 | #include <torch/torch.h>
12 | #include <torch/script.h>
13 | 
14 | #include <tests/util.h>
15 | 
16 | #ifndef PROJECT_ROOT
17 | #error "PROJECT_ROOT is not defined"
18 | #endif
19 | 
20 | #ifndef TOP_FUNC
21 | #error "TOP_FUNC is not defined"
22 | #endif
23 | 
24 | static const std::size_t kMaxSize = 16384;
25 | 
26 | using namespace dnnk;
27 | namespace F = torch::nn::functional;
28 | 
29 | int main() {
30 |   // Seeds must be fixed because the testbench is executed twice in
31 |   // the cosimulation.
32 |   torch::manual_seed(0);
33 | 
34 | 
35 |   float x[kMaxSize], y[kMaxSize];
36 |   std::map<std::string, std::vector<float> > params;
37 | 
38 |   // load model file
39 |   auto model = torch::jit::load(PROJECT_ROOT "/learning/traced_model.pt");
40 | 
41 |   // load parameter values from model
42 |   for (const auto& param_ref : model.named_parameters()) {
43 | 
44 |     // use param_ref.name as key (ex: "conv1.weight")
45 |     params[param_ref.name].resize(param_ref.value.numel());
46 | 
47 |     // copy image data
48 |     tensor2array(param_ref.value, params[param_ref.name].data());
49 |   }
50 | 
51 |   // read MNIST dataset
52 |   auto dataset = torch::data::datasets::MNIST(PROJECT_ROOT "/learning/data/MNIST/raw")
53 |     .map(torch::data::transforms::Stack<>());
54 | 
55 |   // define loader and set batch_size to 1
56 |   auto data_loader =
57 |     torch::data::make_data_loader(std::move(dataset),
58 |                                   torch::data::DataLoaderOptions().batch_size(1));
59 | 
60 |   // iterate data_loader
61 |   std::size_t niters = 0;
62 |   for (auto& batch : *data_loader) {
63 | 
64 |     auto x_ref = batch.data;   // shape = (1, 1, 28, 28)
65 |     auto y_label = batch.target; // shape = (1)
66 | 
67 |     // run inference
68 |     tensor2array(x_ref, x);
69 |     TOP_FUNC (x,
70 |               params.at("conv1.weight").data(), params.at("conv1.bias").data(),
71 |               params.at("conv2.weight").data(), params.at("conv2.bias").data(),
72 |               params.at("fc1.weight").data(), params.at("fc1.bias").data(),
73 |               params.at("fc2.weight").data(), params.at("fc2.bias").data(),
74 |               y);
75 | 
76 | 
77 |     // run inference in pytorch
78 |     std::vector<torch::jit::IValue> inputs;
79 |     inputs.push_back(x_ref);
80 |     auto y_ref = model.forward(inputs).toTensor();
81 | 
82 |     if (!verify(y, y_ref)) {
83 |       printf("%sFailed%s\n", Color::red, Color::reset);
84 |       return 1;
85 |     }
86 | 
87 |     if (++niters == 4) {
88 |       break;
89 |     }
90 |   }
91 | 
92 |   printf("%sSucceed!%s\n", Color::green, Color::reset);
93 |   return 0;
94 | }
95 | 


--------------------------------------------------------------------------------
/tests/hls/linear/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(hls_src linear_hls.cc)
2 | set(test_src linear_test.cc)
3 | 
4 | add_test_and_impl(linear_hls linear_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=linear_hls")
5 | add_test_and_impl(linear_opt_2_hls linear_opt_2_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=linear_opt_2_hls")
6 | add_test_and_impl(linear_opt_3_hls linear_opt_3_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=linear_opt_3_hls")
7 | 


--------------------------------------------------------------------------------
/tests/hls/linear/linear_hls.cc:
--------------------------------------------------------------------------------
 1 | #include "dnn-kernel/linear.h"
 2 | 
 3 | #include <stdint.h>
 4 | #include <algorithm>
 5 | 
 6 | static const std::size_t kMaxSize = 65536;
 7 | 
 8 | void linear_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], int32_t in_features, int32_t out_features, float y[kMaxSize]) {
 9 | 
10 |   dnnk::linear(x, weight, bias, in_features, out_features, y);
11 | }
12 | 
13 | void linear_opt_2_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], int32_t in_features, int32_t out_features, float y[kMaxSize]) {
14 | 
15 |   dnnk::linear_opt<2>(x, weight, bias, in_features, out_features, y);
16 | }
17 | 
18 | void linear_opt_3_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], int32_t in_features, int32_t out_features, float y[kMaxSize]) {
19 | 
20 |   dnnk::linear_opt<3>(x, weight, bias, in_features, out_features, y);
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/hls/linear/linear_hls.h:
--------------------------------------------------------------------------------
 1 | #ifndef DNNKERNEL_TEST_LINEAR_HLS_H
 2 | #define DNNKERNEL_TEST_LINEAR_HLS_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | void linear_hls(const float *x, const float* weight, const float* bias, int32_t in_features, int32_t out_features, float *y);
 7 | void linear_opt_2_hls(const float *x, const float* weight, const float* bias, int32_t in_features, int32_t out_features, float *y);
 8 | void linear_opt_3_hls(const float *x, const float* weight, const float* bias, int32_t in_features, int32_t out_features, float *y);
 9 | 
10 | #endif  // DNNKERNEL_TEST_LINEAR_HLS_H
11 | 


--------------------------------------------------------------------------------
/tests/hls/linear/linear_test.cc:
--------------------------------------------------------------------------------
 1 | #include "linear_hls.h"
 2 | 
 3 | #include <algorithm>
 4 | #include <cstdint>
 5 | #include <iostream>
 6 | #include <vector>
 7 | 
 8 | #include <torch/torch.h>
 9 | 
10 | #include <tests/util.h>
11 | 
12 | #ifndef TOP_FUNC
13 | #error "TOP_FUNC is not defined"
14 | #endif
15 | 
16 | static const std::size_t kMaxSize = 65536;
17 | 
18 | using namespace dnnk;
19 | namespace F = torch::nn::functional;
20 | 
21 | int main() {
22 |   // Seeds must be fixed because the testbench is executed twice in
23 |   // the cosimulation.
24 |   torch::manual_seed(0);
25 | 
26 |   int in_features = 32, out_features = 16;
27 | 
28 |   auto x_ref = torch::randn({1, in_features});
29 |   auto weight_ref = torch::randn({out_features, in_features});
30 |   auto bias_ref = torch::randn({out_features});
31 | 
32 |   float x[kMaxSize], weight[kMaxSize], bias[kMaxSize], y[kMaxSize];
33 |   tensor2array(x_ref, x);
34 |   tensor2array(weight_ref, weight);
35 |   tensor2array(bias_ref, bias);
36 | 
37 |   auto y_ref = F::linear(x_ref, weight_ref, bias_ref);
38 |   TOP_FUNC (x, weight, bias, in_features, out_features, y);
39 | 
40 |   if (!verify(y, y_ref)) {
41 |     printf("%sFailed%s\n", Color::red, Color::reset);
42 |     return 1;
43 |   }
44 | 
45 |   printf("%sSucceed!%s\n", Color::green, Color::reset);
46 |   return 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/tests/hls/maxpool2d/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(hls_src maxpool2d_hls.cc)
2 | set(test_src maxpool2d_test.cc)
3 | 
4 | add_test_and_impl(maxpool2d_hls maxpool2d_hls HLS_SRC ${hls_src} TB_SRC ${test_src})
5 | 


--------------------------------------------------------------------------------
/tests/hls/maxpool2d/maxpool2d_hls.cc:
--------------------------------------------------------------------------------
 1 | #include "dnn-kernel/maxpool2d.h"
 2 | 
 3 | #include <stdint.h>
 4 | #include <algorithm>
 5 | 
 6 | static const std::size_t kMaxSize = 65536;
 7 | 
 8 | void maxpool2d_hls(const float x[kMaxSize], int32_t width, int32_t height, int32_t channels, int32_t stride, float y[kMaxSize]) {
 9 | 
10 |   dnnk::maxpool2d(x, width, height, channels, stride, y);
11 | }
12 | 


--------------------------------------------------------------------------------
/tests/hls/maxpool2d/maxpool2d_hls.h:
--------------------------------------------------------------------------------
1 | #ifndef DNNKERNEL_TEST_CONV2D_HLS_H
2 | #define DNNKERNEL_TEST_CONV2D_HLS_H
3 | 
4 | #include <stdint.h>
5 | 
6 | void maxpool2d_hls(const float* x, int32_t width, int32_t height, int32_t channels, int32_t stride, float* y);
7 | 
8 | #endif  // DNNKERNEL_TEST_CONV2D_HLS_H
9 | 


--------------------------------------------------------------------------------
/tests/hls/maxpool2d/maxpool2d_test.cc:
--------------------------------------------------------------------------------
 1 | #include "maxpool2d_hls.h"
 2 | 
 3 | #include <algorithm>
 4 | #include <cstdint>
 5 | #include <iostream>
 6 | #include <vector>
 7 | 
 8 | #include <torch/torch.h>
 9 | 
10 | #include <tests/util.h>
11 | 
12 | static const std::size_t kMaxSize = 65536;
13 | 
14 | using namespace dnnk;
15 | namespace F = torch::nn::functional;
16 | 
17 | int main() {
18 |   // Seeds must be fixed because the testbench is executed twice in
19 |   // the cosimulation.
20 |   torch::manual_seed(0);
21 | 
22 |   int h = 32, w = 32, channels = 4, stride = 2;
23 | 
24 |   auto x_ref = torch::randn({1, channels, h, w});
25 | 
26 |   float x[kMaxSize], y[kMaxSize];
27 |   tensor2array(x_ref, x);
28 | 
29 |   auto y_ref = F::detail::max_pool2d(x_ref, stride, stride, 0, 1, false);
30 |   maxpool2d_hls(x, w, h, channels, stride, y);
31 | 
32 |   if (!verify(y, y_ref)) {
33 |     printf("%sFailed%s\n", Color::red, Color::reset);
34 |     return 1;
35 |   }
36 | 
37 |   printf("%sSucceed!%s\n", Color::green, Color::reset);
38 |   return 0;
39 | }
40 | 


--------------------------------------------------------------------------------
/tests/hls/relu/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(hls_src relu_hls.cc)
2 | set(test_src relu_test.cc)
3 | 
4 | add_test_and_impl(relu_hls relu_hls HLS_SRC ${hls_src} TB_SRC ${test_src})
5 | 


--------------------------------------------------------------------------------
/tests/hls/relu/relu_hls.cc:
--------------------------------------------------------------------------------
 1 | #include "dnn-kernel/relu.h"
 2 | 
 3 | #include <stdint.h>
 4 | #include <algorithm>
 5 | 
 6 | void relu_hls(const float x[1000], int64_t size, float y[1000]) {
 7 | 
 8 |   dnnk::relu(x, size, y);
 9 | }
10 | 


--------------------------------------------------------------------------------
/tests/hls/relu/relu_hls.h:
--------------------------------------------------------------------------------
1 | #ifndef DNNKERNEL_TEST_RELU_HLS_H
2 | #define DNNKERNEL_TEST_RELU_HLS_H
3 | 
4 | #include <stdint.h>
5 | 
6 | void relu_hls(const float* x, int64_t size, float* y);
7 | 
8 | #endif  // DNNKERNEL_TEST_RELU_HLS_H
9 | 


--------------------------------------------------------------------------------
/tests/hls/relu/relu_test.cc:
--------------------------------------------------------------------------------
 1 | #include "relu_hls.h"
 2 | 
 3 | #include <algorithm>
 4 | #include <cstdint>
 5 | #include <iostream>
 6 | 
 7 | #include <torch/torch.h>
 8 | 
 9 | #include <tests/util.h>
10 | 
11 | using namespace dnnk;
12 | namespace F = torch::nn::functional;
13 | 
14 | int main() {
15 |   // Seeds must be fixed because the testbench is executed twice in
16 |   // the cosimulation.
17 |   torch::manual_seed(0);
18 | 
19 |   const std::size_t size_max = 1000;
20 |   auto x_ref = torch::randn({28, 28, 1});
21 |   float x[size_max], y[size_max];
22 |   tensor2array(x_ref, x);
23 | 
24 |   relu_hls(x, x_ref.numel(), y);
25 |   auto y_ref = F::detail::relu(x_ref, false);
26 | 
27 |   if (!verify(y, y_ref)) {
28 |     printf("%sFailed%s\n", Color::red, Color::reset);
29 |     return 1;
30 |   }
31 | 
32 |   printf("%sSucceed!%s\n", Color::green, Color::reset);
33 |   return 0;
34 | }
35 | 


--------------------------------------------------------------------------------
/tests/hls/tb.tcl:
--------------------------------------------------------------------------------
 1 | set mode [lindex $argv 2]
 2 | set name [lindex $argv 3]
 3 | set hls_srcs [lindex $argv 4]
 4 | set top [lindex $argv 5]
 5 | set chip_part [lindex $argv 6]
 6 | set cxxflags [lindex $argv 7]
 7 | set ldflags [lindex $argv 8]
 8 | set test_srcs [lindex $argv 9]
 9 | set test_args [lindex $argv 10]
10 | 
11 | 
12 | open_project -reset ${name}
13 | 
14 | regsub "cxxflags=" $cxxflags {} cxxflags
15 | regsub "ldflags=" $ldflags {} ldflags
16 | set test_cxxflags "${cxxflags} -std=c++14 -fopenmp"
17 | 
18 | set_top ${top}
19 | add_files ${hls_srcs} -cflags "${cxxflags}"
20 | 
21 | open_solution "solution1"
22 | set_part ${chip_part}
23 | create_clock -period 3.33 -name default
24 | 
25 | csynth_design
26 | 
27 | if {${mode} == "cosim"} {
28 |     add_files -tb ${test_srcs} -cflags "${test_cxxflags}"
29 |     cosim_design -trace_level port -ldflags "${ldflags}" -argv "${test_args}"
30 | }
31 | 
32 | if {${mode} == "impl"} {
33 |     export_design -flow impl -rtl verilog -format ip_catalog
34 | }
35 | 
36 | if {${mode} == "xo"} {
37 |     config_rtl -kernel_profile
38 |     config_sdx -target xocc -profile true
39 |     export_design -flow impl -rtl verilog -format ip_catalog -xo ${name}.xo
40 | }
41 | 
42 | exit
43 | 


--------------------------------------------------------------------------------
/tests/ref/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | file(GLOB TEST_SRCS *.cc)
 2 | 
 3 | foreach(test_path ${TEST_SRCS})
 4 |   get_filename_component(test_file ${test_path} NAME)
 5 |   string(REPLACE ".cc" "" test_name ${test_file}_ref)
 6 |   add_executable(${test_name} ${test_file})
 7 |   add_dependencies(${test_name} googletest)
 8 |   target_include_directories(${test_name} PRIVATE ${DNNK_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS})
 9 |   target_compile_options(${test_name} PRIVATE ${DNNK_CXX_FLAGS} "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"")
10 |   target_compile_features(${test_name} PRIVATE cxx_std_14)
11 |   target_link_libraries(${test_name} PRIVATE ${TORCH_LIBRARIES} ${GTEST_LIBRARIES})
12 |   
13 |   add_test(
14 |     NAME ${test_name}
15 |     COMMAND ${test_name}
16 |     )
17 | 
18 | endforeach()
19 | 


--------------------------------------------------------------------------------
/tests/ref/conv2d.cc:
--------------------------------------------------------------------------------
 1 | #include "dnn-kernel/conv2d.h"
 2 | 
 3 | #include <algorithm>
 4 | #include <cstdint>
 5 | #include <iostream>
 6 | 
 7 | #include <gtest/gtest.h>
 8 | 
 9 | #include <torch/torch.h>
10 | 
11 | #include <tests/util.h>
12 | 
13 | using namespace dnnk;
14 | namespace F = torch::nn::functional;
15 | 
16 | TEST(CPUVerify, Conv2d) {
17 |   torch::manual_seed(0);
18 | 
19 |   int h = 14, w = 14, in_channels = 4, out_channels = 8, ksize = 3;
20 | 
21 |   auto x_ref = torch::randn({1, in_channels, h, w});
22 |   auto weight_ref = torch::randn({out_channels, in_channels, ksize, ksize});
23 |   auto bias_ref = torch::randn({out_channels});
24 | 
25 |   std::vector<float> x(x_ref.numel());
26 |   std::vector<float> weight(weight_ref.numel());
27 |   std::vector<float> bias(out_channels);
28 |   tensor2array(x_ref, x.data());
29 |   tensor2array(weight_ref, weight.data());
30 |   tensor2array(bias_ref, bias.data());
31 | 
32 |   auto y_ref = F::detail::conv2d(x_ref, weight_ref, bias_ref, 1, ksize/2, 1, 1);
33 |   std::vector<float> y(y_ref.numel());
34 |   conv2d(x.data(), weight.data(), bias.data(), w, h, in_channels, out_channels, ksize, y.data());
35 | 
36 |   EXPECT_TRUE(verify(y.data(), y_ref));
37 | }
38 | 
39 | int main(int argc, char** argv) {
40 |   ::testing::InitGoogleTest(&argc, argv);
41 |   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
42 |   return RUN_ALL_TESTS();
43 | }
44 | 


--------------------------------------------------------------------------------
/tests/ref/inference.cc:
--------------------------------------------------------------------------------
  1 | #include "dnn-kernel/inference.h"
  2 | 
  3 | #include <algorithm>
  4 | #include <cstdint>
  5 | #include <iostream>
  6 | 
  7 | #include <gtest/gtest.h>
  8 | 
  9 | #include <torch/torch.h>
 10 | #include <torch/script.h>
 11 | 
 12 | #include <tests/util.h>
 13 | 
 14 | #ifndef PROJECT_ROOT
 15 | #error "PROJECT_ROOT is not defined"
 16 | #endif
 17 | 
 18 | using namespace dnnk;
 19 | namespace F = torch::nn::functional;
 20 | 
 21 | TEST(CPUVerify, Inference) {
 22 |   std::vector<float> x, y;
 23 |   std::map<std::string, std::vector<float> > params;
 24 | 
 25 |   // load model file
 26 |   auto model = torch::jit::load(PROJECT_ROOT "/learning/traced_model.pt");
 27 | 
 28 |   // load parameter values from model
 29 |   for (const auto& param_ref : model.named_parameters()) {
 30 | 
 31 |     // use param_ref.name as key (ex: "conv1.weight")
 32 |     params[param_ref.name].resize(param_ref.value.numel());
 33 | 
 34 |     // copy image data
 35 |     tensor2array(param_ref.value, params[param_ref.name].data());
 36 |   }
 37 | 
 38 |   // read MNIST dataset
 39 |   auto dataset = torch::data::datasets::MNIST(PROJECT_ROOT "/learning/data/MNIST/raw")
 40 |     .map(torch::data::transforms::Stack<>());
 41 | 
 42 |   // define loader and set batch_size to 1
 43 |   auto data_loader =
 44 |     torch::data::make_data_loader(std::move(dataset),
 45 |                                   torch::data::DataLoaderOptions().batch_size(1));
 46 | 
 47 |   // iterate data_loader
 48 |   std::size_t num_data = 0;
 49 |   std::size_t num_corrects_ref = 0;
 50 |   std::size_t num_corrects = 0;
 51 |   for (auto& batch : *data_loader) {
 52 | 
 53 |     auto x_ref = batch.data;   // shape = (1, 1, 28, 28)
 54 |     auto y_label = batch.target; // shape = (1)
 55 | 
 56 |     // run inference in pytorch
 57 |     std::vector<torch::jit::IValue> inputs;
 58 |     inputs.push_back(x_ref);
 59 |     auto y_ref = model.forward(inputs).toTensor();
 60 | 
 61 |     x.resize(x_ref.numel());
 62 |     y.resize(y_ref.numel());
 63 | 
 64 |     // run inference
 65 |     tensor2array(x_ref, x.data());
 66 |     inference(x.data(),
 67 |               params.at("conv1.weight").data(), params.at("conv1.bias").data(),
 68 |               params.at("conv2.weight").data(), params.at("conv2.bias").data(),
 69 |               params.at("fc1.weight").data(), params.at("fc1.bias").data(),
 70 |               params.at("fc2.weight").data(), params.at("fc2.bias").data(),
 71 |               y.data());
 72 | 
 73 |     EXPECT_TRUE(verify(y.data(), y_ref));
 74 | 
 75 |     // summarize
 76 |     num_data++;
 77 | 
 78 |     if (y_label.data_ptr<int64_t>()[0] == y_ref.argmax().data_ptr<int64_t>()[0]) {
 79 |       num_corrects_ref++;
 80 |     }
 81 | 
 82 |     auto argmax = [](const std::vector<float>& vec) {
 83 |       return std::distance(vec.begin(), std::max_element(vec.begin(), vec.end()));
 84 |     };
 85 | 
 86 |     if (y_label.data_ptr<int64_t>()[0] == argmax(y)) {
 87 |       num_corrects++;
 88 |     }
 89 |   }
 90 | 
 91 |   std::cout << "accuracy (ref): " << double(num_corrects_ref) / num_data << std::endl;
 92 |   std::cout << "accuracy: " << double(num_corrects) / num_data << std::endl;
 93 | }
 94 | 
 95 | int main(int argc, char** argv) {
 96 |   ::testing::InitGoogleTest(&argc, argv);
 97 |   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
 98 |   return RUN_ALL_TESTS();
 99 | }
100 | 


--------------------------------------------------------------------------------
/tests/ref/linear.cc:
--------------------------------------------------------------------------------
 1 | #include "dnn-kernel/linear.h"
 2 | 
 3 | #include <algorithm>
 4 | #include <cstdint>
 5 | #include <iostream>
 6 | 
 7 | #include <gtest/gtest.h>
 8 | 
 9 | #include <torch/torch.h>
10 | 
11 | #include <tests/util.h>
12 | 
13 | using namespace dnnk;
14 | namespace F = torch::nn::functional;
15 | 
16 | TEST(CPUVerify, Linear) {
17 |   torch::manual_seed(0);
18 | 
19 |   int in_channels = 32, out_channels = 16;
20 | 
21 |   auto x_ref = torch::randn({1, in_channels});
22 |   auto weight_ref = torch::randn({out_channels, in_channels});
23 |   auto bias_ref = torch::randn({out_channels});
24 | 
25 |   std::vector<float> x(x_ref.numel());
26 |   std::vector<float> weight(weight_ref.numel());
27 |   std::vector<float> bias(out_channels);
28 |   tensor2array(x_ref, x.data());
29 |   tensor2array(weight_ref, weight.data());
30 |   tensor2array(bias_ref, bias.data());
31 | 
32 |   auto y_ref = F::linear(x_ref, weight_ref, bias_ref);
33 |   std::vector<float> y(y_ref.numel());
34 |   linear(x.data(), weight.data(), bias.data(), in_channels, out_channels, y.data());
35 | 
36 |   EXPECT_TRUE(verify(y.data(), y_ref));
37 | }
38 | 
39 | int main(int argc, char** argv) {
40 |   ::testing::InitGoogleTest(&argc, argv);
41 |   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
42 |   return RUN_ALL_TESTS();
43 | }
44 | 


--------------------------------------------------------------------------------
/tests/ref/maxpool2d.cc:
--------------------------------------------------------------------------------
 1 | #include "dnn-kernel/maxpool2d.h"
 2 | 
 3 | #include <algorithm>
 4 | #include <cstdint>
 5 | #include <iostream>
 6 | 
 7 | #include <gtest/gtest.h>
 8 | 
 9 | #include <torch/torch.h>
10 | 
11 | #include <tests/util.h>
12 | 
13 | using namespace dnnk;
14 | namespace F = torch::nn::functional;
15 | 
16 | TEST(CPUVerify, Maxpool2d) {
17 |   torch::manual_seed(0);
18 | 
19 |   int h = 32, w = 32, channels = 4, stride = 2;
20 | 
21 |   auto x_ref = torch::randn({1, channels, h, w});
22 | 
23 |   std::vector<float> x(x_ref.numel());
24 |   tensor2array(x_ref, x.data());
25 | 
26 |   auto y_ref = F::detail::max_pool2d(x_ref, stride, stride, 0, 1, false);
27 |   std::vector<float> y(y_ref.numel());
28 |   maxpool2d(x.data(), w, h, channels, stride, y.data());
29 | 
30 |   EXPECT_TRUE(verify(y.data(), y_ref));
31 | }
32 | 
33 | int main(int argc, char** argv) {
34 |   ::testing::InitGoogleTest(&argc, argv);
35 |   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
36 |   return RUN_ALL_TESTS();
37 | }
38 | 


--------------------------------------------------------------------------------
/tests/ref/relu.cc:
--------------------------------------------------------------------------------
 1 | #include "dnn-kernel/relu.h"
 2 | 
 3 | #include <algorithm>
 4 | #include <cstdint>
 5 | #include <iostream>
 6 | 
 7 | #include <gtest/gtest.h>
 8 | 
 9 | #include <torch/torch.h>
10 | 
11 | #include <tests/util.h>
12 | 
13 | using namespace dnnk;
14 | namespace F = torch::nn::functional;
15 | 
16 | TEST(CPUVerify, ReLU) {
17 |   auto x_ref = torch::randn({28, 28, 1});
18 |   const float* x = tensor2array(x_ref);
19 |   float* y = new float[x_ref.numel()];
20 | 
21 |   dnnk::relu(x, x_ref.numel(), y);
22 |   auto y_ref = F::detail::relu(x_ref, false);
23 | 
24 |   EXPECT_TRUE(verify(y, y_ref));
25 | }
26 | 
27 | int main(int argc, char** argv) {
28 |   ::testing::InitGoogleTest(&argc, argv);
29 |   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
30 |   return RUN_ALL_TESTS();
31 | }
32 | 


--------------------------------------------------------------------------------
/tests/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef DNNKERNEL_TEST_UTIL_H
 2 | #define DNNKERNEL_TEST_UTIL_H
 3 | 
 4 | #include "torch/torch.h"
 5 | 
 6 | namespace dnnk {
 7 | namespace {
 8 | 
 9 | struct Color {
10 |   static constexpr const char* red = "\u001b[31m";
11 |   static constexpr const char* green = "\u001b[32m";
12 |   static constexpr const char* reset = "\u001b[0m";
13 | };
14 | 
15 | float* tensor2array(const torch::Tensor& tensor) {
16 |   float* ret = new float[tensor.numel()];
17 |   std::memcpy(ret, tensor.data_ptr(), tensor.nbytes());
18 |   return ret;
19 | }
20 | 
21 | void tensor2array(const torch::Tensor& tensor, float* array) {
22 |   std::memcpy(array, tensor.data_ptr(), tensor.nbytes());
23 | }
24 | 
25 | bool verify(const float* actual, const torch::Tensor& expect) {
26 |   const float tolerance = 10e-5f;
27 |   auto expect_ptr = expect.data_ptr<float>();
28 | 
29 |   for (auto i = decltype(expect.numel())(0); i < expect.numel(); ++i) {
30 |     if (std::abs(actual[i] - expect_ptr[i]) >= tolerance) {
31 |       std::cout << i << " : " << actual[i] << " vs " << expect_ptr[i]
32 |                 << std::endl;
33 |       return false;
34 |     }
35 |   }
36 | 
37 |   return true;
38 | }
39 | 
40 | }  // namespace
41 | }  // namespace dnnk
42 | 
43 | #endif  // DNNKERNEL_TEST_UTIL_H
44 | 


--------------------------------------------------------------------------------
/thirdparty/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip
4 | wget https://github.com/google/googletest/archive/release-1.10.0.zip
5 | wget https://github.com/Kitware/CMake/releases/download/v3.16.8/cmake-3.16.8-Linux-x86_64.tar.gz
6 | 
7 | tar -xf cmake-3.16.8-Linux-x86_64.tar.gz
8 | 


--------------------------------------------------------------------------------