├── .clang-format ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── doc └── acri-room-howto.md ├── host ├── CMakeLists.txt ├── host_util.h ├── link_u200_template.ini ├── link_u50_template.ini └── run_inference.cc ├── include └── dnn-kernel │ ├── conv2d.h │ ├── inference.h │ ├── linear.h │ ├── maxpool2d.h │ └── relu.h ├── learning ├── requirements.txt └── train_mnist.py ├── tests ├── CMakeLists.txt ├── hls │ ├── CMakeLists.txt │ ├── conv2d │ │ ├── CMakeLists.txt │ │ ├── conv2d_hls.cc │ │ ├── conv2d_hls.h │ │ └── conv2d_test.cc │ ├── inference │ │ ├── CMakeLists.txt │ │ ├── inference_hls.cc │ │ ├── inference_hls.h │ │ └── inference_test.cc │ ├── linear │ │ ├── CMakeLists.txt │ │ ├── linear_hls.cc │ │ ├── linear_hls.h │ │ └── linear_test.cc │ ├── maxpool2d │ │ ├── CMakeLists.txt │ │ ├── maxpool2d_hls.cc │ │ ├── maxpool2d_hls.h │ │ └── maxpool2d_test.cc │ ├── relu │ │ ├── CMakeLists.txt │ │ ├── relu_hls.cc │ │ ├── relu_hls.h │ │ └── relu_test.cc │ └── tb.tcl ├── ref │ ├── CMakeLists.txt │ ├── conv2d.cc │ ├── inference.cc │ ├── linear.cc │ ├── maxpool2d.cc │ └── relu.cc └── util.h └── thirdparty └── download.sh /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Google 4 | AccessModifierOffset: -1 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Left 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: All 15 | AllowShortIfStatementsOnASingleLine: true 16 | AllowShortLoopsOnASingleLine: true 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: true 20 | AlwaysBreakTemplateDeclarations: true 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakBeforeTernaryOperators: true 43 | BreakConstructorInitializersBeforeComma: false 44 | BreakConstructorInitializers: BeforeColon 45 | BreakAfterJavaFieldAnnotations: false 46 | BreakStringLiterals: true 47 | ColumnLimit: 80 48 | CommentPragmas: '^ IWYU pragma:' 49 | CompactNamespaces: false 50 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 51 | ConstructorInitializerIndentWidth: 4 52 | ContinuationIndentWidth: 4 53 | Cpp11BracedListStyle: true 54 | DerivePointerAlignment: true 55 | DisableFormat: false 56 | ExperimentalAutoDetectBinPacking: false 57 | FixNamespaceComments: true 58 | ForEachMacros: 59 | - foreach 60 | - Q_FOREACH 61 | - BOOST_FOREACH 62 | IncludeBlocks: Preserve 63 | IncludeCategories: 64 | - Regex: '^' 65 | Priority: 2 66 | - Regex: '^<.*\.h>' 67 | Priority: 1 68 | - Regex: '^<.*' 69 | Priority: 2 70 | - Regex: '.*' 71 | Priority: 3 72 | IncludeIsMainRegex: '([-_](test|unittest))?$' 73 | IndentCaseLabels: true 74 | IndentPPDirectives: None 75 | IndentWidth: 2 76 | IndentWrappedFunctionNames: false 77 | JavaScriptQuotes: Leave 78 | JavaScriptWrapImports: true 79 | KeepEmptyLinesAtTheStartOfBlocks: false 80 | MacroBlockBegin: '' 81 | MacroBlockEnd: '' 82 | MaxEmptyLinesToKeep: 1 83 | NamespaceIndentation: None 84 | ObjCBlockIndentWidth: 2 85 | ObjCSpaceAfterProperty: false 86 | ObjCSpaceBeforeProtocolList: false 87 | PenaltyBreakAssignment: 2 88 | PenaltyBreakBeforeFirstCallParameter: 1 89 | PenaltyBreakComment: 300 90 | PenaltyBreakFirstLessLess: 120 91 | PenaltyBreakString: 1000 92 | PenaltyExcessCharacter: 1000000 93 | PenaltyReturnTypeOnItsOwnLine: 200 94 | PointerAlignment: Left 95 | RawStringFormats: 96 | - Delimiter: pb 97 | Language: TextProto 98 | BasedOnStyle: google 99 | ReflowComments: true 100 | SortIncludes: true 101 | SortUsingDeclarations: true 102 | SpaceAfterCStyleCast: false 103 | SpaceAfterTemplateKeyword: true 104 | SpaceBeforeAssignmentOperators: true 105 | SpaceBeforeParens: ControlStatements 106 | SpaceInEmptyParentheses: false 107 | SpacesBeforeTrailingComments: 2 108 | SpacesInAngles: false 109 | SpacesInContainerLiterals: true 110 | SpacesInCStyleCastParentheses: false 111 | SpacesInParentheses: false 112 | SpacesInSquareBrackets: false 113 | Standard: Auto 114 | TabWidth: 8 115 | UseTab: Never 116 | ... 117 | 118 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build* 2 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.11 FATAL_ERROR) 2 | enable_testing() 3 | 4 | # Project 5 | set(PROJECT_NAME dnn-kernel) 6 | project(${PROJECT_NAME} LANGUAGES C CXX) 7 | 8 | 9 | # Default to Debug build type 10 | if(NOT CMAKE_BUILD_TYPE) 11 | set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE) 12 | endif() 13 | 14 | # Project settings 15 | set(DNNK_INCLUDE_DIRS ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/include) 16 | set(DNNK_CXX_FLAGS "-g" "-Wall" "-Wno-uninitialized" "-Wno-unused-function" "-Wno-unknown-pragmas") 17 | 18 | # Vivado HLS 19 | set(VIVADO_HLS_ROOT "/opt/Xilinx/VivadoHLS/2019.1" CACHE STRING "Path to Vivado HLS root directory") 20 | set(VHLS_INCLUDE_DIRS ${VIVADO_HLS_ROOT}/include) 21 | set(XILINX_XRT "/opt/xilinx/xrt") 22 | 23 | # Target board 24 | set(TARGET_BOARD "u200" CACHE STRING "Select target Alveo board (available: \"u200\", \"u250\", \"u280\", \"u50\")") 25 | if (${TARGET_BOARD} STREQUAL "u200") 26 | set(CHIP_PART "xcu200-fsgd2104-2-e") 27 | set(VITIS_PLATFORM "/opt/xilinx/platforms/xilinx_u200_xdma_201830_2/xilinx_u200_xdma_201830_2.xpfm") 28 | elseif (${TARGET_BOARD} STREQUAL "u250") 29 | set(CHIP_PART "xcu250-figd2104-2L-e") 30 | set(VITIS_PLATFORM "/opt/xilinx/platforms/xilinx_u250_xdma_201830_2/xilinx_u250_xdma_201830_2.xpfm") 31 | elseif (${TARGET_BOARD} STREQUAL "u280") 32 | set(CHIP_PART "xcu280-fsvh2892-2L-e") 33 | set(VITIS_PLATFORM "/opt/xilinx/platforms/xilinx_u280-es1_xdma_201910_1/xilinx_u280-es1_xdma_201910_1.xpfm") 34 | elseif (${TARGET_BOARD} STREQUAL "u50") 35 | set(CHIP_PART "xcu50-fsvh2104-2-e") 36 | set(VITIS_PLATFORM "/opt/xilinx/platforms/xilinx_u50_gen3x16_xdma_201920_3/xilinx_u50_gen3x16_xdma_201920_3.xpfm") 37 | else() 38 | message(FATAL_ERROR "Unknown TARGET_BOARD value \"${TARGET_BOARD}\"") 39 | endif() 40 | 41 | # thirdpartys 42 | include(ExternalProject) 43 | include(FetchContent) 44 | 45 | ## libtorch 46 | set(LIBTORCH_LOCAL_PATH "filefile://://${CMAKE_SOURCE_DIR}/thirdparty/libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip") 47 | FetchContent_Declare( 48 | libtorch 49 | URL ${LIBTORCH_LOCAL_PATH} https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip 50 | ) 51 | FetchContent_GetProperties(libtorch) 52 | if(NOT libtorch_POPULATED) 53 | FetchContent_Populate(libtorch) 54 | endif() 55 | 56 | list(APPEND CMAKE_PREFIX_PATH ${libtorch_SOURCE_DIR}) 57 | set(TORCH_LIBRARY_DIRS ${libtorch_SOURCE_DIR}/lib) 58 | find_package(Torch REQUIRED) 59 | 60 | ## googletest 61 | set(GTEST_PREFIX ${PROJECT_BINARY_DIR}/thirdparty/googletest) 62 | set(GTEST_INSTALL ${GTEST_PREFIX}/install) 63 | set(GTEST_INCLUDE_DIRS ${GTEST_INSTALL}/include) 64 | set(GTEST_LIBRARY_DIRS ${GTEST_INSTALL}/lib) 65 | set(GTEST_LIBRARIES ${GTEST_INSTALL}/lib/libgtest.a) 66 | set(GTEST_LOCAL_PATH "filefile://://${CMAKE_SOURCE_DIR}/thirdparty/release-1.10.0.zip") 67 | 68 | ExternalProject_Add( 69 | googletest 70 | PREFIX ${GTEST_PREFIX} 71 | URL ${GTEST_LOCAL_PATH} https://github.com/google/googletest/archive/release-1.10.0.zip 72 | CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL} -DCMAKE_BUILD_TYPE=Release 73 | BUILD_BYPRODUCTS ${GTEST_LIB} 74 | ) 75 | 76 | # Logging 77 | message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") 78 | message(STATUS "Path to Vivado HLS: ${VIVADO_HLS_ROOT}") 79 | 80 | # tests 81 | add_subdirectory(tests) 82 | add_subdirectory(host) 83 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Fixstars 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DNN-Kernel-FPGA 2 | 3 | Deep Learning の FPGA 向けフルスクラッチ実装 4 | 5 | ## 概要 6 | 7 | このプロジェクトは、小規模な畳み込みネットワークを FPGA で実装したものです。 8 | MNIST データセットをターゲットに、フルスクラッチで書いたネットワークモデルを Alveo FPGA カード上で動作させます。 9 | 10 | 特にACRi ルーム上でこのコードを使用する場合は、[doc/acri-room-howto.md](doc/acri-room-howto.md) を参考にしてください。 11 | 12 | ## 開発環境 13 | - Ubuntu (>= 18.04) 14 | - Python (>= 3.5.2) 15 | - CMake (>= 3.11) 16 | - Vivado HLS (>= 2019.2) 17 | 18 | ## MNIST の学習 19 | 20 | 以下はvirtualenv を使用しているので、その他の python 仮想環境を使用する場合は各々変更してください。 21 | 22 | ```sh 23 | cd learning 24 | virtualenv -p python3 venv 25 | source venv/bin/activate 26 | pip install -r requirements.txt 27 | python train_mnist.py 28 | ``` 29 | 30 | ## ビルド 31 | 32 | #### ホストアプリケーションなど 33 | ```sh 34 | mkdir build && cd build 35 | cmake -DTARGET_BOARD=u200 ../ 36 | cmake --build . 37 | ``` 38 | 39 | #### FPGA イメージ 40 | 41 | 環境変数を設定します。 42 | ``` 43 | $ source /tools/Xilinx/Vitis/2019.2/settings64.sh 44 | $ source /opt/xilinx/xrt/setup.sh 45 | ``` 46 | 47 | ビットストリームの合成を行います。 48 | この手順は2時間程度かかります。 49 | ```sh 50 | cmake --build . --target inference_top_hw_xo 51 | cmake --build . --target inference_top_hw 52 | ``` 53 | 54 | 合成レポートは次のように確認できます。 55 | ```sh 56 | vitis_analyzer host/inference_top_hw.xclbin.link_summary 57 | ``` 58 | 59 | ## 推論処理 60 | 61 | ### 推論の実行 62 | 63 | トレース取得用に `xrt.ini` を作成します。 64 | ```sh 65 | echo -e "[Debug]\nprofile=true\ntimeline_trace=true" > xrt.ini 66 | ``` 67 | 68 | 以下のコマンドで推論処理が実行されます。 69 | ```sh 70 | ./host/run_inference ./host/inference_top_hw.xclbin inference_top 71 | ``` 72 | 73 | 実行レポートは次のように確認できます。 74 | ```sh 75 | vitis_analyzer inference_top_hw.xclbin.run_summary 76 | ``` 77 | 78 | ## テスト 79 | 80 | #### 単体テスト 81 | 82 | 以下のようにして単体テストが可能です (ReLU の場合) 。 83 | 84 | ```sh 85 | ctest -V -R "relu_ref" # Test of reference implementation 86 | ctest -V -R "relu_hls_csim" # C simulation test of HLS implementation 87 | ctest -V -R "relu_hls_cosim" # C/RTL co-simulation test of HLS implementation 88 | ``` 89 | 90 | -------------------------------------------------------------------------------- /doc/acri-room-howto.md: -------------------------------------------------------------------------------- 1 | 2 | # ACRi ルームのサーバー上での動かし方 3 | 4 | ACRi ルームの Alveo サーバー上で本リポジトリで作成する MNIST モデルを実行する方法を記します。 5 | ターゲットとなる環境は、`as001` サーバーです。 6 | 7 | ACRi ルームのサーバー上では外部ネットワークへの接続が不可なため、まずは自前の開発マシンでの準備が必要です。 8 | 9 | 10 | ## 自前の開発マシンでの作業手順 11 | 12 | 1. このリポジトリをclone 13 | 2. `thirdparty` 以下の`download.sh` を実行 14 | 3. `learning` 以下に入り学習を行う (手順は[README.md](../README.md) 内に記載) 15 | 4. リポジトリのコード全体を圧縮し、ACRi ルームのサーバーのホームディレクトリ上にコピーする 16 | 17 | ## ACRi ルームのサーバー上での作業手順 18 | 19 | 公式の利用方法を元にログインします。 20 | - サーバ全般: http://gw.acri.c.titech.ac.jp/wp/manual/how-to-reserve 21 | - Alveoサーバ: http://gw.acri.c.titech.ac.jp/wp/manual/alveo-server 22 | 23 | レポート表示時に GUI 機能を使うため、リモートデスクトップの使用を推奨します。 24 | 25 | ### 準備 26 | 27 | `/home//dnn-kernel-fpga.zip` に圧縮済みのコードがある前提で説明します。 28 | 29 | まず、高速なローカルディレクトリである`/scratch` 上にデータをコピーします。 30 | ``` 31 | $ cp /home//dnn-kernel-fpga.zip /scratch 32 | ``` 33 | 34 | ワーキングディレクトリを`/scratch`に移動しコピーしたファイルを解凍します。 35 | ``` 36 | $ cd /scratch 37 | $ unzip dnn-kernel-fpga.zip 38 | ``` 39 | 40 | cmake 3.16.8 にパスを通します。 41 | ``` 42 | $ export PATH=/scratch/dnn-kernel-fpga/thirdparty/cmake-3.16.8-Linux-x86_64/bin:${PATH}` 43 | ``` 44 | 45 | 以降は、[README.md](../README.md) に記載のビルド・推論処理の手順を行います。 46 | MNIST の学習、テストの手順は ACRi ルームのサーバー上では行えません。 47 | -------------------------------------------------------------------------------- /host/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | # host 3 | file(GLOB HOST_SRCS *.cc) 4 | 5 | find_library(XRT_LIBRARIES NAMES xrt_core PATHS ${XILINX_XRT}/lib) 6 | 7 | set(VITIS_INCLUDE_DIRS ${XILINX_XRT}/include ${XILINX_VIVADO}/include) 8 | set(VITIS_LIBRARIES OpenCL pthread ${XRT_LIBRARIES}) 9 | 10 | add_executable(run_inference ${HOST_SRCS}) 11 | target_include_directories(run_inference PRIVATE ${DNNK_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS} ${VITIS_INCLUDE_DIRS}) 12 | target_link_libraries(run_inference PRIVATE ${TORCH_LIBRARIES} ${VITIS_LIBRARIES}) 13 | target_compile_options(run_inference PRIVATE "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"") 14 | target_compile_features(run_inference PRIVATE cxx_std_14) 15 | 16 | # xo 17 | function (add_xo name top target sources platform) 18 | set(include_dirs ${DNNK_INCLUDE_DIRS}) 19 | prepend_option("${include_dirs}" "-I" include_options) 20 | 21 | add_custom_target( 22 | ${name}_xo 23 | COMMAND v++ -g --compile --target ${target} --kernel ${top} --platform ${platform} --profile_kernel data:all:all:all --profile_kernel stall:all:all:all --temp_dir build_${name} --save-temps ${include_options} ${sources} -o ${name}.xo 24 | WORKING_DIRECTORY ${WORK_DIR} 25 | ) 26 | endfunction() 27 | 28 | # xclbin 29 | function (add_xclbin name top target ini_file platform) 30 | 31 | abs_path(${ini_file} abs_ini_file) 32 | 33 | set(top_func ${top}) 34 | set(target_ini_file ${name}_${TARGET_BOARD}.ini) 35 | 36 | configure_file(${abs_ini_file} ${target_ini_file}) 37 | 38 | add_custom_target( 39 | ${name} 40 | COMMAND v++ -g --link --target ${target} --platform ${platform} --config ${target_ini_file} --temp_dir build_${name} --save-temps ${name}.xo -o ${name}.xclbin 41 | WORKING_DIRECTORY ${WORK_DIR} 42 | ) 43 | endfunction() 44 | 45 | function (add_xo_and_xclbin name top sources ini_file platform) 46 | 47 | add_xo(${name}_hw ${top} hw ${sources} ${platform}) 48 | add_xo(${name}_hw_emu ${top} hw_emu ${sources} ${platform}) 49 | add_xo(${name}_sw_emu ${top} sw_emu ${sources} ${platform}) 50 | add_xclbin(${name}_hw ${top} hw ${ini_file} ${platform}) 51 | add_xclbin(${name}_hw_emu ${top} hw_emu ${ini_file} ${platform}) 52 | add_xclbin(${name}_sw_emu ${top} sw_emu ${ini_file} ${platform}) 53 | endfunction() 54 | 55 | get_filename_component(inference_src ../tests/hls/inference/inference_hls.cc ABSOLUTE) 56 | 57 | add_xo_and_xclbin(inference_top inference_top ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM}) 58 | add_xo_and_xclbin(inference_dataflow inference_dataflow ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM}) 59 | add_xo_and_xclbin(inference_with_local_buffer inference_with_local_buffer ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM}) 60 | add_xo_and_xclbin(inference_pipelined_conv_v1 inference_pipelined_conv_v1 ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM}) 61 | add_xo_and_xclbin(inference_pipelined_conv_v2 inference_pipelined_conv_v2 ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM}) 62 | add_xo_and_xclbin(inference_unrolledx4_conv_v1 inference_unrolledx4_conv_v1 ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM}) 63 | add_xo_and_xclbin(inference_unrolledx4_conv_v2 inference_unrolledx4_conv_v2 ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM}) 64 | add_xo_and_xclbin(inference_final inference_final ${inference_src} link_${TARGET_BOARD}_template.ini ${VITIS_PLATFORM}) 65 | -------------------------------------------------------------------------------- /host/host_util.h: -------------------------------------------------------------------------------- 1 | #ifndef DNNKERNEL_HOST_UTIL_H 2 | #define DNNKERNEL_HOST_UTIL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace dnnk { 12 | 13 | class ClHelper { 14 | public: 15 | ClHelper(const std::string& xclbin_name) { 16 | 17 | cl::Platform::get(&platforms_); 18 | for (std::size_t i = 0; i < platforms_.size(); i++) { 19 | cl::Platform& platform = platforms_[i]; 20 | std::string platform_name = platform.getInfo(); 21 | 22 | if (platform_name == "Xilinx") { 23 | platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices_); 24 | break; 25 | } 26 | } 27 | 28 | cl::Device device = devices_[0]; 29 | 30 | context_ = cl::Context(device); 31 | 32 | auto xclbin = read_binary_file(xclbin_name); 33 | cl::Program::Binaries binaries; 34 | binaries.push_back(xclbin); 35 | 36 | program_ = cl::Program(context_, devices_, binaries); 37 | } 38 | 39 | cl::Program& get_program() { 40 | return program_; 41 | } 42 | 43 | cl::Context& get_context() { 44 | return context_; 45 | } 46 | 47 | cl::Device& get_device() { 48 | return devices_[0]; 49 | } 50 | 51 | private: 52 | std::vector read_binary_file(const std::string& filename) { 53 | std::vector ret; 54 | std::ifstream ifs(filename, std::ifstream::binary); 55 | 56 | ifs.seekg(0, ifs.end); 57 | std::size_t size = ifs.tellg(); 58 | ifs.seekg(0, ifs.beg); 59 | 60 | ret.resize(size); 61 | ifs.read(reinterpret_cast(ret.data()), ret.size()); 62 | 63 | return ret; 64 | } 65 | 66 | std::vector platforms_; 67 | std::vector devices_; 68 | cl::Context context_; 69 | cl::Program program_; 70 | }; 71 | 72 | 73 | template 74 | class aligned_allocator { 75 | public: 76 | using value_type = T; 77 | 78 | aligned_allocator() = default; 79 | 80 | template 81 | constexpr aligned_allocator(const aligned_allocator&) noexcept {} 82 | 83 | T* allocate(std::size_t size) { 84 | void* ptr = nullptr; 85 | 86 | if (posix_memalign(&ptr, 4096, size * sizeof(T))) { 87 | throw std::bad_alloc(); 88 | } 89 | 90 | return reinterpret_cast(ptr); 91 | } 92 | 93 | void deallocate(T* ptr, std::size_t size) { 94 | free(ptr); 95 | } 96 | }; 97 | 98 | template 99 | using aligned_vector = std::vector>; 100 | 101 | 102 | class StopWatch { 103 | public: 104 | StopWatch() = default; 105 | 106 | void start() { 107 | tstart_ = clock::now(); 108 | } 109 | 110 | void stop() { 111 | tstop_ = clock::now(); 112 | } 113 | 114 | double elapsed_time_ms() const { 115 | auto elapsed_micro = std::chrono::duration_cast(tstop_ - tstart_).count(); 116 | return elapsed_micro / 1000.0; 117 | } 118 | 119 | private: 120 | using clock = std::chrono::high_resolution_clock; 121 | using time_point = std::chrono::time_point; 122 | 123 | time_point tstart_; 124 | time_point tstop_; 125 | }; 126 | 127 | } 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /host/link_u200_template.ini: -------------------------------------------------------------------------------- 1 | # Debug 2 | dk=chipscope:${top_func}_1:M_AXI_GMEM0 3 | dk=chipscope:${top_func}_1:M_AXI_GMEM1 4 | dk=chipscope:${top_func}_1:M_AXI_GMEM2 5 | dk=chipscope:${top_func}_1:M_AXI_GMEM3 6 | dk=chipscope:${top_func}_1:M_AXI_GMEM4 7 | dk=chipscope:${top_func}_1:M_AXI_GMEM5 8 | dk=chipscope:${top_func}_1:M_AXI_GMEM6 9 | dk=chipscope:${top_func}_1:M_AXI_GMEM7 10 | dk=chipscope:${top_func}_1:M_AXI_GMEM8 11 | dk=chipscope:${top_func}_1:M_AXI_GMEM9 12 | dk=chipscope:${top_func}_1:S_AXI_CONTROL 13 | dk=protocol:${top_func}_1:M_AXI_GMEM0 14 | dk=protocol:${top_func}_1:M_AXI_GMEM1 15 | dk=protocol:${top_func}_1:M_AXI_GMEM2 16 | dk=protocol:${top_func}_1:M_AXI_GMEM3 17 | dk=protocol:${top_func}_1:M_AXI_GMEM4 18 | dk=protocol:${top_func}_1:M_AXI_GMEM5 19 | dk=protocol:${top_func}_1:M_AXI_GMEM6 20 | dk=protocol:${top_func}_1:M_AXI_GMEM7 21 | dk=protocol:${top_func}_1:M_AXI_GMEM8 22 | dk=protocol:${top_func}_1:M_AXI_GMEM9 23 | dk=chipscope:${top_func}_1:S_AXI_CONTROL 24 | 25 | # Profile 26 | profile_kernel=stall:all:all:all 27 | profile_kernel=data:${top_func}:${top_func}_1:x:all 28 | profile_kernel=data:${top_func}:${top_func}_1:weight0:all 29 | profile_kernel=data:${top_func}:${top_func}_1:bias0:all 30 | profile_kernel=data:${top_func}:${top_func}_1:weight1:all 31 | profile_kernel=data:${top_func}:${top_func}_1:bias1:all 32 | profile_kernel=data:${top_func}:${top_func}_1:weight2:all 33 | profile_kernel=data:${top_func}:${top_func}_1:bias2:all 34 | profile_kernel=data:${top_func}:${top_func}_1:weight3:all 35 | profile_kernel=data:${top_func}:${top_func}_1:bias3:all 36 | profile_kernel=data:${top_func}:${top_func}_1:y:all 37 | 38 | [connectivity] 39 | nk=${top_func}:1:${top_func}_1 40 | sp=${top_func}_1.x:DDR[0] 41 | sp=${top_func}_1.weight0:DDR[0] 42 | sp=${top_func}_1.bias0:DDR[0] 43 | sp=${top_func}_1.weight1:DDR[0] 44 | sp=${top_func}_1.bias1:DDR[0] 45 | sp=${top_func}_1.weight2:DDR[0] 46 | sp=${top_func}_1.bias2:DDR[0] 47 | sp=${top_func}_1.weight3:DDR[0] 48 | sp=${top_func}_1.bias3:DDR[0] 49 | sp=${top_func}_1.y:DDR[0] 50 | -------------------------------------------------------------------------------- /host/link_u50_template.ini: -------------------------------------------------------------------------------- 1 | # Debug 2 | dk=chipscope:${top_func}_1:M_AXI_GMEM0 3 | dk=chipscope:${top_func}_1:M_AXI_GMEM1 4 | dk=chipscope:${top_func}_1:M_AXI_GMEM2 5 | dk=chipscope:${top_func}_1:M_AXI_GMEM3 6 | dk=chipscope:${top_func}_1:M_AXI_GMEM4 7 | dk=chipscope:${top_func}_1:M_AXI_GMEM5 8 | dk=chipscope:${top_func}_1:M_AXI_GMEM6 9 | dk=chipscope:${top_func}_1:M_AXI_GMEM7 10 | dk=chipscope:${top_func}_1:M_AXI_GMEM8 11 | dk=chipscope:${top_func}_1:M_AXI_GMEM9 12 | dk=chipscope:${top_func}_1:S_AXI_CONTROL 13 | dk=protocol:${top_func}_1:M_AXI_GMEM0 14 | dk=protocol:${top_func}_1:M_AXI_GMEM1 15 | dk=protocol:${top_func}_1:M_AXI_GMEM2 16 | dk=protocol:${top_func}_1:M_AXI_GMEM3 17 | dk=protocol:${top_func}_1:M_AXI_GMEM4 18 | dk=protocol:${top_func}_1:M_AXI_GMEM5 19 | dk=protocol:${top_func}_1:M_AXI_GMEM6 20 | dk=protocol:${top_func}_1:M_AXI_GMEM7 21 | dk=protocol:${top_func}_1:M_AXI_GMEM8 22 | dk=protocol:${top_func}_1:M_AXI_GMEM9 23 | dk=chipscope:${top_func}_1:S_AXI_CONTROL 24 | 25 | # Profile 26 | profile_kernel=stall:all:all:all 27 | profile_kernel=data:${top_func}:${top_func}_1:x:all 28 | profile_kernel=data:${top_func}:${top_func}_1:weight0:all 29 | profile_kernel=data:${top_func}:${top_func}_1:bias0:all 30 | profile_kernel=data:${top_func}:${top_func}_1:weight1:all 31 | profile_kernel=data:${top_func}:${top_func}_1:bias1:all 32 | profile_kernel=data:${top_func}:${top_func}_1:weight2:all 33 | profile_kernel=data:${top_func}:${top_func}_1:bias2:all 34 | profile_kernel=data:${top_func}:${top_func}_1:weight3:all 35 | profile_kernel=data:${top_func}:${top_func}_1:bias3:all 36 | profile_kernel=data:${top_func}:${top_func}_1:y:all 37 | 38 | [connectivity] 39 | nk=${top_func}:1:${top_func}_1 40 | sp=${top_func}_1.x:HBM[0] 41 | sp=${top_func}_1.weight0:HBM[0] 42 | sp=${top_func}_1.bias0:HBM[0] 43 | sp=${top_func}_1.weight1:HBM[0] 44 | sp=${top_func}_1.bias1:HBM[0] 45 | sp=${top_func}_1.weight2:HBM[0] 46 | sp=${top_func}_1.bias2:HBM[0] 47 | sp=${top_func}_1.weight3:HBM[0] 48 | sp=${top_func}_1.bias3:HBM[0] 49 | sp=${top_func}_1.y:HBM[0] 50 | -------------------------------------------------------------------------------- /host/run_inference.cc: -------------------------------------------------------------------------------- 1 | 2 | #define CL_HPP_CL_1_2_DEFAULT_BUILD 3 | #define CL_HPP_TARGET_OPENCL_VERSION 120 4 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 5 | 6 | #include "host_util.h" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | #include 18 | 19 | 20 | static const std::size_t kNumTestImages = 1000; 21 | 22 | 23 | void setup_parameters(cl::Context& context, 24 | cl::CommandQueue& queue, 25 | cl::Kernel& kernel, 26 | std::map& buf_params) { 27 | 28 | std::vector kernel_args = { 29 | "-", 30 | "conv1.weight", 31 | "conv1.bias", 32 | "conv2.weight", 33 | "conv2.bias", 34 | "fc1.weight", 35 | "fc1.bias", 36 | "fc2.weight", 37 | "fc2.bias", 38 | }; 39 | 40 | // load model file 41 | auto model = torch::jit::load(PROJECT_ROOT "/learning/traced_model.pt"); 42 | 43 | // load parameter values from model and copy to the device memory 44 | for (const auto& param_ref : model.named_parameters()) { 45 | 46 | dnnk::aligned_vector host_buf(param_ref.value.numel()); 47 | 48 | float* ptr = param_ref.value.data_ptr(); 49 | std::copy(ptr, ptr + host_buf.size(), host_buf.begin()); 50 | 51 | // use param_ref.name as key (ex: "conv1.weight"), and initialize device buffer 52 | { 53 | cl::Buffer buf(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, host_buf.size() * sizeof(float), host_buf.data(), nullptr); 54 | buf_params[param_ref.name] = std::move(buf); 55 | } 56 | 57 | // set kernel argument 58 | auto index = std::distance(kernel_args.begin(), std::find(kernel_args.begin(), kernel_args.end(), param_ref.name)); 59 | if (index == kernel_args.size()) { 60 | throw std::runtime_error("Unknown parameter name: " + param_ref.name); 61 | } 62 | kernel.setArg(index, buf_params[param_ref.name]); 63 | 64 | // copy parameter data into the device buffer 65 | queue.enqueueMigrateMemObjects({buf_params[param_ref.name]}, 0); 66 | queue.finish(); 67 | } 68 | } 69 | 70 | void setup_inouts(cl::Context& context, 71 | cl::CommandQueue& queue, 72 | cl::Kernel& kernel, 73 | std::vector& buf_x, 74 | std::vector& buf_y, 75 | std::vector& answers) { 76 | // read MNIST dataset 77 | auto dataset = torch::data::datasets::MNIST(PROJECT_ROOT "/learning/data/MNIST/raw") 78 | .map(torch::data::transforms::Stack<>()); 79 | 80 | // define loader and set batch_size to 1 81 | auto data_loader = 82 | torch::data::make_data_loader(std::move(dataset), 83 | torch::data::DataLoaderOptions().batch_size(1)); 84 | 85 | // create reference data 86 | int num_iter = 0; 87 | for (auto& batch : *data_loader) { 88 | auto& x_ref = batch.data; 89 | auto& y_ref = batch.target; 90 | 91 | auto x_size = x_ref.numel() * sizeof(float); 92 | auto y_size = 10 * sizeof(float); 93 | 94 | dnnk::aligned_vector host_buf(x_ref.numel()); 95 | float* x_ptr = x_ref.data_ptr(); 96 | std::copy(x_ptr, x_ptr + host_buf.size(), host_buf.begin()); 97 | 98 | buf_x.emplace_back(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, x_size, host_buf.data(), nullptr); 99 | buf_y.emplace_back(context, CL_MEM_WRITE_ONLY, y_size); 100 | answers.push_back(*(y_ref.data_ptr())); 101 | 102 | // copy to device 103 | cl::Buffer& target = buf_x[buf_x.size() - 1]; 104 | kernel.setArg(0, target); 105 | queue.enqueueMigrateMemObjects({target}, 0); 106 | queue.finish(); 107 | 108 | if (++num_iter == kNumTestImages) { 109 | break; 110 | } 111 | } 112 | } 113 | 114 | 115 | int main(int argc, char* argv[]) { 116 | if (argc != 3 && argc != 4) { 117 | printf("Usage: %s [enable_OoO]\n", argv[0]); 118 | return 0; 119 | } 120 | 121 | dnnk::ClHelper clhelper(argv[1]); 122 | std::string kernel_name(argv[2]); 123 | bool enable_OoO = (argc == 3) ? 0 : (std::atoi(argv[3]) != 0); 124 | 125 | auto device = clhelper.get_device(); 126 | auto context = clhelper.get_context(); 127 | auto program = clhelper.get_program(); 128 | 129 | auto queue_flag = (enable_OoO) ? (CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) : CL_QUEUE_PROFILING_ENABLE; 130 | cl::CommandQueue queue(context, device, queue_flag); 131 | 132 | // create kernel object 133 | cl::Kernel kernel(program, kernel_name.c_str()); 134 | 135 | // define device buffer 136 | std::vector buf_x; 137 | std::map buf_params; 138 | std::vector buf_y; 139 | 140 | // MNIST answers 141 | std::vector answers; 142 | 143 | // setup device buffers 144 | setup_parameters(context, queue, kernel, buf_params); 145 | setup_inouts(context, queue, kernel, buf_x, buf_y, answers); 146 | 147 | // run 148 | dnnk::StopWatch sw; 149 | sw.start(); 150 | for (std::size_t i = 0; i < buf_x.size(); i++) { 151 | kernel.setArg(0, buf_x[i]); 152 | kernel.setArg(9, buf_y[i]); 153 | 154 | queue.enqueueTask(kernel); 155 | } 156 | queue.finish(); 157 | sw.stop(); 158 | 159 | std::cout << "Elapsed time: " << sw.elapsed_time_ms() / kNumTestImages << " [ms/image]" << std::endl; 160 | 161 | // get results from device buffer 162 | std::vector> results(buf_x.size()); 163 | for (std::size_t i = 0; i < results.size(); i++) { 164 | queue.enqueueReadBuffer(buf_y[i], false, 0, results[i].size() * sizeof(float), results[i].data()); 165 | } 166 | queue.finish(); 167 | 168 | // report 169 | auto argmax = [](const std::array& vec) { 170 | return std::distance(vec.begin(), std::max_element(vec.begin(), vec.end())); 171 | }; 172 | 173 | std::size_t num_corrects = 0; 174 | for (std::size_t i = 0; i < results.size(); i++) { 175 | if (argmax(results[i]) == answers[i]) { 176 | num_corrects++; 177 | } 178 | } 179 | 180 | std::cout << "accuracy: " << double(num_corrects) / results.size() << std::endl; 181 | 182 | return 0; 183 | } 184 | -------------------------------------------------------------------------------- /include/dnn-kernel/conv2d.h: -------------------------------------------------------------------------------- 1 | #ifndef DNNKERNEL_CONV2D_H 2 | #define DNNKERNEL_CONV2D_H 3 | 4 | #include 5 | #include 6 | 7 | namespace dnnk { 8 | 9 | static void conv2d(const float* x, const float* weight, const float* bias, int32_t width, int32_t height, 10 | int32_t in_channels, int32_t out_channels, int32_t ksize, float* y) { 11 | for (int32_t och = 0; och < out_channels; ++och) { 12 | for (int32_t h = 0; h < height; ++h) { 13 | for (int32_t w = 0; w < width; ++w) { 14 | float sum = 0.f; 15 | 16 | for (int32_t ich = 0; ich < in_channels; ++ich) { 17 | for (int32_t kh = 0; kh < ksize; ++kh) { 18 | for (int32_t kw = 0; kw < ksize; ++kw) { 19 | int32_t ph = h + kh - ksize/2; 20 | int32_t pw = w + kw - ksize/2; 21 | 22 | // zero padding 23 | if (ph < 0 || ph >= height || pw < 0 || pw >= width) { 24 | continue; 25 | } 26 | 27 | int64_t pix_idx = (ich * height + ph) * width + pw; 28 | int64_t weight_idx = ((och * in_channels + ich) * ksize + kh) * ksize + kw; 29 | 30 | sum += x[pix_idx] * weight[weight_idx]; 31 | } 32 | } 33 | } 34 | 35 | // add bias 36 | sum += bias[och]; 37 | 38 | y[(och * height + h) * width + w] = sum; 39 | } 40 | } 41 | } 42 | } 43 | 44 | 45 | static void conv2d_pipelined_v1(const float* x, const float* weight, const float* bias, int32_t width, int32_t height, 46 | int32_t in_channels, int32_t out_channels, int32_t ksize, float* y) { 47 | for (int32_t och = 0; och < out_channels; ++och) { 48 | for (int32_t h = 0; h < height; ++h) { 49 | for (int32_t w = 0; w < width; ++w) { 50 | float sum = 0.f; 51 | 52 | for (int32_t ich = 0; ich < in_channels; ++ich) { 53 | for (int32_t kh = 0; kh < ksize; ++kh) { 54 | for (int32_t kw = 0; kw < ksize; ++kw) { 55 | #pragma HLS pipeline II=1 56 | 57 | int32_t ph = h + kh - ksize/2; 58 | int32_t pw = w + kw - ksize/2; 59 | 60 | // zero padding 61 | if (ph < 0 || ph >= height || pw < 0 || pw >= width) { 62 | continue; 63 | } 64 | 65 | int64_t pix_idx = (ich * height + ph) * width + pw; 66 | int64_t weight_idx = ((och * in_channels + ich) * ksize + kh) * ksize + kw; 67 | 68 | sum += x[pix_idx] * weight[weight_idx]; 69 | } 70 | } 71 | } 72 | 73 | // add bias 74 | sum += bias[och]; 75 | 76 | y[(och * height + h) * width + w] = sum; 77 | } 78 | } 79 | } 80 | } 81 | 82 | static void conv2d_pipelined_v2(const float* x, const float* weight, const float* bias, int32_t width, int32_t height, 83 | int32_t in_channels, int32_t out_channels, int32_t ksize, float* y) { 84 | static const int kShiftRegLength = 4; 85 | 86 | for (int32_t och = 0; och < out_channels; ++och) { 87 | for (int32_t h = 0; h < height; ++h) { 88 | for (int32_t w = 0; w < width; ++w) { 89 | float shift_reg[kShiftRegLength + 1]; 90 | #pragma HLS array_partition variable=shift_reg complete 91 | 92 | int32_t glob_idx = 0; 93 | for (int32_t ich = 0; ich < in_channels; ++ich) { 94 | for (int32_t kh = 0; kh < ksize; ++kh) { 95 | for (int32_t kw = 0; kw < ksize; ++kw) { 96 | #pragma HLS pipeline II=1 97 | 98 | int32_t ph = h + kh - ksize/2; 99 | int32_t pw = w + kw - ksize/2; 100 | 101 | // zero padding 102 | if (ph < 0 || ph >= height || pw < 0 || pw >= width) { 103 | continue; 104 | } 105 | 106 | int64_t pix_idx = (ich * height + ph) * width + pw; 107 | int64_t weight_idx = ((och * in_channels + ich) * ksize + kh) * ksize + kw; 108 | 109 | float mul = x[pix_idx] * weight[weight_idx]; 110 | 111 | // local sum 112 | for (int i = 0; i < kShiftRegLength; ++i) { 113 | if (i == 0) { 114 | if (glob_idx < kShiftRegLength) { 115 | shift_reg[kShiftRegLength] = mul; 116 | } else { 117 | shift_reg[kShiftRegLength] = shift_reg[0] + mul; 118 | } 119 | } 120 | 121 | shift_reg[i] = shift_reg[i + 1]; 122 | } 123 | 124 | ++glob_idx; 125 | } 126 | } 127 | } 128 | 129 | // global sum 130 | float sum = 0.f; 131 | for (int i = 0; i < kShiftRegLength; ++i) { 132 | #pragma HLS pipeline II=1 133 | sum += shift_reg[i]; 134 | } 135 | 136 | // add bias 137 | sum += bias[och]; 138 | 139 | y[(och * height + h) * width + w] = sum; 140 | } 141 | } 142 | } 143 | } 144 | 145 | 146 | template 147 | static void conv2d_unrolled_v1(const float* x, const float* weight, const float* bias, int32_t width, int32_t height, 148 | int32_t in_channels, int32_t out_channels, int32_t ksize, float* y) { 149 | 150 | for (int32_t och = 0; och < out_channels; ++och) { 151 | for (int32_t h = 0; h < height; ++h) { 152 | for (int32_t block_w = 0; block_w < width; block_w += UNROLL_X) { 153 | float sum[UNROLL_X]; 154 | #pragma HLS array_partition variable=sum complete 155 | 156 | for (int32_t ich = 0; ich < in_channels; ++ich) { 157 | for (int32_t kh = 0; kh < ksize; ++kh) { 158 | for (int32_t kw = 0; kw < ksize; ++kw) { 159 | #pragma HLS pipeline II=4 160 | for (int local_w = 0; local_w < UNROLL_X; local_w++) { 161 | #pragma HLS unroll 162 | if (block_w + local_w < width) { 163 | 164 | int32_t w = block_w + local_w; 165 | 166 | int32_t ph = h + kh - ksize/2; 167 | int32_t pw = w + kw - ksize/2; 168 | 169 | float last = (ich == 0 && kh == 0 && kw == 0) ? 0 : sum[local_w]; 170 | 171 | // zero padding 172 | if (ph < 0 || ph >= height || pw < 0 || pw >= width) { 173 | sum[local_w] = last; 174 | continue; 175 | } 176 | 177 | int64_t pix_idx = (ich * height + ph) * width + pw; 178 | int64_t weight_idx = ((och * in_channels + ich) * ksize + kh) * ksize + kw; 179 | 180 | sum[local_w] = last + x[pix_idx] * weight[weight_idx]; 181 | } 182 | } 183 | } 184 | } 185 | } 186 | 187 | for (int local_w = 0; local_w < UNROLL_X; local_w++) { 188 | #pragma HLS unroll 189 | if (block_w + local_w < width) { 190 | 191 | int32_t w = block_w + local_w; 192 | 193 | // add bias 194 | y[(och * height + h) * width + w] = sum[local_w] + bias[och]; 195 | } 196 | } 197 | } 198 | } 199 | } 200 | } 201 | 202 | 203 | template 204 | static void conv2d_unrolled_v2(const float* x, const float* weight, const float* bias, int32_t width, int32_t height, 205 | int32_t in_channels, int32_t out_channels, int32_t ksize, float* y) { 206 | 207 | for (int32_t block_och = 0; block_och < out_channels; block_och += UNROLL_OCH) { 208 | for (int32_t h = 0; h < height; ++h) { 209 | for (int32_t block_w = 0; block_w < width; block_w += UNROLL_X) { 210 | float sum[UNROLL_OCH][UNROLL_X]; 211 | #pragma HLS array_partition variable=sum complete dim=0 212 | 213 | for (int32_t ich = 0; ich < in_channels; ++ich) { 214 | for (int32_t kh = 0; kh < ksize; ++kh) { 215 | for (int32_t kw = 0; kw < ksize; ++kw) { 216 | #pragma HLS pipeline II=4 217 | for (int local_och = 0; local_och < UNROLL_OCH; local_och++) { 218 | #pragma HLS unroll 219 | for (int local_w = 0; local_w < UNROLL_X; local_w++) { 220 | #pragma HLS unroll 221 | if (block_w + local_w < width && block_och + local_och < out_channels) { 222 | 223 | int32_t och = block_och + local_och; 224 | int32_t w = block_w + local_w; 225 | 226 | int32_t ph = h + kh - ksize/2; 227 | int32_t pw = w + kw - ksize/2; 228 | 229 | float last = (ich == 0 && kh == 0 && kw == 0) ? 0 : sum[local_och][local_w]; 230 | 231 | // zero padding 232 | if (ph < 0 || ph >= height || pw < 0 || pw >= width) { 233 | sum[local_och][local_w] = last; 234 | continue; 235 | } 236 | 237 | int64_t pix_idx = (ich * height + ph) * width + pw; 238 | int64_t weight_idx = ((och * in_channels + ich) * ksize + kh) * ksize + kw; 239 | 240 | sum[local_och][local_w] = last + x[pix_idx] * weight[weight_idx]; 241 | } 242 | } 243 | } 244 | } 245 | } 246 | } 247 | 248 | for (int local_och = 0; local_och < UNROLL_OCH; local_och++) { 249 | #pragma HLS unroll 250 | for (int local_w = 0; local_w < UNROLL_X; local_w++) { 251 | #pragma HLS unroll 252 | if (block_w + local_w < width && block_och + local_och < out_channels) { 253 | int32_t och = block_och + local_och; 254 | int32_t w = block_w + local_w; 255 | 256 | // add bias 257 | y[(och * height + h) * width + w] = sum[local_och][local_w] + bias[och]; 258 | } 259 | } 260 | } 261 | } 262 | } 263 | } 264 | } 265 | 266 | } // namespace dnnk 267 | 268 | #endif // DNNKERNEL_CONV2D_H 269 | -------------------------------------------------------------------------------- /include/dnn-kernel/inference.h: -------------------------------------------------------------------------------- 1 | #ifndef DNNKERNEL_INFERENCE_H 2 | #define DNNKERNEL_INFERENCE_H 3 | 4 | #include "conv2d.h" 5 | #include "maxpool2d.h" 6 | #include "relu.h" 7 | #include "linear.h" 8 | 9 | #include 10 | #include 11 | 12 | namespace dnnk { 13 | 14 | template 15 | static void inference_custom(const float* x, 16 | const float* weight0, const float* bias0, 17 | const float* weight1, const float* bias1, 18 | const float* weight2, const float* bias2, 19 | const float* weight3, const float* bias3, 20 | float* y, 21 | CONV_FUNC* conv1_f, 22 | RELU_FUNC* relu1_f, 23 | MAXPOOL_FUNC* maxpool1_f, 24 | CONV_FUNC* conv2_f, 25 | RELU_FUNC* relu2_f, 26 | MAXPOOL_FUNC* maxpool2_f, 27 | LINEAR_FUNC* linear1_f, 28 | RELU_FUNC* relu3_f, 29 | LINEAR_FUNC* linear2_f) { 30 | #pragma HLS inline 31 | 32 | static const int kWidths[] = {28, 14, 7}; 33 | static const int kHeights[] = {28, 14, 7}; 34 | static const int kChannels[] = {1, 4, 8, 32, 10}; 35 | 36 | float x1[kWidths[0] * kHeights[0] * kChannels[1]]; 37 | float x2[kWidths[0] * kHeights[0] * kChannels[1]]; 38 | float x3[kWidths[1] * kHeights[1] * kChannels[1]]; 39 | float x4[kWidths[1] * kHeights[1] * kChannels[2]]; 40 | float x5[kWidths[1] * kHeights[1] * kChannels[2]]; 41 | float x6[kWidths[2] * kHeights[2] * kChannels[2]]; 42 | float x7[kChannels[3]]; 43 | float x8[kChannels[3]]; 44 | 45 | // 1st layer 46 | conv1_f(x, weight0, bias0, kWidths[0], kHeights[0], kChannels[0], kChannels[1], 3, x1); 47 | relu1_f(x1, kWidths[0] * kHeights[0] * kChannels[1], x2); 48 | maxpool1_f(x2, kWidths[0], kHeights[0], kChannels[1], 2, x3); 49 | 50 | // 2nd layer 51 | conv2_f(x3, weight1, bias1, kWidths[1], kHeights[1], kChannels[1], kChannels[2], 3, x4); 52 | relu2_f(x4, kWidths[1] * kHeights[1] * kChannels[2], x5); 53 | maxpool2_f(x5, kWidths[1], kHeights[1], kChannels[2], 2, x6); 54 | 55 | // 3rd layer 56 | linear1_f(x6, weight2, bias2, kWidths[2] * kHeights[2] * kChannels[2], kChannels[3], x7); 57 | relu3_f(x7, kChannels[3], x8); 58 | 59 | // 4th layer 60 | linear2_f(x8, weight3, bias3, kChannels[3], kChannels[4], y); 61 | } 62 | 63 | template 64 | static void inference_custom(const float* x, 65 | const float* weight0, const float* bias0, 66 | const float* weight1, const float* bias1, 67 | const float* weight2, const float* bias2, 68 | const float* weight3, const float* bias3, 69 | float* y, 70 | CONV_FUNC* conv2d_f, 71 | MAXPOOL_FUNC* maxpool2d_f, 72 | RELU_FUNC* relu_f, 73 | LINEAR_FUNC* linear_f) { 74 | #pragma HLS inline 75 | inference_custom(x, 76 | weight0, bias0, 77 | weight1, bias1, 78 | weight2, bias2, 79 | weight3, bias3, 80 | y, 81 | conv2d_f, relu_f, maxpool2d_f, 82 | conv2d_f, relu_f, maxpool2d_f, 83 | linear_f, relu_f, 84 | linear_f); 85 | } 86 | 87 | static void inference(const float* x, 88 | const float* weight0, const float* bias0, 89 | const float* weight1, const float* bias1, 90 | const float* weight2, const float* bias2, 91 | const float* weight3, const float* bias3, 92 | float* y) { 93 | #pragma HLS inline 94 | 95 | inference_custom(x, 96 | weight0, bias0, 97 | weight1, bias1, 98 | weight2, bias2, 99 | weight3, bias3, 100 | y, 101 | conv2d, maxpool2d, relu, linear); 102 | } 103 | 104 | } // namespace dnnk 105 | 106 | #endif // DNNKERNEL_INFERENCE_H 107 | -------------------------------------------------------------------------------- /include/dnn-kernel/linear.h: -------------------------------------------------------------------------------- 1 | #ifndef DNNKERNEL_LINEAR_H 2 | #define DNNKERNEL_LINEAR_H 3 | 4 | #include 5 | #include 6 | 7 | namespace dnnk { 8 | 9 | static void linear(const float *x, const float* weight, const float* bias, int64_t in_features, int64_t out_features, float *y) { 10 | for (int64_t i = 0; i < out_features; ++i) { 11 | float sum = 0.f; 12 | for (int64_t j = 0; j < in_features; ++j) { 13 | sum += x[j] * weight[i * in_features + j]; 14 | } 15 | y[i] = sum + bias[i]; 16 | } 17 | } 18 | 19 | template 20 | static void linear_opt(const float *x, const float* weight, const float* bias, int64_t in_features, int64_t out_features, float *y) { 21 | 22 | for (int64_t block_i = 0; block_i < out_features; block_i += UNROLL_OCH) { 23 | float sum[UNROLL_OCH]; 24 | #pragma HLS array_partition variable=sum complete 25 | 26 | for (int64_t j = 0; j < in_features; ++j) { 27 | #pragma HLS pipeline II=1 28 | for (int64_t local_i = 0; local_i < UNROLL_OCH; local_i++) { 29 | #pragma HLS unroll 30 | int64_t i = block_i + local_i; 31 | if (i < out_features) { 32 | float last = (j == 0) ? 0 : sum[local_i]; 33 | sum[local_i] = last + x[j] * weight[i * in_features + j]; 34 | } 35 | } 36 | } 37 | 38 | for (int64_t local_i = 0; local_i < UNROLL_OCH; local_i++) { 39 | #pragma HLS unroll 40 | int64_t i = block_i + local_i; 41 | if (i < out_features) { 42 | y[i] = sum[local_i] + bias[i]; 43 | } 44 | } 45 | } 46 | } 47 | 48 | } // namespace dnnk 49 | 50 | #endif // DNNKERNEL_LINEAR_H 51 | -------------------------------------------------------------------------------- /include/dnn-kernel/maxpool2d.h: -------------------------------------------------------------------------------- 1 | #ifndef DNNKERNEL_MAXPOOL2D_H 2 | #define DNNKERNEL_MAXPOOL2D_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace dnnk { 10 | 11 | static void maxpool2d(const float *x, int32_t width, int32_t height, int32_t channels, int32_t stride, float *y) { 12 | for (int ch = 0; ch < channels; ++ch) { 13 | for (int32_t h = 0; h < height; h += stride) { 14 | for (int32_t w = 0; w < width; w += stride) { 15 | float maxval = -FLT_MAX; 16 | 17 | for (int bh = 0; bh < stride; ++bh) { 18 | for (int bw = 0; bw < stride; ++bw) { 19 | maxval = std::max(maxval, x[(ch * height + h + bh) * width + w + bw]); 20 | } 21 | } 22 | 23 | y[(ch * (height / stride) + (h / stride)) * (width / stride) + w / stride] = maxval; 24 | } 25 | } 26 | } 27 | } 28 | 29 | } // namespace dnnk 30 | 31 | #endif // DNNKERNEL_MAXPOOL2D_H 32 | -------------------------------------------------------------------------------- /include/dnn-kernel/relu.h: -------------------------------------------------------------------------------- 1 | #ifndef DNNKERNEL_RELU_H 2 | #define DNNKERNEL_RELU_H 3 | 4 | #include 5 | #include 6 | 7 | namespace dnnk { 8 | 9 | void relu(const float *x, int64_t size, float *y) { 10 | for (int64_t i = 0; i < size; ++i) { 11 | y[i] = std::max(x[i], .0f); 12 | } 13 | } 14 | 15 | } // namespace dnnk 16 | 17 | #endif // DNNKERNEL_RELU_H 18 | -------------------------------------------------------------------------------- /learning/requirements.txt: -------------------------------------------------------------------------------- 1 | joblib==0.14.1 2 | numpy==1.18.5 3 | Pillow==7.1.2 4 | scikit-learn==0.22.2.post1 5 | scipy==1.4.1 6 | six==1.15.0 7 | sklearn==0.0 8 | torch==1.4.0 9 | torchvision==0.5.0 10 | -------------------------------------------------------------------------------- /learning/train_mnist.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | import torchvision 5 | import torchvision.transforms as transforms 6 | from sklearn.metrics import accuracy_score, confusion_matrix 7 | 8 | 9 | # 1. ネットワークモデルの定義 10 | class Net(nn.Module): 11 | def __init__(self, num_output_classes=10): 12 | super(Net, self).__init__() 13 | 14 | # 入力は28x28 のグレースケール画像 (チャネル数=1) 15 | # 出力が8チャネルとなるような畳み込みを行う 16 | self.conv1 = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3, padding=1) 17 | 18 | # 活性化関数はReLU 19 | self.relu1 = nn.ReLU(inplace=True) 20 | 21 | # 画像を28x28から14x14に縮小する 22 | self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2) 23 | 24 | # 4ch -> 8ch, 14x14 -> 7x7 25 | self.conv2 = nn.Conv2d(in_channels=4, out_channels=8, kernel_size=3, padding=1) 26 | self.relu2 = nn.ReLU(inplace=True) 27 | self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2) 28 | 29 | # 全結合層 30 | # 8chの7x7画像を1つのベクトルとみなし、要素数32のベクトルまで縮小 31 | self.fc1 = nn.Linear(8 * 7 * 7, 32) 32 | self.relu3 = nn.ReLU(inplace=True) 33 | 34 | # 全結合層その2 35 | # 出力クラス数まで縮小 36 | self.fc2 = nn.Linear(32, num_output_classes) 37 | 38 | def forward(self, x): 39 | # 1層目の畳み込み 40 | # 活性化関数 (activation) はReLU 41 | x = self.conv1(x) 42 | x = self.relu1(x) 43 | 44 | # 縮小 45 | x = self.pool1(x) 46 | 47 | # 2層目+縮小 48 | x = self.conv2(x) 49 | x = self.relu2(x) 50 | x = self.pool2(x) 51 | 52 | # フォーマット変換 (Batch, Ch, Height, Width) -> (Batch, Ch) 53 | x = x.view(x.shape[0], -1) 54 | 55 | # 全結合層 56 | x = self.fc1(x) 57 | x = self.relu3(x) 58 | x = self.fc2(x) 59 | 60 | return x 61 | 62 | 63 | net = Net() 64 | 65 | # 2. データセットの読み出し法の定義 66 | # MNIST の学習・テストデータの取得 67 | trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor()) 68 | testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor()) 69 | 70 | # データの読み出し方法の定義 71 | # 1stepの学習・テストごとに16枚ずつ画像を読みだす 72 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=16, shuffle=True) 73 | testloader = torch.utils.data.DataLoader(testset, batch_size=16, shuffle=False) 74 | 75 | # ロス関数、最適化器の定義 76 | loss_func = nn.CrossEntropyLoss() 77 | optimizer = torch.optim.Adam(net.parameters(), lr=0.0001) 78 | 79 | # 3. 学習 80 | # データセット内の全画像を10回使用するまでループ 81 | for epoch in range(10): 82 | running_loss = 0 83 | 84 | # データセット内でループ 85 | for i, data in enumerate(trainloader, 0): 86 | # 入力バッチの読み込み (画像、正解ラベル) 87 | inputs, labels = data 88 | 89 | # 最適化器をゼロ初期化 90 | optimizer.zero_grad() 91 | 92 | # 入力画像をモデルに通して出力ラベルを取得 93 | outputs = net(inputs) 94 | 95 | # 正解との誤差の計算 + 誤差逆伝搬 96 | loss = loss_func(outputs, labels) 97 | loss.backward() 98 | 99 | # 誤差を用いてモデルの最適化 100 | optimizer.step() 101 | running_loss += loss.item() 102 | if i % 1000 == 999: 103 | print('[%d, %5d] loss: %.3f' % 104 | (epoch + 1, i + 1, running_loss / 1000)) 105 | running_loss = 0.0 106 | 107 | # 4. テスト 108 | ans = [] 109 | pred = [] 110 | for i, data in enumerate(testloader, 0): 111 | inputs, labels = data 112 | 113 | outputs = net(inputs) 114 | 115 | ans += labels.tolist() 116 | pred += torch.argmax(outputs, 1).tolist() 117 | 118 | print('accuracy:', accuracy_score(ans, pred)) 119 | print('confusion matrix:') 120 | print(confusion_matrix(ans, pred)) 121 | 122 | # 5. モデルの保存 123 | # PyTorchから普通に読み出すためのモデルファイル 124 | torch.save(net.state_dict(), 'model.pt') 125 | 126 | # libtorch (C++ API) から読み出すためのTorch Script Module を保存 127 | example = torch.rand(1, 1, 28, 28) 128 | traced_script_module = torch.jit.trace(net, example) 129 | traced_script_module.save('traced_model.pt') 130 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(ref) 2 | add_subdirectory(hls) 3 | -------------------------------------------------------------------------------- /tests/hls/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(CMakeParseArguments) 2 | 3 | set(TB_TCL ${CMAKE_CURRENT_SOURCE_DIR}/tb.tcl) 4 | 5 | macro(abs_path files abs_paths) 6 | foreach(f ${files}) 7 | list(APPEND ${abs_paths} ${CMAKE_CURRENT_SOURCE_DIR}/${f}) 8 | endforeach() 9 | endmacro() 10 | 11 | macro(append_include_option src dst) 12 | foreach(p ${src}) 13 | list(APPEND ${dst} "includepath=${p}") 14 | endforeach() 15 | endmacro() 16 | 17 | macro(list2str l str) 18 | string(REPLACE ";" " " ${str} "${l}") 19 | endmacro() 20 | 21 | macro(prepend_option srcs option dsts) 22 | foreach(src ${srcs}) 23 | list(APPEND ${dsts} "${option}${src}") 24 | endforeach() 25 | endmacro() 26 | 27 | 28 | function(add_csim name) 29 | cmake_parse_arguments(ARG "" "" "HLS_SRC;TB_SRC;CXXFLAGS" ${ARGN}) 30 | 31 | add_executable(${name} ${ARG_HLS_SRC} ${ARG_TB_SRC}) 32 | 33 | # For checking c++ standards 34 | set_source_files_properties("${ARG_TB_SRC}" PROPERTIES COMPILE_FLAGS "-std=c++14") 35 | set_source_files_properties("${ARG_HLS_SRC}" PROPERTIES COMPILE_FLAGS "-std=c++98") 36 | 37 | target_include_directories(${name} PRIVATE ${DNNK_INCLUDE_DIRS} ${VHLS_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS}) 38 | target_compile_options(${name} PRIVATE ${DNNK_CXX_FLAGS} ${ARG_CXXFLAGS}) 39 | target_link_libraries(${name} PRIVATE ${TORCH_LIBRARIES} ${GTEST_LIBRARIES}) 40 | 41 | add_test( 42 | NAME ${name} 43 | COMMAND ${name} ${ARG_UNPARSED_ARGUMENTS} 44 | ) 45 | 46 | endfunction() 47 | 48 | function(add_cosim name top) 49 | cmake_parse_arguments(ARG "" "" "HLS_SRC;TB_SRC;CXXFLAGS" ${ARGN}) 50 | 51 | abs_path(${ARG_HLS_SRC} abs_hls_srcs) 52 | abs_path(${ARG_TB_SRC} abs_tb_srcs) 53 | 54 | set(include_dirs ${DNNK_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS}) 55 | prepend_option("${include_dirs}" "-I" include_options) 56 | 57 | set(library_dirs ${TORCH_LIBRARY_DIRS}) 58 | prepend_option("${library_dirs}" "-L" library_dir_options) 59 | prepend_option("${library_dirs}" "-Wl,-rpath," rpath_options) 60 | 61 | set(libraries "torch" "c10" "gtest" "pthread") 62 | prepend_option("${libraries}" "-l" librariy_options) 63 | 64 | set(cxxflags ${ARG_CXXFLAGS} ${include_options}) 65 | list2str("${cxxflags}" cxxflags_str) 66 | 67 | set(ldflags ${library_dir_options} ${librariy_options} ${rpath_options}) 68 | list2str("${ldflags}" ldflags_str) 69 | 70 | add_test( 71 | NAME ${name} 72 | COMMAND vivado_hls -f ${TB_TCL} "cosim" ${name} "${abs_hls_srcs}" ${top} ${CHIP_PART} "cxxflags=${cxxflags_str}" "ldflags=${ldflags_str}" "${abs_tb_srcs}" "${ARG_UNPARSED_ARGUMENTS}" 73 | WORKING_DIRECTORY ${WORK_DIR} 74 | ) 75 | endfunction() 76 | 77 | function(add_impl name top) 78 | cmake_parse_arguments(ARG "" "" "HLS_SRC;CXXFLAGS" ${ARGN}) 79 | 80 | abs_path(${ARG_HLS_SRC} abs_hls_srcs) 81 | 82 | set(include_dirs ${DNNK_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS}) 83 | prepend_option("${include_dirs}" "-I" include_options) 84 | set(cxxflags ${ARG_CXXFLAGS} ${include_options}) 85 | list2str("${cxxflags}" cxxflags_str) 86 | 87 | add_custom_target( 88 | ${name} 89 | COMMAND vivado_hls -f ${TB_TCL} "impl" ${name} "${abs_hls_srcs}" ${top} ${CHIP_PART} "cxxflags=${cxxflags_str}" ${ARG_UNPARSED_ARGUMENTS} 90 | WORKING_DIRECTORY ${WORK_DIR} 91 | ) 92 | endfunction() 93 | 94 | function(add_test_and_impl name top) 95 | cmake_parse_arguments(ARG "" "" "HLS_SRC;TB_SRC;CXXFLAGS" ${ARGN}) 96 | 97 | add_csim(${name}_csim CXXFLAGS ${ARG_CXXFLAGS} HLS_SRC ${ARG_HLS_SRC} TB_SRC ${ARG_TB_SRC}) 98 | add_cosim(${name}_cosim ${top} CXXFLAGS ${ARG_CXXFLAGS} HLS_SRC ${ARG_HLS_SRC} TB_SRC ${ARG_TB_SRC}) 99 | add_impl(${name}_impl ${top} CXXFLAGS ${ARG_CXXFLAGS} HLS_SRC ${ARG_HLS_SRC}) 100 | endfunction() 101 | 102 | add_subdirectory(relu) 103 | add_subdirectory(conv2d) 104 | add_subdirectory(maxpool2d) 105 | add_subdirectory(linear) 106 | add_subdirectory(inference) 107 | -------------------------------------------------------------------------------- /tests/hls/conv2d/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(hls_src conv2d_hls.cc) 2 | set(test_src conv2d_test.cc) 3 | 4 | add_test_and_impl(conv2d_hls conv2d_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_hls") 5 | add_test_and_impl(conv2d_pipelined_v1_hls conv2d_pipelined_v1_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_pipelined_v1_hls") 6 | add_test_and_impl(conv2d_pipelined_v2_hls conv2d_pipelined_v2_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_pipelined_v2_hls") 7 | add_test_and_impl(conv2d_unrolled_v1_2_hls conv2d_unrolled_v1_2_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_unrolled_v1_2_hls") 8 | add_test_and_impl(conv2d_unrolled_v1_3_hls conv2d_unrolled_v1_3_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_unrolled_v1_3_hls") 9 | add_test_and_impl(conv2d_unrolled_v2_2_2_hls conv2d_unrolled_v2_2_2_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_unrolled_v2_2_2_hls") 10 | add_test_and_impl(conv2d_unrolled_v2_2_3_hls conv2d_unrolled_v2_2_3_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_unrolled_v2_2_3_hls") 11 | add_test_and_impl(conv2d_unrolled_v2_3_2_hls conv2d_unrolled_v2_3_2_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_unrolled_v2_3_2_hls") 12 | add_test_and_impl(conv2d_unrolled_v2_3_3_hls conv2d_unrolled_v2_3_3_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=conv2d_unrolled_v2_3_3_hls") 13 | 14 | -------------------------------------------------------------------------------- /tests/hls/conv2d/conv2d_hls.cc: -------------------------------------------------------------------------------- 1 | #include "dnn-kernel/conv2d.h" 2 | 3 | #include 4 | #include 5 | 6 | static const std::size_t kMaxSize = 65536; 7 | 8 | void conv2d_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], 9 | int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) { 10 | 11 | dnnk::conv2d(x, weight, bias, width, height, in_channels, out_channels, ksize, y); 12 | } 13 | 14 | void conv2d_pipelined_v1_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], 15 | int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) { 16 | 17 | dnnk::conv2d_pipelined_v1(x, weight, bias, width, height, in_channels, out_channels, ksize, y); 18 | } 19 | 20 | void conv2d_pipelined_v2_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], 21 | int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) { 22 | 23 | dnnk::conv2d_pipelined_v2(x, weight, bias, width, height, in_channels, out_channels, ksize, y); 24 | } 25 | 26 | void conv2d_unrolled_v1_2_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], 27 | int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) { 28 | 29 | dnnk::conv2d_unrolled_v1<2>(x, weight, bias, width, height, in_channels, out_channels, ksize, y); 30 | } 31 | 32 | void conv2d_unrolled_v1_3_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], 33 | int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) { 34 | 35 | dnnk::conv2d_unrolled_v1<3>(x, weight, bias, width, height, in_channels, out_channels, ksize, y); 36 | } 37 | 38 | void conv2d_unrolled_v2_2_2_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], 39 | int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) { 40 | 41 | dnnk::conv2d_unrolled_v2<2, 2>(x, weight, bias, width, height, in_channels, out_channels, ksize, y); 42 | } 43 | 44 | void conv2d_unrolled_v2_3_2_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], 45 | int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) { 46 | 47 | dnnk::conv2d_unrolled_v2<3, 2>(x, weight, bias, width, height, in_channels, out_channels, ksize, y); 48 | } 49 | 50 | void conv2d_unrolled_v2_2_3_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], 51 | int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) { 52 | 53 | dnnk::conv2d_unrolled_v2<2, 3>(x, weight, bias, width, height, in_channels, out_channels, ksize, y); 54 | } 55 | 56 | void conv2d_unrolled_v2_3_3_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], 57 | int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float y[kMaxSize]) { 58 | 59 | dnnk::conv2d_unrolled_v2<3, 3>(x, weight, bias, width, height, in_channels, out_channels, ksize, y); 60 | } 61 | -------------------------------------------------------------------------------- /tests/hls/conv2d/conv2d_hls.h: -------------------------------------------------------------------------------- 1 | #ifndef DNNKERNEL_TEST_CONV2D_HLS_H 2 | #define DNNKERNEL_TEST_CONV2D_HLS_H 3 | 4 | #include 5 | 6 | void conv2d_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y); 7 | void conv2d_pipelined_v1_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y); 8 | void conv2d_pipelined_v2_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y); 9 | void conv2d_unrolled_v1_2_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y); 10 | void conv2d_unrolled_v1_3_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y); 11 | void conv2d_unrolled_v2_2_2_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y); 12 | void conv2d_unrolled_v2_2_3_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y); 13 | void conv2d_unrolled_v2_3_2_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y); 14 | void conv2d_unrolled_v2_3_3_hls(const float *x, const float* weight, const float* bias, int32_t width, int32_t height, int32_t in_channels, int32_t out_channels, int32_t ksize, float *y); 15 | 16 | #endif // DNNKERNEL_TEST_CONV2D_HLS_H 17 | -------------------------------------------------------------------------------- /tests/hls/conv2d/conv2d_test.cc: -------------------------------------------------------------------------------- 1 | #include "conv2d_hls.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include 11 | 12 | #ifndef TOP_FUNC 13 | #error "TOP_FUNC is not defined" 14 | #endif 15 | 16 | static const std::size_t kMaxSize = 65536; 17 | 18 | using namespace dnnk; 19 | namespace F = torch::nn::functional; 20 | 21 | int main() { 22 | // Seeds must be fixed because the testbench is executed twice in 23 | // the cosimulation. 24 | torch::manual_seed(0); 25 | 26 | int h = 14, w = 14, in_channels = 4, out_channels = 8, ksize = 3; 27 | 28 | auto x_ref = torch::randn({1, in_channels, h, w}); 29 | auto weight_ref = torch::randn({out_channels, in_channels, ksize, ksize}); 30 | auto bias_ref = torch::randn({out_channels}); 31 | 32 | float x[kMaxSize], weight[kMaxSize], bias[kMaxSize], y[kMaxSize]; 33 | tensor2array(x_ref, x); 34 | tensor2array(weight_ref, weight); 35 | tensor2array(bias_ref, bias); 36 | 37 | auto y_ref = F::detail::conv2d(x_ref, weight_ref, bias_ref, 1, ksize/2, 1, 1); 38 | TOP_FUNC (x, weight, bias, w, h, in_channels, out_channels, ksize, y); 39 | 40 | if (!verify(y, y_ref)) { 41 | printf("%sFailed%s\n", Color::red, Color::reset); 42 | return 1; 43 | } 44 | 45 | printf("%sSucceed!%s\n", Color::green, Color::reset); 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /tests/hls/inference/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(hls_src inference_hls.cc) 2 | set(test_src inference_test.cc) 3 | 4 | add_test_and_impl(inference_hls inference_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_hls") 5 | add_test_and_impl(inference_top inference_top HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_top") 6 | add_test_and_impl(inference_dataflow inference_dataflow HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_dataflow") 7 | add_test_and_impl(inference_with_local_buffer inference_with_local_buffer HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_with_local_buffer") 8 | add_test_and_impl(inference_pipelined_conv_v1 inference_pipelined_conv_v1 HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_pipelined_conv_v1") 9 | add_test_and_impl(inference_pipelined_conv_v2 inference_pipelined_conv_v2 HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_pipelined_conv_v2") 10 | add_test_and_impl(inference_unrolledx4_conv_v1 inference_unrolledx4_conv_v1 HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_unrolledx4_conv_v1") 11 | add_test_and_impl(inference_unrolledx4_conv_v2 inference_unrolledx4_conv_v2 HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_unrolledx4_conv_v2") 12 | add_test_and_impl(inference_final inference_final HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"" "-DTOP_FUNC=inference_final") 13 | -------------------------------------------------------------------------------- /tests/hls/inference/inference_hls.cc: -------------------------------------------------------------------------------- 1 | #include "dnn-kernel/inference.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | static const std::size_t kMaxSize = 16384; 8 | 9 | void inference_hls(const float x[kMaxSize], 10 | const float weight0[kMaxSize], const float bias0[kMaxSize], 11 | const float weight1[kMaxSize], const float bias1[kMaxSize], 12 | const float weight2[kMaxSize], const float bias2[kMaxSize], 13 | const float weight3[kMaxSize], const float bias3[kMaxSize], 14 | float y[kMaxSize]) { 15 | dnnk::inference(x, 16 | weight0, bias0, 17 | weight1, bias1, 18 | weight2, bias2, 19 | weight3, bias3, 20 | y); 21 | } 22 | 23 | extern "C" { 24 | 25 | void inference_top(const float x[kMaxSize], 26 | const float weight0[kMaxSize], const float bias0[kMaxSize], 27 | const float weight1[kMaxSize], const float bias1[kMaxSize], 28 | const float weight2[kMaxSize], const float bias2[kMaxSize], 29 | const float weight3[kMaxSize], const float bias3[kMaxSize], 30 | float y[kMaxSize]) { 31 | #pragma HLS interface m_axi port=x offset=slave bundle=gmem0 32 | #pragma HLS interface m_axi port=weight0 offset=slave bundle=gmem1 33 | #pragma HLS interface m_axi port=weight1 offset=slave bundle=gmem2 34 | #pragma HLS interface m_axi port=weight2 offset=slave bundle=gmem3 35 | #pragma HLS interface m_axi port=weight3 offset=slave bundle=gmem4 36 | #pragma HLS interface m_axi port=bias0 offset=slave bundle=gmem5 37 | #pragma HLS interface m_axi port=bias1 offset=slave bundle=gmem6 38 | #pragma HLS interface m_axi port=bias2 offset=slave bundle=gmem7 39 | #pragma HLS interface m_axi port=bias3 offset=slave bundle=gmem8 40 | #pragma HLS interface m_axi port=y offset=slave bundle=gmem9 41 | #pragma HLS interface s_axilite port=x bundle=control 42 | #pragma HLS interface s_axilite port=weight0 bundle=control 43 | #pragma HLS interface s_axilite port=weight1 bundle=control 44 | #pragma HLS interface s_axilite port=weight2 bundle=control 45 | #pragma HLS interface s_axilite port=weight3 bundle=control 46 | #pragma HLS interface s_axilite port=bias0 bundle=control 47 | #pragma HLS interface s_axilite port=bias1 bundle=control 48 | #pragma HLS interface s_axilite port=bias2 bundle=control 49 | #pragma HLS interface s_axilite port=bias3 bundle=control 50 | #pragma HLS interface s_axilite port=y bundle=control 51 | #pragma HLS interface s_axilite port=return bundle=control 52 | 53 | dnnk::inference(x, 54 | weight0, bias0, 55 | weight1, bias1, 56 | weight2, bias2, 57 | weight3, bias3, 58 | y); 59 | } 60 | 61 | void inference_dataflow(const float x[kMaxSize], 62 | const float weight0[kMaxSize], const float bias0[kMaxSize], 63 | const float weight1[kMaxSize], const float bias1[kMaxSize], 64 | const float weight2[kMaxSize], const float bias2[kMaxSize], 65 | const float weight3[kMaxSize], const float bias3[kMaxSize], 66 | float y[kMaxSize]) { 67 | #pragma HLS dataflow 68 | #pragma HLS interface m_axi port=x offset=slave bundle=gmem0 69 | #pragma HLS interface m_axi port=weight0 offset=slave bundle=gmem1 70 | #pragma HLS interface m_axi port=weight1 offset=slave bundle=gmem2 71 | #pragma HLS interface m_axi port=weight2 offset=slave bundle=gmem3 72 | #pragma HLS interface m_axi port=weight3 offset=slave bundle=gmem4 73 | #pragma HLS interface m_axi port=bias0 offset=slave bundle=gmem5 74 | #pragma HLS interface m_axi port=bias1 offset=slave bundle=gmem6 75 | #pragma HLS interface m_axi port=bias2 offset=slave bundle=gmem7 76 | #pragma HLS interface m_axi port=bias3 offset=slave bundle=gmem8 77 | #pragma HLS interface m_axi port=y offset=slave bundle=gmem9 78 | #pragma HLS interface s_axilite port=x bundle=control 79 | #pragma HLS interface s_axilite port=weight0 bundle=control 80 | #pragma HLS interface s_axilite port=weight1 bundle=control 81 | #pragma HLS interface s_axilite port=weight2 bundle=control 82 | #pragma HLS interface s_axilite port=weight3 bundle=control 83 | #pragma HLS interface s_axilite port=bias0 bundle=control 84 | #pragma HLS interface s_axilite port=bias1 bundle=control 85 | #pragma HLS interface s_axilite port=bias2 bundle=control 86 | #pragma HLS interface s_axilite port=bias3 bundle=control 87 | #pragma HLS interface s_axilite port=y bundle=control 88 | #pragma HLS interface s_axilite port=return bundle=control 89 | #pragma HLS interface ap_ctrl_chain port=return bundle=control 90 | 91 | #pragma HLS stable variable=x 92 | #pragma HLS stable variable=weight0 93 | #pragma HLS stable variable=bias0 94 | #pragma HLS stable variable=weight1 95 | #pragma HLS stable variable=bias1 96 | #pragma HLS stable variable=weight2 97 | #pragma HLS stable variable=bias2 98 | #pragma HLS stable variable=weight3 99 | #pragma HLS stable variable=bias3 100 | #pragma HLS stable variable=y 101 | 102 | dnnk::inference(x, 103 | weight0, bias0, 104 | weight1, bias1, 105 | weight2, bias2, 106 | weight3, bias3, 107 | y); 108 | } 109 | 110 | 111 | void inference_with_local_buffer(const float x[kMaxSize], 112 | const float weight0[kMaxSize], const float bias0[kMaxSize], 113 | const float weight1[kMaxSize], const float bias1[kMaxSize], 114 | const float weight2[kMaxSize], const float bias2[kMaxSize], 115 | const float weight3[kMaxSize], const float bias3[kMaxSize], 116 | float y[kMaxSize]) { 117 | #pragma HLS dataflow 118 | #pragma HLS interface m_axi port=x offset=slave bundle=gmem0 119 | #pragma HLS interface m_axi port=weight0 offset=slave bundle=gmem1 120 | #pragma HLS interface m_axi port=weight1 offset=slave bundle=gmem2 121 | #pragma HLS interface m_axi port=weight2 offset=slave bundle=gmem3 122 | #pragma HLS interface m_axi port=weight3 offset=slave bundle=gmem4 123 | #pragma HLS interface m_axi port=bias0 offset=slave bundle=gmem5 124 | #pragma HLS interface m_axi port=bias1 offset=slave bundle=gmem6 125 | #pragma HLS interface m_axi port=bias2 offset=slave bundle=gmem7 126 | #pragma HLS interface m_axi port=bias3 offset=slave bundle=gmem8 127 | #pragma HLS interface m_axi port=y offset=slave bundle=gmem9 128 | #pragma HLS interface s_axilite port=x bundle=control 129 | #pragma HLS interface s_axilite port=weight0 bundle=control 130 | #pragma HLS interface s_axilite port=weight1 bundle=control 131 | #pragma HLS interface s_axilite port=weight2 bundle=control 132 | #pragma HLS interface s_axilite port=weight3 bundle=control 133 | #pragma HLS interface s_axilite port=bias0 bundle=control 134 | #pragma HLS interface s_axilite port=bias1 bundle=control 135 | #pragma HLS interface s_axilite port=bias2 bundle=control 136 | #pragma HLS interface s_axilite port=bias3 bundle=control 137 | #pragma HLS interface s_axilite port=y bundle=control 138 | #pragma HLS interface s_axilite port=return bundle=control 139 | #pragma HLS interface ap_ctrl_chain port=return bundle=control 140 | 141 | #pragma HLS stable variable=x 142 | #pragma HLS stable variable=weight0 143 | #pragma HLS stable variable=bias0 144 | #pragma HLS stable variable=weight1 145 | #pragma HLS stable variable=bias1 146 | #pragma HLS stable variable=weight2 147 | #pragma HLS stable variable=bias2 148 | #pragma HLS stable variable=weight3 149 | #pragma HLS stable variable=bias3 150 | #pragma HLS stable variable=y 151 | 152 | const std::size_t x_size = 1 * 28 * 28; 153 | const std::size_t w0_size = 4 * 1 * 3 * 3, b0_size = 4; 154 | const std::size_t w1_size = 8 * 4 * 3 * 3, b1_size = 8; 155 | const std::size_t w2_size = 32 * 392, b2_size = 32; 156 | const std::size_t w3_size = 10 * 32, b3_size = 10; 157 | const std::size_t y_size = 10; 158 | 159 | float x_local[x_size]; 160 | float w0_local[w0_size], b0_local[b0_size]; 161 | float w1_local[w1_size], b1_local[b1_size]; 162 | float w2_local[w2_size], b2_local[b2_size]; 163 | float w3_local[w3_size], b3_local[b3_size]; 164 | float y_local[y_size]; 165 | 166 | // fetch to local buffer 167 | std::memcpy(x_local, x, x_size * sizeof(float)); 168 | std::memcpy(w0_local, weight0, w0_size * sizeof(float)); 169 | std::memcpy(b0_local, bias0, b0_size * sizeof(float)); 170 | std::memcpy(w1_local, weight1, w1_size * sizeof(float)); 171 | std::memcpy(b1_local, bias1, b1_size * sizeof(float)); 172 | std::memcpy(w2_local, weight2, w2_size * sizeof(float)); 173 | std::memcpy(b2_local, bias2, b2_size * sizeof(float)); 174 | std::memcpy(w3_local, weight3, w3_size * sizeof(float)); 175 | std::memcpy(b3_local, bias3, b3_size * sizeof(float)); 176 | 177 | // run inference with local buffer 178 | dnnk::inference(x_local, 179 | w0_local, b0_local, 180 | w1_local, b1_local, 181 | w2_local, b2_local, 182 | w3_local, b3_local, 183 | y_local); 184 | 185 | // store to global buffer 186 | std::memcpy(y, y_local, y_size * sizeof(float)); 187 | } 188 | 189 | 190 | #define DECLARE_INFERENCE_WITH_LOCAL_BUFFER(NAME, CONV_FUNC, MAXPOOL_FUNC, RELU_FUNC, LINEAR_FUNC) \ 191 | void NAME(const float x[kMaxSize], \ 192 | const float weight0[kMaxSize], const float bias0[kMaxSize], \ 193 | const float weight1[kMaxSize], const float bias1[kMaxSize], \ 194 | const float weight2[kMaxSize], const float bias2[kMaxSize], \ 195 | const float weight3[kMaxSize], const float bias3[kMaxSize], \ 196 | float y[kMaxSize]) { \ 197 | _Pragma("HLS dataflow") \ 198 | _Pragma("HLS interface m_axi port=x offset=slave bundle=gmem0") \ 199 | _Pragma("HLS interface m_axi port=weight0 offset=slave bundle=gmem1") \ 200 | _Pragma("HLS interface m_axi port=weight1 offset=slave bundle=gmem2") \ 201 | _Pragma("HLS interface m_axi port=weight2 offset=slave bundle=gmem3") \ 202 | _Pragma("HLS interface m_axi port=weight3 offset=slave bundle=gmem4") \ 203 | _Pragma("HLS interface m_axi port=bias0 offset=slave bundle=gmem5") \ 204 | _Pragma("HLS interface m_axi port=bias1 offset=slave bundle=gmem6") \ 205 | _Pragma("HLS interface m_axi port=bias2 offset=slave bundle=gmem7") \ 206 | _Pragma("HLS interface m_axi port=bias3 offset=slave bundle=gmem8") \ 207 | _Pragma("HLS interface m_axi port=y offset=slave bundle=gmem9") \ 208 | _Pragma("HLS interface s_axilite port=x bundle=control") \ 209 | _Pragma("HLS interface s_axilite port=weight0 bundle=control") \ 210 | _Pragma("HLS interface s_axilite port=weight1 bundle=control") \ 211 | _Pragma("HLS interface s_axilite port=weight2 bundle=control") \ 212 | _Pragma("HLS interface s_axilite port=weight3 bundle=control") \ 213 | _Pragma("HLS interface s_axilite port=bias0 bundle=control") \ 214 | _Pragma("HLS interface s_axilite port=bias1 bundle=control") \ 215 | _Pragma("HLS interface s_axilite port=bias2 bundle=control") \ 216 | _Pragma("HLS interface s_axilite port=bias3 bundle=control") \ 217 | _Pragma("HLS interface s_axilite port=y bundle=control") \ 218 | _Pragma("HLS interface s_axilite port=return bundle=control") \ 219 | _Pragma("HLS interface ap_ctrl_chain port=return bundle=control") \ 220 | _Pragma("HLS stable variable=x") \ 221 | _Pragma("HLS stable variable=weight0") \ 222 | _Pragma("HLS stable variable=bias0") \ 223 | _Pragma("HLS stable variable=weight1") \ 224 | _Pragma("HLS stable variable=bias1") \ 225 | _Pragma("HLS stable variable=weight2") \ 226 | _Pragma("HLS stable variable=bias2") \ 227 | _Pragma("HLS stable variable=weight3") \ 228 | _Pragma("HLS stable variable=bias3") \ 229 | _Pragma("HLS stable variable=y") \ 230 | const std::size_t x_size = 1 * 28 * 28; \ 231 | const std::size_t w0_size = 4 * 1 * 3 * 3, b0_size = 4; \ 232 | const std::size_t w1_size = 8 * 4 * 3 * 3, b1_size = 8; \ 233 | const std::size_t w2_size = 32 * 392, b2_size = 32; \ 234 | const std::size_t w3_size = 10 * 32, b3_size = 10; \ 235 | const std::size_t y_size = 10; \ 236 | float x_local[x_size]; \ 237 | float w0_local[w0_size], b0_local[b0_size]; \ 238 | float w1_local[w1_size], b1_local[b1_size]; \ 239 | float w2_local[w2_size], b2_local[b2_size]; \ 240 | float w3_local[w3_size], b3_local[b3_size]; \ 241 | float y_local[y_size]; \ 242 | std::memcpy(x_local, x, x_size * sizeof(float)); \ 243 | std::memcpy(w0_local, weight0, w0_size * sizeof(float)); \ 244 | std::memcpy(b0_local, bias0, b0_size * sizeof(float)); \ 245 | std::memcpy(w1_local, weight1, w1_size * sizeof(float)); \ 246 | std::memcpy(b1_local, bias1, b1_size * sizeof(float)); \ 247 | std::memcpy(w2_local, weight2, w2_size * sizeof(float)); \ 248 | std::memcpy(b2_local, bias2, b2_size * sizeof(float)); \ 249 | std::memcpy(w3_local, weight3, w3_size * sizeof(float)); \ 250 | std::memcpy(b3_local, bias3, b3_size * sizeof(float)); \ 251 | \ 252 | dnnk::inference_custom(x_local, \ 253 | w0_local, b0_local, \ 254 | w1_local, b1_local, \ 255 | w2_local, b2_local, \ 256 | w3_local, b3_local, \ 257 | y_local, \ 258 | CONV_FUNC, MAXPOOL_FUNC, RELU_FUNC, LINEAR_FUNC); \ 259 | \ 260 | std::memcpy(y, y_local, y_size * sizeof(float)); \ 261 | } 262 | 263 | DECLARE_INFERENCE_WITH_LOCAL_BUFFER(inference_pipelined_conv_v1, dnnk::conv2d_pipelined_v1, dnnk::maxpool2d, dnnk::relu, dnnk::linear); 264 | DECLARE_INFERENCE_WITH_LOCAL_BUFFER(inference_pipelined_conv_v2, dnnk::conv2d_pipelined_v2, dnnk::maxpool2d, dnnk::relu, dnnk::linear); 265 | DECLARE_INFERENCE_WITH_LOCAL_BUFFER(inference_unrolledx4_conv_v1, dnnk::conv2d_unrolled_v1<4>, dnnk::maxpool2d, dnnk::relu, dnnk::linear); 266 | DECLARE_INFERENCE_WITH_LOCAL_BUFFER(inference_unrolledx4_conv_v2, (dnnk::conv2d_unrolled_v2<4, 4>), dnnk::maxpool2d, dnnk::relu, dnnk::linear); 267 | 268 | 269 | 270 | void inference_final(const float x[kMaxSize], 271 | const float weight0[kMaxSize], const float bias0[kMaxSize], 272 | const float weight1[kMaxSize], const float bias1[kMaxSize], 273 | const float weight2[kMaxSize], const float bias2[kMaxSize], 274 | const float weight3[kMaxSize], const float bias3[kMaxSize], 275 | float y[kMaxSize]) { 276 | #pragma HLS dataflow 277 | #pragma HLS interface m_axi port=x offset=slave bundle=gmem0 278 | #pragma HLS interface m_axi port=weight0 offset=slave bundle=gmem1 279 | #pragma HLS interface m_axi port=weight1 offset=slave bundle=gmem2 280 | #pragma HLS interface m_axi port=weight2 offset=slave bundle=gmem3 281 | #pragma HLS interface m_axi port=weight3 offset=slave bundle=gmem4 282 | #pragma HLS interface m_axi port=bias0 offset=slave bundle=gmem5 283 | #pragma HLS interface m_axi port=bias1 offset=slave bundle=gmem6 284 | #pragma HLS interface m_axi port=bias2 offset=slave bundle=gmem7 285 | #pragma HLS interface m_axi port=bias3 offset=slave bundle=gmem8 286 | #pragma HLS interface m_axi port=y offset=slave bundle=gmem9 287 | #pragma HLS interface s_axilite port=x bundle=control 288 | #pragma HLS interface s_axilite port=weight0 bundle=control 289 | #pragma HLS interface s_axilite port=weight1 bundle=control 290 | #pragma HLS interface s_axilite port=weight2 bundle=control 291 | #pragma HLS interface s_axilite port=weight3 bundle=control 292 | #pragma HLS interface s_axilite port=bias0 bundle=control 293 | #pragma HLS interface s_axilite port=bias1 bundle=control 294 | #pragma HLS interface s_axilite port=bias2 bundle=control 295 | #pragma HLS interface s_axilite port=bias3 bundle=control 296 | #pragma HLS interface s_axilite port=y bundle=control 297 | #pragma HLS interface s_axilite port=return bundle=control 298 | #pragma HLS interface ap_ctrl_chain port=return bundle=control 299 | 300 | #pragma HLS stable variable=x 301 | #pragma HLS stable variable=weight0 302 | #pragma HLS stable variable=bias0 303 | #pragma HLS stable variable=weight1 304 | #pragma HLS stable variable=bias1 305 | #pragma HLS stable variable=weight2 306 | #pragma HLS stable variable=bias2 307 | #pragma HLS stable variable=weight3 308 | #pragma HLS stable variable=bias3 309 | #pragma HLS stable variable=y 310 | 311 | const std::size_t x_size = 1 * 28 * 28; 312 | const std::size_t w0_size = 4 * 1 * 3 * 3, b0_size = 4; 313 | const std::size_t w1_size = 8 * 4 * 3 * 3, b1_size = 8; 314 | const std::size_t w2_size = 32 * 392, b2_size = 32; 315 | const std::size_t w3_size = 10 * 32, b3_size = 10; 316 | const std::size_t y_size = 10; 317 | 318 | float x_local[x_size]; 319 | float w0_local[w0_size], b0_local[b0_size]; 320 | float w1_local[w1_size], b1_local[b1_size]; 321 | float w2_local[w2_size], b2_local[b2_size]; 322 | float w3_local[w3_size], b3_local[b3_size]; 323 | float y_local[y_size]; 324 | 325 | // fetch to local buffer 326 | std::memcpy(x_local, x, x_size * sizeof(float)); 327 | std::memcpy(w0_local, weight0, w0_size * sizeof(float)); 328 | std::memcpy(b0_local, bias0, b0_size * sizeof(float)); 329 | std::memcpy(w1_local, weight1, w1_size * sizeof(float)); 330 | std::memcpy(b1_local, bias1, b1_size * sizeof(float)); 331 | std::memcpy(w2_local, weight2, w2_size * sizeof(float)); 332 | std::memcpy(b2_local, bias2, b2_size * sizeof(float)); 333 | std::memcpy(w3_local, weight3, w3_size * sizeof(float)); 334 | std::memcpy(b3_local, bias3, b3_size * sizeof(float)); 335 | 336 | // run inference with local buffer 337 | dnnk::inference_custom(x_local, 338 | w0_local, b0_local, 339 | w1_local, b1_local, 340 | w2_local, b2_local, 341 | w3_local, b3_local, 342 | y_local, 343 | dnnk::conv2d_unrolled_v2<4, 4>, 344 | dnnk::relu, 345 | dnnk::maxpool2d, 346 | dnnk::conv2d_unrolled_v2<8, 4>, 347 | dnnk::relu, 348 | dnnk::maxpool2d, 349 | dnnk::linear_opt<4>, 350 | dnnk::relu, 351 | dnnk::linear); 352 | 353 | // store to global buffer 354 | std::memcpy(y, y_local, y_size * sizeof(float)); 355 | } 356 | 357 | } 358 | -------------------------------------------------------------------------------- /tests/hls/inference/inference_hls.h: -------------------------------------------------------------------------------- 1 | #ifndef DNNKERNEL_TEST_INFERENCE_HLS_H 2 | #define DNNKERNEL_TEST_INFERENCE_HLS_H 3 | 4 | #include 5 | 6 | void inference_hls(const float *x, 7 | const float* weight0, const float* bias0, 8 | const float* weight1, const float* bias1, 9 | const float* weight2, const float* bias2, 10 | const float* weight3, const float* bias3, 11 | float *y); 12 | 13 | extern "C" { 14 | 15 | void inference_top(const float *x, 16 | const float* weight0, const float* bias0, 17 | const float* weight1, const float* bias1, 18 | const float* weight2, const float* bias2, 19 | const float* weight3, const float* bias3, 20 | float *y); 21 | 22 | void inference_dataflow(const float *x, 23 | const float* weight0, const float* bias0, 24 | const float* weight1, const float* bias1, 25 | const float* weight2, const float* bias2, 26 | const float* weight3, const float* bias3, 27 | float *y); 28 | 29 | void inference_with_local_buffer(const float *x, 30 | const float* weight0, const float* bias0, 31 | const float* weight1, const float* bias1, 32 | const float* weight2, const float* bias2, 33 | const float* weight3, const float* bias3, 34 | float *y); 35 | 36 | void inference_pipelined_conv_v1(const float *x, 37 | const float* weight0, const float* bias0, 38 | const float* weight1, const float* bias1, 39 | const float* weight2, const float* bias2, 40 | const float* weight3, const float* bias3, 41 | float *y); 42 | 43 | void inference_pipelined_conv_v2(const float *x, 44 | const float* weight0, const float* bias0, 45 | const float* weight1, const float* bias1, 46 | const float* weight2, const float* bias2, 47 | const float* weight3, const float* bias3, 48 | float *y); 49 | 50 | void inference_unrolledx4_conv_v1(const float *x, 51 | const float* weight0, const float* bias0, 52 | const float* weight1, const float* bias1, 53 | const float* weight2, const float* bias2, 54 | const float* weight3, const float* bias3, 55 | float *y); 56 | 57 | void inference_unrolledx4_conv_v2(const float *x, 58 | const float* weight0, const float* bias0, 59 | const float* weight1, const float* bias1, 60 | const float* weight2, const float* bias2, 61 | const float* weight3, const float* bias3, 62 | float *y); 63 | 64 | void inference_final(const float *x, 65 | const float* weight0, const float* bias0, 66 | const float* weight1, const float* bias1, 67 | const float* weight2, const float* bias2, 68 | const float* weight3, const float* bias3, 69 | float *y); 70 | 71 | } 72 | 73 | #endif // DNNKERNEL_TEST_INFERENCE_HLS_H 74 | -------------------------------------------------------------------------------- /tests/hls/inference/inference_test.cc: -------------------------------------------------------------------------------- 1 | #include "inference_hls.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | #ifndef PROJECT_ROOT 17 | #error "PROJECT_ROOT is not defined" 18 | #endif 19 | 20 | #ifndef TOP_FUNC 21 | #error "TOP_FUNC is not defined" 22 | #endif 23 | 24 | static const std::size_t kMaxSize = 16384; 25 | 26 | using namespace dnnk; 27 | namespace F = torch::nn::functional; 28 | 29 | int main() { 30 | // Seeds must be fixed because the testbench is executed twice in 31 | // the cosimulation. 32 | torch::manual_seed(0); 33 | 34 | 35 | float x[kMaxSize], y[kMaxSize]; 36 | std::map > params; 37 | 38 | // load model file 39 | auto model = torch::jit::load(PROJECT_ROOT "/learning/traced_model.pt"); 40 | 41 | // load parameter values from model 42 | for (const auto& param_ref : model.named_parameters()) { 43 | 44 | // use param_ref.name as key (ex: "conv1.weight") 45 | params[param_ref.name].resize(param_ref.value.numel()); 46 | 47 | // copy image data 48 | tensor2array(param_ref.value, params[param_ref.name].data()); 49 | } 50 | 51 | // read MNIST dataset 52 | auto dataset = torch::data::datasets::MNIST(PROJECT_ROOT "/learning/data/MNIST/raw") 53 | .map(torch::data::transforms::Stack<>()); 54 | 55 | // define loader and set batch_size to 1 56 | auto data_loader = 57 | torch::data::make_data_loader(std::move(dataset), 58 | torch::data::DataLoaderOptions().batch_size(1)); 59 | 60 | // iterate data_loader 61 | std::size_t niters = 0; 62 | for (auto& batch : *data_loader) { 63 | 64 | auto x_ref = batch.data; // shape = (1, 1, 28, 28) 65 | auto y_label = batch.target; // shape = (1) 66 | 67 | // run inference 68 | tensor2array(x_ref, x); 69 | TOP_FUNC (x, 70 | params.at("conv1.weight").data(), params.at("conv1.bias").data(), 71 | params.at("conv2.weight").data(), params.at("conv2.bias").data(), 72 | params.at("fc1.weight").data(), params.at("fc1.bias").data(), 73 | params.at("fc2.weight").data(), params.at("fc2.bias").data(), 74 | y); 75 | 76 | 77 | // run inference in pytorch 78 | std::vector inputs; 79 | inputs.push_back(x_ref); 80 | auto y_ref = model.forward(inputs).toTensor(); 81 | 82 | if (!verify(y, y_ref)) { 83 | printf("%sFailed%s\n", Color::red, Color::reset); 84 | return 1; 85 | } 86 | 87 | if (++niters == 4) { 88 | break; 89 | } 90 | } 91 | 92 | printf("%sSucceed!%s\n", Color::green, Color::reset); 93 | return 0; 94 | } 95 | -------------------------------------------------------------------------------- /tests/hls/linear/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(hls_src linear_hls.cc) 2 | set(test_src linear_test.cc) 3 | 4 | add_test_and_impl(linear_hls linear_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=linear_hls") 5 | add_test_and_impl(linear_opt_2_hls linear_opt_2_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=linear_opt_2_hls") 6 | add_test_and_impl(linear_opt_3_hls linear_opt_3_hls HLS_SRC ${hls_src} TB_SRC ${test_src} CXXFLAGS "-DTOP_FUNC=linear_opt_3_hls") 7 | -------------------------------------------------------------------------------- /tests/hls/linear/linear_hls.cc: -------------------------------------------------------------------------------- 1 | #include "dnn-kernel/linear.h" 2 | 3 | #include 4 | #include 5 | 6 | static const std::size_t kMaxSize = 65536; 7 | 8 | void linear_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], int32_t in_features, int32_t out_features, float y[kMaxSize]) { 9 | 10 | dnnk::linear(x, weight, bias, in_features, out_features, y); 11 | } 12 | 13 | void linear_opt_2_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], int32_t in_features, int32_t out_features, float y[kMaxSize]) { 14 | 15 | dnnk::linear_opt<2>(x, weight, bias, in_features, out_features, y); 16 | } 17 | 18 | void linear_opt_3_hls(const float x[kMaxSize], const float weight[kMaxSize], const float bias[kMaxSize], int32_t in_features, int32_t out_features, float y[kMaxSize]) { 19 | 20 | dnnk::linear_opt<3>(x, weight, bias, in_features, out_features, y); 21 | } 22 | -------------------------------------------------------------------------------- /tests/hls/linear/linear_hls.h: -------------------------------------------------------------------------------- 1 | #ifndef DNNKERNEL_TEST_LINEAR_HLS_H 2 | #define DNNKERNEL_TEST_LINEAR_HLS_H 3 | 4 | #include 5 | 6 | void linear_hls(const float *x, const float* weight, const float* bias, int32_t in_features, int32_t out_features, float *y); 7 | void linear_opt_2_hls(const float *x, const float* weight, const float* bias, int32_t in_features, int32_t out_features, float *y); 8 | void linear_opt_3_hls(const float *x, const float* weight, const float* bias, int32_t in_features, int32_t out_features, float *y); 9 | 10 | #endif // DNNKERNEL_TEST_LINEAR_HLS_H 11 | -------------------------------------------------------------------------------- /tests/hls/linear/linear_test.cc: -------------------------------------------------------------------------------- 1 | #include "linear_hls.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include 11 | 12 | #ifndef TOP_FUNC 13 | #error "TOP_FUNC is not defined" 14 | #endif 15 | 16 | static const std::size_t kMaxSize = 65536; 17 | 18 | using namespace dnnk; 19 | namespace F = torch::nn::functional; 20 | 21 | int main() { 22 | // Seeds must be fixed because the testbench is executed twice in 23 | // the cosimulation. 24 | torch::manual_seed(0); 25 | 26 | int in_features = 32, out_features = 16; 27 | 28 | auto x_ref = torch::randn({1, in_features}); 29 | auto weight_ref = torch::randn({out_features, in_features}); 30 | auto bias_ref = torch::randn({out_features}); 31 | 32 | float x[kMaxSize], weight[kMaxSize], bias[kMaxSize], y[kMaxSize]; 33 | tensor2array(x_ref, x); 34 | tensor2array(weight_ref, weight); 35 | tensor2array(bias_ref, bias); 36 | 37 | auto y_ref = F::linear(x_ref, weight_ref, bias_ref); 38 | TOP_FUNC (x, weight, bias, in_features, out_features, y); 39 | 40 | if (!verify(y, y_ref)) { 41 | printf("%sFailed%s\n", Color::red, Color::reset); 42 | return 1; 43 | } 44 | 45 | printf("%sSucceed!%s\n", Color::green, Color::reset); 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /tests/hls/maxpool2d/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(hls_src maxpool2d_hls.cc) 2 | set(test_src maxpool2d_test.cc) 3 | 4 | add_test_and_impl(maxpool2d_hls maxpool2d_hls HLS_SRC ${hls_src} TB_SRC ${test_src}) 5 | -------------------------------------------------------------------------------- /tests/hls/maxpool2d/maxpool2d_hls.cc: -------------------------------------------------------------------------------- 1 | #include "dnn-kernel/maxpool2d.h" 2 | 3 | #include 4 | #include 5 | 6 | static const std::size_t kMaxSize = 65536; 7 | 8 | void maxpool2d_hls(const float x[kMaxSize], int32_t width, int32_t height, int32_t channels, int32_t stride, float y[kMaxSize]) { 9 | 10 | dnnk::maxpool2d(x, width, height, channels, stride, y); 11 | } 12 | -------------------------------------------------------------------------------- /tests/hls/maxpool2d/maxpool2d_hls.h: -------------------------------------------------------------------------------- 1 | #ifndef DNNKERNEL_TEST_CONV2D_HLS_H 2 | #define DNNKERNEL_TEST_CONV2D_HLS_H 3 | 4 | #include 5 | 6 | void maxpool2d_hls(const float* x, int32_t width, int32_t height, int32_t channels, int32_t stride, float* y); 7 | 8 | #endif // DNNKERNEL_TEST_CONV2D_HLS_H 9 | -------------------------------------------------------------------------------- /tests/hls/maxpool2d/maxpool2d_test.cc: -------------------------------------------------------------------------------- 1 | #include "maxpool2d_hls.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include 11 | 12 | static const std::size_t kMaxSize = 65536; 13 | 14 | using namespace dnnk; 15 | namespace F = torch::nn::functional; 16 | 17 | int main() { 18 | // Seeds must be fixed because the testbench is executed twice in 19 | // the cosimulation. 20 | torch::manual_seed(0); 21 | 22 | int h = 32, w = 32, channels = 4, stride = 2; 23 | 24 | auto x_ref = torch::randn({1, channels, h, w}); 25 | 26 | float x[kMaxSize], y[kMaxSize]; 27 | tensor2array(x_ref, x); 28 | 29 | auto y_ref = F::detail::max_pool2d(x_ref, stride, stride, 0, 1, false); 30 | maxpool2d_hls(x, w, h, channels, stride, y); 31 | 32 | if (!verify(y, y_ref)) { 33 | printf("%sFailed%s\n", Color::red, Color::reset); 34 | return 1; 35 | } 36 | 37 | printf("%sSucceed!%s\n", Color::green, Color::reset); 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /tests/hls/relu/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(hls_src relu_hls.cc) 2 | set(test_src relu_test.cc) 3 | 4 | add_test_and_impl(relu_hls relu_hls HLS_SRC ${hls_src} TB_SRC ${test_src}) 5 | -------------------------------------------------------------------------------- /tests/hls/relu/relu_hls.cc: -------------------------------------------------------------------------------- 1 | #include "dnn-kernel/relu.h" 2 | 3 | #include 4 | #include 5 | 6 | void relu_hls(const float x[1000], int64_t size, float y[1000]) { 7 | 8 | dnnk::relu(x, size, y); 9 | } 10 | -------------------------------------------------------------------------------- /tests/hls/relu/relu_hls.h: -------------------------------------------------------------------------------- 1 | #ifndef DNNKERNEL_TEST_RELU_HLS_H 2 | #define DNNKERNEL_TEST_RELU_HLS_H 3 | 4 | #include 5 | 6 | void relu_hls(const float* x, int64_t size, float* y); 7 | 8 | #endif // DNNKERNEL_TEST_RELU_HLS_H 9 | -------------------------------------------------------------------------------- /tests/hls/relu/relu_test.cc: -------------------------------------------------------------------------------- 1 | #include "relu_hls.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | using namespace dnnk; 12 | namespace F = torch::nn::functional; 13 | 14 | int main() { 15 | // Seeds must be fixed because the testbench is executed twice in 16 | // the cosimulation. 17 | torch::manual_seed(0); 18 | 19 | const std::size_t size_max = 1000; 20 | auto x_ref = torch::randn({28, 28, 1}); 21 | float x[size_max], y[size_max]; 22 | tensor2array(x_ref, x); 23 | 24 | relu_hls(x, x_ref.numel(), y); 25 | auto y_ref = F::detail::relu(x_ref, false); 26 | 27 | if (!verify(y, y_ref)) { 28 | printf("%sFailed%s\n", Color::red, Color::reset); 29 | return 1; 30 | } 31 | 32 | printf("%sSucceed!%s\n", Color::green, Color::reset); 33 | return 0; 34 | } 35 | -------------------------------------------------------------------------------- /tests/hls/tb.tcl: -------------------------------------------------------------------------------- 1 | set mode [lindex $argv 2] 2 | set name [lindex $argv 3] 3 | set hls_srcs [lindex $argv 4] 4 | set top [lindex $argv 5] 5 | set chip_part [lindex $argv 6] 6 | set cxxflags [lindex $argv 7] 7 | set ldflags [lindex $argv 8] 8 | set test_srcs [lindex $argv 9] 9 | set test_args [lindex $argv 10] 10 | 11 | 12 | open_project -reset ${name} 13 | 14 | regsub "cxxflags=" $cxxflags {} cxxflags 15 | regsub "ldflags=" $ldflags {} ldflags 16 | set test_cxxflags "${cxxflags} -std=c++14 -fopenmp" 17 | 18 | set_top ${top} 19 | add_files ${hls_srcs} -cflags "${cxxflags}" 20 | 21 | open_solution "solution1" 22 | set_part ${chip_part} 23 | create_clock -period 3.33 -name default 24 | 25 | csynth_design 26 | 27 | if {${mode} == "cosim"} { 28 | add_files -tb ${test_srcs} -cflags "${test_cxxflags}" 29 | cosim_design -trace_level port -ldflags "${ldflags}" -argv "${test_args}" 30 | } 31 | 32 | if {${mode} == "impl"} { 33 | export_design -flow impl -rtl verilog -format ip_catalog 34 | } 35 | 36 | if {${mode} == "xo"} { 37 | config_rtl -kernel_profile 38 | config_sdx -target xocc -profile true 39 | export_design -flow impl -rtl verilog -format ip_catalog -xo ${name}.xo 40 | } 41 | 42 | exit 43 | -------------------------------------------------------------------------------- /tests/ref/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB TEST_SRCS *.cc) 2 | 3 | foreach(test_path ${TEST_SRCS}) 4 | get_filename_component(test_file ${test_path} NAME) 5 | string(REPLACE ".cc" "" test_name ${test_file}_ref) 6 | add_executable(${test_name} ${test_file}) 7 | add_dependencies(${test_name} googletest) 8 | target_include_directories(${test_name} PRIVATE ${DNNK_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS}) 9 | target_compile_options(${test_name} PRIVATE ${DNNK_CXX_FLAGS} "-DPROJECT_ROOT=\"${CMAKE_SOURCE_DIR}\"") 10 | target_compile_features(${test_name} PRIVATE cxx_std_14) 11 | target_link_libraries(${test_name} PRIVATE ${TORCH_LIBRARIES} ${GTEST_LIBRARIES}) 12 | 13 | add_test( 14 | NAME ${test_name} 15 | COMMAND ${test_name} 16 | ) 17 | 18 | endforeach() 19 | -------------------------------------------------------------------------------- /tests/ref/conv2d.cc: -------------------------------------------------------------------------------- 1 | #include "dnn-kernel/conv2d.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | #include 12 | 13 | using namespace dnnk; 14 | namespace F = torch::nn::functional; 15 | 16 | TEST(CPUVerify, Conv2d) { 17 | torch::manual_seed(0); 18 | 19 | int h = 14, w = 14, in_channels = 4, out_channels = 8, ksize = 3; 20 | 21 | auto x_ref = torch::randn({1, in_channels, h, w}); 22 | auto weight_ref = torch::randn({out_channels, in_channels, ksize, ksize}); 23 | auto bias_ref = torch::randn({out_channels}); 24 | 25 | std::vector x(x_ref.numel()); 26 | std::vector weight(weight_ref.numel()); 27 | std::vector bias(out_channels); 28 | tensor2array(x_ref, x.data()); 29 | tensor2array(weight_ref, weight.data()); 30 | tensor2array(bias_ref, bias.data()); 31 | 32 | auto y_ref = F::detail::conv2d(x_ref, weight_ref, bias_ref, 1, ksize/2, 1, 1); 33 | std::vector y(y_ref.numel()); 34 | conv2d(x.data(), weight.data(), bias.data(), w, h, in_channels, out_channels, ksize, y.data()); 35 | 36 | EXPECT_TRUE(verify(y.data(), y_ref)); 37 | } 38 | 39 | int main(int argc, char** argv) { 40 | ::testing::InitGoogleTest(&argc, argv); 41 | ::testing::FLAGS_gtest_death_test_style = "threadsafe"; 42 | return RUN_ALL_TESTS(); 43 | } 44 | -------------------------------------------------------------------------------- /tests/ref/inference.cc: -------------------------------------------------------------------------------- 1 | #include "dnn-kernel/inference.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | #ifndef PROJECT_ROOT 15 | #error "PROJECT_ROOT is not defined" 16 | #endif 17 | 18 | using namespace dnnk; 19 | namespace F = torch::nn::functional; 20 | 21 | TEST(CPUVerify, Inference) { 22 | std::vector x, y; 23 | std::map > params; 24 | 25 | // load model file 26 | auto model = torch::jit::load(PROJECT_ROOT "/learning/traced_model.pt"); 27 | 28 | // load parameter values from model 29 | for (const auto& param_ref : model.named_parameters()) { 30 | 31 | // use param_ref.name as key (ex: "conv1.weight") 32 | params[param_ref.name].resize(param_ref.value.numel()); 33 | 34 | // copy image data 35 | tensor2array(param_ref.value, params[param_ref.name].data()); 36 | } 37 | 38 | // read MNIST dataset 39 | auto dataset = torch::data::datasets::MNIST(PROJECT_ROOT "/learning/data/MNIST/raw") 40 | .map(torch::data::transforms::Stack<>()); 41 | 42 | // define loader and set batch_size to 1 43 | auto data_loader = 44 | torch::data::make_data_loader(std::move(dataset), 45 | torch::data::DataLoaderOptions().batch_size(1)); 46 | 47 | // iterate data_loader 48 | std::size_t num_data = 0; 49 | std::size_t num_corrects_ref = 0; 50 | std::size_t num_corrects = 0; 51 | for (auto& batch : *data_loader) { 52 | 53 | auto x_ref = batch.data; // shape = (1, 1, 28, 28) 54 | auto y_label = batch.target; // shape = (1) 55 | 56 | // run inference in pytorch 57 | std::vector inputs; 58 | inputs.push_back(x_ref); 59 | auto y_ref = model.forward(inputs).toTensor(); 60 | 61 | x.resize(x_ref.numel()); 62 | y.resize(y_ref.numel()); 63 | 64 | // run inference 65 | tensor2array(x_ref, x.data()); 66 | inference(x.data(), 67 | params.at("conv1.weight").data(), params.at("conv1.bias").data(), 68 | params.at("conv2.weight").data(), params.at("conv2.bias").data(), 69 | params.at("fc1.weight").data(), params.at("fc1.bias").data(), 70 | params.at("fc2.weight").data(), params.at("fc2.bias").data(), 71 | y.data()); 72 | 73 | EXPECT_TRUE(verify(y.data(), y_ref)); 74 | 75 | // summarize 76 | num_data++; 77 | 78 | if (y_label.data_ptr()[0] == y_ref.argmax().data_ptr()[0]) { 79 | num_corrects_ref++; 80 | } 81 | 82 | auto argmax = [](const std::vector& vec) { 83 | return std::distance(vec.begin(), std::max_element(vec.begin(), vec.end())); 84 | }; 85 | 86 | if (y_label.data_ptr()[0] == argmax(y)) { 87 | num_corrects++; 88 | } 89 | } 90 | 91 | std::cout << "accuracy (ref): " << double(num_corrects_ref) / num_data << std::endl; 92 | std::cout << "accuracy: " << double(num_corrects) / num_data << std::endl; 93 | } 94 | 95 | int main(int argc, char** argv) { 96 | ::testing::InitGoogleTest(&argc, argv); 97 | ::testing::FLAGS_gtest_death_test_style = "threadsafe"; 98 | return RUN_ALL_TESTS(); 99 | } 100 | -------------------------------------------------------------------------------- /tests/ref/linear.cc: -------------------------------------------------------------------------------- 1 | #include "dnn-kernel/linear.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | #include 12 | 13 | using namespace dnnk; 14 | namespace F = torch::nn::functional; 15 | 16 | TEST(CPUVerify, Linear) { 17 | torch::manual_seed(0); 18 | 19 | int in_channels = 32, out_channels = 16; 20 | 21 | auto x_ref = torch::randn({1, in_channels}); 22 | auto weight_ref = torch::randn({out_channels, in_channels}); 23 | auto bias_ref = torch::randn({out_channels}); 24 | 25 | std::vector x(x_ref.numel()); 26 | std::vector weight(weight_ref.numel()); 27 | std::vector bias(out_channels); 28 | tensor2array(x_ref, x.data()); 29 | tensor2array(weight_ref, weight.data()); 30 | tensor2array(bias_ref, bias.data()); 31 | 32 | auto y_ref = F::linear(x_ref, weight_ref, bias_ref); 33 | std::vector y(y_ref.numel()); 34 | linear(x.data(), weight.data(), bias.data(), in_channels, out_channels, y.data()); 35 | 36 | EXPECT_TRUE(verify(y.data(), y_ref)); 37 | } 38 | 39 | int main(int argc, char** argv) { 40 | ::testing::InitGoogleTest(&argc, argv); 41 | ::testing::FLAGS_gtest_death_test_style = "threadsafe"; 42 | return RUN_ALL_TESTS(); 43 | } 44 | -------------------------------------------------------------------------------- /tests/ref/maxpool2d.cc: -------------------------------------------------------------------------------- 1 | #include "dnn-kernel/maxpool2d.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | #include 12 | 13 | using namespace dnnk; 14 | namespace F = torch::nn::functional; 15 | 16 | TEST(CPUVerify, Maxpool2d) { 17 | torch::manual_seed(0); 18 | 19 | int h = 32, w = 32, channels = 4, stride = 2; 20 | 21 | auto x_ref = torch::randn({1, channels, h, w}); 22 | 23 | std::vector x(x_ref.numel()); 24 | tensor2array(x_ref, x.data()); 25 | 26 | auto y_ref = F::detail::max_pool2d(x_ref, stride, stride, 0, 1, false); 27 | std::vector y(y_ref.numel()); 28 | maxpool2d(x.data(), w, h, channels, stride, y.data()); 29 | 30 | EXPECT_TRUE(verify(y.data(), y_ref)); 31 | } 32 | 33 | int main(int argc, char** argv) { 34 | ::testing::InitGoogleTest(&argc, argv); 35 | ::testing::FLAGS_gtest_death_test_style = "threadsafe"; 36 | return RUN_ALL_TESTS(); 37 | } 38 | -------------------------------------------------------------------------------- /tests/ref/relu.cc: -------------------------------------------------------------------------------- 1 | #include "dnn-kernel/relu.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | #include 12 | 13 | using namespace dnnk; 14 | namespace F = torch::nn::functional; 15 | 16 | TEST(CPUVerify, ReLU) { 17 | auto x_ref = torch::randn({28, 28, 1}); 18 | const float* x = tensor2array(x_ref); 19 | float* y = new float[x_ref.numel()]; 20 | 21 | dnnk::relu(x, x_ref.numel(), y); 22 | auto y_ref = F::detail::relu(x_ref, false); 23 | 24 | EXPECT_TRUE(verify(y, y_ref)); 25 | } 26 | 27 | int main(int argc, char** argv) { 28 | ::testing::InitGoogleTest(&argc, argv); 29 | ::testing::FLAGS_gtest_death_test_style = "threadsafe"; 30 | return RUN_ALL_TESTS(); 31 | } 32 | -------------------------------------------------------------------------------- /tests/util.h: -------------------------------------------------------------------------------- 1 | #ifndef DNNKERNEL_TEST_UTIL_H 2 | #define DNNKERNEL_TEST_UTIL_H 3 | 4 | #include "torch/torch.h" 5 | 6 | namespace dnnk { 7 | namespace { 8 | 9 | struct Color { 10 | static constexpr const char* red = "\u001b[31m"; 11 | static constexpr const char* green = "\u001b[32m"; 12 | static constexpr const char* reset = "\u001b[0m"; 13 | }; 14 | 15 | float* tensor2array(const torch::Tensor& tensor) { 16 | float* ret = new float[tensor.numel()]; 17 | std::memcpy(ret, tensor.data_ptr(), tensor.nbytes()); 18 | return ret; 19 | } 20 | 21 | void tensor2array(const torch::Tensor& tensor, float* array) { 22 | std::memcpy(array, tensor.data_ptr(), tensor.nbytes()); 23 | } 24 | 25 | bool verify(const float* actual, const torch::Tensor& expect) { 26 | const float tolerance = 10e-5f; 27 | auto expect_ptr = expect.data_ptr(); 28 | 29 | for (auto i = decltype(expect.numel())(0); i < expect.numel(); ++i) { 30 | if (std::abs(actual[i] - expect_ptr[i]) >= tolerance) { 31 | std::cout << i << " : " << actual[i] << " vs " << expect_ptr[i] 32 | << std::endl; 33 | return false; 34 | } 35 | } 36 | 37 | return true; 38 | } 39 | 40 | } // namespace 41 | } // namespace dnnk 42 | 43 | #endif // DNNKERNEL_TEST_UTIL_H 44 | -------------------------------------------------------------------------------- /thirdparty/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip 4 | wget https://github.com/google/googletest/archive/release-1.10.0.zip 5 | wget https://github.com/Kitware/CMake/releases/download/v3.16.8/cmake-3.16.8-Linux-x86_64.tar.gz 6 | 7 | tar -xf cmake-3.16.8-Linux-x86_64.tar.gz 8 | --------------------------------------------------------------------------------