├── ISCA2020-slides ├── IISCA2020_ONNC_Software_Architecture_Overview.pdf ├── ISCA2020_Graph_Level_Optimization.pdf ├── ISCA2020_Hardware_Optimization_Pass.pdf ├── ISCA2020_Introduction_of_ONNC_C_Backend.pdf ├── ISCA2020_Lab_ONNC_Working_Environment_Setup.pdf ├── ISCA2020_Nvdla_Overview.pdf ├── ISCA2020_ONNC_CIM.pdf ├── ISCA2020_ONNC_Software_Architecture_Overview.pdf ├── ISCA2020_ONNC_WASM_Project.pdf ├── ISCA2020_Porting_ONNC_To_NVDLA.pdf └── ISCA2020_Programming_Tips.pdf ├── README.md ├── figures ├── add-data_flow.png ├── bad_mapping_Add_Mul_Relu.png ├── compute_graph.png ├── cortexm_code_snapshot.png ├── cortexm_flow.png ├── ir_graph_get_input.png ├── ir_graph_get_output.png ├── loadable-file.png ├── loadable_and_driver.png ├── loadable_code_emit.png ├── loadable_sdp.png ├── loadable_task_info.png ├── loadable_tasks.png ├── mnist_demo.gif ├── mnist_demo_setup.png ├── nvdla-architecture.png ├── onnc-software-architecture-diagram.png ├── processing_open_file.png ├── processing_run.png ├── resnet50-partial.png ├── rubik_split_and_merge.png ├── runOnModule.png ├── runtime_env.png ├── sdp_x1_condition_1.png ├── sdp_x1_datapath.png ├── shuffle_visualization.png ├── shufflenet_partial.png ├── softmax_dataflow.png ├── test_Add.png ├── test_Conv_Relu.png ├── test_Conv_Relu_onnc_ir.png ├── test_Log.png ├── test_Mul_Add_Relu.png ├── test_Mul_Add_Relu_compound_IR.png ├── test_Mul_Add_Relu_original_IR.png ├── test_Mul_Add_Relu_reordered_IR.png ├── test_Relu.png ├── test_Relu_Log_Relu.png ├── test_Shuffle.png ├── test_Shuffle_adjusted_ONNC_IR.png └── test_Shuffle_original_ONNC_IR.png ├── lab_1_Environment_Setup └── lab_1.md ├── lab_2_Digit_Recognition_with_ARM_CortexM ├── lab_2.md ├── mnist_demo_gui │ └── mnist_demo_gui.pde └── onnc-cmsis-example │ ├── .gitignore │ ├── .mbed │ ├── .mbedignore │ ├── CMSIS_5.lib │ ├── README.md │ ├── add.cpp │ ├── add.h │ ├── main.cpp │ ├── matmul.cpp │ ├── matmul.h │ ├── mbed-os.lib │ ├── mbed_settings.py │ └── run.sh ├── lab_3_Starting_New_Backend └── lab_3.md ├── lab_4_Code_Emitting ├── lab_4.md └── src │ ├── FooNvdla.tar.gz │ └── visit_Add.cpp ├── lab_5_CPU_Fallback ├── lab_5.md └── src │ ├── emu_interface.h │ ├── visit_Log.cpp │ └── visit_Relu.cpp ├── lab_6_Manipulating_ONNC_IR ├── lab_6.md └── src │ ├── FooNvdlaBackend.cpp │ ├── GraphvizONNCIRPass.cpp │ ├── GraphvizONNCIRPass.h │ └── test_Conv_Relu.dot ├── lab_7_ONNC_IR_Extension ├── lab_7.md └── src │ ├── CodeEmitVisitor.cpp │ ├── CodeEmitVisitor.h │ ├── FooNvdlaBackend.cpp │ ├── NvDlaIdentifyShufflePass.cpp │ ├── NvDlaIdentifyShufflePass.h │ ├── NvDlaShuffle.cpp │ ├── NvDlaShuffle.h │ ├── PrintONNCIRPass.cpp │ └── PrintONNCIRPass.h ├── lab_8_Mul_Add_Reordering_and_Fusion ├── lab_8.md └── src │ ├── CMakeLists.txt │ ├── CodeEmitVisitor.cpp │ ├── CodeEmitVisitor.h │ ├── FooNvdlaBackend.cpp │ ├── FooNvdlaBackend.h │ ├── Makefile.am │ ├── NvDlaAddMulRelu.cpp │ ├── NvDlaAddMulRelu.h │ ├── NvDlaFuseAddMulReluPass.cpp │ ├── NvDlaFuseAddMulReluPass.h │ ├── NvDlaReorderMulAddPass.cpp │ ├── NvDlaReorderMulAddPass.h │ ├── PrintONNCIRPass.cpp │ └── PrintONNCIRPass.h └── models ├── lenet ├── input0.output.dimg ├── input0.pgm ├── input1.pgm ├── input2.pgm ├── input4.pgm ├── input5.pgm ├── input6.pgm ├── input7.output.dimg ├── input7.pgm ├── input8.pgm ├── input9.pgm ├── lenet.nvdla └── lenet.onnx ├── quantized_mnist ├── mnist_calibration.txt └── quantized_mnist.onnx ├── test_Add ├── input1x5x7.pgm ├── out.nvdla ├── test_Add.log ├── test_Add.nvdla ├── test_Add.onnx └── test_Add.output.dimg ├── test_Conv_Relu └── test_Conv_Relu.onnx ├── test_Log ├── input1x5x7.pgm ├── test_Log.log ├── test_Log.nvdla ├── test_Log.onnx └── test_Log.output.dimg ├── test_Mul_Add_Relu ├── input1x5x5.pgm └── test_Mul_Add_Relu.onnx ├── test_Relu └── test_Relu.onnx ├── test_Relu_Log_Relu ├── input1x5x7.pgm ├── test_Relu_Log_Relu.log ├── test_Relu_Log_Relu.nvdla ├── test_Relu_Log_Relu.onnx └── test_Relu_Log_Relu.output.dimg ├── test_Shuffle ├── input.pgm ├── test_Shuffle.onnx └── test_Shuffle.output.dimg └── test_group_Conv ├── test_group_Conv.onnx └── test_group_Conv.py /ISCA2020-slides/IISCA2020_ONNC_Software_Architecture_Overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/IISCA2020_ONNC_Software_Architecture_Overview.pdf -------------------------------------------------------------------------------- /ISCA2020-slides/ISCA2020_Graph_Level_Optimization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Graph_Level_Optimization.pdf -------------------------------------------------------------------------------- /ISCA2020-slides/ISCA2020_Hardware_Optimization_Pass.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Hardware_Optimization_Pass.pdf -------------------------------------------------------------------------------- /ISCA2020-slides/ISCA2020_Introduction_of_ONNC_C_Backend.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Introduction_of_ONNC_C_Backend.pdf -------------------------------------------------------------------------------- /ISCA2020-slides/ISCA2020_Lab_ONNC_Working_Environment_Setup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Lab_ONNC_Working_Environment_Setup.pdf -------------------------------------------------------------------------------- /ISCA2020-slides/ISCA2020_Nvdla_Overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Nvdla_Overview.pdf -------------------------------------------------------------------------------- /ISCA2020-slides/ISCA2020_ONNC_CIM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_ONNC_CIM.pdf -------------------------------------------------------------------------------- /ISCA2020-slides/ISCA2020_ONNC_Software_Architecture_Overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_ONNC_Software_Architecture_Overview.pdf -------------------------------------------------------------------------------- /ISCA2020-slides/ISCA2020_ONNC_WASM_Project.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_ONNC_WASM_Project.pdf -------------------------------------------------------------------------------- /ISCA2020-slides/ISCA2020_Porting_ONNC_To_NVDLA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Porting_ONNC_To_NVDLA.pdf -------------------------------------------------------------------------------- /ISCA2020-slides/ISCA2020_Programming_Tips.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Programming_Tips.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | The NVIDIA Deep Learning Accelerator provides free intellectual property licensing to anyone wanting to build a chip that uses deep neural networks for inference applications. With extensive documentation and tools, many business proposals and research projects choose NVDLA as their inference engine design. However, lack of extensible compiler support becomes the major bottleneck for supporting more AI models and optimizations. This tutorial presents the first open source compiler that supports NVDLA-based designs. The ONNC compiler has more support than the official NVDLA compiler and relieves programmers from manually specifying the low-level details of models that are not supported by the official NVDLA compiler. It also enables the opportunities for hardware customization and proprietary optimization. We will cover the overview, porting and optimizations in three subsections. In each subsection, we will have hands-on labs to demonstrate how to run and customize the NVDLA backend in ONNC for product development and research projects. 4 | 5 | ONNC (Open Neural Network Compiler) is a retargetable compilation framework designed specifically for proprietary deep learning accelerators. Its software architecture expedites porting ONNC to any Deep Learning Accelerator (DLA) design that supports [ONNX (Open Neural Network Exchange)](https://onnx.ai/) operators. ONNC guarantees executability across every DLA by means of transforming ONNX models into DLA-specific binary forms and leveraging the intermediate representation (IR) design of ONNX along with effective algorithms to eliminate the overhead of data movement. **ONNC is the first open source compiler available for NVDLA-based hardware designs**. Its NVDLA backend can compile a model into an executable NVDLA Loadable file. Integrating ONNC with the NVDLA software stack opens up opportunities for developers and researchers to explore the NVDLA-based inference design at system level. 6 | 7 | This tutorial was presented at [MICRO 2019: The 52nd IEEE/ACM International Symposium on Microarchitecture (October 12th)](https://www.microarch.org/micro52/program/workshops.html#onnc) , Columbus, Ohio. 8 | 9 | ## Intended Audience 10 | 11 | Researchers and practitioners in academia or industry looking for an open-source AI compiler for NVDLA-based neural network inference engines. 12 | 13 | ## Contributors 14 | 15 | * Wei-Fen Lin (weifen@skymizer.com) 16 | * Cheng-Tao Hsieh (cthsieh@skymizer.com) 17 | 18 | ## Hands-on Labs 19 | 20 | * Lab 1. [ONNC Working Environment Setup](https://github.com/ONNC/onnc-tutorial/blob/master/lab_1_Environment_Setup/lab_1.md) 21 | * Lab 2. [Digit Recognition with ARM Cortex-M](https://github.com/ONNC/onnc-tutorial/blob/master/lab_2_Digit_Recognition_with_ARM_CortexM/lab_2.md) 22 | * Lab 3. [Starting a New Backend](https://github.com/ONNC/onnc-tutorial/blob/master/lab_3_Starting_New_Backend/lab_3.md) 23 | * Lab 4. [Code Emitting](https://github.com/ONNC/onnc-tutorial/blob/master/lab_4_Code_Emitting/lab_4.md) 24 | * Lab 5. [CPU Fallback Support](https://github.com/ONNC/onnc-tutorial/blob/master/lab_5_CPU_Fallback/lab_5.md) 25 | * Lab 6. [Manipulating ONNC IR and Optimization](https://github.com/ONNC/onnc-tutorial/blob/master/lab_6_Manipulating_ONNC_IR/lab_6.md) 26 | * Lab 7. [ONNC IR Extension](https://github.com/ONNC/onnc-tutorial/blob/master/lab_7_ONNC_IR_Extension/lab_7.md) 27 | * Lab 8. [Hardware-specific Optimization](https://github.com/ONNC/onnc-tutorial/blob/master/lab_8_Mul_Add_Reordering_and_Fusion/lab_8.md) 28 | 29 | ## References 30 | 31 | ### Papers 32 | 33 | * W. F. Lin, D. Y. Tsai, L. Tang, C. T. Hsieh, C. Y. Chou, P. H. Chang, and L. Hsu, “ONNC: A compilation framework connecting ONNX to proprietary deep learning accelerators,” in IEEE International Conference on Artificial Intelligence Circuits and Systems (AICAS 2019). IEEE, 2019. 34 | Download PDF: [Link](https://skymizer.com/publications/Skymizer-AICAS2019.pdf) 35 | 36 | 37 | * W.F. Lin, C. T. Hsieh, C. Y. Chou, "ONNC-based Software Development Platform for Configurable NVDLA Designs", to appear in IEEE International Symposium on VLSI Design, Automation and Test (VLSI-DAT 2019). IEEE, 2019 38 | Download PDF: [Link](https://skymizer.com/publications/Skymizer-VLSIDAT2019.pdf) 39 | 40 | ### Documentation 41 | 42 | - [ONNC Utilities](https://github.com/ONNC/onnc/blob/master/docs/ONNC-Utilities.md) 43 | - [ONNC Pass Manager Getting Started Guide](https://github.com/ONNC/onnc/blob/master/docs/ONNC-Pass-Manager-Getting-Started-Guide.md) 44 | - [ONNC Backend Developer Guide](https://github.com/ONNC/onnc/blob/master/docs/ONNC-Backend-Porting-Guide.md) 45 | - [The Code Emitting Pass User Guide](https://github.com/ONNC/onnc/blob/master/docs/The-Code-Emitting-Pass-User-Guide.md) 46 | - [ONNC IR Extension Guide](https://github.com/ONNC/onnc/blob/master/docs/ONNC-IR-Extension-Guide.md) 47 | 48 | 49 | -------------------------------------------------------------------------------- /figures/add-data_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/add-data_flow.png -------------------------------------------------------------------------------- /figures/bad_mapping_Add_Mul_Relu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/bad_mapping_Add_Mul_Relu.png -------------------------------------------------------------------------------- /figures/compute_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/compute_graph.png -------------------------------------------------------------------------------- /figures/cortexm_code_snapshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/cortexm_code_snapshot.png -------------------------------------------------------------------------------- /figures/cortexm_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/cortexm_flow.png -------------------------------------------------------------------------------- /figures/ir_graph_get_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/ir_graph_get_input.png -------------------------------------------------------------------------------- /figures/ir_graph_get_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/ir_graph_get_output.png -------------------------------------------------------------------------------- /figures/loadable-file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/loadable-file.png -------------------------------------------------------------------------------- /figures/loadable_and_driver.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/loadable_and_driver.png -------------------------------------------------------------------------------- /figures/loadable_code_emit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/loadable_code_emit.png -------------------------------------------------------------------------------- /figures/loadable_sdp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/loadable_sdp.png -------------------------------------------------------------------------------- /figures/loadable_task_info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/loadable_task_info.png -------------------------------------------------------------------------------- /figures/loadable_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/loadable_tasks.png -------------------------------------------------------------------------------- /figures/mnist_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/mnist_demo.gif -------------------------------------------------------------------------------- /figures/mnist_demo_setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/mnist_demo_setup.png -------------------------------------------------------------------------------- /figures/nvdla-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/nvdla-architecture.png -------------------------------------------------------------------------------- /figures/onnc-software-architecture-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/onnc-software-architecture-diagram.png -------------------------------------------------------------------------------- /figures/processing_open_file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/processing_open_file.png -------------------------------------------------------------------------------- /figures/processing_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/processing_run.png -------------------------------------------------------------------------------- /figures/resnet50-partial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/resnet50-partial.png -------------------------------------------------------------------------------- /figures/rubik_split_and_merge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/rubik_split_and_merge.png -------------------------------------------------------------------------------- /figures/runOnModule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/runOnModule.png -------------------------------------------------------------------------------- /figures/runtime_env.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/runtime_env.png -------------------------------------------------------------------------------- /figures/sdp_x1_condition_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/sdp_x1_condition_1.png -------------------------------------------------------------------------------- /figures/sdp_x1_datapath.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/sdp_x1_datapath.png -------------------------------------------------------------------------------- /figures/shuffle_visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/shuffle_visualization.png -------------------------------------------------------------------------------- /figures/shufflenet_partial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/shufflenet_partial.png -------------------------------------------------------------------------------- /figures/softmax_dataflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/softmax_dataflow.png -------------------------------------------------------------------------------- /figures/test_Add.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Add.png -------------------------------------------------------------------------------- /figures/test_Conv_Relu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Conv_Relu.png -------------------------------------------------------------------------------- /figures/test_Conv_Relu_onnc_ir.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Conv_Relu_onnc_ir.png -------------------------------------------------------------------------------- /figures/test_Log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Log.png -------------------------------------------------------------------------------- /figures/test_Mul_Add_Relu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Mul_Add_Relu.png -------------------------------------------------------------------------------- /figures/test_Mul_Add_Relu_compound_IR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Mul_Add_Relu_compound_IR.png -------------------------------------------------------------------------------- /figures/test_Mul_Add_Relu_original_IR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Mul_Add_Relu_original_IR.png -------------------------------------------------------------------------------- /figures/test_Mul_Add_Relu_reordered_IR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Mul_Add_Relu_reordered_IR.png -------------------------------------------------------------------------------- /figures/test_Relu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Relu.png -------------------------------------------------------------------------------- /figures/test_Relu_Log_Relu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Relu_Log_Relu.png -------------------------------------------------------------------------------- /figures/test_Shuffle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Shuffle.png -------------------------------------------------------------------------------- /figures/test_Shuffle_adjusted_ONNC_IR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Shuffle_adjusted_ONNC_IR.png -------------------------------------------------------------------------------- /figures/test_Shuffle_original_ONNC_IR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Shuffle_original_ONNC_IR.png -------------------------------------------------------------------------------- /lab_1_Environment_Setup/lab_1.md: -------------------------------------------------------------------------------- 1 | # ONNC Working Environment Setup 2 | 3 | ## Preface 4 | 5 | This tutorial targets at using ONNC to generate Loadables that contains DNN model graph information for running inference on NVDLA-based SoCs. Most information in this tutorial is specifically tailored to the NVDLA backend porting. 6 | To facilitate the software development process, an ONNC Docker image is available in the Docker Hub for fast deployment. It has pre-installed dependent libraries and is a ready-to-run working environment. Users can mount ONNC source code into the Docker container and build the source code inside the container. In addition, the built ONNC binary can be executed to compile deep neural network (DNN) models inside the container. ONNC currently provides two backend implementations in the GitHub release v1.2. For the x86 backend, users may run model inference using the embedded interpreter, ONNI. For the NVDLA backend, a Loadable file that contains model graph information is generated after compilation. Users may simulate the model inference by running the Loadable files on an NVDLA virtual platform. The [NVIDIA Deep Learning Accelerator (NVDLA)](http://nvdla.org/index.html) release provides a full-featured virtual platform for full-system software simulation. We leverage the officially released virtual platform and make small changes for this tutorial. 7 | 8 | In the first Lab, we will describe and demonstrate how to build ONNC, compile models using ONNC, and simulate the model inference on our pre-packed virtual platform. 9 | 10 | ## Prerequisite 11 | 12 | If Docker is not installed in your system, please download Docker (http://www.docker.com) and install it first. 13 | You also need to install Git (https://git-scm.com/) to retrieve the source codes from Git servers. 14 | 15 | ## Preparing Source Code and Docker Images 16 | 17 | The latest ONNC source code is available on GitHub. Please follow the following commands to download the source code. 18 | 19 | ```sh 20 | $ git clone https://github.com/ONNC/onnc.git 21 | $ cd onnc 22 | $ git checkout tags/1.3.0 23 | $ cd .. 24 | ``` 25 | 26 | Use the following command to download the tutorial material. There are some example DNN models and code snippets you will use in the subsequent labs. 27 | 28 | ```sh 29 | $ git clone https://github.com/ONNC/onnc-tutorial.git 30 | ``` 31 | 32 | Pull the Docker image from the Docker Hub using the following commands. 33 | 34 | ```sh 35 | # We need two Docker images. 36 | 37 | $ docker pull onnc/onnc-community 38 | $ docker pull onnc/vp 39 | ``` 40 | 41 | To verify that the Docker images were downloaded successfully, use the following command to show all available Docker images. You should see both `onnc/onnc-community` and `onnc/vp` images. 42 | 43 | ```sh 44 | $ docker images 45 | REPOSITORY TAG IMAGE ID CREATED SIZE 46 | onnc/onnc-community latest fdd06c76c519 2 days ago 5.58GB 47 | onnc/vp latest 889c00396ea1 2 days ago 2.16GB 48 | ``` 49 | 50 | 51 | ## Building ONNC and Compiling DNN Models 52 | 53 | Use the following command to bring up the ONNC-community Docker. 54 | 55 | ```sh 56 | $ docker run -ti --rm -v :/onnc/onnc -v :/tutorial onnc/onnc-community 57 | ``` 58 | 59 | * `` is the directory where you clone the ONNC source code. Note that it must be the absolute path other than a relative path. 60 | * `` is the directory where you clone the ONNC tutorial material. 61 | * The `-ti` option provides an interactive interface for the container. 62 | * The `--rm` option will automatically clean up the container when the container exits. 63 | * The `-v` option mounts the directory to the Docker container. With this option, you can make change to the source code () outside the Docker container with your favorite editor, and the change can be seen inside the Docker container and gets compiled. 64 | 65 | Within the Docker container, use the following commands to build ONNC. 66 | 67 | ```sh 68 | # Within onnc/onnc-community Docker container 69 | 70 | $ cd /onnc/onnc-umbrella/build-normal 71 | 72 | # Build ONNC. 73 | $ smake -j8 install 74 | ``` 75 | 76 | * The `smake` command synchronizes the build directory with `/onnc` and invokes the make command to build ONNC. 77 | * The `-j8` option is to parallelize compilation with 8 CPU cores. 78 | * This command will automatically install the compiled binary in this container environment. 79 | 80 | ```sh 81 | # Run ONNC to compile a DNN model. 82 | $ onnc -mquadruple nvdla /tutorial/models/lenet/lenet.onnx 83 | 84 | # Prepare the compiled output file for the virtual platform to run. 85 | $ sudo mv out.nvdla /tutorial/models/lenet/ 86 | ``` 87 | 88 | You may use the following command to exit the Docker prompt, 89 | 90 | ```sh 91 | # Within the onnc/onnc-community Docker container 92 | $ exit 93 | ``` 94 | 95 | ## Performing Model Inference on Virtual Platform 96 | 97 | When you finish building ONNC and compiling a DNN model, you do not need the `onnc/onnc-community` Docker anymore. Start another console/terminal on your computer to enter the other Docker image called `onnc/vp` for model inference. 98 | 99 | ```sh 100 | # Within your computer console 101 | 102 | $ docker run -ti --rm -v :/tutorial onnc/vp 103 | ``` 104 | 105 | The virtual platform in this Docker is used to simulate the NVDLA runtime environment. As the following figure shows, the virtual platform contains a systemC model for the NVDLA hardware as well as a CPU emulator, where a Linux OS and NVDLA drivers are running to drive the NVDLA hardware. 106 | 107 | 108 | 109 | Within the VP Docker container, use the following commands to activate the virtual platform. 110 | 111 | ```sh 112 | # Within onnc/vp Docker container 113 | 114 | $ cd /usr/local/nvdla 115 | 116 | # Prepare loadable, input, and golden output for the future use. 117 | $ cp /tutorial/models/lenet/* . 118 | 119 | # Run the virtual platform. 120 | $ aarch64_toplevel -c aarch64_nvdla.lua 121 | 122 | SystemC 2.3.0-ASI --- Oct 9 2017 04:21:14 123 | Copyright (c) 1996-2012 by all Contributors, 124 | ALL RIGHTS RESERVED 125 | 126 | No sc_log specified, will use the default setting 127 | verbosity_level = SC_MEDIUM 128 | bridge: tlm2c_elaborate.. 129 | [ 0.000000] Booting Linux on physical CPU 0x0 130 | # ... 131 | Initializing random number generator... done. 132 | Starting network: udhcpc: started, v1.27.2 133 | udhcpc: sending discover 134 | udhcpc: sending select for 10.0.2.15 135 | udhcpc: lease of 10.0.2.15 obtained, lease time 86400 136 | deleting routers 137 | adding dns 10.0.2.3 138 | OK 139 | Starting sshd: [ 4.590433] NET: Registered protocol family 10 140 | [ 4.606182] Segment Routing with IPv6 141 | OK 142 | 143 | Welcome to Buildroot 144 | nvdla login: 145 | ``` 146 | 147 | By starting the virtual platform, a Linux kernel is brought up and stops at the login prompt. 148 | 149 | * nvdla login: root 150 | * Password: nvdla 151 | 152 | After logging into the Linux prompt, use the following commands to install the drivers. 153 | 154 | ```sh 155 | # Within the virtual platform 156 | 157 | $ mount -t 9p -o trans=virtio r /mnt && cd /mnt 158 | 159 | # Install KMD. 160 | $ insmod drm.ko && insmod opendla.ko 161 | [ 469.730339] opendla: loading out-of-tree module taints kernel. 162 | [ 469.734509] reset engine done 163 | [ 469.737998] [drm] Initialized nvdla 0.0.0 20171017 for 10200000.nvdla on minor 0 164 | ``` 165 | 166 | Up to this point, everything is ready for running model inference. In this lab, we demonstrate with a real-world model, LeNet, which is used for hand-written digit recognition. We have prepared some 28x28 images (`.pgm` files) to represent digit numbers 0 to 9. We begin with running model inference to recognize digit number 0 with input file `input0.pgm`. The inference simulation will take about a few minutes. 167 | 168 | ```sh 169 | # Within the virtual platform 170 | 171 | # Run the NVDLA runtime (containing UMD) to do model inference. 172 | $ ./nvdla_runtime --loadable out.nvdla --image input0.pgm --rawdump 173 | creating new runtime context... 174 | Emulator starting 175 | # ... 176 | [ 126.029817] Enter:dla_handle_events, processor:CDP 177 | [ 126.029995] Exit:dla_handle_events, ret:0 178 | [ 126.030146] Enter:dla_handle_events, processor:RUBIK 179 | [ 126.030323] Exit:dla_handle_events, ret:0 180 | [ 126.032432] reset engine done 181 | Shutdown signal received, exiting 182 | Test pass 183 | ``` 184 | 185 | After the simulation is done, we will derive an output file `output.dimg` containing the model output values. 186 | In this example, the output file should look like the follows: 187 | 188 | ```sh 189 | $ more output.dimg 190 | 149.25 -49.625 13.875 11.2344 -59.8125 -2.61523 7.80078 -44.7188 30.8594 17.3594 191 | ``` 192 | 193 | In the file, there are ten numbers indicating the confidence level of the 10 digits from 0 to 9, respectively. 194 | For example, the first number 149.25 indicates the confidence level of digit 0, and the next -49.625 of digit 1, and so on. Among those numbers, the largest one implies the recognition result. In this case, the first number 149.25 is the largest one, so the corresponding digit 0 is the recognition result. 195 | 196 | After the experiment, you can use the following command to exit the virtual platform. 197 | 198 | ```sh 199 | # Within the virtual platform 200 | $ poweroff 201 | ``` 202 | 203 | Use the following command to exit the `onnc/vp` Docker prompt. 204 | 205 | ```sh 206 | # Within the onnc/vp Docker container 207 | $ exit 208 | ``` 209 | -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/lab_2.md: -------------------------------------------------------------------------------- 1 | # Digit Recognition with ARM Cortex-M 2 | 3 | 4 | ## Preface 5 | 6 | Machine learning is moving to the edge. People want to have edge computing capability on embedded devices to provide more advanced services, like voice recognition for smart speakers and face detection for surveillance cameras. The Arm Cortex-M processor family is a range of scalable, energy-efficient and easy-to-use processors that meet the needs of smart and connected embedded applications. Cortex Microcontroller Software Interface Standard (CMSIS) is a vendor-independent hardware abstraction layer for the Cortex-M processor series. Research (https://arxiv.org/abs/1801.06601) has shown that machine learning has a proven 4.6X boost on the Cortex-M platform with the new CMSIS-NN software framework. In this lab, we will introduce an ONNC backend, `CortexM` backend, for the ARM Cortex-M microprocessor and demonstrate an end-to-end application, hand-writing recognition. The CortexM backend integrates the [CMSIS-NN library](https://github.com/ARM-software/CMSIS_5) that provides a set of computing functions for several popular operators in deep neural network (DNN) models, such as convolution, maximum pooling, etc. The library is optimized for speeding up the model inference on ARM Cortex-M CPUs. The following figure shows an example of the mapping between the model operators and the CMSIS-NN function calls. 7 | 8 | 9 | 10 | In this lab, we will use an end-to-end application to demonstrate how the ONNC framework supports AI inferences for a target hardware easily. 11 | 12 | ## Deploying MNIST Model Inference on an Embedded System 13 | 14 | The following diagram depicts how we deploy MNIST model inference on an Cortex-M platform. 15 | 16 | 17 | 18 | Typically, machine learning models are trained with floating-point data on GPU graphic cards or servers, but running inference in lower precision is preferred on embedded devices due to limited computation power. Fortunately, several research papers have proved that quantizing the data into integers can usually be performed without any loss of performance (i.e. accuracy). In this lab, we have prepared a quantized MNIST model in ONNX format. The input data and the weights are all of 8-bit integers. When running inference, the internal computation datapath might have a higher precision than 8 bits to avoid accuracy loss, but the activation data precision is converted back to 8 bits in the implementation. Many CMSIS-NN functions simply use "shift-right" logic to perform the bit-width conversion. The amount of shift-right is typically determined together with the weight quantization, so we leave it as one user input in the Cortex-M backend. The ONNX model format does not contain calibration information on the activation data. We have prepared a separate file, called the calibration file, to store the shift-right information. 19 | 20 | After compiling the MNIST model inference application (as `.cpp` file) using ONNC, we use the ARM cross-compiler to compile and link the application and the CMSIS-NN library together. The application software depends on the underlying embedded system and the target application. Users may find hardware-dependent information from vendors. Once the firmware binary is ready, we upload the binary file into the target board by an ISP tool that should be provided by the board vendor. 21 | 22 | ## Prerequisite 23 | 24 | If Docker is not installed in your system, please download Docker (http://www.docker.com) and install it first. In addition, you need to install Git (https://git-scm.com/) to fetch the source code from the GitHub server. Furthermore, the demonstration uses a popular GUI programming framework, Processing, please install Processing (https://processing.org/) as well. Lastly, you need to prepare a development board equipped with ARM Cortex-M CPU. We suggest to use [Mbed compatible boards](https://os.mbed.com/platforms/) because we use the [Mbed framework](https://www.mbed.com/en/) for the firmware compilation. If your board is not compatible with Mbed, you might need to rewrite some demonstration code following the regulation from the board vendor. 25 | 26 | ## Preparing Source Code and Docker Images 27 | 28 | The ONNC source code for Cortex-M is available online. Use the following command to download the ONNC source code. 29 | 30 | ```sh 31 | $ git clone -b CortexM https://github.com/ONNC/onnc.git 32 | ``` 33 | 34 | Next, use the following command to download the tutorial source code. There are some example DNN models you will use in this lab. 35 | 36 | ```sh 37 | $ git clone https://github.com/ONNC/onnc-tutorial.git 38 | ``` 39 | 40 | Pull the Docker images from the Docker Hub using the following commands. 41 | 42 | ```sh 43 | # Obtain the ONNC compilation environment. 44 | $ docker pull onnc/onnc-community 45 | 46 | # Obtain the ARM cross-compilation environment. 47 | $ docker pull misegr/mbed-cli 48 | ``` 49 | 50 | To verify that the Docker images were downloaded successfully, use the following command to show all available Docker images. You should see both `onnc/onnc-community` and `misegr/mbed-cli` images. 51 | 52 | 53 | ```sh 54 | $ docker images 55 | REPOSITORY TAG IMAGE ID CREATED SIZE 56 | onnc/onnc-community latest fdd06c76c519 2 days ago 5.58GB 57 | misegr/mbed-cli latest a708c25bd4d9 2 weeks ago 2.85GB 58 | ``` 59 | 60 | ## Building ONNC and Compiling Digit-Recognition Models 61 | 62 | Use this command to bring up the ONNC-community Docker. 63 | 64 | ```sh 65 | $ docker run -ti --rm -v :/onnc/onnc -v :/tutorial onnc/onnc-community 66 | ``` 67 | Please refer to [lab 1: Environment Setup](../lab_1_Environment_Setup/lab_1.md) for Docker's command usage. Within the Docker container, use the following commands to build ONNC. 68 | 69 | ```sh 70 | ############################################## 71 | # Within onnc/onnc-community Docker container 72 | ############################################## 73 | 74 | $ cd /onnc/onnc-umbrella/build-normal 75 | 76 | # Build ONNC. 77 | $ smake -j8 install 78 | ``` 79 | 80 | Up to this point, you should have the ONNC binary ready to compile DNN models. As we have mentioned earlier, the DNN model used for this lab must have been quantized, and all of its weights are 8-bit integers. In addition, a calibration file with shift-right values on all activation data must be prepared as well. In this lab, we obtained the [mnist model](https://github.com/onnx/models/tree/master/vision/classification/mnist) from the ONNX model zoo, and performed post-training quantization to derive its quantized version. Once all files are ready (you may find a copy in the `/models/quantized_mnist/` folder), use the following commands to compile the model and generate C codes. 81 | 82 | ```sh 83 | ############################################## 84 | # Within onnc/onnc-community Docker container 85 | ############################################## 86 | 87 | # Run ONNC to compile a quantized model with calibration information. 88 | $ onnc -mquadruple cortexm /tutorial/models/quantized_mnist/quantized_mnist.onnx \ 89 | --load-calibration-file=/tutorial/models/quantized_mnist/mnist_calibration.txt 90 | 91 | # Check the output files of the Cortex-M backend. 92 | $ ls cortexm* 93 | cortexm_main.cpp cortexm_main.h cortexm_weight.h 94 | 95 | # Prepare the resulting files for the later cross-compilation. 96 | $ sudo mv cortexm* /tutorial/models/quantized_mnist 97 | ``` 98 | By now, you may find the generated files in the `/models/quantized_mnist/` folder. In case where you want to exit the Docker prompt, use the following command. 99 | 100 | ```sh 101 | ############################################## 102 | # Within onnc/onnc-community Docker container 103 | ############################################## 104 | 105 | $ exit 106 | ``` 107 | 108 | ## Cross-compilation of CortexM machine code 109 | 110 | When you finish the previous steps of building ONNC and compiling a DNN model, you do not need the `onnc/onnc-community` Docker anymore. You need to enter the other Docker image `misegr/mbed-cli` to compile the generated C code for the Cortex-M platform. 111 | 112 | ```sh 113 | ############################### 114 | # Within your computer console 115 | ############################### 116 | 117 | # Move CortexM files to the onnc-cmsis-example folder 118 | $ cd 119 | $ cp /models/quantized_mnist/cortexm* . 120 | 121 | # Enter the cross-compilation Docker. 122 | $ docker run -ti --rm -v :/src misegr/mbed-cli bash 123 | ``` 124 | 125 | ```sh 126 | ########################################## 127 | # Within misegr/mbed-cli Docker container 128 | ########################################## 129 | 130 | $ cd /src 131 | $ mbed deploy 132 | [mbed] Working path "/src" (program) 133 | [mbed] Adding library "mbed-os" from "https://github.com/ARMmbed/mbed-os" at rev #367dbdf5145f 134 | [mbed] Adding library "CMSIS_5" from "https://github.com/ARM-software/CMSIS_5" at rev #c4c089d6333d 135 | [mbed] WARNING: File "RTX_V8MMF.lib" in "/src/CMSIS_5/CMSIS/RTOS2/RTX/Library/ARM" uses a non-standard .lib file extension, which is not compatible with the mbed build tools. 136 | ... 137 | [mbed] Auto-installing missing Python modules (fuzzywuzzy)... 138 | 139 | # Compile the firmware for a specific target by appointing the --target option. 140 | # Here we use NuMaker_PFM_NUC472 as an example. 141 | # Another example is DISCO_L475VG_IOT01A by STM. 142 | $ mbed compile --target NuMaker_PFM_NUC472 143 | [mbed] Working path "/src" (program) 144 | Building project src (NUMAKER_PFM_NUC472, GCC_ARM) 145 | Scan: . 146 | Scan: env 147 | ... 148 | Compile [ 99.7%]: serial_api.c 149 | Compile [ 99.9%]: spi_api.c 150 | Compile [100.0%]: test_env.cpp 151 | Link: src 152 | Elf2Bin: src 153 | +------------------+--------+-------+-------+ 154 | | Module | .text | .data | .bss | 155 | +------------------+--------+-------+-------+ 156 | | CMSIS_5/CMSIS | 1748 | 0 | 0 | 157 | | [fill] | 471 | 25 | 23 | 158 | | [lib]/c.a | 63801 | 2548 | 127 | 159 | | [lib]/gcc.a | 7200 | 0 | 0 | 160 | | [lib]/misc | 252 | 12 | 28 | 161 | | [lib]/nosys.a | 32 | 0 | 0 | 162 | | [lib]/stdc++.a | 171534 | 165 | 5676 | 163 | | add.o | 192 | 4 | 1 | 164 | | cortexm_main.o | 384 | 6082 | 15768 | 165 | | main.o | 344 | 4 | 4200 | 166 | | matmul.o | 118 | 0 | 0 | 167 | | mbed-os/drivers | 1219 | 0 | 0 | 168 | | mbed-os/features | 112 | 0 | 12345 | 169 | | mbed-os/hal | 1720 | 4 | 68 | 170 | | mbed-os/platform | 3934 | 256 | 105 | 171 | | mbed-os/rtos | 10917 | 168 | 6073 | 172 | | mbed-os/targets | 5656 | 212 | 142 | 173 | | Subtotals | 269634 | 9480 | 44556 | 174 | +------------------+--------+-------+-------+ 175 | Total Static RAM memory (data + bss): 54036 bytes 176 | Total Flash memory (text + data): 279114 bytes 177 | 178 | Image: ./BUILD/NUMAKER_PFM_NUC472/GCC_ARM/src.bin 179 | ``` 180 | 181 | The generated firmware binary file is located at /BUILD/NUMAKER_PFM_NUC472/GCC_ARM/src.bin. You can upload it to the board following the suggestion from the board vendor. The procedure is simple for an Mbed-compatible board. Connect the target board to a (Mac, Linux, or Windows) computer via a USB cable. Then you should see an Mbed drive to appear in the file browser window. Copy the `src.bin` file into that drive. 182 | 183 | ## Digit Recognition Demo 184 | 185 | The demo setup is shown as below. 186 | 187 | 188 | 189 | The board is connected to a PC via the UART connection. On the PC, we have prepared a GUI software where you can draw digit numbers on. Please open the [GUI program](mnist_demo_gui/mnist_demo_gui.pde) by Processing as shown in the following diagram. The file path is `/lab_2_Digit_Recognition_with_ARM_CortexM/mnist_demo_gui/mnist_demo_gui.pde`. 190 | 191 | 192 | 193 | Then run the program by clicking the "run" button as below. 194 | 195 | 196 | 197 | This demo accepts only one single-digit numnber at a time. Once you are done and click the "Submit" button on the GUI, the software will take a screenshot, transform it into a 28x28 image, and send the image to the board via the UART connection. The board will perform the model inference, and then send the classification answer back to the PC. 198 | A screenshot of the demo is shown as below. 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/mnist_demo_gui/mnist_demo_gui.pde: -------------------------------------------------------------------------------- 1 | import processing.serial.Serial; 2 | 3 | Serial _port; 4 | 5 | @Override 6 | void setup() 7 | { 8 | size(640, 280, P2D); 9 | background(0xFF); 10 | 11 | textSize(144); // Workaround P2D 12 | textAlign(CENTER); 13 | 14 | strokeWeight(20); 15 | noStroke(); 16 | 17 | // Print all avaiable Serial ports. 18 | printArray(Serial.list()); 19 | // According to the Serial port list, choose the one with a connection to the board. 20 | int portNo = 0; 21 | _port = new Serial(this, Serial.list()[portNo], 115200); 22 | 23 | _clear(); 24 | _redraw(Button.NONE); 25 | } 26 | 27 | @Override 28 | void draw() {} 29 | 30 | @Override 31 | void mouseDragged() 32 | { 33 | if (mouseX < 280) { 34 | stroke(0xFF); 35 | line(pmouseX, pmouseY, mouseX, mouseY); 36 | noStroke(); 37 | } 38 | } 39 | 40 | @Override 41 | void mouseMoved() 42 | { 43 | _redraw(_button()); 44 | } 45 | 46 | @Override 47 | void mouseClicked() 48 | { 49 | switch (_button()) { 50 | case SUBMIT: 51 | _say(_recognize()); 52 | break; 53 | case CLEAR: 54 | _clear(); 55 | break; 56 | } 57 | } 58 | 59 | @Override 60 | void keyPressed() 61 | { 62 | switch (keyCode) { 63 | case ENTER: 64 | case RETURN: 65 | case ' ': 66 | case 'S': 67 | _say(_recognize()); 68 | break; 69 | case BACKSPACE: 70 | case DELETE: 71 | case 'C': 72 | _clear(); 73 | break; 74 | } 75 | } 76 | 77 | private void _say(int number) 78 | { 79 | fill(0xFF); 80 | rect(400, 0, 100, 160); 81 | fill(0); 82 | textSize(144); 83 | 84 | if (number >= 0) { 85 | text(number, 460, 140); 86 | } 87 | else { 88 | text('-', 460, 140); 89 | } 90 | } 91 | 92 | boolean result_returned; 93 | 94 | void serialEvent(Serial p) { 95 | result_returned = true; 96 | } 97 | 98 | private int _recognize() 99 | { 100 | final PImage img = get(0, 0, 280, 280); 101 | img.resize(28, 28); 102 | img.loadPixels(); 103 | 104 | final int size = img.pixels.length; 105 | final byte data[] = new byte[size]; 106 | 107 | for (int i = 0; i < size; ++i) 108 | data[i] = (byte)(img.pixels[i] >> 1 & 0x7F); 109 | 110 | result_returned = false; 111 | _port.write(data); 112 | while (!result_returned) { delay(1); } 113 | return _port.read(); 114 | } 115 | 116 | private void _clear() 117 | { 118 | fill(0); 119 | rect(0, 0, 280, 280); 120 | } 121 | 122 | private void _redraw(Button hover) 123 | { 124 | fill(hover == Button.SUBMIT ? #007ACC : 0); 125 | rect(310, 190, 140, 50, 5); 126 | 127 | fill(hover == Button.CLEAR ? #007ACC : 0); 128 | rect(470, 190, 140, 50, 5); 129 | 130 | fill(0xFF); 131 | textSize(32); 132 | text("Submit", 380, 225); 133 | text("Clear", 540, 225); 134 | } 135 | 136 | private Button _button() 137 | { 138 | if (mouseY >= 190 && mouseY < 240) { 139 | if (mouseX >= 310 && mouseX < 450) 140 | return Button.SUBMIT; 141 | else if (mouseX >= 470 && mouseX < 610) 142 | return Button.CLEAR; 143 | } 144 | return Button.NONE; 145 | } 146 | 147 | private enum Button 148 | { 149 | NONE, 150 | SUBMIT, 151 | CLEAR, 152 | } -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/.gitignore: -------------------------------------------------------------------------------- 1 | BUILD 2 | CMSIS_5 3 | mbed-os 4 | mbed_settings.pyc 5 | -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/.mbed: -------------------------------------------------------------------------------- 1 | TOOLCHAIN=GCC_ARM 2 | TARGET=NUMAKER_PFM_NUC472 3 | ROOT=. 4 | -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/.mbedignore: -------------------------------------------------------------------------------- 1 | CMSIS_5/CMSIS/CoreValidation/* 2 | CMSIS_5/CMSIS/Core/Template/* 3 | CMSIS_5/CMSIS/Driver/* 4 | CMSIS_5/CMSIS/DoxyGen/* 5 | CMSIS_5/CMSIS/Documentation/* 6 | CMSIS_5/CMSIS/DSP/Examples/* 7 | CMSIS_5/CMSIS/DSP/DSP_Lib_TestSuite/* 8 | CMSIS_5/CMSIS/DSP/Projects/* 9 | CMSIS_5/CMSIS/DAP/* 10 | CMSIS_5/CMSIS/Core_A/* 11 | CMSIS_5/CMSIS/RTOS/* 12 | CMSIS_5/CMSIS/Pack/* 13 | CMSIS_5/CMSIS/NN/Scripts/* 14 | CMSIS_5/CMSIS/NN/NN_Lib_Tests/* 15 | CMSIS_5/CMSIS/NN/Examples/* 16 | CMSIS_5/CMSIS/Utilities/* 17 | CMSIS_5/CMSIS/RTOS2/* 18 | CMSIS_5/CMSIS/Lib/* 19 | CMSIS_5/Device/* -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/CMSIS_5.lib: -------------------------------------------------------------------------------- 1 | https://github.com/ARM-software/CMSIS_5/#c4c089d6333d5b4f2069b5287c26e2ccf74f373d 2 | -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/README.md: -------------------------------------------------------------------------------- 1 | 2 | $ mbed deploy 3 | $ mbed compile 4 | 5 | -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/add.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "add.h" 4 | 5 | using namespace std; 6 | 7 | void MatAdd(q7_t* input,int* input_dim,q7_t* add,int* add_dim,q7_t* output,int number_of_input_dim,int right_shift,int add_shift) {//q7_t (&answer)[K*C*H*W],q7_t (&add)[fm*H*W] 8 | //printf("length = %d\n",_msize(input_dim) / sizeof(input_dim[0])); 9 | if( number_of_input_dim == 4 ){ 10 | for(int loop_k = 0 ; loop_k < input_dim[0] ; loop_k++){ 11 | for(int loop_h = 0 ; loop_h < input_dim[1] ; loop_h++){ 12 | for(int loop_w = 0 ; loop_w < input_dim[2] ; loop_w++){ 13 | for(int loop_c = 0 ; loop_c < input_dim[3] ; loop_c++){ 14 | output[loop_k * input_dim[1] * input_dim[2] * input_dim[3] + loop_h * input_dim[2] * input_dim[3] + loop_w * input_dim[3] + loop_c] = 15 | (input[loop_k * input_dim[1] * input_dim[2] * input_dim[3] + loop_h * input_dim[2] * input_dim[3] + loop_w * input_dim[3] + loop_c] + (add[loop_c]>>add_shift) ) >> right_shift; 16 | } 17 | } 18 | } 19 | } 20 | }else{ 21 | for(int loop_h = 0 ; loop_h < input_dim[0] ; loop_h++){ 22 | for(int loop_w = 0 ; loop_w < input_dim[1] ; loop_w++){ 23 | output[ loop_h * input_dim[1] + loop_w ] = (input[ loop_h * input_dim[1] + loop_w ] + (add[ loop_h * input_dim[1] + loop_w ]>>add_shift)) >> right_shift ; 24 | //printf("index = %d,",loop_h * input_dim[1] + loop_w); 25 | } 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/add.h: -------------------------------------------------------------------------------- 1 | #ifndef ADD_H 2 | #define ADD_H 3 | 4 | #include "arm_math.h" 5 | 6 | void MatAdd(q7_t* input,int* input_dim,q7_t* add,int* add_dim,q7_t* output,int number_of_input_dim,int right_shift,int add_shift);//input , add , output -> q7_t 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/main.cpp: -------------------------------------------------------------------------------- 1 | #include "mbed.h" 2 | #include "arm_math.h" 3 | #include "cortexm_main.h" 4 | #include 5 | 6 | Serial port(USBTX, USBRX, 115200); 7 | 8 | static const int IMAGE_SIZE = 28 * 28; 9 | int input[IMAGE_SIZE]; 10 | unsigned char buffer[IMAGE_SIZE]; 11 | 12 | 13 | void 14 | pre_processing(int* image_data){ 15 | for(int i = 0 ; i < IMAGE_SIZE; i++) { 16 | image_data[i] = (image_data[i] >> 1) & 0x7f; 17 | } 18 | } 19 | 20 | int 21 | maximunloop(q7_t* img_buffer2) 22 | { 23 | int return_type = 0; 24 | int type_value = 0; 25 | for (int i = 0; i < 10 ; i++){ 26 | if(type_value < img_buffer2[i]){ 27 | type_value = img_buffer2[i]; 28 | return_type = i; 29 | } 30 | } 31 | return return_type; 32 | } 33 | 34 | 35 | void read(void){ 36 | int i; 37 | while(port.readable()==0){}; 38 | for(i=0;i> right_shift; 13 | index++; 14 | } 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/matmul.h: -------------------------------------------------------------------------------- 1 | #ifndef MATMUL_H 2 | #define MATMUL_H 3 | #include "arm_math.h" 4 | void matmul(q7_t* input_1,int input_1_dim[2],q7_t* input_2,int input_2_dim[2],q7_t* output,int right_shift);//dim[0] is high , dim[1] is weight 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/mbed-os.lib: -------------------------------------------------------------------------------- 1 | https://github.com/ARMmbed/mbed-os/#367dbdf5145f4d6aa3e483c147fe7bda1ce23a36 2 | -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/mbed_settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | mbed SDK 3 | Copyright (c) 2016 ARM Limited 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | """ 17 | 18 | from os.path import join, abspath, dirname 19 | 20 | #ROOT = abspath(join(dirname(__file__), ".")) 21 | 22 | ############################################################################## 23 | # Build System Settings 24 | ############################################################################## 25 | #BUILD_DIR = abspath(join(ROOT, "build")) 26 | 27 | # ARM 28 | #ARM_PATH = "C:/Program Files/ARM" 29 | 30 | # GCC ARM 31 | #GCC_ARM_PATH = "" 32 | 33 | # IAR 34 | #IAR_PATH = "C:/Program Files (x86)/IAR Systems/Embedded Workbench 7.0/arm" 35 | 36 | # Goanna static analyser. Please overload it in private_settings.py 37 | #GOANNA_PATH = "c:/Program Files (x86)/RedLizards/Goanna Central 3.2.3/bin" 38 | 39 | #BUILD_OPTIONS = [] 40 | 41 | # mbed.org username 42 | #MBED_ORG_USER = "" 43 | 44 | # Print compiler warnings and errors as link format 45 | #PRINT_COMPILER_OUTPUT_AS_LINK = False 46 | -------------------------------------------------------------------------------- /lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/run.sh: -------------------------------------------------------------------------------- 1 | docker run -ti --rm -v /Users/weifen/work/onnc_projects/onnc-tutorial/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example:/src misegr/mbed-cli bash 2 | -------------------------------------------------------------------------------- /lab_3_Starting_New_Backend/lab_3.md: -------------------------------------------------------------------------------- 1 | # Starting a New Backend 2 | 3 | ## Preface 4 | 5 | ONNC as an AI compiler framework, intends to be flexible and easy to incorporate a variety of deep learning accelerator (DLA) hardware. 6 | The following figure shows the software architecture of ONNC. 7 | 8 | 9 | 10 | The ONNC compiler has a general frontend framework to parse AI models and lower its representation to the ONNX IR graph. For each target hardware platform, compiler has a corresponding backend to deal with target-dependent tasks. There are two possible paths in porting a new backend. Processor-type targets follow the left path in the above diagram to emit LLVM IRs and be compiled to target machine code using the LLVM cross-compiler. Other proprietary DLA designs follow the right path to have a customized backend. In the case of NVDLA, we take the right path for ONNC porting to the NVDLA hardware. Each backend in ONNC performs target-specific conversion, and ONNC can have multiple backends for supporting different DLAs. ONNC provides a script to generate a code skeleton for a new backend. In this tutorial, we will describe how to use the script to have a jump start in backend porting. 11 | 12 | In terms of file structure, all backend code is placed inside the directory, `/lib/Target`. There are two backends available in that directory, including NVDLA and X86. As the above figure shows, there are a couple of default stages in each backend, including TensorSel, TensorScheduling, MemoryAllocation, and CodeEmit. The backend design in the ONNC framework has a significant control of the compilation process. Developers may decide whether and how to design each stage on their own. We recommend generating a new backend using the provided backend-creating script and make the necessary modification based on your own needs. This lab will demonstrate how to generate a new backend, how to compile ONNC, and how to run ONNC to compile an AI model respectively. 13 | 14 | 15 | ## Lab: Creating a Backend -- FooNvdla 16 | 17 | ### Step 1: Set up environment. 18 | 19 | Please finish the following labs before continuing this lab. 20 | * [lab 1: Environment Setup](../lab_1_Environment_Setup/lab_1.md) for preparing the Docker images and ONNC source codes. 21 | 22 | The backend-creating script is included in the ONNC source code and can be run within the ONNC-community Docker container. 23 | 24 | ### Step 2: Run the backend-creating script. 25 | 26 | Running the script needs certain packages installed in your working environment. We have prepared a pre-build working environment, the ONNC-community Docker, for fast setup. Please run the script within the Docker container. 27 | 28 | ```sh 29 | # Use the interactive mode to enter the Docker prompt. You will run the script inside. 30 | $ docker run -ti --rm -v :/onnc/onnc onnc/onnc-community 31 | ``` 32 | 33 | We have described how to set up a pre-built working environment in [Lab 1](../lab_1_Environment_Setup/lab_1.md). If you are not familiar with the ONNC-community Docker container, please go through Lab 1 first to setup your working environment. Once you enter the Docker container, type the following commands in the prompt to create a new backend called FooNvdla. 34 | 35 | ```sh 36 | # Within the onnc/onnc-community Docker container 37 | 38 | # Go to the path where the ONNC source codes are mounted to. 39 | $ cd /onnc/onnc 40 | 41 | # Run the script to create a new backend called FooNvdla. 42 | $ ./scripts/create-new-backend.sh FooNvdla 43 | ``` 44 | 45 | The new backend FooNvdla will be placed inside the folder `/lib/Target/FooNvdla`. Since we mount the ONNC source code to the Docker, any change inside the Docker container can be seen outside the container as well. You can find the generated files on your computer outside the Docker container. 46 | 47 | ### Step 3: Compile the new backend 48 | 49 | After creating the new backend with that script, you have a runnable backend that just dumps the model information by default. In this step, let's rebuild ONNC and compile a DNN model. 50 | 51 | Use the following commands to compile ONNC with the new backend. 52 | 53 | ```sh 54 | # Within the onnc/onnc-community Docker container 55 | 56 | $ cd /onnc/onnc-umbrella/build-normal/ 57 | 58 | # Use “-j8” to invoke 8 CPU cores to do the parallel compilation. 59 | $ smake -j8 install 60 | # ... 61 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/Bits/header.h 62 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/OFStreamLog.h 63 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/Diagnostic.h 64 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/MsgHandler.h 65 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/EngineFwd.h 66 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/StreamLog.h 67 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/MsgHandling.h 68 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc 69 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Support 70 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Support/DataTypes.h 71 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Config 72 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Config/ONNX.h 73 | -- Installing: /onnc/onnc-umbrella/install-normal/include/onnc/Config/Platforms.def 74 | -- Installing: /onnc/onnc-umbrella/install-normal/include/onnc/Config/Backends.def 75 | -- Installing: /onnc/onnc-umbrella/install-normal/include/onnc/Config/Config.h 76 | ``` 77 | 78 | ### Step 4: Compile an AI model 79 | 80 | The following commands demonstrate how to compile the `AlexNet` model with the ONNC binary. 81 | 82 | ```sh 83 | # Within the onnc/onnc-community Docker container 84 | 85 | $ onnc -mquadruple foonvdla /models/bvlc_alexnet/model.onnx 86 | ``` 87 | 88 | The option `-mquadruple foonvdla` is for invoking the new backend FooNvdla. Note that ONNC only accepts lowercase letters as the backend name in this option. When you use uppercase letters for the new backend name, the `create-new-backend.sh` script will convert them to lowercase letters automatically. 89 | 90 | The following log shows the compilation result and it dumps the model graph information in the `AlexNet` model. 91 | 92 | ```sh 93 | FooNvdla is invoked 94 | %conv1_w_0[96, 3, 11, 11] = Initializer() 95 | %conv1_b_0[96] = Initializer() 96 | %conv2_w_0[256, 48, 5, 5] = Initializer() 97 | %conv2_b_0[256] = Initializer() 98 | %conv3_w_0[384, 256, 3, 3] = Initializer() 99 | %conv3_b_0[384] = Initializer() 100 | %conv4_w_0[384, 192, 3, 3] = Initializer() 101 | %conv4_b_0[384] = Initializer() 102 | %conv5_w_0[256, 192, 3, 3] = Initializer() 103 | %conv5_b_0[256] = Initializer() 104 | %fc6_w_0[4096, 9216] = Initializer() 105 | %fc6_b_0[4096] = Initializer() 106 | %fc7_w_0[4096, 4096] = Initializer() 107 | %fc7_b_0[4096] = Initializer() 108 | %fc8_w_0[1000, 4096] = Initializer() 109 | %fc8_b_0[1000] = Initializer() 110 | %OC2_DUMMY_1[2] = Initializer() 111 | %data_0[1, 3, 224, 224] = InputOperator() 112 | %conv1_1[1, 96, 54, 54] = Conv(%data_0[1, 3, 224, 224], %conv1_w_0[96, 3, 11, 11], %conv1_b_0[96]) 113 | %conv2_1[1, 256, 26, 26] = Conv(%pool1_1[1, 96, 26, 26], %conv2_w_0[256, 48, 5, 5], %conv2_b_0[256]) 114 | %conv3_1[1, 384, 12, 12] = Conv(%pool2_1[1, 256, 12, 12], %conv3_w_0[384, 256, 3, 3], %conv3_b_0[384]) 115 | %conv4_1[1, 384, 12, 12] = Conv(%conv3_2[1, 384, 12, 12], %conv4_w_0[384, 192, 3, 3], %conv4_b_0[384]) 116 | %conv5_1[1, 256, 12, 12] = Conv(%conv4_2[1, 384, 12, 12], %conv5_w_0[256, 192, 3, 3], %conv5_b_0[256]) 117 | = OutputOperator(%prob_1[1, 1000]) 118 | ``` 119 | 120 | Congratulations! Now you have your new backend ready. In the subsequent tutorial labs, you are going to add more functionalities to the new backend. 121 | 122 | ## Files within a new backend 123 | 124 | By following the commands in the previous section, we have created a new backend FooNvdla and all the files are generated in the `lib/Target/FooNvdla` directory. The following table lists the files in the created folder. 125 | 126 | | File | Purpose | 127 | | ---- | ------- | 128 | | `FooNvdlaBackend.cpp & .h` | The main file of a backend. Developers need to modify this file to add optimization passes. | 129 | | `CodeEmitVisitor.cpp & .h` | Implementation of the `CodeEmitVisitor` class. Developers need to modify this file to handle the code generation for each operators. | 130 | | `TargetInfo/FooNvdlaTargetInfo.cpp & .h` | This file containing functions for registering this backend to the ONNC framework. | 131 | | `TargetInfo/FooNvdlaTargetMemInfo.cpp & .h` | The file for configuring memory size and alignment for each data type in neural network models. Developers need to modify this file based on the target hardware attributes to optimize memory allocation. | 132 | | `CMakeLists.txt` | Configuration file for the CMake building system. | 133 | | `Makefile.am` | Configuration file for the Autotools building system. | 134 | 135 | 136 | -------------------------------------------------------------------------------- /lab_4_Code_Emitting/src/FooNvdla.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/lab_4_Code_Emitting/src/FooNvdla.tar.gz -------------------------------------------------------------------------------- /lab_4_Code_Emitting/src/visit_Add.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void CodeEmitVisitor::visit(const Add& pOp) 5 | { 6 | printf("visit(Add) is called\n"); 7 | 8 | // Get tensor attributes. 9 | const Tensor& first = *(pOp.getInput(0)); 10 | const Tensor& second = *(pOp.getInput(1)); 11 | const Tensor& output = *(pOp.getOutput(0)); 12 | 13 | // For this example, we only support a special case where the first tensor is activation data 14 | // stored in memory and the 2nd tensor is a constant 15 | assert( (!isConstant(first) && isConstant(second)) && 16 | "support only the case that the first tensor is activation data and the second constant"); 17 | 18 | //-------------------------------- 19 | // Configure hardware block 20 | //-------------------------------- 21 | 22 | NvDlaDlaOperation* operation = new NvDlaDlaOperation(); 23 | // Set hardware block type. 24 | operation->op_dep.op_type = DLA_OP_SDP; 25 | 26 | struct dla_sdp_op_desc& desc = (struct dla_sdp_op_desc&)(operation->op_desc); 27 | desc.src_precision = PRECISION_FP16; 28 | desc.dst_precision = PRECISION_FP16; 29 | // No look up table is required. 30 | desc.lut_index = -1; 31 | 32 | // For this example, we only support batch == 1. 33 | desc.batch_num = 1; 34 | desc.batch_stride = 0; 35 | 36 | // Enable X1 block. 37 | desc.x1_op.enable = 1; 38 | 39 | // X1 operation Options: Disable (SDP_OP_NONE) / ALU only (SDP_OP_ADD) / 40 | // Multiplier only (SDP_OP_MUL) / ALU+MUL (SDP_OP_BOTH) 41 | desc.x1_op.type = SDP_OP_ADD; 42 | 43 | // ALU type options: SUM/MIN/MAX 44 | desc.x1_op.alu_type = SDP_ALU_OP_SUM; 45 | 46 | // Disable ReLU 47 | desc.x1_op.act = ACTIVATION_NONE; 48 | 49 | // Set per_layer/per_channel/per_point mode based on the broadcasting type. 50 | // For this example we only support per_point mode. 51 | desc.x1_op.mode = SDP_OP_PER_POINT; 52 | 53 | // Set the datapath precision to be fp16. 54 | desc.x1_op.precision = PRECISION_FP16; 55 | 56 | //---------------------------------------- 57 | // Setup dataflow sources and destination 58 | //---------------------------------------- 59 | 60 | struct dla_sdp_surface_desc& surface = (struct dla_sdp_surface_desc&)(operation->op_surf); 61 | 62 | // Setup 1st tensor source. 63 | const NvDlaCubeInfo firstCubeInfo = makeCubeInfo(*this, NVDLA_CUBE_FEATURE, first); 64 | // The 1st input tensor can be read from: 65 | // external DRAM via the interface of MCIF: DLA_MEM_MC 66 | // SRAM via the interface of CVIF: DLA_MEM_CV 67 | // the output of CONV hardware block: DLA_MEM_HW 68 | // In this example, we only support the 1st input tensor is stored at external DRAM. 69 | surface.src_data.type = DLA_MEM_MC; 70 | // Setup memory allocation and DMA configuration for 1st input tensor. 71 | surface.src_data.address = issueDlaAddr(first, firstCubeInfo); 72 | surface.src_data.size = m_pMeta.getMemoryListEntrySize(first); 73 | surface.src_data.width = firstCubeInfo.dim_w; 74 | surface.src_data.height = firstCubeInfo.dim_h; 75 | surface.src_data.channel = firstCubeInfo.dim_c; 76 | surface.src_data.line_stride = firstCubeInfo.stride_line; 77 | surface.src_data.surf_stride = firstCubeInfo.stride_surface; 78 | 79 | // Setup 2nd tensor source. 80 | MemoryListEntryId memoryId; 81 | const NvDlaCubeInfo secondCubeInfo = makeCubeInfo(*this, getSdpXSingleCubeType(second, DLA_PRECISION), second); 82 | // The 2nd input tensor is stored at DRAM and accessed through the interface of MCIF. 83 | surface.x1_data.type = DLA_MEM_MC; 84 | // Setup memory allocation and DMA configuration for 2nd input tensor. 85 | // In addition, the 2nd tensor is constant so need be packed into a blob and becomes a part of loadable. 86 | surface.x1_data.address = issueSDPOperand(second, secondCubeInfo, memoryId); 87 | surface.x1_data.size = m_pMeta.getMemoryListEntrySize(memoryId); 88 | surface.x1_data.width = secondCubeInfo.dim_w; 89 | surface.x1_data.height = secondCubeInfo.dim_h; 90 | surface.x1_data.channel = secondCubeInfo.dim_c; 91 | surface.x1_data.line_stride = secondCubeInfo.stride_line; 92 | surface.x1_data.surf_stride = secondCubeInfo.stride_surface; 93 | 94 | // Setup output tensor destination. 95 | const NvDlaCubeInfo outputCubeInfo = makeCubeInfo(*this, NVDLA_CUBE_FEATURE, output); 96 | // The output tensor is stored at DRAM. 97 | surface.dst_data.type = DLA_MEM_MC; 98 | surface.dst_data.address = issueDlaAddr(output, outputCubeInfo); 99 | surface.dst_data.size = m_pMeta.getMemoryListEntrySize(output); 100 | surface.dst_data.width = outputCubeInfo.dim_w; 101 | surface.dst_data.height = outputCubeInfo.dim_h; 102 | surface.dst_data.channel = outputCubeInfo.dim_c; 103 | surface.dst_data.line_stride = outputCubeInfo.stride_line; 104 | surface.dst_data.surf_stride = outputCubeInfo.stride_surface; 105 | 106 | //---------------------------------------- 107 | // enlist the operation 108 | //---------------------------------------- 109 | issueDlaOp(operation, NULL, m_pMeta.m_pPrevOp); 110 | } 111 | 112 | -------------------------------------------------------------------------------- /lab_5_CPU_Fallback/src/emu_interface.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of NVIDIA CORPORATION nor the names of its 13 | * contributors may be used to endorse or promote products derived 14 | * from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | */ 28 | 29 | #ifndef NVDLA_PRIV_EMU_EMU1_A_EMU_INTERFACE_H 30 | #define NVDLA_PRIV_EMU_EMU1_A_EMU_INTERFACE_H 31 | 32 | #include "dlatypes.h" 33 | 34 | #define NVDLA_EMU_MAX_BUFFERS_PER_TASK (6144) 35 | 36 | /** 37 | * @name Op Type 38 | * Network is formed using a list of these operations 39 | * @{ 40 | */ 41 | #define NVDLA_EMU_OP_POWER 0 42 | #define NVDLA_EMU_OP_SOFTMAX 1 43 | #define NVDLA_EMU_OP_LOG 2 44 | /** @} */ 45 | 46 | /** 47 | * Address 48 | */ 49 | struct emu_address 50 | { 51 | void *hMem; 52 | NvU32 offset; 53 | }; 54 | 55 | /** 56 | * Task Descriptor 57 | */ 58 | struct emu_task_desc 59 | { 60 | NvU32 num_addresses; 61 | emu_address address_list[NVDLA_EMU_MAX_BUFFERS_PER_TASK]; 62 | } __attribute__ ((packed, aligned(256))); 63 | 64 | /** 65 | * Network Descriptor 66 | * 67 | * Contains all information to execute a network 68 | * 69 | * @num_operations: Number of operations in the lists 70 | */ 71 | struct emu_network_desc 72 | { 73 | NvS16 operation_desc_index; 74 | NvS16 operation_buffer_desc_index; 75 | NvU16 num_operations; 76 | } __attribute__ ((packed, aligned(256))); 77 | 78 | struct emu_common_op_desc 79 | { 80 | NvU8 op_type; 81 | }; 82 | 83 | struct emu_power_op_desc 84 | { 85 | emu_common_op_desc common; 86 | NvF32 power; 87 | NvF32 scale; 88 | NvF32 shift; 89 | } __attribute__ ((packed, aligned(4))); 90 | 91 | struct emu_softmax_op_desc 92 | { 93 | emu_common_op_desc common; 94 | NvU8 axis; 95 | } __attribute__ ((packed, aligned(4))); 96 | 97 | struct emu_log_op_desc 98 | { 99 | emu_common_op_desc common; 100 | } __attribute__ ((packed, aligned(4))); 101 | 102 | union emu_operation_container 103 | { 104 | struct emu_power_op_desc power_op; 105 | struct emu_softmax_op_desc softmax_op; 106 | struct emu_log_op_desc log_op; 107 | }; 108 | 109 | struct emu_buffer_desc 110 | { 111 | /* offset to the actual IOVA in task.address_list */ 112 | NvS16 addressIndex; 113 | NvU32 size; 114 | 115 | /* surface format */ 116 | NvU16 format; 117 | 118 | /* cube dimensions */ 119 | NvU16 width; 120 | NvU16 height; 121 | NvU16 channel; 122 | 123 | /* stride information */ 124 | NvU32 line_stride; 125 | NvU32 surf_stride; 126 | } __attribute__ ((packed, aligned(256))); 127 | 128 | struct emu_power_buffer_descs 129 | { 130 | /* Buffer Descriptors */ 131 | struct emu_buffer_desc src_data; 132 | struct emu_buffer_desc dst_data; 133 | } __attribute__ ((packed, aligned(4))); 134 | 135 | struct emu_softmax_buffer_descs 136 | { 137 | /* Buffer Descriptors */ 138 | struct emu_buffer_desc src_data; 139 | struct emu_buffer_desc dst_data; 140 | } __attribute__ ((packed, aligned(4))); 141 | 142 | struct emu_log_buffer_descs 143 | { 144 | /* Buffer Descriptors */ 145 | struct emu_buffer_desc src_data; 146 | struct emu_buffer_desc dst_data; 147 | } __attribute__ ((packed, aligned(4))); 148 | 149 | union emu_operation_buffer_container 150 | { 151 | struct emu_power_buffer_descs power_buffers; 152 | struct emu_softmax_buffer_descs softmax_buffers; 153 | struct emu_log_buffer_descs log_buffers; 154 | }; 155 | 156 | 157 | #endif // NVDLA_PRIV_EMU_EMU1_A_EMU_INTERFACE_H 158 | -------------------------------------------------------------------------------- /lab_5_CPU_Fallback/src/visit_Log.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void CodeEmitVisitor::visit(const Log& pOp) 4 | { 5 | printf("visit(Log) is called\n"); 6 | 7 | // Get tensor attributes. 8 | const Tensor& input = *(pOp.getInput(0)); 9 | const Tensor& output = *(pOp.getOutput(0)); 10 | 11 | //-------------------------------- 12 | // Configure emulator engine 13 | //-------------------------------- 14 | 15 | // Use the class NvDlaEmuOperation rather than class NvDlaDlaOperation used in the DLA case. 16 | NvDlaEmuOperation* operation = new NvDlaEmuOperation(); 17 | 18 | struct emu_log_op_desc& desc = (struct emu_log_op_desc&)(operation->op_desc); 19 | desc.common.op_type = NVDLA_EMU_OP_LOG; 20 | 21 | //---------------------------------------- 22 | // Setup dataflow sources and destination 23 | //---------------------------------------- 24 | 25 | struct emu_log_buffer_descs& surface = (struct emu_log_buffer_descs&)(operation->op_buf); 26 | 27 | // Setup input tensor source. 28 | const NvDlaCubeInfo inputCubeInfo = makeCubeInfo(*this, NVDLA_CUBE_FEATURE, input); 29 | int input_mid = m_pMeta.getMemoryListEntryId(input); 30 | surface.src_data.addressIndex = issueEmuAddr(input_mid); 31 | surface.src_data.size = m_pMeta.getMemoryListEntrySize(input_mid); 32 | surface.src_data.format = PRECISION_FP16; 33 | surface.src_data.width = inputCubeInfo.dim_w; 34 | surface.src_data.height = inputCubeInfo.dim_h; 35 | surface.src_data.channel = inputCubeInfo.dim_c; 36 | surface.src_data.line_stride = inputCubeInfo.stride_line; 37 | surface.src_data.surf_stride = inputCubeInfo.stride_surface; 38 | 39 | // Setup output tensor destination. 40 | const NvDlaCubeInfo outputCubeInfo = makeCubeInfo(*this, NVDLA_CUBE_FEATURE, output); 41 | int output_mid = m_pMeta.getMemoryListEntryId(output); 42 | surface.dst_data.addressIndex = issueEmuAddr(output_mid); 43 | surface.dst_data.size = m_pMeta.getMemoryListEntrySize(output_mid); 44 | surface.dst_data.format = PRECISION_FP16; 45 | surface.dst_data.width = outputCubeInfo.dim_w; 46 | surface.dst_data.height = outputCubeInfo.dim_h; 47 | surface.dst_data.channel = outputCubeInfo.dim_c; 48 | surface.dst_data.line_stride = outputCubeInfo.stride_line; 49 | surface.dst_data.surf_stride = outputCubeInfo.stride_surface; 50 | 51 | //---------------------------------------- 52 | // enlist the operation 53 | //---------------------------------------- 54 | issueEmuOp(operation); 55 | } 56 | -------------------------------------------------------------------------------- /lab_5_CPU_Fallback/src/visit_Relu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void CodeEmitVisitor::visit(const Relu& pOp) 4 | { 5 | printf("visit(Relu) is called\n"); 6 | 7 | const Tensor* input_X_t = pOp.getInput(0); 8 | int32_t input_X_ndim = input_X_t->getNumOfDimensions(); 9 | int32_t input_X_dims[4] = {1, 1, 1, 1}; 10 | for (int i = 0; i < input_X_ndim; ++i) 11 | input_X_dims[i] = input_X_t->dimension(i); 12 | NvDlaCubeInfo X_cube(*this, NVDLA_CUBE_FEATURE, input_X_dims[0], input_X_dims[1], input_X_dims[2], input_X_dims[3]); 13 | 14 | const Tensor* output_Y_t = pOp.getOutput(0); 15 | int32_t output_Y_ndim = output_Y_t->getNumOfDimensions(); 16 | int32_t output_Y_dims[4] = {1, 1, 1, 1}; 17 | for (int i = 0; i < output_Y_ndim; ++i) 18 | output_Y_dims[i] = output_Y_t->dimension(i); 19 | 20 | NvDlaCubeInfo Y_cube(*this, NVDLA_CUBE_FEATURE, output_Y_dims[0], output_Y_dims[1], output_Y_dims[2], 21 | output_Y_dims[3]); 22 | 23 | NvDlaDlaOperation* relu_op = new NvDlaDlaOperation(); 24 | relu_op->op_dep.op_type = DLA_OP_SDP; 25 | 26 | struct dla_sdp_op_desc* relu_desc = (struct dla_sdp_op_desc*)(&(relu_op->op_desc)); 27 | relu_desc->src_precision = DLA_PRECISION; 28 | relu_desc->dst_precision = DLA_PRECISION; 29 | relu_desc->lut_index = -1; 30 | relu_desc->conv_mode = 0; 31 | relu_desc->out_cvt.scale = 1; 32 | relu_desc->out_cvt.truncate = 0; 33 | relu_desc->out_cvt.enable = 1; 34 | relu_desc->out_cvt.offset = 0; 35 | relu_desc->conv_mode = CONV_MODE_DIRECT; 36 | relu_desc->batch_num = 1; 37 | relu_desc->batch_stride = 0; 38 | relu_desc->x1_op.enable = 1; 39 | relu_desc->x1_op.alu_type = SDP_ALU_OP_SUM; 40 | relu_desc->x1_op.type = SDP_OP_NONE; 41 | relu_desc->x1_op.mode = SDP_OP_PER_LAYER; 42 | relu_desc->x1_op.act = ACTIVATION_RELU; 43 | relu_desc->x1_op.shift_value = 0; 44 | relu_desc->x1_op.truncate = 0; 45 | relu_desc->x1_op.precision = DLA_PRECISION; 46 | relu_desc->x1_op.alu_operand = 0; 47 | relu_desc->x1_op.mul_operand = 1; 48 | relu_desc->x1_op.cvt.alu_cvt.scale = 0; 49 | relu_desc->x1_op.cvt.alu_cvt.truncate = 0; 50 | relu_desc->x1_op.cvt.alu_cvt.enable = 0; 51 | relu_desc->x1_op.cvt.alu_cvt.offset = 0; 52 | relu_desc->x1_op.cvt.mul_cvt.scale = 0; 53 | relu_desc->x1_op.cvt.mul_cvt.truncate = 0; 54 | relu_desc->x1_op.cvt.mul_cvt.enable = 0; 55 | relu_desc->x1_op.cvt.mul_cvt.offset = 0; 56 | 57 | struct dla_sdp_surface_desc* relu_surf = (struct dla_sdp_surface_desc*)(&(relu_op->op_surf)); 58 | relu_surf->src_data.type = DLA_MEM_MC; 59 | relu_surf->src_data.address = issueDlaAddr(*input_X_t, X_cube); 60 | relu_surf->src_data.size = m_pMeta.getMemoryListEntrySize(*input_X_t); 61 | relu_surf->src_data.width = X_cube.dim_w; 62 | relu_surf->src_data.height = X_cube.dim_h; 63 | relu_surf->src_data.channel = X_cube.dim_c; 64 | relu_surf->src_data.line_stride = X_cube.stride_line; 65 | relu_surf->src_data.surf_stride = X_cube.stride_surface; 66 | relu_surf->src_data.plane_stride = X_cube.stride_plane; 67 | 68 | relu_surf->dst_data.type = DLA_MEM_MC; 69 | relu_surf->dst_data.address = issueDlaAddr(*output_Y_t, Y_cube); 70 | relu_surf->dst_data.size = m_pMeta.getMemoryListEntrySize(*output_Y_t); 71 | relu_surf->dst_data.width = Y_cube.dim_w; 72 | relu_surf->dst_data.height = Y_cube.dim_h; 73 | relu_surf->dst_data.channel = Y_cube.dim_c; 74 | relu_surf->dst_data.line_stride = Y_cube.stride_line; 75 | relu_surf->dst_data.surf_stride = Y_cube.stride_surface; 76 | relu_surf->dst_data.plane_stride = Y_cube.stride_plane; 77 | 78 | issueDlaOp(relu_op, NULL, m_pMeta.m_pPrevOp); 79 | } 80 | -------------------------------------------------------------------------------- /lab_6_Manipulating_ONNC_IR/lab_6.md: -------------------------------------------------------------------------------- 1 | # Manipulating ONNC IR and Optimization 2 | 3 | ## Preface 4 | 5 | ONNC inherits the concept of pass management from the LLVM infrastructure and 6 | the pass manager is one of the most important features in ONNC as well. Any analysis or transformation on a target program can be implemented as a pass in the ONNC framework. 7 | 8 | 9 | 10 | The above figure depicts the top-level block diagram of ONNC software stacks. The software stack illustrates the functional blocks from importing an ONNX computation graph model to emitting a hardware-executable form. Each stack is implemented as a collection of passes. In addition to leveraging the LLVM backend, ONNC paves another fast track for proprietary DLAs to execute ONNX models by defining ONNC IR, an intermediate representation (IR) that has one-to-one mapping to the ONNX IR. The `TensorSel` pass translates the ONNX IR into the ONNC IR. The subsequent passes analyze and manipulate the ONNC IR for model optimization and transformation. Many deep learning accelerators (DLAs) have limited support for the ONNX operators and only a subset of ONNX/ONNC IRs can be directly mapped to their hardware operations. In that case, some optimization passes are designed to decompose an ONNC IR into a sequence of ONNC IRs that have direct hardware operation support. For example, in the case of NVDLA backend, a BatchNormalization operator in the ONNC IR is decomposed into a CONV followed by an ADD IR in an optimization pass before the `CodeEmit` pass. For those who intend to develop a backend, it is essential to understand the data structure and APIs for ONNC IR manipulation. In this lab, we discuss and demonstrate to write a pass of traversing the ONNC IR of a given model. 11 | 12 | 13 | ## ONNC IR Graph 14 | 15 | Take the following model as an example. 16 | 17 | 18 | 19 | This model contains a CONV followed by a Relu. Its ONNC IR graph is depicted in the following diagram. 20 | 21 | 22 | 23 | The ONNC IR graph represents the data flow in a model. There are two types of nodes in the graph including the circle-shape nodes for compute operators and the rectangle-shape nodes for all types of values. They are implemented as C++ classes of `ComputeOperator` and `Value` respectively. In this example, there are five `ComputeOperator` in the graph. Besides `Conv` and `ReLu`, there are three other special ComputeOperators not shown in the original model. They are InputOperator, OutputOperator, and Initializer. InputOperator and OutputOperator, as the name suggests, represents the input and output of the model. Initializer is used to represent constants in the model such as weights. Between many pairs of ComputeOperators, there are rectangle-shape nodes that stores values for computation. Take the *Conv* node in the graph as an example. It has two input Values named *W* and *INPUT0*. They are kernel weights and input data respectively. It also has one output Value named *conv_out*, which is the output data of the convolution. 24 | 25 | ## Data Structures for ONNC IR 26 | 27 | ONNC provides a group of data structures to describe the ONNC IR. The following figure shows the overview of the data structures in the UML form. 28 | 29 | 30 | 31 | There are four major classes in the ONNC IR implementation - `class Module`, `class ComputeGraph`, `class ComputeOperator`, and `class Value`. `Class ComputeOperator` and `class Value` are already described in previous section. `Class ComputeGraph` encloses a single **connected** DAG (Directed Acyclic Graph) of ComputeOperators and Values, whereas `class Module` encloses a set of independent ComputeGraphs. Although it is rare for a DNN model to have multiple separated, disconnected data flow in reality, ONNC introduces the concept of Module in a higher and broader abstraction for extendibilitiy. 32 | 33 | ## Lab: Visualing the ONNC IR Graph of a Given Model 34 | 35 | In this lab, we will write a pass to traverse the ONNC IR graph of a given model and print the graph in the [Graphviz](https://www.graphviz.org/) format. Graphviz is a formal language for describing graphs and networks. There are open source software tools available to compile the textual description into some image forms such as png. The following code snippet shows an example of the Graphviz "script" and you may find the complete source file in [test_Conv_Relu.dot](src/test_Conv_Relu.dot). 36 | 37 | ``` 38 | digraph { 39 | ... 40 | 41 | Initializer_94153827516736 -> W 42 | InputOperator_94153828180800 -> INPUT0 43 | INPUT0 -> Conv_94153828038720 44 | W -> Conv_94153828038720 45 | Conv_94153828038720 -> conv_out 46 | conv_out -> Relu_94153827458336 47 | Relu_94153827458336 -> OUTPUT0 48 | OUTPUT0 -> OutputOperator_94153828212384 49 | } 50 | ``` 51 | 52 | Graphviz is supported in many readers that support markdown languages and you may get the corresponding image file for the above script as the following picture. 53 | 54 | 55 | 56 | Let's write a pass to generate the above Graphviz script in ONNC. 57 | 58 | ### Step 1: Set up environment. 59 | 60 | Please finish the following labs first before continuing this lab. 61 | 62 | * [lab 1: Environment Setup](../lab_1_Environment_Setup/lab_1.md) for preparing the Docker images and ONNC source codes. 63 | * [lab 3: Starting New Backend](../lab_3_Starting_New_Backend/lab_3.md) for preparing the experimental backend `FooNvdla` for the exercise in this lab. 64 | * [lab 4: Code Emitting](../lab_4_Code_Emitting/lab_4.md) for setting up the utilities needed by the exampled ONNX model in this lab. 65 | 66 | After the preparation, you should have the backend of `FooNvdla` ready, and the source code file can be found in the `/lib/Target/FooNvdla` directory. 67 | For the rest of this lab, when we talk about modifying the source code of the NVDLA backend, we are referring to the code in the `FooNvdla` directory. 68 | 69 | ```sh 70 | $ cd /lib/Target/FooNvdla 71 | ``` 72 | 73 | ### Step 2: Create a new pass 74 | 75 | Pass is an abstraction of each execution in ONNC framework. It is designed for manipulating an ONNC IR graph to achieve a specific goal. Users may define customized pass types, register a pass into pass manager, and let pass manager administrate the executions. 76 | 77 | First, create a pass by inheriting from the `CustomPass` abstract class. 78 | 79 | ```cpp 80 | #include 81 | 82 | class GraphvizONNCIRPass : public CustomPass 83 | { 84 | public: 85 | GraphvizONNCIRPass() = default; 86 | 87 | ReturnType runOnModule(Module& pModule) override; 88 | }; 89 | ``` 90 | 91 | The `CustomPass` abstract class defines several virtual functions. These member functions are invoked by the pass manager on each execution. Their prototypes are listed as below: 92 | 93 | | Prototype | 94 | | --------- | 95 | | `virtual ReturnType doInitialization(Module&);` | 96 | | `virtual ReturnType runOnModule(Module&);` | 97 | | `virtual ReturnType doFinalization(Module&);` | 98 | 99 | | Method | Description | 100 | | ------ | ----------- | 101 | | `doInitialization` | The first-invoked method in a pass. Acquire resources such as files, network and etc. | 102 | | `runOnModule` | Implement module manipulations in this method. | 103 | | `doFinalization` | The last-called method in a pass. Release resources and prepare next run. | 104 | 105 | The above three methods are invoked exactly once per execution. Users can assemble meaningful values and return informative result to pass manager. ONNC use an enumeration type `PassResult` for execution results. 106 | `PassResult` is usually encoded like bit mask and the following table lists all possibile values. 107 | 108 | | Value | Description | 109 | | ----- | ----------- | 110 | | `kModuleNoChanged` | No update to the module content, or do nothing. | 111 | | `kModuleChanged` | There are some modifications on module, or invoke successfully. | 112 | | `kPassRetry` | Can not finish invocation due to some reason. Need to retry. | 113 | | `kPassFailure` | Failed to action. | 114 | 115 | In the following code snippet, we show the typical implementation of the function `runOnModule`. 116 | 117 | ```cpp 118 | // GraphvizONNCIRPass.cpp 119 | 120 | Pass::ReturnType GraphvizONNCIRPass::runOnModule(Module& pModule) 121 | { 122 | Pass::ReturnType ret = kModuleNoChanged; 123 | 124 | // ... 125 | // Change the value of variable `ret` if necessary. 126 | 127 | if (ret != kModuleNoChanged) { 128 | pModule.eraseUnusedValues(); 129 | } 130 | 131 | return ret; 132 | } 133 | ``` 134 | 135 | ### Step 3: Implement `GraphvizONNCIRPass` to traverse the ONNC IR 136 | 137 | In this pass, we mainly override the `runOnModule` function of GraphvizONNCIRPass, and here we utilize the ONNC framework to simplify the implementation of `runOnModule`, as the following figure shows. 138 | 139 | 140 | 141 | The actual implementation of `runOnModule` is as follows. 142 | 143 | ```cpp 144 | // GraphvizONNCIRPass.cpp 145 | 146 | Pass::ReturnType GraphvizONNCIRPass::runOnModule(Module& pModule) 147 | { 148 | Pass::ReturnType ret = kModuleNoChanged; 149 | 150 | // Call the default implementation of runOnModule(). It subsequently invokes 151 | // runOnComputeGraph() to handle each of the ComputeGraphs in the module. 152 | ret = BaseType::runOnModule(pModule); 153 | 154 | if (ret != kModuleNoChanged) { 155 | pModule.eraseUnusedValues(); 156 | } 157 | 158 | return ret; 159 | } 160 | 161 | // Use the following function to handle every ComputeGraph. 162 | Pass::ReturnType GraphvizONNCIRPass::runOnComputeGraph(ComputeGraph& pCG) 163 | { 164 | std::cout << "digraph {\n"; 165 | 166 | // Traverse ComputeOperators in the topological order. 167 | for (ComputeOperator& op : pCG) { 168 | std::string opName = op.name().str() + "_" + std::to_string((long)&op); 169 | std::cout << " " << opName << " [label=" << op.name() << "]\n"; 170 | 171 | // Traverse the input of this ComputeOperator. 172 | int numInputs = op.getNumOfInputs(); 173 | for (int i = 0; i < numInputs; ++i) { 174 | Value* input = op.getInput(i); 175 | 176 | std::cout << " " << input->getName() << " -> " << opName << "\n"; 177 | } 178 | 179 | // Traverse the output of this ComputeOperator. 180 | int numOutputs = op.getNumOfOutputs(); 181 | for (int i = 0; i < numOutputs; ++i) { 182 | Value* output = op.getOutput(i); 183 | 184 | std::cout << " " << opName << " -> " << output->getName() << "\n"; 185 | std::cout << " " << output->getName() << " [shape=rect]\n"; 186 | } 187 | } 188 | 189 | std::cout << "}\n"; 190 | 191 | // This pass does not modify the graph topology. Just returnes kModuleNoChanged. 192 | return Pass::kModuleNoChanged; 193 | } 194 | ``` 195 | 196 | You may copy the complete source code of [GraphvizONNCIRPass.cpp](src/GraphvizONNCIRPass.cpp) and [GraphvizONNCIRPass.h](src/GraphvizONNCIRPass.h) from the `lab_6_anipulating_ONNC_IR/src` directory to your backend directory, `/lib/Target/FooNvdla`. 197 | 198 | ### Step 4: Register GraphvizONNCIRPass in the target backend. 199 | 200 | The following code snippet shows how to register `GraphvizONNCIRPass` in the FooNvdla backend. 201 | 202 | ```diff 203 | // FooNvdlaBackend.cpp 204 | 205 | #include "NvDlaFileGenPass.h" 206 | +#include "GraphvizONNCIRPass.h" 207 | 208 | #include 209 | +#include 210 | 211 | void FooNvdlaBackend::addOnncIrOptimization(PassManager& pPM, OptimizationOptions& options) 212 | { 213 | TargetBackend::addOnncIrOptimization(pPM, options); 214 | 215 | + // Register the pass into the pass manager, so that it can get called during the backend's execution. 216 | + pPM.add(); 217 | } 218 | 219 | void FooNvdlaBackend::RegisterLowers(LowerRegistry& pRegistry) const 220 | { 221 | pRegistry.emplace(); 222 | + // We need to register operator Relu because the example model in this lab contains such type of operator. 223 | + // Only with this registration can Relu be present in the ONNC IR. 224 | + pRegistry.emplace(); 225 | } 226 | ``` 227 | 228 | The complete source code of [`FooNvdlaBackend.cpp`](src/FooNvdlaBackend.cpp) is also available in the `lab_6_anipulating_ONNC_IR/src` directory 229 | 230 | Since we created a new file, `NvDlaFileGenPass.cpp`, for the backend, we need to declare the file addition in the building system so that it can get compiled. Modify the related cmake files as below. 231 | 232 | ```diff 233 | // CMakeLists.txt 234 | 235 | add_libonnc_src( 236 | NvDlaMemInfoPass.cpp 237 | NvDlaTaskSubmitPass.cpp 238 | NvDlaFileGenPass.cpp 239 | + GraphvizONNCIRPass.cpp 240 | ``` 241 | 242 | ```diff 243 | // Makefile.am 244 | 245 | ONNC_TARGET_SOURCES += \ 246 | Target/FooNvdla/NvDlaMemInfoPass.cpp \ 247 | Target/FooNvdla/NvDlaTaskSubmitPass.cpp \ 248 | Target/FooNvdla/NvDlaFileGenPass.cpp \ 249 | + Target/FooNvdla/GraphvizONNCIRPass.cpp \ 250 | ``` 251 | 252 | ### Step 5: Re-build ONNC and check the result 253 | 254 | Follow the instruction in lab 1 to rebuild the ONNC source code within the ONNC-community Docker. 255 | Use the following command to bring up the ONNC-community Docker. 256 | 257 | ```sh 258 | $ docker run -ti --rm -v :/onnc/onnc -v /models:/tutorial/models onnc/onnc-community 259 | ``` 260 | 261 | Within the Docker container, use the following commands to rebuild ONNC and then use the new ONNC binary to compile the target DNN model. 262 | 263 | ```sh 264 | # Within onnc/onnc-community Docker container 265 | 266 | $ cd /onnc/onnc-umbrella/build-normal 267 | 268 | # Rebuild ONNC. 269 | $ smake -j8 install 270 | 271 | # Run ONNC to compile the DNN model. 272 | $ onnc -mquadruple foonvdla /tutorial/models/test_Conv_Relu/test_Conv_Relu.onnx 273 | FooNvdla is invoked 274 | === GraphvizONNCIRPass ====== 275 | digraph { 276 | Initializer_94890678153536 [label=Initializer] 277 | Initializer_94890678153536 -> W 278 | W [shape=rect] 279 | InputOperator_94890678817600 [label=InputOperator] 280 | InputOperator_94890678817600 -> INPUT0 281 | INPUT0 [shape=rect] 282 | Conv_94890678675520 [label=Conv] 283 | INPUT0 -> Conv_94890678675520 284 | W -> Conv_94890678675520 285 | Conv_94890678675520 -> conv_out 286 | conv_out [shape=rect] 287 | Relu_94890678095136 [label=Relu] 288 | conv_out -> Relu_94890678095136 289 | Relu_94890678095136 -> OUTPUT0 290 | OUTPUT0 [shape=rect] 291 | OutputOperator_94890678849184 [label=OutputOperator] 292 | OUTPUT0 -> OutputOperator_94890678849184 293 | } 294 | ========================== 295 | ``` 296 | 297 | Note that the number appended to each ComputeOperator is a **random** unique number such as `Initializer_94890678153536`. Since it is random, it is no surprise that your log message have different random numbers. But you may get a similar image file for your script as the following picture. 298 | 299 | 300 | 301 | ## Summary 302 | 303 | In this lab, you have learned: 304 | 305 | * Writing a pass, and 306 | * How to traverse the ONNC IR graph and do something on for each `ComputeOperator` and `Value` objects. 307 | -------------------------------------------------------------------------------- /lab_6_Manipulating_ONNC_IR/src/FooNvdlaBackend.cpp: -------------------------------------------------------------------------------- 1 | //===- FooNvdlaBackend.cpp -----------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #include 9 | 10 | #include "FooNvdlaBackend.h" 11 | #include "TargetInfo/FooNvdlaTargetInfo.h" 12 | #include "TargetInfo/FooNvdlaTargetMemInfo.h" 13 | #include "CodeEmitVisitor.h" 14 | #include "NvDlaMemInfoPass.h" 15 | #include "NvDlaTaskSubmitPass.h" 16 | #include "NvDlaFileGenPass.h" 17 | #include "GraphvizONNCIRPass.h" 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #include 55 | #include 56 | #include 57 | #include 58 | #include 59 | #include 60 | #include 61 | 62 | #include 63 | 64 | using namespace onnc; 65 | 66 | //===----------------------------------------------------------------------===// 67 | // FooNvdlaBackend 68 | //===----------------------------------------------------------------------===// 69 | const Version FooNvdlaBackend::LOADABLE_VERSION = Version(1, 1, 255); 70 | const Version FooNvdlaBackend::BLOB_DLA_VERSION = Version(1, 3, 0); 71 | const Version FooNvdlaBackend::BLOB_EMU_VERSION = Version(1, 3, 0); 72 | 73 | FooNvdlaBackend::FooNvdlaBackend(const TargetOptions& pOptions) 74 | : TargetBackend(pOptions) 75 | , NvDlaConstants(getConfig(::nvdla::ConfigSet::nv_full, ::nvdla::ExecutionMode::direct, false)) 76 | , m_pMeta(*this) { 77 | m_pMemInfo = std::make_unique(); 78 | } 79 | 80 | void FooNvdlaBackend::addTensorSel(PassManager& pPM) 81 | { 82 | errs() << "FooNvdla is invoked\n"; 83 | 84 | // Do ONNX graph IR optimization here. 85 | 86 | // Translate from ONNX graph IR into ONNC IR 87 | addStandardTensorSel(pPM, *this); 88 | 89 | // Now ONNC IR is ready. 90 | // If you need to extend ONNC IR, here is the place to add your pass that 91 | // adds your ONNC IR operators. 92 | } 93 | 94 | void FooNvdlaBackend::addOnncIrOptimization(PassManager& pPM, OptimizationOptions& options) 95 | { 96 | TargetBackend::addOnncIrOptimization(pPM, options); 97 | 98 | pPM.add(); 99 | } 100 | 101 | void FooNvdlaBackend::addTensorSched(PassManager& pPM) 102 | { 103 | // After method AddTensorSel, operators have been scheduled in an 104 | // topological order, which totally respects the data dependency. 105 | // However, that might not be an optimized order for certain objective. 106 | // Add a scheduling optimization pass here. 107 | } 108 | 109 | void FooNvdlaBackend::addMemAlloc(PassManager& pPM) 110 | { 111 | // Input: Module 112 | // Output: LiveIntervals 113 | addStandardCreateLiveIntervals(pPM); 114 | 115 | // Input: LiveIntervals 116 | // Output: MemAllocs 117 | addStandardMemoryAllocation(pPM, *this); 118 | 119 | // Input: MemAllocs 120 | // Output: Virtual memory address for each memory operands. 121 | addStandardSetMemOperands(pPM); 122 | 123 | const NvDlaConstants& constants = *this; 124 | pPM.add(constants, &m_pMeta); 125 | } 126 | 127 | void FooNvdlaBackend::addCodeEmit(PassManager& pPM, const Path& pOutput) 128 | { 129 | static foonvdla::CodeEmitVisitor ceVisitor(*this, m_pMeta); 130 | pPM.add(ceVisitor) 131 | .add(&m_pMeta, BLOB_DLA_VERSION, BLOB_EMU_VERSION) 132 | .add(&m_pMeta, LOADABLE_VERSION) 133 | ; 134 | } 135 | 136 | void FooNvdlaBackend::RegisterLowers(LowerRegistry& pRegistry) const 137 | { 138 | pRegistry.emplace(); 139 | pRegistry.emplace(); 140 | pRegistry.emplace(); 141 | pRegistry.emplace(); 142 | pRegistry.emplace(); 143 | pRegistry.emplace(); 144 | pRegistry.emplace(); 145 | pRegistry.emplace(); 146 | pRegistry.emplace(); 147 | pRegistry.emplace(); 148 | pRegistry.emplace(); 149 | pRegistry.emplace(); 150 | pRegistry.emplace(); 151 | pRegistry.emplace(); 152 | pRegistry.emplace(); 153 | pRegistry.emplace(); 154 | pRegistry.emplace(); 155 | pRegistry.emplace(); 156 | } 157 | 158 | 159 | //===----------------------------------------------------------------------===// 160 | // Non member functions 161 | //===----------------------------------------------------------------------===// 162 | TargetBackend* CreateFooNvdlaBackend(const TargetOptions& pOptions) 163 | { 164 | return new FooNvdlaBackend(pOptions); 165 | } 166 | 167 | extern "C" void InitializeFooNvdlaONNCBackend() 168 | { 169 | onnc::TargetRegistry::RegisterTargetBackend(getTheFooNvdlaTarget(), 170 | CreateFooNvdlaBackend); 171 | } 172 | 173 | -------------------------------------------------------------------------------- /lab_6_Manipulating_ONNC_IR/src/GraphvizONNCIRPass.cpp: -------------------------------------------------------------------------------- 1 | //===- GraphvizONNCIRPass.cpp ---------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #include "GraphvizONNCIRPass.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace onnc { 17 | namespace foonvdla { 18 | 19 | //===----------------------------------------------------------------------===// 20 | // GraphvizONNCIRPass 21 | //===----------------------------------------------------------------------===// 22 | 23 | Pass::ReturnType GraphvizONNCIRPass::runOnModule(Module& pModule) 24 | { 25 | Pass::ReturnType ret = kModuleNoChanged; 26 | 27 | ret = BaseType::runOnModule(pModule); 28 | 29 | if (ret != kModuleNoChanged) { 30 | pModule.eraseUnusedValues(); 31 | } 32 | 33 | return ret; 34 | } 35 | 36 | Pass::ReturnType GraphvizONNCIRPass::runOnComputeGraph(ComputeGraph& pCG) 37 | { 38 | std::cout << "=== GraphvizONNCIRPass ======\n"; 39 | std::cout << "digraph {\n"; 40 | 41 | // Loop over every operator in this ComputeGraph. 42 | for (ComputeOperator& op : pCG) { 43 | 44 | //------------------------------------------------------------------------------------ 45 | // Print the decleration of this operator's name according to Graphviz's requirement. 46 | //------------------------------------------------------------------------------------ 47 | 48 | std::string opName = op.name().str() + "_" + std::to_string((long)&op); 49 | std::cout << " " << opName << " [label=" << op.name() << "]\n"; 50 | 51 | //----------------------------------------------------------------- 52 | // Print the edges between this operator and all its input tensors. 53 | //----------------------------------------------------------------- 54 | int numInputs = op.getNumOfInputs(); 55 | for (int i = 0; i < numInputs; ++i) { 56 | Value* input = op.getInput(i); 57 | 58 | std::cout << " " << input->getName() << " -> " << opName << "\n"; 59 | } 60 | 61 | //------------------------------------------------------------------- 62 | // Print the edges between this operator and all its output tensors. 63 | //------------------------------------------------------------------- 64 | int numOutputs = op.getNumOfOutputs(); 65 | for (int i = 0; i < numOutputs; ++i) { 66 | Value* output = op.getOutput(i); 67 | 68 | std::cout << " " << opName << " -> " << output->getName() << "\n"; 69 | std::cout << " " << output->getName() << " [shape=rect]\n"; 70 | } 71 | } 72 | 73 | std::cout << "}\n"; 74 | std::cout << "==========================\n"; 75 | 76 | return Pass::kModuleNoChanged; 77 | } 78 | 79 | } // namespace foonvdla 80 | } // namespace onnc 81 | -------------------------------------------------------------------------------- /lab_6_Manipulating_ONNC_IR/src/GraphvizONNCIRPass.h: -------------------------------------------------------------------------------- 1 | //===- GraphvizONNCIRPass.h -----------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #ifndef ONNC_FOONVDLA_GRAPHVIZ_ONNC_IR_PASS_H 9 | #define ONNC_FOONVDLA_GRAPHVIZ_ONNC_IR_PASS_H 10 | #include 11 | //#include 12 | 13 | namespace onnc { 14 | namespace foonvdla { 15 | 16 | class GraphvizONNCIRPass : public CustomPass 17 | { 18 | public: 19 | GraphvizONNCIRPass() = default; 20 | 21 | ReturnType runOnModule(Module& pModule) override; 22 | 23 | ReturnType runOnComputeGraph(ComputeGraph& pCG) override; 24 | }; 25 | 26 | } // namespace foonvdla 27 | } // namespace onnc 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /lab_6_Manipulating_ONNC_IR/src/test_Conv_Relu.dot: -------------------------------------------------------------------------------- 1 | digraph { 2 | Initializer_94228347773248 [label=Initializer] 3 | Initializer_94228347773248 -> W 4 | W [shape=rect] 5 | InputOperator_94228348437312 [label=InputOperator] 6 | InputOperator_94228348437312 -> INPUT0 7 | INPUT0 [shape=rect] 8 | Conv_94228348295232 [label=Conv] 9 | INPUT0 -> Conv_94228348295232 10 | W -> Conv_94228348295232 11 | Conv_94228348295232 -> conv_out 12 | conv_out [shape=rect] 13 | Relu_94228347714848 [label=Relu] 14 | conv_out -> Relu_94228347714848 15 | Relu_94228347714848 -> OUTPUT0 16 | OUTPUT0 [shape=rect] 17 | OutputOperator_94228348468896 [label=OutputOperator] 18 | OUTPUT0 -> OutputOperator_94228348468896 19 | } -------------------------------------------------------------------------------- /lab_7_ONNC_IR_Extension/lab_7.md: -------------------------------------------------------------------------------- 1 | # ONNC IR Extension 2 | 3 | ## Preface 4 | 5 | ONNC has implemented a set of ONNC IR operators in the latest release such as [Conv](https://github.com/onnx/onnx/blob/rel-1.3.0/docs/Operators.md#conv), [Relu](https://github.com/onnx/onnx/blob/rel-1.3.0/docs/Operators.md#relu), [MaxPool](https://github.com/onnx/onnx/blob/rel-1.3.0/docs/Operators.md#maxpool), etc. Many of supported operators are directly-mapped to corresponding ONNX operators. You may find each operator's description in the ONNX official site (https://github.com/onnx/onnx/blob/rel-1.3.0/docs/Operators.md). However, on some occasions, we may need additional tailor-made ONNC IR to support specific target hardware feature. An example from NVDLA is the "channel shuffle" operator, which is widely utilized by one famous image classification model, [ShuffleNet](https://arxiv.org/abs/1707.01083). The following figure shows a partial graph of ShuffleNet and there is a Reshape-Transpose-Reshape concatenation highlighted in a box. 6 | 7 | 8 | 9 | This three-operator concatenation performs the channel suffle operation as visualized in the following figure. 10 | 11 | 12 | 13 | The concatenated operation is equivalent to reordering the channels in an interleaved way. When mapping the Shuffle operator to the NVDLA hardware, we prefer to map three concatenated operators into a sequence of RUBIK operations in NVDLA. As the following figure (from the [NVDLA official site](http://nvdla.org/hw/v1/ias/unit_description.html#split-and-merge)) shows, the RUBIK engine provides split and merge modes to reorder memory layout. 14 | 15 | 16 | 17 | With a sequence of RUBIK operations, the `Shuffle` operator can be implemented in a mathematically-equivalent software pipeline. It is critical to fuse three model-layer operators into a single ONNC IR operator and the compiler gets a chance to map the operator into target hardware efficiently. This is a good example to demonstrate why we need to define proprietary IRs in some cases to support hardware-specific features. 18 | 19 | In this lab, we will discuss and demonstrate the method to extend the built-in ONNC IR for the hardware-specific operator, `Shuffle`, and then collapse the Reshape-Transpose-Reshape concatenation into a single `Shuffle` operator in the model description. 20 | 21 | ## Lab: Adding a Hardware-Specific Operator, `Shuffle` 22 | 23 | The following figure shows the example model, [`test_Shuffle.onnx`](../models/test_Shuffle/test_Shuffle.onnx), used in this lab. It contains a Reshape-Transpose-Reshape concatenation that performs an `Shuffle` operation equivalently. 24 | 25 | 26 | 27 | Given the above model, ONNC initially transforms the model into an ONNC IR graph as depicted in the following ONNC IR graph. The `Reshape` and `Transpose` operators in the given model are directly mapped to the `Reshape` and `Transpose` ONNC IRs respectively. 28 | 29 | 30 | 31 | The goal of this lab is to demonstrate how to define a new ONNC IR and use it for an optimization pass. The optimization pass will convert the above graph to the following graph, where the Reshape-Transpose-Reshape concatenation is replaced by a single `Shuffle`. 32 | 33 | 34 | 35 | ### Step 1: Set up environment. 36 | 37 | Please finish the following labs first before continuing this lab. 38 | 39 | * [lab 1: Environment Setup](../lab_1_Environment_Setup/lab_1.md) for preparing the Docker images and ONNC source codes. 40 | * [lab 3: Starting New Backend](../lab_3_Starting_New_Backend/lab_3.md) for preparing the experimental backend `FooNvdla` for the exercise in this lab. 41 | * [lab 4: Code Emitting](../lab_4_Code_Emitting/lab_4.md) for setting up the utilities needed by this lab. 42 | 43 | After the preparation, you should have the `FooNvdla` backend ready in `/lib/Target/FooNvdla`. 44 | For the rest of this lab, all code modification is made in the `FooNvdla` directory. 45 | 46 | ```sh 47 | $ cd /lib/Target/FooNvdla 48 | ``` 49 | 50 | ### Step 2: Define a new ONNC IR operator 51 | 52 | To define a new ONNC IR, you need to create a new IR class inheriting from the `class ComputeOperator`. In this lab, a new `class NvDlaShuffle` is declared to for the new `Shuffle` operator. There are two sets of methods and variables in this class. One set is associated with the operator attributes. For example, the `Shuffle` operator needs a variable for the attribute, "group", which indicates how to interleave the channels. The other set is mandatory and common for all operators in order to meet the ONNC framework requirement. For example, the `accept()` method is used by the [visitor design pattern](https://en.wikipedia.org/wiki/Visitor_pattern) in ONNC for performing optimization on every operator. 53 | 54 | ```cpp 55 | // Compute/NvDlaShuffle.h 56 | 57 | class NvDlaShuffle : public ComputeOperator 58 | { 59 | public: 60 | // This variable is mandatory for all operators. Do not omit it. 61 | static char ID; 62 | 63 | public: 64 | NvDlaShuffle(int group) 65 | : ComputeOperator("Shuffle", ID) // Set "Shuffle" as the operator's type name. 66 | , m_Group(group) // Set the "group" attribute of this operator. 67 | {} 68 | 69 | virtual ~NvDlaShuffle() {} 70 | 71 | // Operator-specific methods. 72 | const IntAttr& getGroup() const { return m_Group; } 73 | 74 | // Mandatory utility methods. Do not emit them. 75 | Tensor* getInput(unsigned int pIdx) override { return static_cast(m_Inputs[pIdx]); } 76 | // ... 77 | void printAttributes(std::ostream& pOS) const override; 78 | void accept(ComputeVisitor& pV) override; 79 | void accept(ComputeVisitor& pV) const override; 80 | static bool classof(const ComputeOperator* pOp); 81 | 82 | private: 83 | IntAttr m_Group; // Operator-specific attribute 84 | }; 85 | ``` 86 | 87 | After the class declaration in the header file, its class implementation is shown in the following code snippet. 88 | 89 | ```cpp 90 | // Compute/NvDlaShuffle.cpp 91 | 92 | // Mandatory implementation. Every ONNC IR operator follows the same coding. 93 | char NvDlaShuffle::ID = 0; 94 | 95 | // Operator-specific implementation 96 | void NvDlaShuffle::printAttributes(std::ostream& pOS) const 97 | { 98 | pOS << ""; 99 | } 100 | 101 | // Mandatory implementation. Every ONNC IR operator follows the same coding. 102 | void NvDlaShuffle::accept(ComputeVisitor& pV) 103 | { 104 | CodeEmitVisitor* visitor = dyn_cast(&pV); 105 | if (nullptr != visitor) 106 | visitor->visit(*this); 107 | } 108 | 109 | // Mandatory implementation. Every ONNC IR operator follows the same coding. 110 | void NvDlaShuffle::accept(ComputeVisitor& pV) const 111 | { 112 | CodeEmitVisitor* visitor = dyn_cast(&pV); 113 | if (nullptr != visitor) 114 | visitor->visit(*this); 115 | } 116 | 117 | // Mandatory implementation. Every ONNC IR operator follows the same coding. 118 | bool NvDlaShuffle::classof(const ComputeOperator* pOp) 119 | { 120 | if (nullptr == pOp) 121 | return false; 122 | return (pOp->getID() == &ID); 123 | } 124 | ``` 125 | 126 | The complete source code of [NvDlaShuffle.cpp](src/NvDlaShuffle.cpp) and [NvDlaShuffle.h](src/NvDlaShuffle.h) can be found in the lab `src` directory. Note that files related to the extended ONNC IR are conventionally located in the `Compute/` directory in a backend. Specifically, they may be found in `/lib/Target/FooNvdla/Compute` by default. Once the new ONNC IR class is created, we need to add its corresponding code emitting function in `CodeEmitVisitor.h` and `CodeEmitVisitor.cpp`. The code change is shown as in the following code snippet. You may refer to [lab 4: Code Emitting](../lab_4_Code_Emitting/lab_4.md) for more details. 127 | 128 | ```diff 129 | // CodeEmitVisitor.h 130 | 131 | #include "NvDlaMeta.h" 132 | +#include "Compute/NvDlaShuffle.h" 133 | 134 | class CodeEmitVisitor : public CustomVisitor, private NvDlaConstants 135 | { 136 | void visit(const Conv& pConv) override; 137 | + void visit(const NvDlaShuffle& pOp); 138 | 139 | void visit(Conv& pConv) override; 140 | + void visit(NvDlaShuffle& pOp) { visit(const_cast(pOp)); } 141 | 142 | }; 143 | ``` 144 | 145 | The complete source code of [CodeEmitVisitor.cpp](src/CodeEmitVisitor.cpp) and [CodeEmitVisitor.h](src/CodeEmitVisitor.h) can be found in the `src` directory for your reference. You may copy them into `/lib/Target/FooNvdla` directly. 146 | 147 | 148 | ### Step 3: Use the new ONNC IR to replace the matched pattern in the model 149 | 150 | With a new ONNC IR, `Shuffle`, we will show how to write a pass to replace the Reshape-Transpose-Reshape pattern with the new `Shuffle` IR. We have elaborated on how to develop a pass and manipulate a model graph in [lab 6: Manipulating ONNC IR](../lab_6_Manipulating_ONNC_IR/lab_6.md). In this lab, we first create a pass, `class NvDlaIdentifyShufflePass`, inherited from the `class CustomPass` to search for the Reshape-Transpose-Reshape pattern. 151 | 152 | ```cpp 153 | // NvDlaIdentifyShufflePass.h 154 | 155 | class NvDlaIdentifyShufflePass : public CustomPass 156 | { 157 | public: 158 | NvDlaIdentifyShufflePass() = default; 159 | 160 | ReturnType runOnModule(Module& pModule) override; 161 | 162 | // ... 163 | }; 164 | ``` 165 | 166 | We need to implement the `runOnModule()` function of this pass. Please refer to the file [NvDlaIdentifyShufflePass.cpp](src/NvDlaIdentifyShufflePass.cpp) for the complete source code. In the reference implementation, you may find several APIs available for traversing a model graph in this pass. In [lab 6: Manipulating ONNC IR](../lab_6_Manipulating_ONNC_IR/lab_6.md), we have introduced two classes `class ComputeOperator` and `class Value` for operators and input/output tensors respectively. To search for a specific operator concatenation pattern, we further need to know the connectivity of operators in the model graph. However, in the data structure of `ComputeOperator`, there is no variable directly pointing to other operators. 167 | 168 | To access another operator from a given operator, it can be done indirectly by accessing linked data structures in `ComputeOperator` and `Value`. The following figure shows how to access its downstream operators. 169 | 170 | 171 | 172 | To get an output tensor of an operator, we first call `getOutput()` to get an output tensor and an index argument is required because an operator can have multiple outputs. `GetOutput()` returns a pointer to a `Value` object that might have multiple usages stored in an array. You may call `getUses()` to get the usage array. By retrieving a specific entry of the usage array and then calling `getUser()`, you eventually get the pointer to the target `ComputeOperator`. 173 | For example, given an operator `op`, we can access its first downstream operator using `op.getOutput(0)->getUses()[0].getUser()`. 174 | 175 | Similarly, if you need to access an upstream operator of a given operator, follow the instructions shown in the following figure. 176 | 177 | 178 | 179 | You may call `getInput()` with a specific index to get the corresponding input tensor and then call `getDefine()` to get the upstream operator that produces this tensor. Note that there is no index for `getDefine()` because there exists only one upstream operator for each tensor. `GetDefine()` returns a pointer to a `Define` object. In fact, `class Define` is one of the parent class of `ComputeOperator`, so you can use `static_cast` to cast that operator as a `ComputeOperator`. For example, given an operator `op`, we can access its first upstream operator using `static_cast(op.getInput(0)->getDefine())`. 180 | 181 | We have prepared the complete source code of [NvDlaIdentifyShufflePass.cpp](src/NvDlaIdentifyShufflePass.cpp) and [NvDlaIdentifyShufflePass.h](src/NvDlaIdentifyShufflePass.h) for your reference. You may copy them into `/lib/Target/FooNvdla` if you do not want to code from scratch. Lastly, register this new pass to the pass manager to make it effective. There is an utility pass, `PrintONNCIRPass` available in the tutorial `src` directory to dump the whole ONNC IR graph in text format. We can use it to validate the optimization effect. 182 | 183 | 184 | ```diff 185 | // FooNvdlaBackend.cpp 186 | 187 | #include "NvDlaFileGenPass.h" 188 | +#include "NvDlaIdentifyShufflePass.h" 189 | +#include "PrintONNCIRPass.h" 190 | 191 | @@ -74,6 +75,10 @@ void FooNvdlaBackend::addTensorSel(PassManager& pPM) 192 | void FooNvdlaBackend::addOnncIrOptimization(PassManager& pPM, OptimizationOptions& options) 193 | { 194 | TargetBackend::addOnncIrOptimization(pPM, options); 195 | + 196 | + pPM.add(); 197 | + pPM.add(); 198 | + pPM.add(); 199 | } 200 | 201 | ``` 202 | 203 | You may copy [FooNvdlaBackend.cpp](src/FooNvdlaBackend.cpp), [PrintONNCIRPass.cpp](src/PrintONNCIRPass.cpp), and [PrintONNCIRPass.h](src/PrintONNCIRPass.h) from the `src` directory to `/lib/Target/FooNvdla` to save your time. Since we created a few new files for the backend, we need to declare the file addition in the building script as follows. 204 | 205 | ```diff 206 | // CMakeLists.txt 207 | 208 | add_libonnc_src( 209 | NvDlaMemInfoPass.cpp 210 | NvDlaTaskSubmitPass.cpp 211 | NvDlaFileGenPass.cpp 212 | + Compute/NvDlaShuffle.cpp 213 | + NvDlaIdentifyShufflePass.cpp 214 | + PrintONNCIRPass.cpp 215 | ``` 216 | 217 | ```diff 218 | // Makefile.am 219 | 220 | ONNC_TARGET_SOURCES += \ 221 | Target/FooNvdla/NvDlaMemInfoPass.cpp \ 222 | Target/FooNvdla/NvDlaTaskSubmitPass.cpp \ 223 | Target/FooNvdla/NvDlaFileGenPass.cpp \ 224 | + Target/FooNvdla/Compute/NvDlaShuffle.cpp \ 225 | + Target/FooNvdla/NvDlaIdentifyShufflePass.cpp \ 226 | + Target/FooNvdla/PrintONNCIRPass.cpp \ 227 | ``` 228 | 229 | ### Step 4: Re-build ONNC to test. 230 | 231 | Follow the instruction in Lab 1. to rebuild the ONNC source code within the ONNC-community Docker. 232 | Use the following command to bring up the ONNC-community Docker. 233 | 234 | ```sh 235 | $ docker run -ti --rm -v :/onnc/onnc -v /models:/tutorial/models onnc/onnc-community 236 | ``` 237 | 238 | Within the Docker container, use the following commands to rebuild ONNC and then use the new ONNC binary to compile the target DNN model. 239 | 240 | ```sh 241 | # Within onnc/onnc-community Docker container 242 | 243 | $ cd /onnc/onnc-umbrella/build-normal 244 | 245 | # Rebuild ONNC. 246 | $ smake -j8 install 247 | 248 | # Run ONNC to compile the DNN model. 249 | $ onnc -mquadruple foonvdla /tutorial/models/test_Shuffle/test_Shuffle.onnx 250 | FooNvdla is invoked 251 | === PrintONNCIRPass ====== 252 | %W0[12, 1, 1, 1] = Initializer() 253 | %SHAPE1[5] = Initializer() 254 | %SHAPE2[4] = Initializer() 255 | %W3[12, 1, 1, 1] = Initializer() 256 | %W4[3, 4, 1, 1] = Initializer() 257 | %IMAGE[1, 1, 5, 5] = InputOperator() 258 | %INPUT0[1, 12, 5, 5] = Conv(%IMAGE[1, 1, 5, 5], %W0[12, 1, 1, 1]) 259 | %RESHAPED1[1, 3, 4, 5, 5] = Reshape(%INPUT0[1, 12, 5, 5], %SHAPE1[5]) 260 | %TRANSPOSED[1, 4, 3, 5, 5] = Transpose(%RESHAPED1[1, 3, 4, 5, 5]) 261 | %RESHAPED2[1, 12, 5, 5] = Reshape(%TRANSPOSED[1, 4, 3, 5, 5], %SHAPE2[4]) 262 | %CONV2[1, 12, 5, 5] = Conv(%RESHAPED2[1, 12, 5, 5], %W3[12, 1, 1, 1]) 263 | %Y[1, 3, 5, 5] = Conv(%CONV2[1, 12, 5, 5], %W4[3, 4, 1, 1]) 264 | = OutputOperator(%Y[1, 3, 5, 5]) 265 | ========================== 266 | === PrintONNCIRPass ====== 267 | %W0[12, 1, 1, 1] = Initializer() 268 | %W3[12, 1, 1, 1] = Initializer() 269 | %W4[3, 4, 1, 1] = Initializer() 270 | %IMAGE[1, 1, 5, 5] = InputOperator() 271 | %INPUT0[1, 12, 5, 5] = Conv(%IMAGE[1, 1, 5, 5], %W0[12, 1, 1, 1]) 272 | %RESHAPED2[1, 12, 5, 5] = Shuffle(%INPUT0[1, 12, 5, 5]) 273 | %CONV2[1, 12, 5, 5] = Conv(%RESHAPED2[1, 12, 5, 5], %W3[12, 1, 1, 1]) 274 | %Y[1, 3, 5, 5] = Conv(%CONV2[1, 12, 5, 5], %W4[3, 4, 1, 1]) 275 | = OutputOperator(%Y[1, 3, 5, 5]) 276 | ========================== 277 | ``` 278 | 279 | In the above output messages, there are two "PrintONNCIRPass" blocks. The first block prints the ONNC IR before the optimization takes effect. The IR-printing format is simply described by the following grammar rules. 280 | 281 | ```console 282 | IRStatement: 283 | OutputList = IRType( InputList ) 284 | 285 | OutputList: 286 | Tensor, OutputList 287 | 288 | InputList: 289 | Tensor, InputList 290 | 291 | Tensor: 292 | %OutputName[DataShape] 293 | ``` 294 | 295 | We can see that there is a Reshape-Transpose-Reshape concatenation in the printout. The second block prints the ONNC IR after the optimization takes effect. Obviously, the Reshape-Transpose-Reshape concatenation disappears, and a `Shuffle` operator replaces the concatenation. With this optimization, in the code emitting phase, we can map the operator to the NVDLA RUBIK operations easily. 296 | 297 | ## Summary 298 | 299 | In this lab, you have learned: 300 | 301 | * Extending the built-in ONNC IR to introduce hardware-specific IR, and 302 | * Developing a pass to translate the original model graph into the one with the new IRs. 303 | 304 | -------------------------------------------------------------------------------- /lab_7_ONNC_IR_Extension/src/CodeEmitVisitor.h: -------------------------------------------------------------------------------- 1 | //===- CodeEmitVisitor.h --------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #ifndef TARGET_FOONVDLA_CODE_EMIT_VISITOR_H 9 | #define TARGET_FOONVDLA_CODE_EMIT_VISITOR_H 10 | 11 | #include "NvDlaDefine.h" 12 | #include "NvDlaMeta.h" 13 | #include "Compute/NvDlaShuffle.h" 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | 25 | namespace onnc { 26 | namespace foonvdla { 27 | 28 | class CodeEmitVisitor : public CustomVisitor, private NvDlaConstants 29 | { 30 | public: 31 | CodeEmitVisitor(const NvDlaConstants& constants, NvDlaBackendMeta& meta) noexcept 32 | : NvDlaConstants{constants} 33 | , m_pMeta{meta} 34 | {} 35 | 36 | /// ONNC defined operators @{ 37 | void visit(const Initializer& pInitializer) override; 38 | void visit(const InputOperator& pInputOperator) override; 39 | void visit(const OutputOperator& pOutputOperator) override; 40 | /// @} 41 | 42 | /// ONNX defined operators @{ 43 | void visit(const Conv& pConv) override; 44 | void visit(const NvDlaShuffle& pOp); 45 | /// @} 46 | 47 | /// ONNC defined operators @{ 48 | void visit(Initializer& pInitializer) override; 49 | void visit(InputOperator& pInputOperator) override; 50 | void visit(OutputOperator& pOutputOperator) override; 51 | /// @} 52 | 53 | /// ONNX defined operators @{ 54 | void visit(Conv& pConv) override; 55 | void visit(NvDlaShuffle& pOp) { visit(const_cast(pOp)); } 56 | /// @} 57 | 58 | private: 59 | MemoryListEntryId packWeight(const Tensor& weight, NvDlaDims destDims, Tensor::Dimension numFrontPaddingChannels, 60 | Tensor::Dimension outputChannelOffset); 61 | MemoryListEntryId packImageWeight(const Tensor& weight, NvDlaDims destDims, Tensor::Dimension outputChannelOffset); 62 | MemoryListEntryId packBias(const Tensor& bias, Tensor::Dimension numDestChannels, 63 | Tensor::Dimension srcChannelOffset = 0); 64 | MemoryListEntryId packSDPOperand(const Tensor* aluTensor, const Tensor* mulTensor, const NvDlaCubeInfo& cubeInfo); 65 | 66 | MemoryListEntryId packFeature(const Tensor& tensor, const NvDlaCubeInfo& cube); 67 | void issueEmuOp(NvDlaEmuOperation* op); 68 | AddressListEntryId issueEmuAddr(MemoryListEntryId mid); 69 | void issueDlaOp(NvDlaDlaOperation* op, NvDlaDlaOperation* op_fuse, NvDlaDlaOperation* op_prev); 70 | void issueDlaOp(std::unique_ptr op); 71 | AddressListEntryId issueDlaAddr(const Tensor& tensor, const NvDlaCubeInfo& cube, Tensor::Dimension channelOffset, 72 | NvDlaBackendMeta::Offset hOffset); 73 | AddressListEntryId issueDlaAddr(const Tensor& tensor, const NvDlaCubeInfo& cube); 74 | AddressListEntryId issueDlaAddr(MemoryListEntryId memoryId, const NvDlaCubeInfo& cube); 75 | AddressListEntryId issueSDPOperand(const Tensor& tensor, const NvDlaCubeInfo& cube, MemoryListEntryId& memoryId); 76 | 77 | void SetLUTParam(dla_lut_param* lut_param, float alpha, float beta, float bias, int size, float outdata_scale, float outdata_offset); 78 | 79 | // Perform SDP for 2 input tensors and an output tensor, 80 | // the possible value for parameter 'opType' is: 81 | // 82 | // 1. SDP_OP_ADD 83 | // 2. SDP_OP_MUL 84 | // 85 | void emitSdp(std::uint8_t opType, const Tensor& firstInput, const Tensor& secondInput, const Tensor& output); 86 | 87 | private: 88 | MemoryListEntryId packWeight(span weight, const Tensor* weightTensor, NvDlaDims srcDims, 89 | NvDlaDims destDims, Tensor::Dimension numFrontPaddingChannels, 90 | Tensor::Dimension outputChannelOffset); 91 | 92 | MemoryListEntryId packWeight(const Tensor& weight, NvDlaDims srcDims, NvDlaDims destDims, 93 | Tensor::Dimension numFrontPaddingChannels, Tensor::Dimension outputChannelOffset); 94 | 95 | template 96 | void packWeightImpl(Type* destData, NvDlaDims destDimsWithFrontPadding, const Tensor* tensor, const float* srcData, 97 | NvDlaDims srcDims, Tensor::Dimension numFrontPaddingChannels, 98 | Tensor::Dimension outputChannelOffset); 99 | 100 | template 101 | void packImageWeightImpl(Type* blob, NvDlaDims blobDims, const Tensor* tensor, const float* srcData, 102 | NvDlaDims srcDims, Tensor::Dimension outputChannelOffset); 103 | template 104 | void packBiasImpl(Type* destData, Tensor::Dimension numDestChannels, const Tensor* tensor, const float* srcData, 105 | Tensor::Dimension srcChannelOffset); 106 | void packSDPOperandImpl(NvU8* blob, const Tensor* aluTensor, const float* aluData, const Tensor* mulTensor, 107 | const float* mulData, const NvDlaCubeInfo& cubeInfo); 108 | 109 | private: 110 | NvDlaBackendMeta& m_pMeta; 111 | }; 112 | 113 | } // namespace nvdla 114 | } // namespace onnc 115 | 116 | #undef PP_DECL_VISIT 117 | #undef PP_NVDLA_OP_LIST 118 | 119 | #endif 120 | -------------------------------------------------------------------------------- /lab_7_ONNC_IR_Extension/src/FooNvdlaBackend.cpp: -------------------------------------------------------------------------------- 1 | //===- FooNvdlaBackend.cpp -----------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #include 9 | 10 | #include "FooNvdlaBackend.h" 11 | #include "TargetInfo/FooNvdlaTargetInfo.h" 12 | #include "TargetInfo/FooNvdlaTargetMemInfo.h" 13 | #include "CodeEmitVisitor.h" 14 | #include "NvDlaMemInfoPass.h" 15 | #include "NvDlaTaskSubmitPass.h" 16 | #include "NvDlaFileGenPass.h" 17 | #include "NvDlaIdentifyShufflePass.h" 18 | #include "PrintONNCIRPass.h" 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | #include 44 | 45 | using namespace onnc; 46 | 47 | //===----------------------------------------------------------------------===// 48 | // FooNvdlaBackend 49 | //===----------------------------------------------------------------------===// 50 | const Version FooNvdlaBackend::LOADABLE_VERSION = Version(1, 1, 255); 51 | const Version FooNvdlaBackend::BLOB_DLA_VERSION = Version(1, 3, 0); 52 | const Version FooNvdlaBackend::BLOB_EMU_VERSION = Version(1, 3, 0); 53 | 54 | FooNvdlaBackend::FooNvdlaBackend(const TargetOptions& pOptions) 55 | : TargetBackend(pOptions) 56 | , NvDlaConstants(getConfig(::nvdla::ConfigSet::nv_full, ::nvdla::ExecutionMode::direct, false)) 57 | , m_pMeta(*this) { 58 | m_pMemInfo = std::make_unique(); 59 | } 60 | 61 | void FooNvdlaBackend::addTensorSel(PassManager& pPM) 62 | { 63 | errs() << "FooNvdla is invoked\n"; 64 | 65 | // Do ONNX graph IR optimization here. 66 | 67 | // Translate from ONNX graph IR into ONNC IR 68 | addStandardTensorSel(pPM, *this); 69 | 70 | // Now ONNC IR is ready. 71 | // If you need to extend ONNC IR, here is the place to add your pass that 72 | // adds your ONNC IR operators. 73 | } 74 | 75 | void FooNvdlaBackend::addOnncIrOptimization(PassManager& pPM, OptimizationOptions& options) 76 | { 77 | TargetBackend::addOnncIrOptimization(pPM, options); 78 | 79 | pPM.add(); 80 | pPM.add(); 81 | pPM.add(); 82 | } 83 | 84 | void FooNvdlaBackend::addTensorSched(PassManager& pPM) 85 | { 86 | // After method AddTensorSel, operators have been scheduled in an 87 | // topological order, which totally respects the data dependency. 88 | // However, that might not be an optimized order for certain objective. 89 | // Add a scheduling optimization pass here. 90 | } 91 | 92 | void FooNvdlaBackend::addMemAlloc(PassManager& pPM) 93 | { 94 | // Input: Module 95 | // Output: LiveIntervals 96 | addStandardCreateLiveIntervals(pPM); 97 | 98 | // Input: LiveIntervals 99 | // Output: MemAllocs 100 | addStandardMemoryAllocation(pPM, *this); 101 | 102 | // Input: MemAllocs 103 | // Output: Virtual memory address for each memory operands. 104 | addStandardSetMemOperands(pPM); 105 | 106 | const NvDlaConstants& constants = *this; 107 | pPM.add(constants, &m_pMeta); 108 | } 109 | 110 | void FooNvdlaBackend::addCodeEmit(PassManager& pPM, const Path& pOutput) 111 | { 112 | static foonvdla::CodeEmitVisitor ceVisitor(*this, m_pMeta); 113 | pPM.add(ceVisitor) 114 | .add(&m_pMeta, BLOB_DLA_VERSION, BLOB_EMU_VERSION) 115 | .add(&m_pMeta, LOADABLE_VERSION) 116 | ; 117 | } 118 | 119 | void FooNvdlaBackend::RegisterLowers(LowerRegistry& pRegistry) const 120 | { 121 | pRegistry.emplace(); 122 | pRegistry.emplace(); 123 | pRegistry.emplace(); 124 | } 125 | 126 | 127 | //===----------------------------------------------------------------------===// 128 | // Non member functions 129 | //===----------------------------------------------------------------------===// 130 | TargetBackend* CreateFooNvdlaBackend(const TargetOptions& pOptions) 131 | { 132 | return new FooNvdlaBackend(pOptions); 133 | } 134 | 135 | extern "C" void InitializeFooNvdlaONNCBackend() 136 | { 137 | onnc::TargetRegistry::RegisterTargetBackend(getTheFooNvdlaTarget(), 138 | CreateFooNvdlaBackend); 139 | } 140 | 141 | -------------------------------------------------------------------------------- /lab_7_ONNC_IR_Extension/src/NvDlaIdentifyShufflePass.cpp: -------------------------------------------------------------------------------- 1 | //===- NvDlaIdentifyShufflePass.cpp ---------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #include "NvDlaIdentifyShufflePass.h" 9 | 10 | #include "Compute/NvDlaShuffle.h" 11 | #include "NvDlaDefine.h" 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace onnc; 18 | using namespace foonvdla; 19 | 20 | //===----------------------------------------------------------------------===// 21 | // NvDlaIdentifyShufflePass 22 | //===----------------------------------------------------------------------===// 23 | Pass::ReturnType NvDlaIdentifyShufflePass::runOnModule(Module& pModule) 24 | { 25 | Pass::ReturnType ret = kModuleNoChanged; 26 | 27 | ret = BaseType::runOnModule(pModule); 28 | 29 | if (ret != kModuleNoChanged) { 30 | pModule.eraseUnusedValues(); 31 | } 32 | 33 | return ret; 34 | } 35 | 36 | Pass::ReturnType NvDlaIdentifyShufflePass::runOnComputeGraph(ComputeGraph& pCG) 37 | { 38 | Pass::ReturnType ret = Pass::kModuleNoChanged; 39 | 40 | //--------------------------------------------------------------------- 41 | // Find out all Reshape-Transpose-Reshape patterns in the model graph. 42 | //--------------------------------------------------------------------- 43 | 44 | std::vector reshapes; 45 | for (auto& op : pCG) { 46 | if (Reshape* reshape1 = dyn_cast(&op)) { 47 | if (is_shuffle(reshape1)) { // A channel-shuffle pattern is detected. 48 | // Save the first node of this pattern into a queue. 49 | // We will replace this pattern by a single Shuffle IR later on. 50 | reshapes.push_back(reshape1); 51 | 52 | // Since a node replacement will happen in the model, the model graph 53 | // will be changed and thus this function should return kModuleChanged. 54 | ret |= Pass::kModuleChanged; 55 | } 56 | } 57 | } 58 | 59 | //--------------------------------------------------------------------------- 60 | // Replace every Reshape-Transpose-Reshape pattern with a single Shuffle IR. 61 | //--------------------------------------------------------------------------- 62 | 63 | for (Reshape* reshape1 : reshapes) { 64 | 65 | // Derive the Tranpose and the second Reshape. 66 | auto* transpose = dyn_cast(reshape1->getOutput(0)->getUses()[0].getUser()); 67 | auto* reshape2 = dyn_cast(transpose->getOutput(0)->getUses()[0].getUser()); 68 | 69 | Tensor* input_tensor = reshape1->getInput(0); 70 | Tensor* shape1_tensor = reshape1->getInput(1); 71 | auto shape1_initializer = static_cast(shape1_tensor->getDefine()); 72 | Tensor* reshape1_out_tensor = reshape1->getOutput(0); 73 | Tensor* transpose_out = transpose->getOutput(0); 74 | Tensor* shape2_tensor = reshape2->getInput(1); 75 | auto shape2_initializer = static_cast(shape2_tensor->getDefine()); 76 | Tensor* output_tensor = reshape2->getOutput(0); 77 | 78 | // The current ONNC IR graph status 79 | // ================================ 80 | // 81 | // (shape1_initializer) 82 | // | | 83 | // input_tensor shape1_tensor 84 | // \ / 85 | // (reshape1) 86 | // | 87 | // reshape1_out_tensor 88 | // | 89 | // (transpose) (shape2_initializer) 90 | // | | 91 | // transpose_out shape2_tensor 92 | // \ / 93 | // (reshape2) 94 | // | 95 | // output_tensor 96 | // | 97 | 98 | // Create a new Shuffle. 99 | const auto& reshape_shape = static_cast(reshape1->getInput(1))->getValues(); 100 | auto* shuffle = pCG.addOperator(reshape_shape[1]); 101 | 102 | // The current ONNC IR graph status 103 | // ================================ 104 | // 105 | // (shape1_initializer) 106 | // | | 107 | // input_tensor shape1_tensor 108 | // \ / 109 | // (reshape1) 110 | // | 111 | // reshape1_out_tensor 112 | // | 113 | // (transpose) (shape2_initializer) 114 | // | | 115 | // transpose_out shape2_tensor 116 | // \ / 117 | // (reshape2) (shuffle) 118 | // | 119 | // output_tensor 120 | // | 121 | 122 | // Remove the edges between some operators and their input/output tensors. 123 | // Remove an edge means to erase the records within an operator's data structure about its input tensors. 124 | reshape1->removeAllInputs(); 125 | reshape1->removeAllOutputs(); 126 | transpose->removeAllInputs(); 127 | transpose->removeAllOutputs(); 128 | reshape2->removeAllInputs(); 129 | reshape2->removeAllOutputs(); 130 | shape1_initializer->removeAllOutputs(); 131 | shape2_initializer->removeAllOutputs(); 132 | 133 | // The current ONNC IR graph status 134 | // ================================ 135 | // 136 | // (shape1_initializer) 137 | // | | 138 | // input_tensor shape1_tensor 139 | // 140 | // (reshape1) 141 | // 142 | // reshape1_out_tensor 143 | // 144 | // (transpose) (shape2_initializer) 145 | // | 146 | // transpose_out shape2_tensor 147 | // 148 | // (reshape2) (shuffle) 149 | // 150 | // output_tensor 151 | // | 152 | 153 | // Remove some un-used nodes in the ONNC IR graph. 154 | pCG.erase(*reshape1); 155 | pCG.erase(*transpose); 156 | pCG.erase(*reshape2); 157 | pCG.erase(*shape1_initializer); 158 | pCG.erase(*shape2_initializer); 159 | pCG.erase(*shape1_tensor); 160 | pCG.erase(*reshape1_out_tensor); 161 | pCG.erase(*transpose_out); 162 | pCG.erase(*shape2_tensor); 163 | 164 | // The current ONNC IR graph status 165 | // ================================ 166 | // 167 | // | 168 | // input_tensor 169 | // 170 | // (shuffle) 171 | // 172 | // output_tensor 173 | // | 174 | 175 | shuffle->addInput(*input_tensor); 176 | shuffle->addOutput(*output_tensor); 177 | 178 | // The current ONNC IR graph status 179 | // ================================ 180 | // 181 | // | 182 | // input_tensor 183 | // | 184 | // (shuffle) 185 | // | 186 | // output_tensor 187 | // | 188 | 189 | } 190 | 191 | pCG.topologicalSort(); 192 | 193 | return ret; 194 | } 195 | 196 | bool NvDlaIdentifyShufflePass::is_shuffle(Reshape* reshape1) 197 | { 198 | // We are going to detect the following pattern. 199 | // 200 | // | 201 | // input_tensor 202 | // \ 203 | // (reshape1) 204 | // | 205 | // reshape1_out_tensor 206 | // | // This tensor must have only one user. 207 | // (transpose) 208 | // | 209 | // transpose_out 210 | // \ // This tensor must have only one user. 211 | // (reshape2) 212 | // | 213 | // output_tensor 214 | // | 215 | // 216 | 217 | #define SHUFFLE_ASSERT(cond) if (! (cond)) return false; 218 | 219 | //-------------------------- 220 | // Check the first Reshape. 221 | //-------------------------- 222 | 223 | SHUFFLE_ASSERT( reshape1->getNumOfOutputs() == 1 ); 224 | 225 | // the output tensor of the Reshape has only one user. 226 | SHUFFLE_ASSERT( reshape1->getOutput(0)->getUses().size() == 1 ); 227 | 228 | // The Reshape attribute must satisfy certain constraints. 229 | // The input dimension must be 4, and this Reshape splits the second dimension into two, 230 | // thus causing the output dimension to be 5. 231 | // e.g. input: 1x12x5x6, shape: [1,3,4,5,6] 232 | // output: 1x3x4x5x6 233 | SHUFFLE_ASSERT( reshape1->getInput(0)->getNumOfDimensions() == 4 ); 234 | SHUFFLE_ASSERT( reshape1->getInput(1)->getNumOfDimensions() == 1 ); // shape tensor must be array 235 | 236 | const auto& reshape1_shape = static_cast(reshape1->getInput(1))->getValues(); 237 | SHUFFLE_ASSERT( reshape1_shape.size() == 5 ); 238 | SHUFFLE_ASSERT( reshape1->getInput(0)->dimension(1) == reshape1_shape[1] * reshape1_shape[2] ); 239 | SHUFFLE_ASSERT( reshape1->getInput(0)->dimension(2) == reshape1_shape[3] && 240 | reshape1->getInput(0)->dimension(3) == reshape1_shape[4]); 241 | 242 | //----------------------------- 243 | // Check the middle Transpose. 244 | //----------------------------- 245 | 246 | // the output tensor of the first Reshape has the user to be a Transpose. 247 | Transpose* transpose = dyn_cast(reshape1->getOutput(0)->getUses()[0].getUser()); 248 | SHUFFLE_ASSERT( transpose ); 249 | 250 | // the output tensor of the Transpose has only one user. 251 | SHUFFLE_ASSERT( transpose->getNumOfOutputs() == 1 ); 252 | SHUFFLE_ASSERT( transpose->getOutput(0)->getUses().size() == 1 ); 253 | 254 | // the attribute of Tranpose, perm, must be [0, 2, 1, 3, 4], ie. swap the 1st and 2nd dimensions. 255 | // e.g. input: 1x3x4x5x6 256 | // output: 1x4x3x5x6 257 | SHUFFLE_ASSERT( transpose->getInput(0)->getNumOfDimensions() == 5 ); 258 | SHUFFLE_ASSERT( transpose->getPerm().at(0) == 0 && 259 | transpose->getPerm().at(1) == 2 && 260 | transpose->getPerm().at(2) == 1 && 261 | transpose->getPerm().at(3) == 3 && 262 | transpose->getPerm().at(4) == 4); 263 | 264 | //----------------------------- 265 | // Check the last Reshape. 266 | //----------------------------- 267 | 268 | // the output tensor of the middle Transpose has the user to be a Reshape. 269 | Reshape* reshape2 = dyn_cast(transpose->getOutput(0)->getUses()[0].getUser()); 270 | SHUFFLE_ASSERT( reshape2 ); 271 | 272 | // The Reshape attribute must satisfy certain constraints. 273 | // The input dimension must be 5, and this Reshape merges the 2nd and 3rd dimension into one, 274 | // thus causing the output dimension to be 4. 275 | // e.g. input: 1x4x3x5x6, shape: [1,12,5,6] 276 | // output: 1x12x5x6 277 | SHUFFLE_ASSERT( reshape2->getInput(0)->getNumOfDimensions() == 5 ); 278 | SHUFFLE_ASSERT( reshape2->getInput(1)->getNumOfDimensions() == 1 ); // shape tensor must be array 279 | 280 | const auto& reshape2_shape = static_cast(reshape2->getInput(1))->getValues(); 281 | SHUFFLE_ASSERT( reshape2_shape.size() == 4 ); 282 | SHUFFLE_ASSERT( reshape2->getInput(0)->dimension(1) * reshape2->getInput(0)->dimension(2) == 283 | reshape2_shape[1] ); 284 | SHUFFLE_ASSERT( reshape2->getInput(0)->dimension(3) == reshape2_shape[2] && 285 | reshape2->getInput(0)->dimension(4) == reshape2_shape[3]); 286 | 287 | #undef SHUFFLE_ASSERT 288 | 289 | return true; 290 | } 291 | -------------------------------------------------------------------------------- /lab_7_ONNC_IR_Extension/src/NvDlaIdentifyShufflePass.h: -------------------------------------------------------------------------------- 1 | //===- NvDlaIdentifyShufflePass.h -------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===------------------------------------------------------------------------------===// 8 | #ifndef NVDLA_IDENTIFY_SHUFFLE_PASS_H 9 | #define NVDLA_IDENTIFY_SHUFFLE_PASS_H 10 | 11 | #include "NvDlaMeta.h" 12 | 13 | #include 14 | 15 | namespace onnc { 16 | namespace foonvdla { 17 | 18 | class NvDlaIdentifyShufflePass : public CustomPass 19 | { 20 | public: 21 | NvDlaIdentifyShufflePass() = default; 22 | 23 | ReturnType runOnModule(Module& pModule) override; 24 | ReturnType runOnComputeGraph(ComputeGraph& pCG) override; 25 | 26 | private: 27 | bool is_shuffle(Reshape* reshape1); 28 | }; 29 | 30 | } // namespace foonvdla 31 | } // namespace onnc 32 | 33 | #endif // MODELSIM_IDENTIFY_SHUFFLE_PASS_H 34 | -------------------------------------------------------------------------------- /lab_7_ONNC_IR_Extension/src/NvDlaShuffle.cpp: -------------------------------------------------------------------------------- 1 | //===- NvDlaShuffle.cpp ----------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #include "NvDlaShuffle.h" 9 | 10 | #include "../CodeEmitVisitor.h" 11 | #include "../NvDlaDefine.h" 12 | 13 | using namespace onnc; 14 | using namespace onnc::foonvdla; 15 | 16 | char NvDlaShuffle::ID = 0; 17 | 18 | //===----------------------------------------------------------------------===// 19 | // NvDlaShuffle 20 | //===----------------------------------------------------------------------===// 21 | void NvDlaShuffle::printAttributes(std::ostream& pOS) const 22 | { 23 | pOS << ""; 24 | } 25 | 26 | void NvDlaShuffle::accept(ComputeVisitor& pV) 27 | { 28 | CodeEmitVisitor* visitor = dyn_cast(&pV); 29 | if (nullptr != visitor) 30 | visitor->visit(*this); 31 | } 32 | 33 | void NvDlaShuffle::accept(ComputeVisitor& pV) const 34 | { 35 | CodeEmitVisitor* visitor = dyn_cast(&pV); 36 | if (nullptr != visitor) 37 | visitor->visit(*this); 38 | } 39 | 40 | bool NvDlaShuffle::classof(const ComputeOperator* pOp) 41 | { 42 | if (nullptr == pOp) 43 | return false; 44 | return (pOp->getID() == &ID); 45 | } 46 | -------------------------------------------------------------------------------- /lab_7_ONNC_IR_Extension/src/NvDlaShuffle.h: -------------------------------------------------------------------------------- 1 | //===- NvDlaShuffle.h ------------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===--------------------------------------------------------------------------===// 8 | #ifndef TARGET_NVDLA_NVDLA_SHUFFLE_H 9 | #define TARGET_NVDLA_NVDLA_SHUFFLE_H 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | namespace onnc { 16 | namespace foonvdla { 17 | 18 | class NvDlaShuffle : public ComputeOperator 19 | { 20 | public: 21 | static char ID; 22 | 23 | public: 24 | NvDlaShuffle(int group) 25 | : ComputeOperator("Shuffle", ID) 26 | , m_Group(group) 27 | {} 28 | 29 | virtual ~NvDlaShuffle() {} 30 | 31 | // Paramater 32 | const IntAttr& getGroup() const { return m_Group; } 33 | 34 | // Input & Ouput Tensor 35 | Tensor* getInput(unsigned int pIdx) override { return static_cast(m_Inputs[pIdx]); } 36 | 37 | const Tensor* getInput(unsigned int pIdx) const override { return static_cast(m_Inputs[pIdx]); } 38 | 39 | Tensor* getOutput(unsigned int pIdx) override { return static_cast(m_Outputs[pIdx]); } 40 | 41 | const Tensor* getOutput(unsigned int pIdx) const override { return static_cast(m_Outputs[pIdx]); } 42 | 43 | void printAttributes(std::ostream& pOS) const override; 44 | 45 | void accept(ComputeVisitor& pV) override; 46 | 47 | void accept(ComputeVisitor& pV) const override; 48 | 49 | static bool classof(const ComputeOperator* pOp); 50 | 51 | private: 52 | IntAttr m_Group; 53 | }; 54 | 55 | } // namespace foonvdla 56 | } // namespace onnc 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /lab_7_ONNC_IR_Extension/src/PrintONNCIRPass.cpp: -------------------------------------------------------------------------------- 1 | //===- PrintONNCIRPass.cpp ------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #include "PrintONNCIRPass.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace onnc { 17 | namespace foonvdla { 18 | 19 | //===----------------------------------------------------------------------===// 20 | // PrintONNCIRPass 21 | //===----------------------------------------------------------------------===// 22 | 23 | Pass::ReturnType PrintONNCIRPass::runOnModule(Module& pModule) 24 | { 25 | const Pass::ReturnType ret = BaseType::runOnModule(pModule); 26 | 27 | if (ret != kModuleNoChanged) { 28 | pModule.eraseUnusedValues(); 29 | } 30 | 31 | return ret; 32 | } 33 | 34 | Pass::ReturnType PrintONNCIRPass::runOnComputeGraph(ComputeGraph& pCG) 35 | { 36 | Pass::ReturnType ret = Pass::kModuleNoChanged; 37 | 38 | std::cout << "=== PrintONNCIRPass ======\n"; 39 | for (ComputeOperator& node : pCG) { 40 | node.print(std::cout); 41 | std::cout << "\n"; 42 | } 43 | std::cout << "==========================\n"; 44 | 45 | return ret; 46 | } 47 | 48 | } // namespace foonvdla 49 | } // namespace onnc 50 | -------------------------------------------------------------------------------- /lab_7_ONNC_IR_Extension/src/PrintONNCIRPass.h: -------------------------------------------------------------------------------- 1 | //===- PrintONNCIRPass.h --------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #ifndef ONNC_FOONVDLA_PRINT_ONNC_IR_PASS_H 9 | #define ONNC_FOONVDLA_PRINT_ONNC_IR_PASS_H 10 | #include 11 | #include 12 | 13 | namespace onnc { 14 | namespace foonvdla { 15 | 16 | class PrintONNCIRPass : public CustomPass 17 | { 18 | public: 19 | PrintONNCIRPass() = default; 20 | 21 | ReturnType runOnModule(Module& pModule) override; 22 | 23 | ReturnType runOnComputeGraph(ComputeGraph& pCG) override; 24 | }; 25 | 26 | } // namespace foonvdla 27 | } // namespace onnc 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/lab_8.md: -------------------------------------------------------------------------------- 1 | # Hardware-specific Optimization 2 | 3 | ## Preface 4 | 5 | Many optimizations that operate on the ONNX/ONNC model IR graph are independent of the DLA hardware design, but many of the most effective optimizations are those that best exploit special features of the target platform. In the case of NVDLA, for a specific ONNX operator, there are more than one way to map the operator into a sequence of hardware operations. It is also possible that we transform a part of the model graph into another mathematically-equivalent subgraph, and then derive a better hardware execution sequence in terms of performance and power. In this tutorial, we will illustrate a hardware-specific optimization in the NVDLA backend. 6 | 7 | The following diagram shows one of the computation modules in NVDLA called SDP-X1. 8 | 9 | 10 | 11 | The Single Point Data Processor (SDP) in NVDLA performs post processing operations at the single data element level. It has several function blocks and each of which targets a different purpose. The X1 block has the architecture support for Bias addition, BatchNorm, PReLU, ReLU, and Eltwise. Its datapath is composed of ALU, multiplier, and ReLU sub-blocks. Each sub-block can be programmed to be enabled or bypassed. More sub-blocks in action result in better performance. For example, if a model contains a series of Add-Mul-Relu, we prefer to map those three operators into a single SDP-X1 operation, rather than mapping them separately into three SDP-X1 operations, as the following figure shows. 12 | 13 | 14 | 15 | Mapping to multiple SDP-X1 operations incurs additional memory accesses, degrades performance, and consumes more power compared to mapping to a single SDP-X1. Intuitively, if we can identify a Add-Mul-ReLu pattern in a model, we can take advantage of the SDP-X1 pipeline to optimize the performance. In addition, we can transform the original model graph to create more Add-Mul-Relu patterns. For example, we can convert a Mul-Add pair to an Add-Mul pair with some adjustment in computation constants. The basic idea is that the original `Y = (X * a) + b` is mathematically equivalent to `Y = (X + c) * a`, where `c = b / a`. Note that this translation has the following pre-conditions to meet in order to be a valid conversion and executable on NVDLA hardware: 16 | 17 | 1. The result of the `Mul` operator has only one consumer. 18 | 2. The values of `a` and `b` can be determined at compile time. 19 | 20 | The first requirement is due to the fact that the ALU output is consumed by the multiplier without any path written to the memory in the SDP-X1 pipeline. Let's look at an invalid example in the following figure. 21 | 22 | 23 | 24 | The Add in the model violates the first condition because it has two consumers. The result of the Add operation has no way to be passed to the other consumer, Conv. Although this pattern exists in the model graph, we cannot convert the pattern for optimization. 25 | 26 | The second requirement comes from the fact that `c = b / a` introduces a division that has no corresponding hardware to execute. Therefore, we need the values of `a` and `b` determined in the compilation time so that the ALU input will be a single constant. It usually implies that the values of `a` and `b` are constant values in the given model. 27 | 28 | In this lab, we will show how to implement such an optimization pass within the ONNC framework. 29 | 30 | 31 | ## Lab: Mul and Add Re-ordering and Fusion 32 | 33 | The following figure shows the example model [test_Mul_Add_Relu.onnx](../models/test_Mul_Add_Relu/test_Mul_Add_Relu.onnx) in this lab, and it contains a Mul-Add-Relu pattern mentioned in previous section. 34 | 35 | 36 | 37 | Given the above model, ONNC initially transforms the model into an ONNC IR graph as depicted in the following diagram. 38 | 39 | 40 | 41 | In the ONNC IR graph, you can see that the Mul, Add, and Relu operators are ordered in the same way as in the model graph. We aim at re-ordering the Mul and Add operations as shown in the following ONNC IR graph to take advantage of SDP-X1 pipeline for better performance. 42 | 43 | 44 | 45 | Lastly, we will create a new compound ONNC IR AddMulRelu, and convert the Add-Mul-Relu IR sequence into a single IR so that the original three operators are considered as a whole during the code emitting phase and issued as a single SDP hardware operation. 46 | 47 | 48 | 49 | 50 | ### Step 1: Set up environment. 51 | 52 | We recommend to finish the following labs first before continuing this lab. 53 | 54 | * [lab 1: Environment Setup](../lab_1_Environment_Setup/lab_1.md) for preparing the Docker images and ONNC source codes. 55 | * [lab 3: Starting New Backend](../lab_3_Starting_New_Backend/lab_3.md) for preparing the experimental backend `FooNvdla` for the exercise in this lab. 56 | * [lab 4: Code Emitting](../lab_4_Code_Emitting/lab_4.md) for setting up the utilities needed by this lab. 57 | 58 | The following note enables you to jump start on this lab if you forget the details of the above labs. 59 | It is recommended to have two terminal consoles. One is for running the ONNC-community docker container, and the other is for running commands on your host machine. 60 | 61 | ```sh 62 | ################################################### 63 | # Within your computer console (outside Docker container) 64 | ################################################### 65 | 66 | # Skip these commands if you have already had the Docker images. 67 | $ docker pull onnc/onnc-community 68 | $ docker pull onnc/vp 69 | 70 | # Prepare ONNC and tutorial source code. 71 | $ git clone https://github.com/ONNC/onnc.git 72 | $ cd onnc; git checkout tags/1.2.0; cd .. 73 | $ git clone https://github.com/ONNC/onnc-tutorial.git 74 | 75 | # Start the onnc/onnc-community Docker. 76 | $ docker run -ti --rm -v :/onnc/onnc -v :/tutorial onnc/onnc-community 77 | 78 | ################################################### 79 | # Within the onnc/onnc-community Docker container. 80 | ################################################### 81 | 82 | $ cd /onnc/onnc 83 | $ ./scripts/create-new-backend.sh FooNvdla 84 | 85 | ################################################### 86 | # Within your computer console (outside Docker container) 87 | ################################################### 88 | 89 | # Install the pre-built FooNvdla backend. 90 | $ tar -zxvf /lab_4_Code_Emitting/src/FooNvdla.tar.gz -C /lib/Target 91 | 92 | ################################################### 93 | # Within the onnc/onnc-community Docker container. 94 | ################################################### 95 | 96 | $ cd /onnc/onnc-umbrella/build-normal 97 | # Rebuild ONNC. 98 | $ smake -j8 install 99 | 100 | # Run ONNC to compile the DNN model. Make sure all the previous preparation is good. 101 | $ onnc -mquadruple foonvdla /tutorial/models/test_group_Conv/test_group_Conv.onnx 102 | FooNvdla is invoked 103 | ``` 104 | By now, you should have the `FooNvdla` backend ready in `/lib/Target/FooNvdla`. 105 | For the rest of this lab, all code modification is made in the `FooNvdla` directory. 106 | 107 | ```sh 108 | # Within your computer console (outside Docker container) 109 | $ cd /lib/Target/FooNvdla 110 | ``` 111 | 112 | ### Step 2: Search for the Mul-Add pattern and re-order the two operators. 113 | 114 | The re-ordering optimization is done in a pass named NvDlaReorderMulAddPass. We have introduced how to create a pass in [lab 6: Manipulating ONNC IR](../lab_6_Manipulating_ONNC_IR/lab_6.md). You may find all the related files for this lab in the `/lab_8_Mul_Add_Reordering_and_Fusion/src/` directory. The implementation traverses the ONNC IR graph, finds the matched pattern, and converts the graph accordingly. The details are similar to what is done previous lab and we will skip them in this lab. 115 | 116 | ```sh 117 | $ cp /lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaReorderMulAddPass.* /lib/Target/FooNvdla 118 | ``` 119 | 120 | 121 | ### Step 3: Fuse Mul-Add-Relu into a single IR. 122 | 123 | In this lab, we need to define a new ONNC IR, `AddMulRelu`, to represent the Mul-Add-Relu concatenation. For details, please refer to how to define a new ONNC IR operator in [lab 7: ONNC IR Extension](../lab_7_ONNC_IR_Extension/lab_7.md). 124 | 125 | ```sh 126 | $ mkdir -p /lib/Target/FooNvdla/Compute 127 | 128 | # These files are about the new IR's definition. 129 | $ cp /lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaAddMulRelu.* /lib/Target/FooNvdla/Compute 130 | 131 | # These files are about deploying the new IR into the model graph. 132 | $ cp /lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaFuseAddMulReluPass.* /lib/Target/FooNvdla 133 | 134 | # These files are about the code emitting functions for the new IR. 135 | $ cp /lab_8_Mul_Add_Reordering_and_Fusion/src/CodeEmitVisitor.* /lib/Target/FooNvdla 136 | ``` 137 | 138 | In addition, in order to visualize the optimization effect, we additionally introduce an utility pass "PrintONNCIRPass" to print out the ONNC IR. 139 | 140 | ```sh 141 | $ cp /lab_8_Mul_Add_Reordering_and_Fusion/src/PrintONNCIRPass.* /lib/Target/FooNvdla 142 | ``` 143 | 144 | We have introduced a few optimization passes, and remember to enable those passes in the backend. 145 | 146 | ```sh 147 | $ cp /lab_8_Mul_Add_Reordering_and_Fusion/src/FooNvdlaBackend.cpp /lib/Target/FooNvdla 148 | ``` 149 | 150 | Lastly, since we created a few new files for the backend, we need to declare the file addition in the building script so that they can get compiled. You may find the related files in the tutorial `src` directory and simply update the building scripts by the following commands. 151 | 152 | ```sh 153 | $ cp /lab_8_Mul_Add_Reordering_and_Fusion/src/CMakeLists.txt /lib/Target/FooNvdla 154 | $ cp /lab_8_Mul_Add_Reordering_and_Fusion/src/Makefile.am /lib/Target/FooNvdla 155 | ``` 156 | 157 | 158 | ### Step 4: Re-build ONNC and compile the example model. 159 | 160 | Follow the instruction in Lab 1. to rebuild the ONNC source code within the ONNC-community Docker. 161 | Use the following command to bring up the ONNC-community Docker. 162 | 163 | ```sh 164 | # Within onnc/onnc-community Docker container 165 | 166 | $ cd /onnc/onnc-umbrella/build-normal 167 | 168 | # Rebuild ONNC. 169 | $ smake -j8 install 170 | 171 | # Execute ONNC to compile the model. 172 | $ onnc -mquadruple foonvdla /tutorial/models/test_Mul_Add_Relu/test_Mul_Add_Relu.onnx 173 | FooNvdla is invoked 174 | === PrintONNCIRPass ====== 175 | %A[1, 1, 5, 5] = Initializer() 176 | %B[1, 1, 5, 5] = Initializer() 177 | %INPUT0[1, 1, 5, 5] = InputOperator() 178 | %mul_out[1, 1, 5, 5] = Mul(%INPUT0[1, 1, 5, 5], %A[1, 1, 5, 5]) 179 | %add_out[1, 1, 5, 5] = Add(%mul_out[1, 1, 5, 5], %B[1, 1, 5, 5]) 180 | %OUTPUT0[1, 1, 5, 5] = Relu(%add_out[1, 1, 5, 5]) 181 | = OutputOperator(%OUTPUT0[1, 1, 5, 5]) 182 | ========================== 183 | NvDlaReorderMulAddPass is called... 184 | === PrintONNCIRPass ====== 185 | %A[1, 1, 5, 5] = Initializer() 186 | %INPUT0[1, 1, 5, 5] = InputOperator() 187 | %B__gamma_0)[1, 1, 5, 5] = Initializer() 188 | %add_out[1, 1, 5, 5] = Add(%INPUT0[1, 1, 5, 5], %B__gamma_0)[1, 1, 5, 5]) 189 | %mul_out[1, 1, 5, 5] = Mul(%add_out[1, 1, 5, 5], %A[1, 1, 5, 5]) 190 | %OUTPUT0[1, 1, 5, 5] = Relu(%mul_out[1, 1, 5, 5]) 191 | = OutputOperator(%OUTPUT0[1, 1, 5, 5]) 192 | ========================== 193 | NvDlaFuseAddMulReluPass is called... 194 | === PrintONNCIRPass ====== 195 | %A[1, 1, 5, 5] = Initializer() 196 | %INPUT0[1, 1, 5, 5] = InputOperator() 197 | %B__gamma_0)[1, 1, 5, 5] = Initializer() 198 | %OUTPUT0[1, 1, 5, 5] = AddMulRelu<>(%INPUT0[1, 1, 5, 5], %B__gamma_0)[1, 1, 5, 5], %A[1, 1, 5, 5]) 199 | = OutputOperator(%OUTPUT0[1, 1, 5, 5]) 200 | ========================== 201 | visit(NvDlaAddMulRelu) is called 202 | ``` 203 | 204 | In the above output log, there are three `PrintONNCIRPass` blocks. The first one prints the initial ONNC IR graph before the re-ordering optimization takes effect. There is a Mul-Add pair in the initial graph. After `NvDlaReorderMulAddPass` is applied, the Mul-Add pair is converted to an Add-Mul pair. occurs before the Mul. In addition, one of the Add's inputs is connected to a newly-created tensor called `B__gamma_0`, which contains the adjusted coefficients. After another pass, `NvDlaFuseAddMulReluPass`, is applied, the ONNC IR graph changes again. A new ONNC IR called `AddMulRelu` replaces the Add-Mul-Relu sequence in the previous ONNC IR graph. With these optimization passes on the model graph, we can easily map three model operations into a single SDP-X1 operation in NVDLA. 205 | 206 | ## Summary 207 | 208 | In this lab, you have learned: 209 | 210 | * How to fully utilize the pipelined SDP-X1 datapath by searching for the Add-Mul-Relu patterns in a given model and mapping them together into a single SDP-X1 operation. 211 | * How to create a pass to manipulate the model graph for achieving the above optimization. 212 | 213 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | include_directories(.) 3 | include_directories(include) 4 | 5 | add_libonnc_src( 6 | CodeEmitVisitor.cpp 7 | FooNvdlaBackend.cpp 8 | Loadable.cpp 9 | NvDlaDefine.cpp 10 | NvDlaMeta.cpp 11 | NvDlaUtil.cpp 12 | NvDlaMemInfoPass.cpp 13 | NvDlaTaskSubmitPass.cpp 14 | NvDlaFileGenPass.cpp 15 | NvDlaReorderMulAddPass.cpp 16 | Compute/NvDlaAddMulRelu.cpp 17 | NvDlaFuseAddMulReluPass.cpp 18 | PrintONNCIRPass.cpp 19 | Config/NvFull.cpp 20 | TargetInfo/FooNvdlaTargetInfo.cpp 21 | TargetInfo/FooNvdlaTargetMemInfo.cpp) 22 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/CodeEmitVisitor.h: -------------------------------------------------------------------------------- 1 | //===- CodeEmitVisitor.h --------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #ifndef TARGET_FOONVDLA_CODE_EMIT_VISITOR_H 9 | #define TARGET_FOONVDLA_CODE_EMIT_VISITOR_H 10 | 11 | #include "NvDlaDefine.h" 12 | #include "NvDlaMeta.h" 13 | #include "Compute/NvDlaAddMulRelu.h" 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | 25 | namespace onnc { 26 | namespace foonvdla { 27 | 28 | class CodeEmitVisitor : public CustomVisitor, private NvDlaConstants 29 | { 30 | public: 31 | CodeEmitVisitor(const NvDlaConstants& constants, NvDlaBackendMeta& meta) noexcept 32 | : NvDlaConstants{constants} 33 | , m_pMeta{meta} 34 | {} 35 | 36 | /// ONNC defined operators @{ 37 | void visit(const Initializer& pInitializer) override; 38 | void visit(const InputOperator& pInputOperator) override; 39 | void visit(const OutputOperator& pOutputOperator) override; 40 | /// @} 41 | 42 | /// ONNX defined operators @{ 43 | void visit(const Conv& pConv) override; 44 | void visit(const NvDlaAddMulRelu& pOp); 45 | /// @} 46 | 47 | /// ONNC defined operators @{ 48 | void visit(Initializer& pInitializer) override; 49 | void visit(InputOperator& pInputOperator) override; 50 | void visit(OutputOperator& pOutputOperator) override; 51 | /// @} 52 | 53 | /// ONNX defined operators @{ 54 | void visit(Conv& pConv) override; 55 | void visit(NvDlaAddMulRelu& pOp) { visit(const_cast(pOp)); } 56 | /// @} 57 | 58 | private: 59 | MemoryListEntryId packWeight(const Tensor& weight, NvDlaDims destDims, Tensor::Dimension numFrontPaddingChannels, 60 | Tensor::Dimension outputChannelOffset); 61 | MemoryListEntryId packImageWeight(const Tensor& weight, NvDlaDims destDims, Tensor::Dimension outputChannelOffset); 62 | MemoryListEntryId packBias(const Tensor& bias, Tensor::Dimension numDestChannels, 63 | Tensor::Dimension srcChannelOffset = 0); 64 | MemoryListEntryId packSDPOperand(const Tensor* aluTensor, const Tensor* mulTensor, const NvDlaCubeInfo& cubeInfo); 65 | 66 | MemoryListEntryId packFeature(const Tensor& tensor, const NvDlaCubeInfo& cube); 67 | void issueEmuOp(NvDlaEmuOperation* op); 68 | AddressListEntryId issueEmuAddr(MemoryListEntryId mid); 69 | void issueDlaOp(NvDlaDlaOperation* op, NvDlaDlaOperation* op_fuse, NvDlaDlaOperation* op_prev); 70 | void issueDlaOp(std::unique_ptr op); 71 | AddressListEntryId issueDlaAddr(const Tensor& tensor, const NvDlaCubeInfo& cube, Tensor::Dimension channelOffset, 72 | NvDlaBackendMeta::Offset hOffset); 73 | AddressListEntryId issueDlaAddr(const Tensor& tensor, const NvDlaCubeInfo& cube); 74 | AddressListEntryId issueDlaAddr(MemoryListEntryId memoryId, const NvDlaCubeInfo& cube); 75 | AddressListEntryId issueSDPOperand(const Tensor& tensor, const NvDlaCubeInfo& cube, MemoryListEntryId& memoryId); 76 | 77 | void SetLUTParam(dla_lut_param* lut_param, float alpha, float beta, float bias, int size, float outdata_scale, float outdata_offset); 78 | 79 | // Perform SDP for 2 input tensors and an output tensor, 80 | // the possible value for parameter 'opType' is: 81 | // 82 | // 1. SDP_OP_ADD 83 | // 2. SDP_OP_MUL 84 | // 85 | void emitSdp(std::uint8_t opType, const Tensor& firstInput, const Tensor& secondInput, const Tensor& output); 86 | 87 | private: 88 | MemoryListEntryId packWeight(span weight, const Tensor* weightTensor, NvDlaDims srcDims, 89 | NvDlaDims destDims, Tensor::Dimension numFrontPaddingChannels, 90 | Tensor::Dimension outputChannelOffset); 91 | 92 | MemoryListEntryId packWeight(const Tensor& weight, NvDlaDims srcDims, NvDlaDims destDims, 93 | Tensor::Dimension numFrontPaddingChannels, Tensor::Dimension outputChannelOffset); 94 | 95 | template 96 | void packWeightImpl(Type* destData, NvDlaDims destDimsWithFrontPadding, const Tensor* tensor, const float* srcData, 97 | NvDlaDims srcDims, Tensor::Dimension numFrontPaddingChannels, 98 | Tensor::Dimension outputChannelOffset); 99 | 100 | template 101 | void packImageWeightImpl(Type* blob, NvDlaDims blobDims, const Tensor* tensor, const float* srcData, 102 | NvDlaDims srcDims, Tensor::Dimension outputChannelOffset); 103 | template 104 | void packBiasImpl(Type* destData, Tensor::Dimension numDestChannels, const Tensor* tensor, const float* srcData, 105 | Tensor::Dimension srcChannelOffset); 106 | void packSDPOperandImpl(NvU8* blob, const Tensor* aluTensor, const float* aluData, const Tensor* mulTensor, 107 | const float* mulData, const NvDlaCubeInfo& cubeInfo); 108 | 109 | private: 110 | NvDlaBackendMeta& m_pMeta; 111 | }; 112 | 113 | } // namespace nvdla 114 | } // namespace onnc 115 | 116 | #undef PP_DECL_VISIT 117 | #undef PP_NVDLA_OP_LIST 118 | 119 | #endif 120 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/FooNvdlaBackend.cpp: -------------------------------------------------------------------------------- 1 | //===- FooNvdlaBackend.cpp -----------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #include 9 | 10 | #include "FooNvdlaBackend.h" 11 | #include "TargetInfo/FooNvdlaTargetInfo.h" 12 | #include "TargetInfo/FooNvdlaTargetMemInfo.h" 13 | #include "CodeEmitVisitor.h" 14 | #include "NvDlaMemInfoPass.h" 15 | #include "NvDlaTaskSubmitPass.h" 16 | #include "NvDlaFileGenPass.h" 17 | #include "NvDlaReorderMulAddPass.h" 18 | #include "NvDlaFuseAddMulReluPass.h" 19 | #include "PrintONNCIRPass.h" 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | #include 46 | 47 | using namespace onnc; 48 | 49 | //===----------------------------------------------------------------------===// 50 | // FooNvdlaBackend 51 | //===----------------------------------------------------------------------===// 52 | const Version FooNvdlaBackend::LOADABLE_VERSION = Version(1, 1, 255); 53 | const Version FooNvdlaBackend::BLOB_DLA_VERSION = Version(1, 3, 0); 54 | const Version FooNvdlaBackend::BLOB_EMU_VERSION = Version(1, 3, 0); 55 | 56 | FooNvdlaBackend::FooNvdlaBackend(const TargetOptions& pOptions) 57 | : TargetBackend(pOptions) 58 | , NvDlaConstants(getConfig(::nvdla::ConfigSet::nv_full, ::nvdla::ExecutionMode::direct, false)) 59 | , m_pMeta(*this) { 60 | m_pMemInfo = std::make_unique(); 61 | } 62 | 63 | void FooNvdlaBackend::addTensorSel(PassManager& pPM) 64 | { 65 | errs() << "FooNvdla is invoked\n"; 66 | 67 | // Do ONNX graph IR optimization here. 68 | 69 | // Translate from ONNX graph IR into ONNC IR 70 | addStandardTensorSel(pPM, *this); 71 | 72 | // Now ONNC IR is ready. 73 | // If you need to extend ONNC IR, here is the place to add your pass that 74 | // adds your ONNC IR operators. 75 | } 76 | 77 | void FooNvdlaBackend::addOnncIrOptimization(PassManager& pPM, OptimizationOptions& options) 78 | { 79 | TargetBackend::addOnncIrOptimization(pPM, options); 80 | 81 | pPM.add(); 82 | pPM.add(); 83 | pPM.add(); 84 | pPM.add(); 85 | pPM.add(); 86 | } 87 | 88 | void FooNvdlaBackend::addTensorSched(PassManager& pPM) 89 | { 90 | // After method AddTensorSel, operators have been scheduled in an 91 | // topological order, which totally respects the data dependency. 92 | // However, that might not be an optimized order for certain objective. 93 | // Add a scheduling optimization pass here. 94 | } 95 | 96 | void FooNvdlaBackend::addMemAlloc(PassManager& pPM) 97 | { 98 | // Input: Module 99 | // Output: LiveIntervals 100 | addStandardCreateLiveIntervals(pPM); 101 | 102 | // Input: LiveIntervals 103 | // Output: MemAllocs 104 | addStandardMemoryAllocation(pPM, *this); 105 | 106 | // Input: MemAllocs 107 | // Output: Virtual memory address for each memory operands. 108 | addStandardSetMemOperands(pPM); 109 | 110 | const NvDlaConstants& constants = *this; 111 | pPM.add(constants, &m_pMeta); 112 | } 113 | 114 | void FooNvdlaBackend::addCodeEmit(PassManager& pPM, const Path& pOutput) 115 | { 116 | static foonvdla::CodeEmitVisitor ceVisitor(*this, m_pMeta); 117 | pPM.add(ceVisitor) 118 | .add(&m_pMeta, BLOB_DLA_VERSION, BLOB_EMU_VERSION) 119 | .add(&m_pMeta, LOADABLE_VERSION) 120 | ; 121 | } 122 | 123 | void FooNvdlaBackend::RegisterLowers(LowerRegistry& pRegistry) const 124 | { 125 | pRegistry.emplace(); 126 | pRegistry.emplace(); 127 | pRegistry.emplace(); 128 | pRegistry.emplace(); 129 | } 130 | 131 | 132 | //===----------------------------------------------------------------------===// 133 | // Non member functions 134 | //===----------------------------------------------------------------------===// 135 | TargetBackend* CreateFooNvdlaBackend(const TargetOptions& pOptions) 136 | { 137 | return new FooNvdlaBackend(pOptions); 138 | } 139 | 140 | extern "C" void InitializeFooNvdlaONNCBackend() 141 | { 142 | onnc::TargetRegistry::RegisterTargetBackend(getTheFooNvdlaTarget(), 143 | CreateFooNvdlaBackend); 144 | } 145 | 146 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/FooNvdlaBackend.h: -------------------------------------------------------------------------------- 1 | //===- FooNvdlaBackend.h -------------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #ifndef TARGET_FOONVDLA_FOONVDLA_BACKEND_H 9 | #define TARGET_FOONVDLA_FOONVDLA_BACKEND_H 10 | #include 11 | #include 12 | #include "NvDlaDefine.h" 13 | #include "NvDlaMeta.h" 14 | #include "Version.h" 15 | 16 | namespace onnc { 17 | using namespace onnc::foonvdla; 18 | 19 | class FooNvdlaBackend : public TargetBackend, private NvDlaConstants 20 | { 21 | private: 22 | static const Version LOADABLE_VERSION; 23 | static const Version BLOB_DLA_VERSION; 24 | static const Version BLOB_EMU_VERSION; 25 | 26 | public: 27 | FooNvdlaBackend(const TargetOptions& pOptions); 28 | 29 | virtual ~FooNvdlaBackend() = default; 30 | 31 | void addTensorSel(PassManager& pPM) override; 32 | 33 | void addOnncIrOptimization(PassManager& pPM, OptimizationOptions& options) override; 34 | 35 | void addTensorSched(PassManager& pPM) override; 36 | 37 | void addMemAlloc(PassManager& pPM) override; 38 | 39 | void addCodeEmit(PassManager& pPM, const Path& pOutput) override; 40 | 41 | void RegisterLowers(LowerRegistry& pRegistry) const override; 42 | 43 | private: 44 | NvDlaBackendMeta m_pMeta; 45 | }; 46 | 47 | } // namespace onnc 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/Makefile.am: -------------------------------------------------------------------------------- 1 | ONNC_TARGET_SOURCES += \ 2 | Target/FooNvdla/CodeEmitVisitor.cpp \ 3 | Target/FooNvdla/FooNvdlaBackend.cpp \ 4 | Target/FooNvdla/Loadable.cpp \ 5 | Target/FooNvdla/NvDlaDefine.cpp \ 6 | Target/FooNvdla/NvDlaMeta.cpp \ 7 | Target/FooNvdla/NvDlaUtil.cpp \ 8 | Target/FooNvdla/NvDlaMemInfoPass.cpp \ 9 | Target/FooNvdla/NvDlaTaskSubmitPass.cpp \ 10 | Target/FooNvdla/NvDlaFileGenPass.cpp \ 11 | Target/FooNvdla/NvDlaReorderMulAddPass.cpp \ 12 | Target/FooNvdla/Compute/NvDlaAddMulRelu.cpp \ 13 | Target/FooNvdla/NvDlaFuseAddMulReluPass.cpp \ 14 | Target/FooNvdla/PrintONNCIRPass.cpp \ 15 | Target/FooNvDla/Config/NvFull.cpp \ 16 | Target/FooNvdla/TargetInfo/FooNvdlaTargetInfo.cpp \ 17 | Target/FooNvdla/TargetInfo/FooNvdlaTargetMemInfo.cpp 18 | 19 | ONNC_INCLUDES += \ 20 | -I${srcdir}/Target/FooNvdla/include \ 21 | -I${srcdir}/Target/FooNvdla/ 22 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaAddMulRelu.cpp: -------------------------------------------------------------------------------- 1 | //===- NvDlaAddMulRelu.cpp ------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #include "NvDlaAddMulRelu.h" 9 | 10 | #include "../CodeEmitVisitor.h" 11 | #include "../NvDlaDefine.h" 12 | 13 | using namespace onnc; 14 | using namespace onnc::foonvdla; 15 | 16 | char NvDlaAddMulRelu::ID = 0; 17 | 18 | //===----------------------------------------------------------------------===// 19 | // NvDlaAddMulRelu 20 | //===----------------------------------------------------------------------===// 21 | void NvDlaAddMulRelu::printAttributes(std::ostream& pOS) const 22 | { 23 | pOS << "<>"; 24 | } 25 | 26 | void NvDlaAddMulRelu::accept(ComputeVisitor& pV) 27 | { 28 | CodeEmitVisitor* visitor = dyn_cast(&pV); 29 | if (nullptr != visitor) 30 | visitor->visit(*this); 31 | } 32 | 33 | void NvDlaAddMulRelu::accept(ComputeVisitor& pV) const 34 | { 35 | CodeEmitVisitor* visitor = dyn_cast(&pV); 36 | if (nullptr != visitor) 37 | visitor->visit(*this); 38 | } 39 | 40 | bool NvDlaAddMulRelu::classof(const ComputeOperator* pOp) 41 | { 42 | if (nullptr == pOp) 43 | return false; 44 | return (pOp->getID() == &ID); 45 | } 46 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaAddMulRelu.h: -------------------------------------------------------------------------------- 1 | //===- NvDlaAddMulRelu.h ------------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===--------------------------------------------------------------------------===// 8 | #ifndef TARGET_NVDLA_NVDLA_ADD_MUL_RELU_H 9 | #define TARGET_NVDLA_NVDLA_ADD_MUL_RELU_H 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | namespace onnc { 16 | namespace foonvdla { 17 | 18 | class NvDlaAddMulRelu : public ComputeOperator 19 | { 20 | public: 21 | static char ID; 22 | 23 | public: 24 | NvDlaAddMulRelu() 25 | : ComputeOperator("AddMulRelu", ID) 26 | {} 27 | 28 | virtual ~NvDlaAddMulRelu() {} 29 | 30 | // Paramater 31 | 32 | // Input & Ouput Tensor 33 | Tensor* getInput(unsigned int pIdx) override { return static_cast(m_Inputs[pIdx]); } 34 | 35 | const Tensor* getInput(unsigned int pIdx) const override { return static_cast(m_Inputs[pIdx]); } 36 | 37 | Tensor* getOutput(unsigned int pIdx) override { return static_cast(m_Outputs[pIdx]); } 38 | 39 | const Tensor* getOutput(unsigned int pIdx) const override { return static_cast(m_Outputs[pIdx]); } 40 | 41 | void printAttributes(std::ostream& pOS) const override; 42 | 43 | void accept(ComputeVisitor& pV) override; 44 | 45 | void accept(ComputeVisitor& pV) const override; 46 | 47 | static bool classof(const ComputeOperator* pOp); 48 | 49 | }; 50 | 51 | } // namespace foonvdla 52 | } // namespace onnc 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaFuseAddMulReluPass.cpp: -------------------------------------------------------------------------------- 1 | //===- NvDlaFuseAddMulReluPass.cpp ----------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #include "NvDlaFuseAddMulReluPass.h" 9 | #include "Compute/NvDlaAddMulRelu.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace onnc { 21 | namespace foonvdla { 22 | 23 | //===----------------------------------------------------------------------===// 24 | // NvDlaFuseAddMulReluPass 25 | //===----------------------------------------------------------------------===// 26 | 27 | Pass::ReturnType NvDlaFuseAddMulReluPass::runOnModule(Module& pModule) 28 | { 29 | const Pass::ReturnType ret = BaseType::runOnModule(pModule); 30 | 31 | if (ret != kModuleNoChanged) { 32 | pModule.eraseUnusedValues(); 33 | } 34 | 35 | return ret; 36 | } 37 | 38 | Pass::ReturnType NvDlaFuseAddMulReluPass::runOnComputeGraph(ComputeGraph& pCG) 39 | { 40 | Pass::ReturnType ret = Pass::kModuleNoChanged; 41 | 42 | // Search for the Add-Mul-Relu patterns that can be replaced by a single AddMulRelu IR. 43 | std::vector patternList; 44 | for (ComputeOperator& node : pCG) { 45 | if (isAddMulRelu(&node)) { 46 | patternList.emplace_back(&node); 47 | ret |= Pass::kModuleChanged; 48 | } 49 | } 50 | 51 | for (ComputeOperator* node : patternList) { 52 | // Derive original IRs. 53 | Add* add = dyn_cast(node); 54 | Mul* mul = dyn_cast(add->getOutput(0)->getUses()[0].getUser()); 55 | Relu* relu = dyn_cast(mul->getOutput(0)->getUses()[0].getUser()); 56 | 57 | Tensor* addA = add->getInput(0); 58 | Tensor* addB = add->getInput(1); 59 | Tensor* addC = add->getOutput(0); 60 | Tensor* mulB; 61 | if (addC == mul->getInput(0)) { 62 | mulB = mul->getInput(1); 63 | } else { 64 | mulB = mul->getInput(0); 65 | } 66 | Tensor* mulC = mul->getOutput(0); 67 | Tensor* reluY = relu->getOutput(0); 68 | 69 | // The current ONNC IR graph status 70 | // ================================ 71 | // 72 | // | | 73 | // addA addB 74 | // \ / 75 | // (add) 76 | // | | 77 | // addC mulB 78 | // \ / 79 | // (mul) 80 | // | 81 | // mulC 82 | // | 83 | // (relu) 84 | // | 85 | // reluY 86 | // | 87 | 88 | // Create a new AddMulRelu IR. 89 | NvDlaAddMulRelu* compound = pCG.addOperator(); 90 | 91 | // The current ONNC IR graph status 92 | // ================================ 93 | // 94 | // | | 95 | // addA addB 96 | // \ / 97 | // (add) 98 | // | | 99 | // addC mulB 100 | // \ / 101 | // (mul) 102 | // | 103 | // mulC 104 | // | 105 | // (relu) (compound) 106 | // | 107 | // reluY 108 | // | 109 | 110 | add->removeAllInputs(); 111 | add->removeAllOutputs(); 112 | mul->removeAllInputs(); 113 | mul->removeAllOutputs(); 114 | relu->removeAllInputs(); 115 | relu->removeAllOutputs(); 116 | 117 | // The current ONNC IR graph status 118 | // ================================ 119 | // 120 | // | | 121 | // addA addB 122 | // 123 | // (add) 124 | // | 125 | // addC mulB 126 | // 127 | // (mul) 128 | // 129 | // mulC 130 | // 131 | // (relu) (compound) 132 | // 133 | // reluY 134 | // | 135 | 136 | pCG.erase(*add); 137 | pCG.erase(*mul); 138 | pCG.erase(*relu); 139 | pCG.erase(*addC); 140 | pCG.erase(*mulC); 141 | 142 | // The current ONNC IR graph status 143 | // ================================ 144 | // 145 | // | | 146 | // addA addB 147 | // 148 | // | 149 | // mulB 150 | // 151 | // (compound) 152 | // 153 | // reluY 154 | // | 155 | 156 | compound->addInput(*addA); 157 | compound->addInput(*addB); 158 | compound->addInput(*mulB); 159 | compound->addOutput(*reluY); 160 | 161 | // The current ONNC IR graph status 162 | // ================================ 163 | // 164 | // | | | 165 | // addA addB mulB 166 | // \ | / 167 | // (compound) 168 | // | 169 | // reluY 170 | // | 171 | 172 | } 173 | 174 | pCG.topologicalSort(); 175 | 176 | return ret; 177 | } 178 | 179 | bool NvDlaFuseAddMulReluPass::isAddMulRelu(ComputeOperator* pNode) 180 | { 181 | // Check the first node. 182 | // It must be 183 | // 1) an Add and, 184 | // 2) has only one operator to use its result. 185 | if ( ! isa(pNode)) return false; 186 | if (pNode->getOutput(0)->getUses().size() > 1) return false; 187 | 188 | // Check the second node. 189 | // It must be 190 | // 1) a Mul and, 191 | // 2) has only one operator to use its result. 192 | ComputeOperator* secondNode = pNode->getOutput(0)->getUses()[0].getUser(); 193 | if ( ! isa(secondNode)) return false; 194 | if (secondNode->getOutput(0)->getUses().size() > 1) return false; 195 | 196 | // Check the third node. 197 | // It must be a Relu. 198 | // However, it does not need the limitation of only one operator to use its result, because 199 | // its result is saved in system memory which can be loaded by multiple operators for use 200 | // at any time. 201 | ComputeOperator* thirdNode = secondNode->getOutput(0)->getUses()[0].getUser(); 202 | if ( ! isa(thirdNode)) return false; 203 | 204 | return true; 205 | } 206 | 207 | } // namespace foonvdla 208 | } // namespace onnc 209 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaFuseAddMulReluPass.h: -------------------------------------------------------------------------------- 1 | //===- NvDlaFuseAddMulReluPass.h ------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #ifndef ONNC_FOONVDLA_FUSE_ADD_MUL_RELU_PASS_H 9 | #define ONNC_FOONVDLA_FUSE_ADD_MUL_RELU_PASS_H 10 | #include 11 | 12 | 13 | namespace onnc { 14 | namespace foonvdla { 15 | 16 | class NvDlaFuseAddMulReluPass : public CustomPass 17 | { 18 | public: 19 | NvDlaFuseAddMulReluPass() = default; 20 | 21 | ReturnType runOnModule(Module& pModule) override; 22 | 23 | ReturnType runOnComputeGraph(ComputeGraph& pCG) override; 24 | 25 | private: 26 | bool isAddMulRelu(ComputeOperator* pNode); 27 | }; 28 | 29 | } // namespace foonvdla 30 | } // namespace onnc 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaReorderMulAddPass.cpp: -------------------------------------------------------------------------------- 1 | //===- NvDlaReorderMulAddPass.cpp -----------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #include "NvDlaReorderMulAddPass.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | using namespace onnc; 19 | using namespace onnc::foonvdla; 20 | 21 | //===----------------------------------------------------------------------===// 22 | // NvDlaReorderMulAddPass 23 | //===----------------------------------------------------------------------===// 24 | 25 | unsigned NvDlaReorderMulAddPass::tensorIdx = 0; 26 | 27 | Pass::ReturnType NvDlaReorderMulAddPass::runOnModule(Module& pModule) 28 | { 29 | std::cout << "NvDlaReorderMulAddPass is called...\n"; 30 | 31 | const Pass::ReturnType ret = BaseType::runOnModule(pModule); 32 | 33 | if (ret != kModuleNoChanged) { 34 | pModule.eraseUnusedValues(); 35 | } 36 | 37 | return ret; 38 | } 39 | 40 | Pass::ReturnType NvDlaReorderMulAddPass::runOnComputeGraph(ComputeGraph& pCG) 41 | { 42 | Pass::ReturnType ret = Pass::kModuleNoChanged; 43 | 44 | //-------------------------------------------------------- 45 | // Search for the Mul-Add patterns that can be reordered. 46 | //-------------------------------------------------------- 47 | std::vector mulList; 48 | for (ComputeOperator& node : pCG) { 49 | if (canBeReordered(&node)) { 50 | mulList.emplace_back(&node); 51 | ret |= Pass::kModuleChanged; 52 | } 53 | } 54 | 55 | //-------------------------------------------- 56 | // Perform re-ordering on the found patterns. 57 | //-------------------------------------------- 58 | // The original pattern is: 59 | // outputY = (inputX * alpha) + beta 60 | // 61 | // We will re-arrange the above pattern by: 62 | // outputY = (inputX + gamma) * alpha, where 63 | // gamma = beta / alpha 64 | 65 | for (ComputeOperator* node : mulList) { 66 | Mul* mul = dyn_cast(node); 67 | Add* add = dyn_cast(node->getOutput(0)->getUses()[0].getUser()); 68 | 69 | Tensor* inputX; 70 | FloatTensor* alpha; // This kind of tensor contains constant values. 71 | FloatTensor* beta; 72 | Tensor* outputY; 73 | Tensor* tmp; 74 | 75 | // Find alpha and inputX. alpha must be a constant tensor. 76 | // In this example, we assume that Mul must have one constant input. 77 | if (isConstant(mul->getInput(0))) { 78 | alpha = dynamic_cast(mul->getInput(0)); 79 | inputX = mul->getInput(1); 80 | } else { 81 | inputX = mul->getInput(0); 82 | alpha = dynamic_cast(mul->getInput(1)); 83 | } 84 | 85 | // Find beta. beta must be a constant tensor. 86 | // In this example, we assume that Add must have one constant input. 87 | if (isConstant(add->getInput(0))) { 88 | beta = dynamic_cast(add->getInput(0)); 89 | tmp = add->getInput(1); 90 | } else { 91 | tmp = add->getInput(0); 92 | beta = dynamic_cast(add->getInput(1)); 93 | } 94 | 95 | // Find outputY 96 | outputY = add->getOutput(0); 97 | 98 | std::string addOutputTensorName = add->getOutput(0)->getName(); 99 | std::string mulOutputTensorName = mul->getOutput(0)->getName(); 100 | 101 | // The current ONNC IR graph status 102 | // ================================ 103 | // 104 | // (alphaInitializer) 105 | // | | 106 | // inputX alpha 107 | // \ / 108 | // (mul) (betaInitializer) 109 | // | | 110 | // tmp beta 111 | // \ / 112 | // (add) 113 | // | 114 | // outputY 115 | // | 116 | // 117 | 118 | // Remove the edges between Mul/Add and their input/output tensors. 119 | // We will re-build their edges later on. 120 | // Remove an edge means to erase the records within an operator's data structure about its input tensors. 121 | mul->removeAllInputs(); 122 | mul->removeAllOutputs(); 123 | add->removeAllInputs(); 124 | add->removeAllOutputs(); 125 | 126 | // The current ONNC IR graph status 127 | // ================================ 128 | // 129 | // (alphaInitializer) 130 | // | | 131 | // inputX alpha 132 | // 133 | // (mul) (betaInitializer) 134 | // | 135 | // tmp beta 136 | // 137 | // (add) 138 | // 139 | // outputY 140 | // | 141 | // 142 | 143 | // Create a new tensor gamma. 144 | FloatTensor* gamma = dynamic_cast(beta->create()); 145 | 146 | // Give gamma tensor a unique name. 147 | gamma->setName(beta->getName() + "__gamma_" + std::to_string(tensorIdx++) + ")"); 148 | 149 | // Initialize gamma. 150 | gamma->setDimensions(beta->getDimensions()); 151 | 152 | // Add gamma into the ONNC IR graph. 153 | gamma = pCG.addValue(gamma); 154 | assert((gamma != nullptr) && "The name must be unique"); 155 | 156 | // Create a new Initializer operator for gamma tensor. This is a must in ONNC IR graph. 157 | // Every tensor must have a "defining" operator. For a constant tensor, its defining 158 | // operator is an Initializer. 159 | Initializer* gammaInitializer = pCG.addOperator(); 160 | gammaInitializer->setTensor(*gamma); 161 | 162 | // The current ONNC IR graph status 163 | // ================================ 164 | // 165 | // (alphaInitializer) 166 | // | | 167 | // inputX alpha 168 | // 169 | // (mul) (betaInitializer) 170 | // | 171 | // tmp beta 172 | // 173 | // (add) (gammaInitializer) 174 | // | 175 | // outputY gamma 176 | // | 177 | // 178 | 179 | // Get the constant data of beta. 180 | const float* betaData = reinterpret_cast(beta->getValues().data()); 181 | 182 | // Get the constant data of alpha. 183 | const float* alphaData = reinterpret_cast(alpha->getValues().data()); 184 | 185 | // Calculate the constant data of gamma. 186 | int tensorSize = beta->getValues().size(); 187 | for (int i = 0; i < tensorSize; i++) { 188 | gamma->getValues().push_back( betaData[i] / alphaData[i] ); 189 | } 190 | 191 | // Remove beta from the ONNC IR graph. We don't need it anymore. 192 | Initializer* betaInitializer = static_cast(beta->getDefine()); 193 | pCG.erase(*betaInitializer); 194 | pCG.erase(*beta); 195 | 196 | // The current ONNC IR graph status 197 | // ================================ 198 | // 199 | // (alphaInitializer) 200 | // | | 201 | // inputX alpha 202 | // 203 | // (mul) 204 | // 205 | // tmp 206 | // 207 | // (add) (gammaInitializer) 208 | // | 209 | // outputY gamma 210 | // | 211 | // 212 | 213 | // Re-connect the operators. 214 | add->addInput(*inputX); 215 | add->addInput(*gamma); 216 | add->addOutput(*tmp); 217 | mul->addInput(*tmp); 218 | mul->addInput(*alpha); 219 | mul->addOutput(*outputY); 220 | 221 | // The current ONNC IR graph status 222 | // ================================ 223 | // 224 | // (gammaInitializer) 225 | // | | 226 | // inputX gamma 227 | // \ / 228 | // (add) (alphaInitializer) 229 | // | | 230 | // tmp alpha 231 | // \ / 232 | // (mul) 233 | // | 234 | // outputY 235 | // | 236 | // 237 | 238 | // Rename tensor tmp to become the original output tensor's name of add. 239 | add->getOutput(0)->setName(addOutputTensorName); 240 | // Rename tensor outputY to become the original output tensor's name of mul. 241 | mul->getOutput(0)->setName(mulOutputTensorName); 242 | } 243 | 244 | pCG.topologicalSort(); 245 | 246 | return ret; 247 | } 248 | 249 | bool NvDlaReorderMulAddPass::canBeReordered(ComputeOperator* pNode) 250 | { 251 | if (!isa(pNode)) { 252 | return false; 253 | } 254 | 255 | if (!isConstant(pNode->getInput(0)) && !isConstant(pNode->getInput(1))) { 256 | return false; 257 | } 258 | 259 | Value* outv = pNode->getOutput(0); 260 | 261 | // if Mul's result has more than one users, we can't fuse it. 262 | if (outv->getUses().size() > 1) { 263 | return false; 264 | } 265 | 266 | ComputeOperator* userNode = outv->getUses()[0].getUser(); 267 | if (!isa(userNode)) { 268 | return false; 269 | } 270 | 271 | return true; 272 | } 273 | 274 | bool NvDlaReorderMulAddPass::isConstant(Value* pValue) 275 | { 276 | // Only if this value's (tensor's) "defining" operator is Initializer, 277 | // this tensor is a constant tensor. 278 | ComputeOperator* op = static_cast(pValue->getDefine()); 279 | if (isa(op)) { 280 | return true; 281 | } else { 282 | return false; 283 | } 284 | } 285 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaReorderMulAddPass.h: -------------------------------------------------------------------------------- 1 | //===- NvDlaReorderMulAddPass.h -------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #ifndef ONNC_FOONVDLA_REORDER_MUL_ADD_PASS_H 9 | #define ONNC_FOONVDLA_REORDER_MUL_ADD_PASS_H 10 | #include 11 | 12 | namespace onnc { 13 | namespace foonvdla { 14 | 15 | class NvDlaReorderMulAddPass : public CustomPass 16 | { 17 | public: 18 | NvDlaReorderMulAddPass() = default; 19 | 20 | ReturnType runOnModule(Module& pModule) override; 21 | 22 | ReturnType runOnComputeGraph(ComputeGraph& pCG) override; 23 | 24 | private: 25 | bool canBeReordered(ComputeOperator* pNode); 26 | bool isConstant(Value* value); 27 | 28 | static unsigned tensorIdx; 29 | }; 30 | 31 | } // namespace foonvdla 32 | } // namespace onnc 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/PrintONNCIRPass.cpp: -------------------------------------------------------------------------------- 1 | //===- PrintONNCIRPass.cpp ------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #include "PrintONNCIRPass.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace onnc { 17 | namespace foonvdla { 18 | 19 | //===----------------------------------------------------------------------===// 20 | // PrintONNCIRPass 21 | //===----------------------------------------------------------------------===// 22 | 23 | Pass::ReturnType PrintONNCIRPass::runOnModule(Module& pModule) 24 | { 25 | const Pass::ReturnType ret = BaseType::runOnModule(pModule); 26 | 27 | if (ret != kModuleNoChanged) { 28 | pModule.eraseUnusedValues(); 29 | } 30 | 31 | return ret; 32 | } 33 | 34 | Pass::ReturnType PrintONNCIRPass::runOnComputeGraph(ComputeGraph& pCG) 35 | { 36 | Pass::ReturnType ret = Pass::kModuleNoChanged; 37 | 38 | std::cout << "=== PrintONNCIRPass ======\n"; 39 | for (ComputeOperator& node : pCG) { 40 | node.print(std::cout); 41 | std::cout << "\n"; 42 | } 43 | std::cout << "==========================\n"; 44 | 45 | return ret; 46 | } 47 | 48 | } // namespace foonvdla 49 | } // namespace onnc 50 | -------------------------------------------------------------------------------- /lab_8_Mul_Add_Reordering_and_Fusion/src/PrintONNCIRPass.h: -------------------------------------------------------------------------------- 1 | //===- PrintONNCIRPass.h --------------------------------------------------===// 2 | // 3 | // The ONNC Project 4 | // 5 | // See LICENSE.TXT for details. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | #ifndef ONNC_FOONVDLA_PRINT_ONNC_IR_PASS_H 9 | #define ONNC_FOONVDLA_PRINT_ONNC_IR_PASS_H 10 | #include 11 | #include 12 | 13 | namespace onnc { 14 | namespace foonvdla { 15 | 16 | class PrintONNCIRPass : public CustomPass 17 | { 18 | public: 19 | PrintONNCIRPass() = default; 20 | 21 | ReturnType runOnModule(Module& pModule) override; 22 | 23 | ReturnType runOnComputeGraph(ComputeGraph& pCG) override; 24 | }; 25 | 26 | } // namespace foonvdla 27 | } // namespace onnc 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /models/lenet/input0.output.dimg: -------------------------------------------------------------------------------- 1 | 149.25 -49.625 13.875 11.2344 -59.8125 -2.61523 7.80078 -44.7188 30.8594 17.3594 -------------------------------------------------------------------------------- /models/lenet/input0.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input0.pgm -------------------------------------------------------------------------------- /models/lenet/input1.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input1.pgm -------------------------------------------------------------------------------- /models/lenet/input2.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input2.pgm -------------------------------------------------------------------------------- /models/lenet/input4.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input4.pgm -------------------------------------------------------------------------------- /models/lenet/input5.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input5.pgm -------------------------------------------------------------------------------- /models/lenet/input6.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input6.pgm -------------------------------------------------------------------------------- /models/lenet/input7.output.dimg: -------------------------------------------------------------------------------- 1 | -2.21875 -5.39062 22.4375 7.35938 -25.4688 -18.0469 -39.8125 165.875 2.22656 40.0312 -------------------------------------------------------------------------------- /models/lenet/input7.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input7.pgm -------------------------------------------------------------------------------- /models/lenet/input8.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input8.pgm -------------------------------------------------------------------------------- /models/lenet/input9.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input9.pgm -------------------------------------------------------------------------------- /models/lenet/lenet.nvdla: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/lenet.nvdla -------------------------------------------------------------------------------- /models/lenet/lenet.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/lenet.onnx -------------------------------------------------------------------------------- /models/quantized_mnist/mnist_calibration.txt: -------------------------------------------------------------------------------- 1 | 9,0,12,8,0,13,7,0,15 2 | -------------------------------------------------------------------------------- /models/quantized_mnist/quantized_mnist.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/quantized_mnist/quantized_mnist.onnx -------------------------------------------------------------------------------- /models/test_Add/input1x5x7.pgm: -------------------------------------------------------------------------------- 1 | P5 2 | 7 5 3 | 255 4 |  5 |  !" -------------------------------------------------------------------------------- /models/test_Add/out.nvdla: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Add/out.nvdla -------------------------------------------------------------------------------- /models/test_Add/test_Add.log: -------------------------------------------------------------------------------- 1 | # ./nvdla_runtime --loadable test_Add.nvdla --image input1x5x7.pgm --rawdump 2 | creating new runtime context... 3 | Emulator starting 4 | ppgminfo 1 5 7 5 | pgm2dimg 1 5 7 1 32 160 160 6 | submitting tasks... 7 | [ 37.512018] Enter:dla_read_network_config 8 | [ 37.513711] Exit:dla_read_network_config status=0 9 | [ 37.513905] Enter: dla_initiate_processors 10 | [ 37.514392] Enter: dla_submit_operation 11 | [ 37.514536] Prepare SDP operation index 0 ROI 0 dep_count 0 12 | [ 37.514717] Enter: dla_prepare_operation 13 | [ 37.515103] processor:SDP group:0, rdma_group:0 available 14 | [ 37.515332] Enter: dla_read_config 15 | [ 37.518969] Exit: dla_read_config 16 | [ 37.519171] Exit: dla_prepare_operation status=0 17 | [ 37.519372] Enter: dla_program_operation 18 | [ 37.519527] Program SDP operation index 0 ROI 0 Group[0] 19 | [ 37.522174] no desc get due to index==-1 20 | [ 37.522364] no desc get due to index==-1 21 | [ 37.522492] no desc get due to index==-1 22 | [ 37.522629] no desc get due to index==-1 23 | [ 37.522766] no desc get due to index==-1 24 | [ 37.522908] no desc get due to index==-1 25 | [ 37.523053] no desc get due to index==-1 26 | [ 37.523201] Enter: dla_op_programmed 27 | [ 37.523427] Exit: dla_op_programmed 28 | [ 37.523565] Exit: dla_program_operation status=0 29 | [ 37.523756] Enter: dla_enable_operation 30 | [ 37.523938] Enable SDP operation index 0 ROI 0 31 | [ 37.524254] Enter: dla_op_enabled 32 | [ 37.524388] Exit: dla_op_enabled 33 | [ 37.524507] Exit: dla_enable_operation status=0 34 | [ 37.526518] Exit: dla_submit_operation 35 | [ 37.526796] Enter: dla_dequeue_operation 36 | [ 37.526982] exit SDP as there's no further operation 37 | [ 37.527187] Exit: dla_dequeue_operation 38 | [ 37.527355] Exit: dla_initiate_processors status=0 39 | [ 37.547196] Enter:dla_handle_events, processor:BDMA 40 | [ 37.547525] Exit:dla_handle_events, ret:0 41 | [ 37.547753] Enter:dla_handle_events, processor:Convolution 42 | [ 37.547940] Exit:dla_handle_events, ret:0 43 | [ 37.548077] Enter:dla_handle_events, processor:SDP 44 | [ 37.548251] Handle op complete event, processor SDP group 0 45 | [ 37.548465] Enter:dla_op_completion processor SDP group0 46 | [ 37.548723] Completed SDP operation index 0 ROI 0 47 | [ 37.549006] 1 HWLs done, totally 1 layers 48 | [ 37.549223] Enter: dla_free_op_desc op desc index 0 ROI 0 49 | [ 37.549678] Exit: dla_free_op_desc 50 | [ 37.549856] Exit:dla_op_completion processor SDP group0 status=0 51 | [ 37.550126] Exit:dla_handle_events, ret:0 52 | [ 37.550281] Enter:dla_handle_events, processor:PDP 53 | [ 37.550484] Exit:dla_handle_events, ret:0 54 | [ 37.550637] Enter:dla_handle_events, processor:CDP 55 | [ 37.550813] Exit:dla_handle_events, ret:0 56 | [ 37.550966] Enter:dla_handle_events, processor:RUBIK 57 | [ 37.551132] Exit:dla_handle_events, ret:0 58 | [ 37.553519] reset engine done 59 | Shutdown signal received, exiting 60 | Test pass 61 | -------------------------------------------------------------------------------- /models/test_Add/test_Add.nvdla: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Add/test_Add.nvdla -------------------------------------------------------------------------------- /models/test_Add/test_Add.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Add/test_Add.onnx -------------------------------------------------------------------------------- /models/test_Add/test_Add.output.dimg: -------------------------------------------------------------------------------- 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 -------------------------------------------------------------------------------- /models/test_Conv_Relu/test_Conv_Relu.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Conv_Relu/test_Conv_Relu.onnx -------------------------------------------------------------------------------- /models/test_Log/input1x5x7.pgm: -------------------------------------------------------------------------------- 1 | P5 2 | 7 5 3 | 255 4 |  5 |  !" -------------------------------------------------------------------------------- /models/test_Log/test_Log.log: -------------------------------------------------------------------------------- 1 | # ./nvdla_runtime --loadable test_Log.nvdla --image input1x5x7.pgm --rawdump 2 | creating new runtime context... 3 | Emulator starting 4 | ppgminfo 1 5 7 5 | pgm2dimg 1 5 7 1 32 160 160 6 | submitting tasks... 7 | Work Found! 8 | Work Done 9 | Shutdown signal received, exiting 10 | Test pass 11 | -------------------------------------------------------------------------------- /models/test_Log/test_Log.nvdla: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Log/test_Log.nvdla -------------------------------------------------------------------------------- /models/test_Log/test_Log.onnx: -------------------------------------------------------------------------------- 1 |  2 | onnx-model:] 3 |  4 | INPUT0Y"Log 5 | test-modelZ 6 | INPUT0 7 |  8 |  9 |  10 |  11 | b 12 | Y 13 |  14 |  15 |  16 |  17 | B -------------------------------------------------------------------------------- /models/test_Log/test_Log.output.dimg: -------------------------------------------------------------------------------- 1 | -inf 0 0.693359 1.09863 1.38672 1.60938 1.79199 1.94629 2.08008 2.19727 2.30273 2.39844 2.48438 2.56445 2.63867 2.70898 2.77344 2.83398 2.89062 2.94531 2.99609 3.04492 3.0918 3.13477 3.17773 3.21875 3.25781 3.29492 3.33203 3.36719 3.40039 3.43359 3.46484 3.49609 3.52539 -------------------------------------------------------------------------------- /models/test_Mul_Add_Relu/input1x5x5.pgm: -------------------------------------------------------------------------------- 1 | P5 2 | 5 5 3 | 255 4 |  -------------------------------------------------------------------------------- /models/test_Mul_Add_Relu/test_Mul_Add_Relu.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Mul_Add_Relu/test_Mul_Add_Relu.onnx -------------------------------------------------------------------------------- /models/test_Relu/test_Relu.onnx: -------------------------------------------------------------------------------- 1 |  onnc-tutorial:] 2 |  3 | INPUT0Y"Relu test_ReluZ 4 | INPUT0 5 |  6 |  7 |  8 |  9 | b 10 | Y 11 |  12 |  13 |  14 |  15 | B -------------------------------------------------------------------------------- /models/test_Relu_Log_Relu/input1x5x7.pgm: -------------------------------------------------------------------------------- 1 | P5 2 | 7 5 3 | 255 4 |  5 |  !" -------------------------------------------------------------------------------- /models/test_Relu_Log_Relu/test_Relu_Log_Relu.log: -------------------------------------------------------------------------------- 1 | creating new runtime context... 2 | Emulator starting 3 | ppgminfo 1 5 7 4 | pgm2dimg 1 5 7 1 32 160 160 5 | submitting tasks... 6 | [ 125.794087] Enter:dla_read_network_config 7 | [ 125.794863] Exit:dla_read_network_config status=0 8 | [ 125.795039] Enter: dla_initiate_processors 9 | [ 125.795486] Enter: dla_submit_operation 10 | [ 125.795647] Prepare SDP operation index 0 ROI 0 dep_count 0 11 | [ 125.795885] Enter: dla_prepare_operation 12 | [ 125.796322] processor:SDP group:0, rdma_group:0 available 13 | [ 125.796586] Enter: dla_read_config 14 | [ 125.800336] Exit: dla_read_config 15 | [ 125.800577] Exit: dla_prepare_operation status=0 16 | [ 125.800817] Enter: dla_program_operation 17 | [ 125.801749] Program SDP operation index 0 ROI 0 Group[0] 18 | [ 125.803583] no desc get due to index==-1 19 | [ 125.803794] no desc get due to index==-1 20 | [ 125.803961] no desc get due to index==-1 21 | [ 125.804138] no desc get due to index==-1 22 | [ 125.804316] no desc get due to index==-1 23 | [ 125.804485] no desc get due to index==-1 24 | [ 125.804661] no desc get due to index==-1 25 | [ 125.806009] Enter: dla_op_programmed 26 | [ 125.806304] Exit: dla_op_programmed 27 | [ 125.806464] Exit: dla_program_operation status=0 28 | [ 125.806699] Enter: dla_enable_operation 29 | [ 125.806925] Enable SDP operation index 0 ROI 0 30 | [ 125.807286] Enter: dla_op_enabled 31 | [ 125.807448] Exit: dla_op_enabled 32 | [ 125.807590] Exit: dla_enable_operation status=0 33 | [ 125.807775] Exit: dla_submit_operation 34 | [ 125.808019] Enter: dla_dequeue_operation 35 | [ 125.808232] exit SDP as there's no further operation 36 | [ 125.808443] Exit: dla_dequeue_operation 37 | [ 125.808631] Exit: dla_initiate_processors status=0 38 | [ 125.827492] Enter:dla_handle_events, processor:BDMA 39 | [ 125.827800] Exit:dla_handle_events, ret:0 40 | [ 125.827995] Enter:dla_handle_events, processor:Convolution 41 | [ 125.828208] Exit:dla_handle_events, ret:0 42 | [ 125.828380] Enter:dla_handle_events, processor:SDP 43 | [ 125.828609] Handle op complete event, processor SDP group 0 44 | [ 125.828849] Enter:dla_op_completion processor SDP group0 45 | [ 125.829088] Completed SDP operation index 0 ROI 0 46 | [ 125.829361] 1 HWLs done, totally 1 layers 47 | [ 125.829602] Enter: dla_free_op_desc op desc index 0 ROI 0 48 | [ 125.830059] Exit: dla_free_op_desc 49 | [ 125.830240] Exit:dla_op_completion processor SDP group0 status=0 50 | [ 125.830490] Exit:dla_handle_events, ret:0 51 | [ 125.830669] Enter:dla_handle_events, processor:PDP 52 | [ 125.830877] Exit:dla_handle_events, ret:0 53 | [ 125.831050] Enter:dla_handle_events, processor:CDP 54 | [ 125.831255] Exit:dla_handle_events, ret:0 55 | [ 125.831435] Enter:dla_handle_events, processor:RUBIK 56 | [ 125.831653] Exit:dla_handle_events, ret:0 57 | [ 125.833915] reset engine done 58 | Work Found! 59 | Work Done 60 | [ 125.981455] Enter:dla_read_network_config 61 | [ 125.981911] Exit:dla_read_network_config status=0 62 | [ 125.982142] Enter: dla_initiate_processors 63 | [ 125.982454] Enter: dla_submit_operation 64 | [ 125.982626] Prepare SDP operation index 0 ROI 0 dep_count 0 65 | [ 125.982862] Enter: dla_prepare_operation 66 | [ 125.983126] processor:SDP group:1, rdma_group:1 available 67 | [ 125.983313] Enter: dla_read_config 68 | [ 125.985834] Exit: dla_read_config 69 | [ 125.987172] Exit: dla_prepare_operation status=0 70 | [ 125.987372] Enter: dla_program_operation 71 | [ 125.987545] Program SDP operation index 0 ROI 0 Group[1] 72 | [ 125.988535] no desc get due to index==-1 73 | [ 125.988714] no desc get due to index==-1 74 | [ 125.989816] no desc get due to index==-1 75 | [ 125.990005] no desc get due to index==-1 76 | [ 125.990163] no desc get due to index==-1 77 | [ 125.990351] no desc get due to index==-1 78 | [ 125.990518] no desc get due to index==-1 79 | [ 125.990691] Enter: dla_op_programmed 80 | [ 125.990845] Exit: dla_op_programmed 81 | [ 125.990993] Exit: dla_program_operation status=0 82 | [ 125.991179] Enter: dla_enable_operation 83 | [ 125.991378] Enable SDP operation index 0 ROI 0 84 | [ 125.991672] Enter: dla_op_enabled 85 | [ 125.991821] Exit: dla_op_enabled 86 | [ 125.991968] Exit: dla_enable_operation status=0 87 | [ 125.992162] Exit: dla_submit_operation 88 | [ 125.992326] Enter: dla_dequeue_operation 89 | [ 125.992494] exit SDP as there's no further operation 90 | [ 125.992688] Exit: dla_dequeue_operation 91 | [ 125.994728] Exit: dla_initiate_processors status=0 92 | [ 126.026894] Enter:dla_handle_events, processor:BDMA 93 | [ 126.027179] Exit:dla_handle_events, ret:0 94 | [ 126.027387] Enter:dla_handle_events, processor:Convolution 95 | [ 126.027582] Exit:dla_handle_events, ret:0 96 | [ 126.027727] Enter:dla_handle_events, processor:SDP 97 | [ 126.027886] Handle op complete event, processor SDP group 1 98 | [ 126.028103] Enter:dla_op_completion processor SDP group1 99 | [ 126.028291] Completed SDP operation index 0 ROI 0 100 | [ 126.028471] 1 HWLs done, totally 1 layers 101 | [ 126.028628] Enter: dla_free_op_desc op desc index 0 ROI 0 102 | [ 126.028979] Exit: dla_free_op_desc 103 | [ 126.029121] Exit:dla_op_completion processor SDP group1 status=0 104 | [ 126.029332] Exit:dla_handle_events, ret:0 105 | [ 126.029481] Enter:dla_handle_events, processor:PDP 106 | [ 126.029661] Exit:dla_handle_events, ret:0 107 | [ 126.029817] Enter:dla_handle_events, processor:CDP 108 | [ 126.029995] Exit:dla_handle_events, ret:0 109 | [ 126.030146] Enter:dla_handle_events, processor:RUBIK 110 | [ 126.030323] Exit:dla_handle_events, ret:0 111 | [ 126.032432] reset engine done 112 | Shutdown signal received, exiting 113 | Test pass 114 | -------------------------------------------------------------------------------- /models/test_Relu_Log_Relu/test_Relu_Log_Relu.nvdla: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Relu_Log_Relu/test_Relu_Log_Relu.nvdla -------------------------------------------------------------------------------- /models/test_Relu_Log_Relu/test_Relu_Log_Relu.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Relu_Log_Relu/test_Relu_Log_Relu.onnx -------------------------------------------------------------------------------- /models/test_Relu_Log_Relu/test_Relu_Log_Relu.output.dimg: -------------------------------------------------------------------------------- 1 | 0 0 0.693359 1.09863 1.38672 1.60938 1.79199 1.94629 2.08008 2.19727 2.30273 2.39844 2.48438 2.56445 2.63867 2.70898 2.77344 2.83398 2.89062 2.94531 2.99609 3.04492 3.0918 3.13477 3.17773 3.21875 3.25781 3.29492 3.33203 3.36719 3.40039 3.43359 3.46484 3.49609 3.52539 -------------------------------------------------------------------------------- /models/test_Shuffle/input.pgm: -------------------------------------------------------------------------------- 1 | P5 2 | 5 5 3 | 255 4 |  -------------------------------------------------------------------------------- /models/test_Shuffle/test_Shuffle.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Shuffle/test_Shuffle.onnx -------------------------------------------------------------------------------- /models/test_Shuffle/test_Shuffle.output.dimg: -------------------------------------------------------------------------------- 1 | 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 -------------------------------------------------------------------------------- /models/test_group_Conv/test_group_Conv.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_group_Conv/test_group_Conv.onnx -------------------------------------------------------------------------------- /models/test_group_Conv/test_group_Conv.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | from onnx import helper 3 | from onnx import AttributeProto, TensorProto, GraphProto 4 | import numpy as np 5 | 6 | def getOnesTensor(shape, name): 7 | values = np.ones(shape).flatten().astype(float) 8 | return helper.make_tensor(name=name, data_type=TensorProto.FLOAT, dims=shape, vals=values) 9 | 10 | # create input 11 | x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 8, 5, 5]) 12 | W = helper.make_tensor_value_info('W', TensorProto.FLOAT, [6, 4, 2, 2]) 13 | 14 | # create output 15 | y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 6, 4, 4]) 16 | 17 | # Convolution without padding 18 | node_def = helper.make_node( 19 | 'Conv', 20 | inputs=['x', 'W'], 21 | outputs=['y'], 22 | groups=2, 23 | kernel_shape=[2, 2], 24 | strides=[1, 1], 25 | pads=[0, 0, 0, 0], 26 | # Default values for other attributes: dilations=[1, 1] 27 | ) 28 | 29 | # create the graph 30 | graph_def = helper.make_graph( 31 | [node_def], 32 | 'test_group_Conv', 33 | [x, W], 34 | [y], 35 | [getOnesTensor([6, 4, 2, 2], 'W')] 36 | ) 37 | 38 | # create the model 39 | model_def = helper.make_model( 40 | graph_def, 41 | producer_name = 'onnc-tutorial' 42 | ) 43 | 44 | onnx.save(model_def, 'test_group_Conv.onnx') 45 | --------------------------------------------------------------------------------