├── ISCA2020-slides
    ├── IISCA2020_ONNC_Software_Architecture_Overview.pdf
    ├── ISCA2020_Graph_Level_Optimization.pdf
    ├── ISCA2020_Hardware_Optimization_Pass.pdf
    ├── ISCA2020_Introduction_of_ONNC_C_Backend.pdf
    ├── ISCA2020_Lab_ONNC_Working_Environment_Setup.pdf
    ├── ISCA2020_Nvdla_Overview.pdf
    ├── ISCA2020_ONNC_CIM.pdf
    ├── ISCA2020_ONNC_Software_Architecture_Overview.pdf
    ├── ISCA2020_ONNC_WASM_Project.pdf
    ├── ISCA2020_Porting_ONNC_To_NVDLA.pdf
    └── ISCA2020_Programming_Tips.pdf
├── README.md
├── figures
    ├── add-data_flow.png
    ├── bad_mapping_Add_Mul_Relu.png
    ├── compute_graph.png
    ├── cortexm_code_snapshot.png
    ├── cortexm_flow.png
    ├── ir_graph_get_input.png
    ├── ir_graph_get_output.png
    ├── loadable-file.png
    ├── loadable_and_driver.png
    ├── loadable_code_emit.png
    ├── loadable_sdp.png
    ├── loadable_task_info.png
    ├── loadable_tasks.png
    ├── mnist_demo.gif
    ├── mnist_demo_setup.png
    ├── nvdla-architecture.png
    ├── onnc-software-architecture-diagram.png
    ├── processing_open_file.png
    ├── processing_run.png
    ├── resnet50-partial.png
    ├── rubik_split_and_merge.png
    ├── runOnModule.png
    ├── runtime_env.png
    ├── sdp_x1_condition_1.png
    ├── sdp_x1_datapath.png
    ├── shuffle_visualization.png
    ├── shufflenet_partial.png
    ├── softmax_dataflow.png
    ├── test_Add.png
    ├── test_Conv_Relu.png
    ├── test_Conv_Relu_onnc_ir.png
    ├── test_Log.png
    ├── test_Mul_Add_Relu.png
    ├── test_Mul_Add_Relu_compound_IR.png
    ├── test_Mul_Add_Relu_original_IR.png
    ├── test_Mul_Add_Relu_reordered_IR.png
    ├── test_Relu.png
    ├── test_Relu_Log_Relu.png
    ├── test_Shuffle.png
    ├── test_Shuffle_adjusted_ONNC_IR.png
    └── test_Shuffle_original_ONNC_IR.png
├── lab_1_Environment_Setup
    └── lab_1.md
├── lab_2_Digit_Recognition_with_ARM_CortexM
    ├── lab_2.md
    ├── mnist_demo_gui
    │   └── mnist_demo_gui.pde
    └── onnc-cmsis-example
    │   ├── .gitignore
    │   ├── .mbed
    │   ├── .mbedignore
    │   ├── CMSIS_5.lib
    │   ├── README.md
    │   ├── add.cpp
    │   ├── add.h
    │   ├── main.cpp
    │   ├── matmul.cpp
    │   ├── matmul.h
    │   ├── mbed-os.lib
    │   ├── mbed_settings.py
    │   └── run.sh
├── lab_3_Starting_New_Backend
    └── lab_3.md
├── lab_4_Code_Emitting
    ├── lab_4.md
    └── src
    │   ├── FooNvdla.tar.gz
    │   └── visit_Add.cpp
├── lab_5_CPU_Fallback
    ├── lab_5.md
    └── src
    │   ├── emu_interface.h
    │   ├── visit_Log.cpp
    │   └── visit_Relu.cpp
├── lab_6_Manipulating_ONNC_IR
    ├── lab_6.md
    └── src
    │   ├── FooNvdlaBackend.cpp
    │   ├── GraphvizONNCIRPass.cpp
    │   ├── GraphvizONNCIRPass.h
    │   └── test_Conv_Relu.dot
├── lab_7_ONNC_IR_Extension
    ├── lab_7.md
    └── src
    │   ├── CodeEmitVisitor.cpp
    │   ├── CodeEmitVisitor.h
    │   ├── FooNvdlaBackend.cpp
    │   ├── NvDlaIdentifyShufflePass.cpp
    │   ├── NvDlaIdentifyShufflePass.h
    │   ├── NvDlaShuffle.cpp
    │   ├── NvDlaShuffle.h
    │   ├── PrintONNCIRPass.cpp
    │   └── PrintONNCIRPass.h
├── lab_8_Mul_Add_Reordering_and_Fusion
    ├── lab_8.md
    └── src
    │   ├── CMakeLists.txt
    │   ├── CodeEmitVisitor.cpp
    │   ├── CodeEmitVisitor.h
    │   ├── FooNvdlaBackend.cpp
    │   ├── FooNvdlaBackend.h
    │   ├── Makefile.am
    │   ├── NvDlaAddMulRelu.cpp
    │   ├── NvDlaAddMulRelu.h
    │   ├── NvDlaFuseAddMulReluPass.cpp
    │   ├── NvDlaFuseAddMulReluPass.h
    │   ├── NvDlaReorderMulAddPass.cpp
    │   ├── NvDlaReorderMulAddPass.h
    │   ├── PrintONNCIRPass.cpp
    │   └── PrintONNCIRPass.h
└── models
    ├── lenet
        ├── input0.output.dimg
        ├── input0.pgm
        ├── input1.pgm
        ├── input2.pgm
        ├── input4.pgm
        ├── input5.pgm
        ├── input6.pgm
        ├── input7.output.dimg
        ├── input7.pgm
        ├── input8.pgm
        ├── input9.pgm
        ├── lenet.nvdla
        └── lenet.onnx
    ├── quantized_mnist
        ├── mnist_calibration.txt
        └── quantized_mnist.onnx
    ├── test_Add
        ├── input1x5x7.pgm
        ├── out.nvdla
        ├── test_Add.log
        ├── test_Add.nvdla
        ├── test_Add.onnx
        └── test_Add.output.dimg
    ├── test_Conv_Relu
        └── test_Conv_Relu.onnx
    ├── test_Log
        ├── input1x5x7.pgm
        ├── test_Log.log
        ├── test_Log.nvdla
        ├── test_Log.onnx
        └── test_Log.output.dimg
    ├── test_Mul_Add_Relu
        ├── input1x5x5.pgm
        └── test_Mul_Add_Relu.onnx
    ├── test_Relu
        └── test_Relu.onnx
    ├── test_Relu_Log_Relu
        ├── input1x5x7.pgm
        ├── test_Relu_Log_Relu.log
        ├── test_Relu_Log_Relu.nvdla
        ├── test_Relu_Log_Relu.onnx
        └── test_Relu_Log_Relu.output.dimg
    ├── test_Shuffle
        ├── input.pgm
        ├── test_Shuffle.onnx
        └── test_Shuffle.output.dimg
    └── test_group_Conv
        ├── test_group_Conv.onnx
        └── test_group_Conv.py


/ISCA2020-slides/IISCA2020_ONNC_Software_Architecture_Overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/IISCA2020_ONNC_Software_Architecture_Overview.pdf


--------------------------------------------------------------------------------
/ISCA2020-slides/ISCA2020_Graph_Level_Optimization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Graph_Level_Optimization.pdf


--------------------------------------------------------------------------------
/ISCA2020-slides/ISCA2020_Hardware_Optimization_Pass.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Hardware_Optimization_Pass.pdf


--------------------------------------------------------------------------------
/ISCA2020-slides/ISCA2020_Introduction_of_ONNC_C_Backend.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Introduction_of_ONNC_C_Backend.pdf


--------------------------------------------------------------------------------
/ISCA2020-slides/ISCA2020_Lab_ONNC_Working_Environment_Setup.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Lab_ONNC_Working_Environment_Setup.pdf


--------------------------------------------------------------------------------
/ISCA2020-slides/ISCA2020_Nvdla_Overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Nvdla_Overview.pdf


--------------------------------------------------------------------------------
/ISCA2020-slides/ISCA2020_ONNC_CIM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_ONNC_CIM.pdf


--------------------------------------------------------------------------------
/ISCA2020-slides/ISCA2020_ONNC_Software_Architecture_Overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_ONNC_Software_Architecture_Overview.pdf


--------------------------------------------------------------------------------
/ISCA2020-slides/ISCA2020_ONNC_WASM_Project.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_ONNC_WASM_Project.pdf


--------------------------------------------------------------------------------
/ISCA2020-slides/ISCA2020_Porting_ONNC_To_NVDLA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Porting_ONNC_To_NVDLA.pdf


--------------------------------------------------------------------------------
/ISCA2020-slides/ISCA2020_Programming_Tips.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/ISCA2020-slides/ISCA2020_Programming_Tips.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | The NVIDIA Deep Learning Accelerator provides free intellectual property licensing to anyone wanting to build a chip that uses deep neural networks for inference applications. With extensive documentation and tools, many business proposals and research projects choose NVDLA as their inference engine design. However, lack of extensible compiler support becomes the major bottleneck for supporting more AI models and optimizations. This tutorial presents the first open source compiler that supports NVDLA-based designs. The ONNC compiler has more support than the official NVDLA compiler and relieves programmers from manually specifying the low-level details of models that are not supported by the official NVDLA compiler. It also enables the opportunities for hardware customization and proprietary optimization. We will cover the overview, porting and optimizations in three subsections. In each subsection, we will have hands-on labs to demonstrate how to run and customize the NVDLA backend in ONNC for product development and research projects.
 4 | 
 5 | ONNC (Open Neural Network Compiler) is a retargetable compilation framework designed specifically for proprietary deep learning accelerators. Its software architecture expedites porting ONNC to any Deep Learning Accelerator (DLA) design that supports [ONNX (Open Neural Network Exchange)](https://onnx.ai/) operators. ONNC guarantees executability across every DLA by means of transforming ONNX models into DLA-specific binary forms and leveraging the intermediate representation (IR) design of ONNX along with effective algorithms to eliminate the overhead of data movement. **ONNC is the first open source compiler available for NVDLA-based hardware designs**. Its NVDLA backend can compile a model into an executable NVDLA Loadable file. Integrating ONNC with the NVDLA software stack opens up opportunities for developers and researchers to explore the NVDLA-based inference design at system level. 
 6 | 
 7 | This tutorial was presented at [MICRO 2019: The 52nd IEEE/ACM International Symposium on Microarchitecture (October 12th)](https://www.microarch.org/micro52/program/workshops.html#onnc) , Columbus, Ohio. 
 8 | 
 9 | ## Intended Audience
10 | 
11 | Researchers and practitioners in academia or industry looking for an open-source AI compiler for NVDLA-based neural network inference engines.
12 | 
13 | ## Contributors
14 | 
15 | * Wei-Fen Lin (weifen@skymizer.com)
16 | * Cheng-Tao Hsieh (cthsieh@skymizer.com)
17 | 
18 | ## Hands-on Labs
19 | 
20 | * Lab 1. [ONNC Working Environment Setup](https://github.com/ONNC/onnc-tutorial/blob/master/lab_1_Environment_Setup/lab_1.md)
21 | * Lab 2. [Digit Recognition with ARM Cortex-M](https://github.com/ONNC/onnc-tutorial/blob/master/lab_2_Digit_Recognition_with_ARM_CortexM/lab_2.md)
22 | * Lab 3. [Starting a New Backend](https://github.com/ONNC/onnc-tutorial/blob/master/lab_3_Starting_New_Backend/lab_3.md)
23 | * Lab 4. [Code Emitting](https://github.com/ONNC/onnc-tutorial/blob/master/lab_4_Code_Emitting/lab_4.md)
24 | * Lab 5. [CPU Fallback Support](https://github.com/ONNC/onnc-tutorial/blob/master/lab_5_CPU_Fallback/lab_5.md)
25 | * Lab 6. [Manipulating ONNC IR and Optimization](https://github.com/ONNC/onnc-tutorial/blob/master/lab_6_Manipulating_ONNC_IR/lab_6.md)
26 | * Lab 7. [ONNC IR Extension](https://github.com/ONNC/onnc-tutorial/blob/master/lab_7_ONNC_IR_Extension/lab_7.md)
27 | * Lab 8. [Hardware-specific Optimization](https://github.com/ONNC/onnc-tutorial/blob/master/lab_8_Mul_Add_Reordering_and_Fusion/lab_8.md)
28 | 
29 | ## References
30 | 
31 | ### Papers
32 | 
33 | * W. F. Lin, D. Y. Tsai, L. Tang, C. T. Hsieh, C. Y. Chou, P. H. Chang, and L. Hsu, “ONNC: A compilation framework connecting ONNX to proprietary deep learning accelerators,” in IEEE International Conference on Artificial Intelligence Circuits and Systems (AICAS 2019). IEEE, 2019. 
34 | Download PDF: [Link](https://skymizer.com/publications/Skymizer-AICAS2019.pdf)
35 | 
36 | 
37 | * W.F. Lin, C. T. Hsieh, C. Y. Chou, "ONNC-based Software Development Platform for Configurable NVDLA Designs", to appear in IEEE International Symposium on VLSI Design, Automation and Test (VLSI-DAT 2019). IEEE, 2019
38 | Download PDF: [Link](https://skymizer.com/publications/Skymizer-VLSIDAT2019.pdf)
39 | 
40 | ### Documentation
41 | 
42 | - [ONNC Utilities](https://github.com/ONNC/onnc/blob/master/docs/ONNC-Utilities.md)
43 | - [ONNC Pass Manager Getting Started Guide](https://github.com/ONNC/onnc/blob/master/docs/ONNC-Pass-Manager-Getting-Started-Guide.md)
44 | - [ONNC Backend Developer Guide](https://github.com/ONNC/onnc/blob/master/docs/ONNC-Backend-Porting-Guide.md)
45 | - [The Code Emitting Pass User Guide](https://github.com/ONNC/onnc/blob/master/docs/The-Code-Emitting-Pass-User-Guide.md)
46 | - [ONNC IR Extension Guide](https://github.com/ONNC/onnc/blob/master/docs/ONNC-IR-Extension-Guide.md)
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/figures/add-data_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/add-data_flow.png


--------------------------------------------------------------------------------
/figures/bad_mapping_Add_Mul_Relu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/bad_mapping_Add_Mul_Relu.png


--------------------------------------------------------------------------------
/figures/compute_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/compute_graph.png


--------------------------------------------------------------------------------
/figures/cortexm_code_snapshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/cortexm_code_snapshot.png


--------------------------------------------------------------------------------
/figures/cortexm_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/cortexm_flow.png


--------------------------------------------------------------------------------
/figures/ir_graph_get_input.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/ir_graph_get_input.png


--------------------------------------------------------------------------------
/figures/ir_graph_get_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/ir_graph_get_output.png


--------------------------------------------------------------------------------
/figures/loadable-file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/loadable-file.png


--------------------------------------------------------------------------------
/figures/loadable_and_driver.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/loadable_and_driver.png


--------------------------------------------------------------------------------
/figures/loadable_code_emit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/loadable_code_emit.png


--------------------------------------------------------------------------------
/figures/loadable_sdp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/loadable_sdp.png


--------------------------------------------------------------------------------
/figures/loadable_task_info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/loadable_task_info.png


--------------------------------------------------------------------------------
/figures/loadable_tasks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/loadable_tasks.png


--------------------------------------------------------------------------------
/figures/mnist_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/mnist_demo.gif


--------------------------------------------------------------------------------
/figures/mnist_demo_setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/mnist_demo_setup.png


--------------------------------------------------------------------------------
/figures/nvdla-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/nvdla-architecture.png


--------------------------------------------------------------------------------
/figures/onnc-software-architecture-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/onnc-software-architecture-diagram.png


--------------------------------------------------------------------------------
/figures/processing_open_file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/processing_open_file.png


--------------------------------------------------------------------------------
/figures/processing_run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/processing_run.png


--------------------------------------------------------------------------------
/figures/resnet50-partial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/resnet50-partial.png


--------------------------------------------------------------------------------
/figures/rubik_split_and_merge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/rubik_split_and_merge.png


--------------------------------------------------------------------------------
/figures/runOnModule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/runOnModule.png


--------------------------------------------------------------------------------
/figures/runtime_env.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/runtime_env.png


--------------------------------------------------------------------------------
/figures/sdp_x1_condition_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/sdp_x1_condition_1.png


--------------------------------------------------------------------------------
/figures/sdp_x1_datapath.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/sdp_x1_datapath.png


--------------------------------------------------------------------------------
/figures/shuffle_visualization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/shuffle_visualization.png


--------------------------------------------------------------------------------
/figures/shufflenet_partial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/shufflenet_partial.png


--------------------------------------------------------------------------------
/figures/softmax_dataflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/softmax_dataflow.png


--------------------------------------------------------------------------------
/figures/test_Add.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Add.png


--------------------------------------------------------------------------------
/figures/test_Conv_Relu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Conv_Relu.png


--------------------------------------------------------------------------------
/figures/test_Conv_Relu_onnc_ir.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Conv_Relu_onnc_ir.png


--------------------------------------------------------------------------------
/figures/test_Log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Log.png


--------------------------------------------------------------------------------
/figures/test_Mul_Add_Relu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Mul_Add_Relu.png


--------------------------------------------------------------------------------
/figures/test_Mul_Add_Relu_compound_IR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Mul_Add_Relu_compound_IR.png


--------------------------------------------------------------------------------
/figures/test_Mul_Add_Relu_original_IR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Mul_Add_Relu_original_IR.png


--------------------------------------------------------------------------------
/figures/test_Mul_Add_Relu_reordered_IR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Mul_Add_Relu_reordered_IR.png


--------------------------------------------------------------------------------
/figures/test_Relu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Relu.png


--------------------------------------------------------------------------------
/figures/test_Relu_Log_Relu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Relu_Log_Relu.png


--------------------------------------------------------------------------------
/figures/test_Shuffle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Shuffle.png


--------------------------------------------------------------------------------
/figures/test_Shuffle_adjusted_ONNC_IR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Shuffle_adjusted_ONNC_IR.png


--------------------------------------------------------------------------------
/figures/test_Shuffle_original_ONNC_IR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/figures/test_Shuffle_original_ONNC_IR.png


--------------------------------------------------------------------------------
/lab_1_Environment_Setup/lab_1.md:
--------------------------------------------------------------------------------
  1 | # ONNC Working Environment Setup
  2 | 
  3 | ## Preface
  4 | 
  5 | This tutorial targets at using ONNC to generate Loadables that contains DNN model graph information for running inference on NVDLA-based SoCs. Most information in this tutorial is specifically tailored to the NVDLA backend porting.   
  6 | To facilitate the software development process, an ONNC Docker image is available in the Docker Hub for fast deployment. It has pre-installed dependent libraries and is a ready-to-run working environment. Users can mount ONNC source code into the Docker container and build the source code inside the container. In addition, the built ONNC binary can be executed to compile deep neural network (DNN) models inside the container. ONNC currently provides two backend implementations in the GitHub release v1.2. For the x86 backend, users may run model inference using the embedded interpreter, ONNI. For the NVDLA backend, a Loadable file that contains model graph information is generated after compilation. Users may simulate the model inference by running the Loadable files on an NVDLA virtual platform. The [NVIDIA Deep Learning Accelerator (NVDLA)](http://nvdla.org/index.html) release provides a full-featured virtual platform for full-system software simulation. We leverage the officially released virtual platform and make small changes for this tutorial. 
  7 | 
  8 | In the first Lab, we will describe and demonstrate how to build ONNC, compile models using ONNC, and simulate the model inference on our pre-packed virtual platform.
  9 | 
 10 | ## Prerequisite
 11 | 
 12 | If Docker is not installed in your system, please download Docker (http://www.docker.com) and install it first.
 13 | You also need to install Git (https://git-scm.com/) to retrieve the source codes from Git servers.
 14 | 
 15 | ## Preparing Source Code and Docker Images
 16 | 
 17 | The latest ONNC source code is available on GitHub. Please follow the following commands to download the source code.
 18 | 
 19 | ```sh
 20 | $ git clone https://github.com/ONNC/onnc.git
 21 | $ cd onnc
 22 | $ git checkout tags/1.3.0
 23 | $ cd ..
 24 | ```
 25 | 
 26 | Use the following command to download the tutorial material. There are some example DNN models and code snippets you will use in the subsequent labs.
 27 | 
 28 | ```sh
 29 | $ git clone https://github.com/ONNC/onnc-tutorial.git
 30 | ```
 31 | 
 32 | Pull the Docker image from the Docker Hub using the following commands.
 33 | 
 34 | ```sh
 35 | # We need two Docker images.
 36 | 
 37 | $ docker pull onnc/onnc-community
 38 | $ docker pull onnc/vp
 39 | ```
 40 | 
 41 | To verify that the Docker images were downloaded successfully, use the following command to show all available Docker images. You should see both `onnc/onnc-community` and `onnc/vp` images.
 42 | 
 43 | ```sh
 44 | $ docker images
 45 | REPOSITORY                           TAG                                IMAGE ID            CREATED             SIZE
 46 | onnc/onnc-community                  latest                             fdd06c76c519        2 days ago          5.58GB
 47 | onnc/vp                              latest                             889c00396ea1        2 days ago          2.16GB
 48 | ```
 49 | 
 50 | 
 51 | ## Building ONNC and Compiling DNN Models
 52 | 
 53 | Use the following command to bring up the ONNC-community Docker.
 54 | 
 55 | ```sh
 56 | $ docker run -ti --rm -v <absolute/path/to/onnc>:/onnc/onnc -v <absolute/path/to/tutorial>:/tutorial onnc/onnc-community
 57 | ```
 58 | 
 59 | * `<absolute/path/to/onnc>` is the directory where you clone the ONNC source code. Note that it must be the absolute path other than a relative path.
 60 | * `<absolute/path/to/tutorial>` is the directory where you clone the ONNC tutorial material.
 61 | * The `-ti` option provides an interactive interface for the container.
 62 | * The `--rm` option will automatically clean up the container when the container exits.
 63 | * The `-v` option mounts the directory to the Docker container. With this option, you can make change to the source code (<path/to/onnc>) outside the Docker container with your favorite editor, and the change can be seen inside the Docker container and gets compiled.
 64 | 
 65 | Within the Docker container, use the following commands to build ONNC.
 66 | 
 67 | ```sh
 68 | # Within onnc/onnc-community Docker container
 69 | 
 70 | $ cd /onnc/onnc-umbrella/build-normal
 71 | 
 72 | # Build ONNC.
 73 | $ smake -j8 install
 74 | ```
 75 | 
 76 | * The `smake` command synchronizes the build directory with `<path/to/onnc>/onnc` and invokes the make command to build ONNC. 
 77 | * The `-j8` option is to parallelize compilation with 8 CPU cores.
 78 | * This command will automatically install the compiled binary in this container environment.
 79 | 
 80 | ```sh
 81 | # Run ONNC to compile a DNN model.
 82 | $ onnc -mquadruple nvdla /tutorial/models/lenet/lenet.onnx
 83 | 
 84 | # Prepare the compiled output file for the virtual platform to run.
 85 | $ sudo mv out.nvdla /tutorial/models/lenet/
 86 | ```
 87 | 
 88 | You may use the following command to exit the Docker prompt, 
 89 | 
 90 | ```sh
 91 | # Within the onnc/onnc-community Docker container
 92 | $ exit
 93 | ```
 94 | 
 95 | ## Performing Model Inference on Virtual Platform
 96 | 
 97 | When you finish building ONNC and compiling a DNN model, you do not need the `onnc/onnc-community` Docker anymore. Start another console/terminal on your computer to enter the other Docker image called `onnc/vp` for model inference.
 98 | 
 99 | ```sh
100 | # Within your computer console
101 | 
102 | $ docker run -ti --rm -v <absolute/path/to/tutorial>:/tutorial onnc/vp
103 | ```
104 | 
105 | The virtual platform in this Docker is used to simulate the NVDLA runtime environment. As the following figure shows, the virtual platform contains a systemC model for the NVDLA hardware as well as a CPU emulator, where a Linux OS and NVDLA drivers are running to drive the NVDLA hardware.
106 | 
107 | <img src="../figures/runtime_env.png" width="400">
108 | 
109 | Within the VP Docker container, use the following commands to activate the virtual platform.
110 | 
111 | ```sh
112 | # Within onnc/vp Docker container
113 | 
114 | $ cd /usr/local/nvdla
115 | 
116 | # Prepare loadable, input, and golden output for the future use.
117 | $ cp /tutorial/models/lenet/* .
118 | 
119 | # Run the virtual platform.
120 | $ aarch64_toplevel -c aarch64_nvdla.lua
121 | 
122 |              SystemC 2.3.0-ASI --- Oct  9 2017 04:21:14
123 |         Copyright (c) 1996-2012 by all Contributors,
124 |         ALL RIGHTS RESERVED
125 | 
126 | No sc_log specified, will use the default setting
127 | verbosity_level = SC_MEDIUM
128 | bridge: tlm2c_elaborate..
129 | [    0.000000] Booting Linux on physical CPU 0x0
130 | # ...
131 | Initializing random number generator... done.
132 | Starting network: udhcpc: started, v1.27.2
133 | udhcpc: sending discover
134 | udhcpc: sending select for 10.0.2.15
135 | udhcpc: lease of 10.0.2.15 obtained, lease time 86400
136 | deleting routers
137 | adding dns 10.0.2.3
138 | OK
139 | Starting sshd: [    4.590433] NET: Registered protocol family 10
140 | [    4.606182] Segment Routing with IPv6
141 | OK
142 | 
143 | Welcome to Buildroot
144 | nvdla login:
145 | ```
146 | 
147 | By starting the virtual platform, a Linux kernel is brought up and stops at the login prompt.
148 | 
149 | * nvdla login: root
150 | * Password: nvdla
151 | 
152 | After logging into the Linux prompt, use the following commands to install the drivers.
153 | 
154 | ```sh
155 | # Within the virtual platform
156 | 
157 | $ mount -t 9p -o trans=virtio r /mnt && cd /mnt
158 | 
159 | # Install KMD.
160 | $ insmod drm.ko && insmod opendla.ko
161 | [  469.730339] opendla: loading out-of-tree module taints kernel.
162 | [  469.734509] reset engine done
163 | [  469.737998] [drm] Initialized nvdla 0.0.0 20171017 for 10200000.nvdla on minor 0
164 | ```
165 | 
166 | Up to this point, everything is ready for running model inference. In this lab, we demonstrate with a real-world model, LeNet, which is used for hand-written digit recognition. We have prepared some 28x28 images (`.pgm` files) to represent digit numbers 0 to 9. We begin with running model inference to recognize digit number 0 with input file `input0.pgm`. The inference simulation will take about a few minutes. 
167 | 
168 | ```sh
169 | # Within the virtual platform
170 | 
171 | # Run the NVDLA runtime (containing UMD) to do model inference.
172 | $ ./nvdla_runtime --loadable out.nvdla --image input0.pgm --rawdump
173 | creating new runtime context...
174 | Emulator starting
175 | # ...
176 | [  126.029817] Enter:dla_handle_events, processor:CDP
177 | [  126.029995] Exit:dla_handle_events, ret:0
178 | [  126.030146] Enter:dla_handle_events, processor:RUBIK
179 | [  126.030323] Exit:dla_handle_events, ret:0
180 | [  126.032432] reset engine done
181 | Shutdown signal received, exiting
182 | Test pass
183 | ```
184 | 
185 | After the simulation is done, we will derive an output file `output.dimg` containing the model output values.
186 | In this example, the output file should look like the follows:
187 | 
188 | ```sh
189 | $ more output.dimg
190 | 149.25 -49.625 13.875 11.2344 -59.8125 -2.61523 7.80078 -44.7188 30.8594 17.3594
191 | ```
192 | 
193 | In the file, there are ten numbers indicating the confidence level of the 10 digits from 0 to 9, respectively.
194 | For example, the first number 149.25 indicates the confidence level of digit 0, and the next -49.625 of digit 1, and so on. Among those numbers, the largest one implies the recognition result. In this case, the first number 149.25 is the largest one, so the corresponding digit 0 is the recognition result.
195 | 
196 | After the experiment, you can use the following command to exit the virtual platform.
197 | 
198 | ```sh
199 | # Within the virtual platform
200 | $ poweroff
201 | ```
202 | 
203 | Use the following command to exit the `onnc/vp` Docker prompt.
204 | 
205 | ```sh
206 | # Within the onnc/vp Docker container
207 | $ exit
208 | ```
209 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/lab_2.md:
--------------------------------------------------------------------------------
  1 | # Digit Recognition with ARM Cortex-M
  2 | 
  3 | 
  4 | ## Preface
  5 | 
  6 | Machine learning is moving to the edge. People want to have edge computing capability on embedded devices to provide more advanced services, like voice recognition for smart speakers and face detection for surveillance cameras. The Arm Cortex-M processor family is a range of scalable, energy-efficient and easy-to-use processors that meet the needs of smart and connected embedded applications. Cortex Microcontroller Software Interface Standard (CMSIS) is a vendor-independent hardware abstraction layer for the Cortex-M processor series. Research (https://arxiv.org/abs/1801.06601) has shown that machine learning has a proven 4.6X boost on the Cortex-M platform with the new CMSIS-NN software framework. In this lab, we will introduce an ONNC backend, `CortexM` backend, for the ARM Cortex-M microprocessor and demonstrate an end-to-end application, hand-writing recognition. The CortexM backend integrates the [CMSIS-NN library](https://github.com/ARM-software/CMSIS_5) that provides a set of computing functions for several popular operators in deep neural network (DNN) models, such as convolution, maximum pooling, etc. The library is optimized for speeding up the model inference on ARM Cortex-M CPUs.  The following figure shows an example of the mapping between the model operators and the CMSIS-NN function calls.
  7 | 
  8 | <img src="../figures/cortexm_code_snapshot.png" width="450">
  9 | 
 10 | In this lab, we will use an end-to-end application to demonstrate how the ONNC framework supports AI inferences for a target hardware easily. 
 11 | 
 12 | ## Deploying MNIST Model Inference on an Embedded System
 13 | 
 14 | The following diagram depicts how we deploy MNIST model inference on an Cortex-M platform.
 15 | 
 16 | <img src="../figures/cortexm_flow.png" width="450">
 17 | 
 18 | Typically, machine learning models are trained with floating-point data on GPU graphic cards or servers, but running inference in lower precision is preferred on embedded devices due to limited computation power. Fortunately, several research papers have proved that quantizing the data into integers can usually be performed without any loss of performance (i.e. accuracy). In this lab, we have prepared a quantized MNIST model in ONNX format. The input data and the weights are all of 8-bit integers. When running inference, the internal computation datapath might have a higher precision than 8 bits to avoid accuracy loss, but the activation data precision is converted back to 8 bits in the implementation. Many CMSIS-NN functions simply use "shift-right" logic to perform the bit-width conversion. The amount of shift-right is typically determined together with the weight quantization, so we leave it as one user input in the Cortex-M backend. The ONNX model format does not contain calibration information on the activation data. We have prepared a separate file, called the calibration file, to store the shift-right information.
 19 | 
 20 | After compiling the MNIST model inference application (as `.cpp` file) using ONNC, we use the ARM cross-compiler to compile and link the application and the CMSIS-NN library together. The application software depends on the underlying embedded system and the target application. Users may find hardware-dependent information from vendors. Once the firmware binary is ready, we upload the binary file into the target board by an ISP tool that should be provided by the board vendor. 
 21 | 
 22 | ## Prerequisite
 23 | 
 24 | If Docker is not installed in your system, please download Docker (http://www.docker.com) and install it first. In addition, you need to install Git (https://git-scm.com/) to fetch the source code from the GitHub server. Furthermore, the demonstration uses a popular GUI programming framework, Processing, please install Processing (https://processing.org/) as well. Lastly, you need to prepare a development board equipped with ARM Cortex-M CPU. We suggest to use [Mbed compatible boards](https://os.mbed.com/platforms/) because we use the [Mbed framework](https://www.mbed.com/en/) for the firmware compilation. If your board is not compatible with Mbed, you might need to rewrite some demonstration code following the regulation from the board vendor. 
 25 | 
 26 | ## Preparing Source Code and Docker Images
 27 | 
 28 | The ONNC source code for Cortex-M is available online. Use the following command to download the ONNC source code.
 29 | 
 30 | ```sh
 31 | $ git clone -b CortexM https://github.com/ONNC/onnc.git
 32 | ```
 33 | 
 34 | Next, use the following command to download the tutorial source code. There are some example DNN models you will use in this lab.
 35 | 
 36 | ```sh
 37 | $ git clone https://github.com/ONNC/onnc-tutorial.git
 38 | ```
 39 | 
 40 | Pull the Docker images from the Docker Hub using the following commands.
 41 | 
 42 | ```sh
 43 | # Obtain the ONNC compilation environment.
 44 | $ docker pull onnc/onnc-community
 45 | 
 46 | # Obtain the ARM cross-compilation environment.
 47 | $ docker pull misegr/mbed-cli
 48 | ```
 49 | 
 50 | To verify that the Docker images were downloaded successfully, use the following command to show all available Docker images. You should see both `onnc/onnc-community` and `misegr/mbed-cli` images.
 51 | 
 52 | 
 53 | ```sh
 54 | $ docker images
 55 | REPOSITORY                           TAG                                IMAGE ID            CREATED             SIZE
 56 | onnc/onnc-community                  latest                             fdd06c76c519        2 days ago          5.58GB
 57 | misegr/mbed-cli                      latest                             a708c25bd4d9        2 weeks ago         2.85GB
 58 | ```
 59 | 
 60 | ## Building ONNC and Compiling Digit-Recognition Models
 61 | 
 62 | Use this command to bring up the ONNC-community Docker.
 63 | 
 64 | ```sh
 65 | $ docker run -ti --rm -v <absolute/path/to/onnc>:/onnc/onnc -v <absolute/path/to/tutorial>:/tutorial onnc/onnc-community
 66 | ```
 67 | Please refer to [lab 1: Environment Setup](../lab_1_Environment_Setup/lab_1.md) for Docker's command usage. Within the Docker container, use the following commands to build ONNC.
 68 | 
 69 | ```sh
 70 | ##############################################
 71 | # Within onnc/onnc-community Docker container
 72 | ##############################################
 73 | 
 74 | $ cd /onnc/onnc-umbrella/build-normal
 75 | 
 76 | # Build ONNC.
 77 | $ smake -j8 install
 78 | ```
 79 | 
 80 | Up to this point, you should have the ONNC binary ready to compile DNN models. As we have mentioned earlier, the DNN model used for this lab must have been quantized, and all of its weights are 8-bit integers. In addition, a calibration file with shift-right values on all activation data must be prepared as well. In this lab, we obtained the [mnist model](https://github.com/onnx/models/tree/master/vision/classification/mnist) from the ONNX model zoo, and performed post-training quantization to derive its quantized version. Once all files are ready (you may find a copy in the `<onnc-tutorial>/models/quantized_mnist/` folder), use the following commands to compile the model and generate C codes.
 81 | 
 82 | ```sh
 83 | ##############################################
 84 | # Within onnc/onnc-community Docker container
 85 | ##############################################
 86 | 
 87 | # Run ONNC to compile a quantized model with calibration information.
 88 | $ onnc -mquadruple cortexm /tutorial/models/quantized_mnist/quantized_mnist.onnx \
 89 |     --load-calibration-file=/tutorial/models/quantized_mnist/mnist_calibration.txt
 90 | 
 91 | # Check the output files of the Cortex-M backend.
 92 | $ ls cortexm*
 93 | cortexm_main.cpp  cortexm_main.h  cortexm_weight.h
 94 | 
 95 | # Prepare the resulting files for the later cross-compilation.
 96 | $ sudo mv cortexm* /tutorial/models/quantized_mnist
 97 | ```
 98 | By now, you may find the generated files in the `<onnc-tutorial>/models/quantized_mnist/` folder. In case where you want to exit the Docker prompt, use the following command.
 99 | 
100 | ```sh
101 | ##############################################
102 | # Within onnc/onnc-community Docker container
103 | ##############################################
104 | 
105 | $ exit
106 | ```
107 | 
108 | ## Cross-compilation of CortexM machine code
109 | 
110 | When you finish the previous steps of building ONNC and compiling a DNN model, you do not need the `onnc/onnc-community` Docker anymore. You need to enter the other Docker image `misegr/mbed-cli` to compile the generated C code for the Cortex-M platform. 
111 | 
112 | ```sh
113 | ###############################
114 | # Within your computer console
115 | ###############################
116 | 
117 | # Move CortexM files to the onnc-cmsis-example folder
118 | $ cd <path/to/onnc-cmsis-example>
119 | $ cp <path/to/tutorial>/models/quantized_mnist/cortexm* .
120 | 
121 | # Enter the cross-compilation Docker.
122 | $ docker run -ti --rm -v <absolute/path/to/onnc-cmsis-example>:/src misegr/mbed-cli bash
123 | ```
124 | 
125 | ```sh
126 | ##########################################
127 | # Within misegr/mbed-cli Docker container
128 | ##########################################
129 | 
130 | $ cd /src
131 | $ mbed deploy
132 | [mbed] Working path "/src" (program)
133 | [mbed] Adding library "mbed-os" from "https://github.com/ARMmbed/mbed-os" at rev #367dbdf5145f
134 | [mbed] Adding library "CMSIS_5" from "https://github.com/ARM-software/CMSIS_5" at rev #c4c089d6333d
135 | [mbed] WARNING: File "RTX_V8MMF.lib" in "/src/CMSIS_5/CMSIS/RTOS2/RTX/Library/ARM" uses a non-standard .lib file extension, which is not compatible with the mbed build tools.
136 | ...
137 | [mbed] Auto-installing missing Python modules (fuzzywuzzy)...
138 | 
139 | # Compile the firmware for a specific target by appointing the --target option.
140 | # Here we use NuMaker_PFM_NUC472 as an example.
141 | # Another example is DISCO_L475VG_IOT01A by STM.
142 | $ mbed compile --target NuMaker_PFM_NUC472
143 | [mbed] Working path "/src" (program)
144 | Building project src (NUMAKER_PFM_NUC472, GCC_ARM)
145 | Scan: .
146 | Scan: env
147 | ...
148 | Compile [ 99.7%]: serial_api.c
149 | Compile [ 99.9%]: spi_api.c
150 | Compile [100.0%]: test_env.cpp
151 | Link: src
152 | Elf2Bin: src
153 | +------------------+--------+-------+-------+
154 | | Module           |  .text | .data |  .bss |
155 | +------------------+--------+-------+-------+
156 | | CMSIS_5/CMSIS    |   1748 |     0 |     0 |
157 | | [fill]           |    471 |    25 |    23 |
158 | | [lib]/c.a        |  63801 |  2548 |   127 |
159 | | [lib]/gcc.a      |   7200 |     0 |     0 |
160 | | [lib]/misc       |    252 |    12 |    28 |
161 | | [lib]/nosys.a    |     32 |     0 |     0 |
162 | | [lib]/stdc++.a   | 171534 |   165 |  5676 |
163 | | add.o            |    192 |     4 |     1 |
164 | | cortexm_main.o   |    384 |  6082 | 15768 |
165 | | main.o           |    344 |     4 |  4200 |
166 | | matmul.o         |    118 |     0 |     0 |
167 | | mbed-os/drivers  |   1219 |     0 |     0 |
168 | | mbed-os/features |    112 |     0 | 12345 |
169 | | mbed-os/hal      |   1720 |     4 |    68 |
170 | | mbed-os/platform |   3934 |   256 |   105 |
171 | | mbed-os/rtos     |  10917 |   168 |  6073 |
172 | | mbed-os/targets  |   5656 |   212 |   142 |
173 | | Subtotals        | 269634 |  9480 | 44556 |
174 | +------------------+--------+-------+-------+
175 | Total Static RAM memory (data + bss): 54036 bytes
176 | Total Flash memory (text + data): 279114 bytes
177 | 
178 | Image: ./BUILD/NUMAKER_PFM_NUC472/GCC_ARM/src.bin
179 | ```
180 | 
181 | The generated firmware binary file is located at <path/to/onnc-cmsis-example>/BUILD/NUMAKER_PFM_NUC472/GCC_ARM/src.bin. You can upload it to the board following the suggestion from the board vendor. The procedure is simple for an Mbed-compatible board. Connect the target board to a (Mac, Linux, or Windows) computer via a USB cable. Then you should see an Mbed drive to appear in the file browser window. Copy the `src.bin` file into that drive.
182 | 
183 | ## Digit Recognition Demo
184 | 
185 | The demo setup is shown as below.
186 | 
187 | <img src="../figures/mnist_demo_setup.png" width="450">
188 | 
189 | The board is connected to a PC via the UART connection. On the PC, we have prepared a GUI software where you can draw digit numbers on. Please open the [GUI program](mnist_demo_gui/mnist_demo_gui.pde) by Processing as shown in the following diagram. The file path is `<path/to/tutorial>/lab_2_Digit_Recognition_with_ARM_CortexM/mnist_demo_gui/mnist_demo_gui.pde`.
190 | 
191 | <img src="../figures/processing_open_file.png" width="300">
192 | 
193 | Then run the program by clicking the "run" button as below.
194 | 
195 | <img src="../figures/processing_run.png" width="400">
196 | 
197 | This demo accepts only one single-digit numnber at a time. Once you are done and click the "Submit" button on the GUI, the software will take a screenshot, transform it into a 28x28 image, and send the image to the board via the UART connection. The board will perform the model inference, and then send the classification answer back to the PC.
198 | A screenshot of the demo is shown as below.
199 | 
200 | <img src="../figures/mnist_demo.gif" width="400">
201 | 
202 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/mnist_demo_gui/mnist_demo_gui.pde:
--------------------------------------------------------------------------------
  1 | import processing.serial.Serial;
  2 | 
  3 | Serial _port;
  4 | 
  5 | @Override
  6 | void setup()
  7 | {
  8 |     size(640, 280, P2D);
  9 |     background(0xFF);
 10 | 
 11 |     textSize(144); // Workaround P2D
 12 |     textAlign(CENTER);
 13 | 
 14 |     strokeWeight(20);
 15 |     noStroke();
 16 | 
 17 |     // Print all avaiable Serial ports.
 18 |     printArray(Serial.list());
 19 |     // According to the Serial port list, choose the one with a connection to the board.
 20 |     int portNo = 0;
 21 |     _port = new Serial(this, Serial.list()[portNo], 115200);
 22 |     
 23 |     _clear();
 24 |     _redraw(Button.NONE);
 25 | }
 26 | 
 27 | @Override
 28 | void draw() {}
 29 | 
 30 | @Override
 31 | void mouseDragged()
 32 | {
 33 |     if (mouseX < 280) {
 34 |         stroke(0xFF);
 35 |         line(pmouseX, pmouseY, mouseX, mouseY);
 36 |         noStroke();
 37 |     }
 38 | }
 39 | 
 40 | @Override
 41 | void mouseMoved()
 42 | {
 43 |     _redraw(_button());
 44 | }
 45 | 
 46 | @Override
 47 | void mouseClicked()
 48 | {
 49 |     switch (_button()) {
 50 |         case SUBMIT:
 51 |             _say(_recognize());
 52 |             break;
 53 |         case CLEAR:
 54 |             _clear();
 55 |             break;
 56 |     }
 57 | }
 58 | 
 59 | @Override
 60 | void keyPressed()
 61 | {
 62 |     switch (keyCode) {
 63 |         case ENTER:
 64 |         case RETURN:
 65 |         case ' ':
 66 |         case 'S':
 67 |             _say(_recognize());
 68 |             break;
 69 |         case BACKSPACE:
 70 |         case DELETE:
 71 |         case 'C':
 72 |             _clear();
 73 |             break;
 74 |     }
 75 | }
 76 | 
 77 | private void _say(int number)
 78 | {
 79 |     fill(0xFF);
 80 |     rect(400, 0, 100, 160);
 81 |     fill(0);
 82 |     textSize(144);
 83 | 
 84 |     if (number >= 0) {
 85 |         text(number, 460, 140);
 86 |     }
 87 |     else {
 88 |         text('-', 460, 140);
 89 |     }
 90 | }
 91 | 
 92 | boolean result_returned;
 93 | 
 94 | void serialEvent(Serial p) {
 95 |   result_returned = true;
 96 | }
 97 | 
 98 | private int _recognize()
 99 | {
100 |     final PImage img = get(0, 0, 280, 280);
101 |     img.resize(28, 28);
102 |     img.loadPixels();
103 | 
104 |     final int size = img.pixels.length;
105 |     final byte data[] = new byte[size];
106 | 
107 |     for (int i = 0; i < size; ++i)
108 |         data[i] = (byte)(img.pixels[i] >> 1 & 0x7F);
109 | 
110 |     result_returned = false;
111 |     _port.write(data);
112 |     while (!result_returned) { delay(1); }
113 |     return _port.read();
114 | }
115 | 
116 | private void _clear()
117 | {
118 |     fill(0);
119 |     rect(0, 0, 280, 280);
120 | }
121 | 
122 | private void _redraw(Button hover)
123 | {
124 |     fill(hover == Button.SUBMIT ? #007ACC : 0);
125 |     rect(310, 190, 140, 50, 5);
126 | 
127 |     fill(hover == Button.CLEAR ? #007ACC : 0);
128 |     rect(470, 190, 140, 50, 5);
129 | 
130 |     fill(0xFF);
131 |     textSize(32);
132 |     text("Submit", 380, 225);
133 |     text("Clear",  540, 225);
134 | }
135 | 
136 | private Button _button()
137 | {
138 |     if (mouseY >= 190 && mouseY < 240) {
139 |         if (mouseX >= 310 && mouseX < 450)
140 |             return Button.SUBMIT;
141 |         else if (mouseX >= 470 && mouseX < 610)
142 |             return Button.CLEAR;
143 |     }
144 |     return Button.NONE;
145 | }
146 | 
147 | private enum Button
148 | {
149 |     NONE,
150 |     SUBMIT,
151 |     CLEAR,
152 | }


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/.gitignore:
--------------------------------------------------------------------------------
1 | BUILD
2 | CMSIS_5
3 | mbed-os
4 | mbed_settings.pyc
5 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/.mbed:
--------------------------------------------------------------------------------
1 | TOOLCHAIN=GCC_ARM
2 | TARGET=NUMAKER_PFM_NUC472
3 | ROOT=.
4 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/.mbedignore:
--------------------------------------------------------------------------------
 1 | CMSIS_5/CMSIS/CoreValidation/*
 2 | CMSIS_5/CMSIS/Core/Template/*
 3 | CMSIS_5/CMSIS/Driver/*
 4 | CMSIS_5/CMSIS/DoxyGen/*
 5 | CMSIS_5/CMSIS/Documentation/*
 6 | CMSIS_5/CMSIS/DSP/Examples/*
 7 | CMSIS_5/CMSIS/DSP/DSP_Lib_TestSuite/*
 8 | CMSIS_5/CMSIS/DSP/Projects/*
 9 | CMSIS_5/CMSIS/DAP/*
10 | CMSIS_5/CMSIS/Core_A/*
11 | CMSIS_5/CMSIS/RTOS/*
12 | CMSIS_5/CMSIS/Pack/*
13 | CMSIS_5/CMSIS/NN/Scripts/*
14 | CMSIS_5/CMSIS/NN/NN_Lib_Tests/*
15 | CMSIS_5/CMSIS/NN/Examples/*
16 | CMSIS_5/CMSIS/Utilities/*
17 | CMSIS_5/CMSIS/RTOS2/*
18 | CMSIS_5/CMSIS/Lib/*
19 | CMSIS_5/Device/*


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/CMSIS_5.lib:
--------------------------------------------------------------------------------
1 | https://github.com/ARM-software/CMSIS_5/#c4c089d6333d5b4f2069b5287c26e2ccf74f373d
2 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/README.md:
--------------------------------------------------------------------------------
1 | 
2 | $ mbed deploy
3 | $ mbed compile
4 | 
5 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/add.cpp:
--------------------------------------------------------------------------------
 1 | #include  <iostream>
 2 | #include <stdio.h>
 3 | #include "add.h"
 4 | 
 5 | using namespace std;
 6 | 
 7 | void MatAdd(q7_t* input,int* input_dim,q7_t* add,int* add_dim,q7_t* output,int number_of_input_dim,int right_shift,int add_shift) {//q7_t (&answer)[K*C*H*W],q7_t (&add)[fm*H*W]
 8 | 	//printf("length = %d\n",_msize(input_dim) / sizeof(input_dim[0]));
 9 |     if( number_of_input_dim == 4 ){
10 | 	for(int loop_k = 0 ; loop_k < input_dim[0] ; loop_k++){
11 | 	    for(int loop_h = 0 ; loop_h < input_dim[1] ; loop_h++){
12 | 		for(int loop_w = 0 ; loop_w < input_dim[2] ; loop_w++){
13 | 		    for(int loop_c = 0 ; loop_c < input_dim[3] ; loop_c++){
14 | 			output[loop_k * input_dim[1] * input_dim[2] * input_dim[3] + loop_h * input_dim[2] * input_dim[3] + loop_w * input_dim[3] + loop_c] =
15 | 			(input[loop_k * input_dim[1] * input_dim[2] * input_dim[3] + loop_h * input_dim[2] * input_dim[3] + loop_w * input_dim[3] + loop_c] + (add[loop_c]>>add_shift) ) >> right_shift;
16 | 		    }
17 | 		}
18 | 	    }
19 | 	}
20 |     }else{
21 |         for(int loop_h = 0 ; loop_h < input_dim[0] ; loop_h++){
22 |             for(int loop_w = 0 ; loop_w < input_dim[1] ; loop_w++){
23 |                 output[ loop_h * input_dim[1] + loop_w ] = (input[ loop_h * input_dim[1] + loop_w ] + (add[ loop_h * input_dim[1] + loop_w ]>>add_shift)) >> right_shift ;
24 |                 //printf("index = %d,",loop_h * input_dim[1] + loop_w);
25 |             }
26 |         }
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/add.h:
--------------------------------------------------------------------------------
1 | #ifndef ADD_H
2 | #define ADD_H
3 | 
4 | #include "arm_math.h"
5 | 
6 | void MatAdd(q7_t* input,int* input_dim,q7_t* add,int* add_dim,q7_t* output,int number_of_input_dim,int right_shift,int add_shift);//input , add , output -> q7_t
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/main.cpp:
--------------------------------------------------------------------------------
 1 | #include "mbed.h"
 2 | #include "arm_math.h"
 3 | #include "cortexm_main.h"
 4 | #include <algorithm>
 5 | 
 6 | Serial port(USBTX, USBRX, 115200);
 7 | 
 8 | static const int IMAGE_SIZE = 28 * 28;
 9 | int input[IMAGE_SIZE];
10 | unsigned char buffer[IMAGE_SIZE];
11 | 
12 | 
13 | void
14 | pre_processing(int* image_data){
15 |   for(int i = 0 ; i < IMAGE_SIZE; i++) {
16 |     image_data[i] = (image_data[i] >> 1) & 0x7f;
17 |   }
18 | }
19 | 
20 | int
21 | maximunloop(q7_t* img_buffer2)
22 | {
23 |   int return_type = 0;
24 |   int type_value = 0;
25 |   for (int i = 0; i < 10 ; i++){
26 |     if(type_value < img_buffer2[i]){
27 |       type_value = img_buffer2[i];
28 |       return_type = i;
29 |     }
30 |   }
31 |   return return_type;
32 | }
33 | 
34 | 
35 | void read(void){
36 |   int i;
37 |   while(port.readable()==0){};
38 |   for(i=0;i<IMAGE_SIZE;i++){
39 |     buffer[i]=port.getc();
40 |   }
41 | }
42 | int j;
43 | void Transform(unsigned char *data,int *input){
44 |   for(j=0;j<IMAGE_SIZE;j++){
45 |     input[j]=(int)data[j];
46 |   }
47 | }
48 | 
49 | int main()
50 | {
51 |   while(1)
52 |   {
53 |     read();
54 |     Transform(buffer,input);
55 |     pre_processing(input);
56 |     int result = maximunloop(cortexm_main(input));
57 |     port.putc(result);
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/matmul.cpp:
--------------------------------------------------------------------------------
 1 | #include "matmul.h"
 2 | 
 3 | void matmul(q7_t* input_1,int input_1_dim[2],q7_t* input_2,int input_2_dim[2],q7_t* output,int right_shift){
 4 |     if(input_1_dim[1] == input_2_dim[0]){
 5 |         int index = 0;
 6 |         for(int input1_high = 0;input1_high < input_1_dim[0];input1_high++){
 7 |             for(int input2_weight = 0;input2_weight < input_2_dim[1];input2_weight++){
 8 |                 int sum = 0;
 9 |                 for(int loop = 0;loop < input_1_dim[1];loop++){
10 |                     sum += input_1[loop + input1_high * input_1_dim[0] ] * input_2[ loop * input_2_dim[1] + input2_weight ];
11 |                 }
12 |                 output[index] = sum >> right_shift;
13 |                 index++;
14 |             }
15 |         }
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/matmul.h:
--------------------------------------------------------------------------------
1 | #ifndef MATMUL_H
2 | #define MATMUL_H
3 | #include "arm_math.h"
4 | void matmul(q7_t* input_1,int input_1_dim[2],q7_t* input_2,int input_2_dim[2],q7_t* output,int right_shift);//dim[0] is high , dim[1] is weight
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/mbed-os.lib:
--------------------------------------------------------------------------------
1 | https://github.com/ARMmbed/mbed-os/#367dbdf5145f4d6aa3e483c147fe7bda1ce23a36
2 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/mbed_settings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | mbed SDK
 3 | Copyright (c) 2016 ARM Limited
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License");
 6 | you may not use this file except in compliance with the License.
 7 | You may obtain a copy of the License at
 8 | 
 9 | http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | """
17 | 
18 | from os.path import join, abspath, dirname
19 | 
20 | #ROOT = abspath(join(dirname(__file__), "."))
21 | 
22 | ##############################################################################
23 | # Build System Settings
24 | ##############################################################################
25 | #BUILD_DIR = abspath(join(ROOT, "build"))
26 | 
27 | # ARM
28 | #ARM_PATH = "C:/Program Files/ARM"
29 | 
30 | # GCC ARM
31 | #GCC_ARM_PATH = ""
32 | 
33 | # IAR
34 | #IAR_PATH = "C:/Program Files (x86)/IAR Systems/Embedded Workbench 7.0/arm"
35 | 
36 | # Goanna static analyser. Please overload it in private_settings.py
37 | #GOANNA_PATH = "c:/Program Files (x86)/RedLizards/Goanna Central 3.2.3/bin"
38 | 
39 | #BUILD_OPTIONS = []
40 | 
41 | # mbed.org username
42 | #MBED_ORG_USER = ""
43 | 
44 | # Print compiler warnings and errors as link format
45 | #PRINT_COMPILER_OUTPUT_AS_LINK = False
46 | 


--------------------------------------------------------------------------------
/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example/run.sh:
--------------------------------------------------------------------------------
1 | docker run -ti --rm -v /Users/weifen/work/onnc_projects/onnc-tutorial/lab_2_Digit_Recognition_with_ARM_CortexM/onnc-cmsis-example:/src misegr/mbed-cli bash
2 | 


--------------------------------------------------------------------------------
/lab_3_Starting_New_Backend/lab_3.md:
--------------------------------------------------------------------------------
  1 | # Starting a New Backend
  2 | 
  3 | ## Preface
  4 | 
  5 | ONNC as an AI compiler framework, intends to be flexible and easy to incorporate a variety of deep learning accelerator (DLA) hardware.
  6 | The following figure shows the software architecture of ONNC. 
  7 | 
  8 | <img src="../figures/onnc-software-architecture-diagram.png" width="500">
  9 | 
 10 | The ONNC compiler has a general frontend framework to parse AI models and lower its representation to the ONNX IR graph. For each target hardware platform, compiler has a corresponding backend to deal with target-dependent tasks. There are two possible paths in porting a new backend. Processor-type targets follow the left path in the above diagram to emit LLVM IRs and be compiled to target machine code using the LLVM cross-compiler. Other proprietary DLA designs follow the right path to have a customized backend. In the case of NVDLA, we take the right path for ONNC porting to the NVDLA hardware. Each backend in ONNC performs target-specific conversion, and ONNC can have multiple backends for supporting different DLAs. ONNC provides a script to generate a code skeleton for a new backend. In this tutorial, we will describe how to use the script to have a jump start in backend porting. 
 11 | 
 12 | In terms of file structure, all backend code is placed inside the directory, `<path/to/onnc>/lib/Target`. There are two backends available in that directory, including NVDLA and X86. As the above figure shows, there are a couple of default stages in each backend, including TensorSel, TensorScheduling, MemoryAllocation, and CodeEmit. The backend design in the ONNC framework has a significant control of the compilation process. Developers may decide whether and how to design each stage on their own. We recommend generating a new backend using the provided backend-creating script and make the necessary modification based on your own needs. This lab will demonstrate how to generate a new backend, how to compile ONNC, and how to run ONNC to compile an AI model respectively. 
 13 | 
 14 | 
 15 | ## Lab: Creating a Backend -- FooNvdla
 16 | 
 17 | ### Step 1: Set up environment.
 18 | 
 19 | Please finish the following labs before continuing this lab.
 20 | * [lab 1: Environment Setup](../lab_1_Environment_Setup/lab_1.md) for preparing the Docker images and ONNC source codes.
 21 | 
 22 | The backend-creating script is included in the ONNC source code and can be run within the ONNC-community Docker container.
 23 | 
 24 | ### Step 2: Run the backend-creating script.
 25 | 
 26 | Running the script needs certain packages installed in your working environment. We have prepared a pre-build working environment, the ONNC-community Docker, for fast setup. Please run the script within the Docker container. 
 27 | 
 28 | ```sh
 29 | # Use the interactive mode to enter the Docker prompt. You will run the script inside.
 30 | $ docker run -ti --rm -v <path/to/onnc>:/onnc/onnc onnc/onnc-community
 31 | ```
 32 | 
 33 | We have described how to set up a pre-built working environment in [Lab 1](../lab_1_Environment_Setup/lab_1.md). If you are not familiar with the ONNC-community Docker container, please go through Lab 1 first to setup your working environment. Once you enter the Docker container, type the following commands in the prompt to create a new backend called FooNvdla.
 34 | 
 35 | ```sh
 36 | # Within the onnc/onnc-community Docker container
 37 | 
 38 | # Go to the path where the ONNC source codes are mounted to.
 39 | $ cd /onnc/onnc
 40 | 
 41 | # Run the script to create a new backend called FooNvdla.
 42 | $ ./scripts/create-new-backend.sh FooNvdla
 43 | ```
 44 | 
 45 | The new backend FooNvdla will be placed inside the folder `<path/to/onnc>/lib/Target/FooNvdla`. Since we mount the ONNC source code to the Docker, any change inside the Docker container can be seen outside the container as well. You can find the generated files on your computer outside the Docker container.
 46 | 
 47 | ### Step 3: Compile the new backend
 48 | 
 49 | After creating the new backend with that script, you have a runnable backend that just dumps the model information by default. In this step, let's rebuild ONNC and compile a DNN model.
 50 | 
 51 | Use the following commands to compile ONNC with the new backend.
 52 | 
 53 | ```sh
 54 | # Within the onnc/onnc-community Docker container
 55 | 
 56 | $ cd /onnc/onnc-umbrella/build-normal/
 57 | 
 58 | # Use “-j8” to invoke 8 CPU cores to do the parallel compilation.
 59 | $ smake -j8 install
 60 | # ...
 61 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/Bits/header.h
 62 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/OFStreamLog.h
 63 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/Diagnostic.h
 64 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/MsgHandler.h
 65 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/EngineFwd.h
 66 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/StreamLog.h
 67 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Diagnostic/MsgHandling.h
 68 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc
 69 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Support
 70 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Support/DataTypes.h
 71 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Config
 72 | -- Up-to-date: /onnc/onnc-umbrella/install-normal/include/onnc/Config/ONNX.h
 73 | -- Installing: /onnc/onnc-umbrella/install-normal/include/onnc/Config/Platforms.def
 74 | -- Installing: /onnc/onnc-umbrella/install-normal/include/onnc/Config/Backends.def
 75 | -- Installing: /onnc/onnc-umbrella/install-normal/include/onnc/Config/Config.h
 76 | ```
 77 | 
 78 | ### Step 4: Compile an AI model
 79 | 
 80 | The following commands demonstrate how to compile the `AlexNet` model with the ONNC binary. 
 81 | 
 82 | ```sh
 83 | # Within the onnc/onnc-community Docker container
 84 | 
 85 | $ onnc -mquadruple foonvdla /models/bvlc_alexnet/model.onnx
 86 | ```
 87 | 
 88 | The option `-mquadruple foonvdla` is for invoking the new backend FooNvdla. Note that ONNC only accepts lowercase letters as the backend name in this option. When you use uppercase letters for the new backend name, the `create-new-backend.sh` script will convert them to lowercase letters automatically. 
 89 | 
 90 | The following log shows the compilation result and it dumps the model graph information in the `AlexNet` model. 
 91 | 
 92 | ```sh
 93 | FooNvdla is invoked
 94 | %conv1_w_0<float>[96, 3, 11, 11] = Initializer<unimplemented>()
 95 | %conv1_b_0<float>[96] = Initializer<unimplemented>()
 96 | %conv2_w_0<float>[256, 48, 5, 5] = Initializer<unimplemented>()
 97 | %conv2_b_0<float>[256] = Initializer<unimplemented>()
 98 | %conv3_w_0<float>[384, 256, 3, 3] = Initializer<unimplemented>()
 99 | %conv3_b_0<float>[384] = Initializer<unimplemented>()
100 | %conv4_w_0<float>[384, 192, 3, 3] = Initializer<unimplemented>()
101 | %conv4_b_0<float>[384] = Initializer<unimplemented>()
102 | %conv5_w_0<float>[256, 192, 3, 3] = Initializer<unimplemented>()
103 | %conv5_b_0<float>[256] = Initializer<unimplemented>()
104 | %fc6_w_0<float>[4096, 9216] = Initializer<unimplemented>()
105 | %fc6_b_0<float>[4096] = Initializer<unimplemented>()
106 | %fc7_w_0<float>[4096, 4096] = Initializer<unimplemented>()
107 | %fc7_b_0<float>[4096] = Initializer<unimplemented>()
108 | %fc8_w_0<float>[1000, 4096] = Initializer<unimplemented>()
109 | %fc8_b_0<float>[1000] = Initializer<unimplemented>()
110 | %OC2_DUMMY_1<int64>[2] = Initializer<unimplemented>()
111 | %data_0<float>[1, 3, 224, 224] = InputOperator<unimplemented>()
112 | %conv1_1<float>[1, 96, 54, 54] = Conv<auto_pad: "NOTSET", dilations: [1, 1], group: 1, kernel_shape: [11, 11], pads: [0, 0, 0, 0], strides: [4, 4]>(%data_0<float>[1, 3, 224, 224], %conv1_w_0<float>[96, 3, 11, 11], %conv1_b_0<float>[96])
113 | %conv2_1<float>[1, 256, 26, 26] = Conv<auto_pad: "NOTSET", dilations: [1, 1], group: 2, kernel_shape: [5, 5], pads: [2, 2, 2, 2], strides: [1, 1]>(%pool1_1<float>[1, 96, 26, 26], %conv2_w_0<float>[256, 48, 5, 5], %conv2_b_0<float>[256])
114 | %conv3_1<float>[1, 384, 12, 12] = Conv<auto_pad: "NOTSET", dilations: [1, 1], group: 1, kernel_shape: [3, 3], pads: [1, 1, 1, 1], strides: [1, 1]>(%pool2_1<float>[1, 256, 12, 12], %conv3_w_0<float>[384, 256, 3, 3], %conv3_b_0<float>[384])
115 | %conv4_1<float>[1, 384, 12, 12] = Conv<auto_pad: "NOTSET", dilations: [1, 1], group: 2, kernel_shape: [3, 3], pads: [1, 1, 1, 1], strides: [1, 1]>(%conv3_2<float>[1, 384, 12, 12], %conv4_w_0<float>[384, 192, 3, 3], %conv4_b_0<float>[384])
116 | %conv5_1<float>[1, 256, 12, 12] = Conv<auto_pad: "NOTSET", dilations: [1, 1], group: 2, kernel_shape: [3, 3], pads: [1, 1, 1, 1], strides: [1, 1]>(%conv4_2<float>[1, 384, 12, 12], %conv5_w_0<float>[256, 192, 3, 3], %conv5_b_0<float>[256])
117 |  = OutputOperator<unimplemented>(%prob_1<float>[1, 1000])
118 | ```
119 | 
120 | Congratulations! Now you have your new backend ready. In the subsequent tutorial labs, you are going to add more functionalities to the new backend.
121 | 
122 | ## Files within a new backend
123 | 
124 | By following the commands in the previous section, we have created a new backend FooNvdla and all the files are generated in the `lib/Target/FooNvdla` directory. The following table lists the files in the created folder. 
125 | 
126 | | File | Purpose |
127 | | ---- | ------- |
128 | | `FooNvdlaBackend.cpp & .h` | The main file of a backend. Developers need to modify this file to add optimization passes. |
129 | | `CodeEmitVisitor.cpp & .h` | Implementation of the `CodeEmitVisitor` class. Developers need to modify this file to handle the code generation for each operators. |
130 | | `TargetInfo/FooNvdlaTargetInfo.cpp & .h` | This file containing functions for registering this backend to the ONNC framework. |
131 | | `TargetInfo/FooNvdlaTargetMemInfo.cpp & .h` | The file for configuring  memory size and alignment for each data type in neural network models. Developers need to modify this file based on the target hardware attributes to optimize memory allocation. |
132 | | `CMakeLists.txt` | Configuration file for the CMake building system. |
133 | | `Makefile.am` | Configuration file for the Autotools building system. |
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/lab_4_Code_Emitting/src/FooNvdla.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/lab_4_Code_Emitting/src/FooNvdla.tar.gz


--------------------------------------------------------------------------------
/lab_4_Code_Emitting/src/visit_Add.cpp:
--------------------------------------------------------------------------------
  1 | #include <onnc/IR/Compute/Add.h>
  2 | 
  3 | 
  4 | void CodeEmitVisitor::visit(const Add& pOp)
  5 | {
  6 |   printf("visit(Add) is called\n");
  7 |   
  8 |   // Get tensor attributes.
  9 |   const Tensor& first = *(pOp.getInput(0));
 10 |   const Tensor& second = *(pOp.getInput(1));
 11 |   const Tensor& output = *(pOp.getOutput(0));
 12 | 
 13 |   // For this example, we only support a special case where the first tensor is activation data
 14 |   // stored in memory and the 2nd tensor is a constant
 15 |   assert( (!isConstant(first) && isConstant(second)) &&
 16 |           "support only the case that the first tensor is activation data and the second constant");
 17 | 
 18 |   //--------------------------------
 19 |   // Configure hardware block
 20 |   //--------------------------------
 21 | 
 22 |   NvDlaDlaOperation* operation = new NvDlaDlaOperation();
 23 |   // Set hardware block type.
 24 |   operation->op_dep.op_type = DLA_OP_SDP;
 25 | 
 26 |   struct dla_sdp_op_desc& desc = (struct dla_sdp_op_desc&)(operation->op_desc);
 27 |   desc.src_precision     = PRECISION_FP16;
 28 |   desc.dst_precision     = PRECISION_FP16;
 29 |   // No look up table is required.
 30 |   desc.lut_index         = -1;
 31 | 
 32 |   // For this example, we only support batch == 1.
 33 |   desc.batch_num         = 1;
 34 |   desc.batch_stride      = 0;
 35 | 
 36 |   // Enable X1 block.
 37 |   desc.x1_op.enable      = 1;
 38 | 
 39 |   // X1 operation Options: Disable (SDP_OP_NONE) / ALU only (SDP_OP_ADD) /
 40 |   //                       Multiplier only (SDP_OP_MUL) / ALU+MUL (SDP_OP_BOTH)
 41 |   desc.x1_op.type        = SDP_OP_ADD;
 42 | 
 43 |   // ALU type options: SUM/MIN/MAX 
 44 |   desc.x1_op.alu_type    = SDP_ALU_OP_SUM;
 45 | 
 46 |   // Disable ReLU
 47 |   desc.x1_op.act         = ACTIVATION_NONE;   
 48 | 
 49 |   // Set per_layer/per_channel/per_point mode based on the broadcasting type.
 50 |   // For this example we only support per_point mode.
 51 |   desc.x1_op.mode        = SDP_OP_PER_POINT;
 52 | 
 53 |   // Set the datapath precision to be fp16.
 54 |   desc.x1_op.precision   = PRECISION_FP16;
 55 | 
 56 |   //----------------------------------------
 57 |   // Setup dataflow sources and destination
 58 |   //----------------------------------------
 59 | 
 60 |   struct dla_sdp_surface_desc& surface = (struct dla_sdp_surface_desc&)(operation->op_surf);
 61 | 
 62 |   // Setup 1st tensor source.
 63 |   const NvDlaCubeInfo firstCubeInfo   = makeCubeInfo(*this, NVDLA_CUBE_FEATURE, first);
 64 |   // The 1st input tensor can be read from:
 65 |   //   external DRAM via the interface of MCIF: DLA_MEM_MC
 66 |   //   SRAM via the interface of CVIF: DLA_MEM_CV
 67 |   //   the output of CONV hardware block: DLA_MEM_HW
 68 |   // In this example, we only support the 1st input tensor is stored at external DRAM.
 69 |   surface.src_data.type               = DLA_MEM_MC;
 70 |   // Setup memory allocation and DMA configuration for 1st input tensor.
 71 |   surface.src_data.address            = issueDlaAddr(first, firstCubeInfo);
 72 |   surface.src_data.size               = m_pMeta.getMemoryListEntrySize(first);
 73 |   surface.src_data.width              = firstCubeInfo.dim_w;
 74 |   surface.src_data.height             = firstCubeInfo.dim_h;
 75 |   surface.src_data.channel            = firstCubeInfo.dim_c;
 76 |   surface.src_data.line_stride        = firstCubeInfo.stride_line;
 77 |   surface.src_data.surf_stride        = firstCubeInfo.stride_surface;
 78 | 
 79 |   // Setup 2nd tensor source.
 80 |   MemoryListEntryId   memoryId;
 81 |   const NvDlaCubeInfo secondCubeInfo = makeCubeInfo(*this, getSdpXSingleCubeType(second, DLA_PRECISION), second);
 82 |   // The 2nd input tensor is stored at DRAM and accessed through the interface of MCIF.
 83 |   surface.x1_data.type               = DLA_MEM_MC;
 84 |   // Setup memory allocation and DMA configuration for 2nd input tensor.
 85 |   // In addition, the 2nd tensor is constant so need be packed into a blob and becomes a part of loadable.
 86 |   surface.x1_data.address            = issueSDPOperand(second, secondCubeInfo, memoryId);
 87 |   surface.x1_data.size               = m_pMeta.getMemoryListEntrySize(memoryId);
 88 |   surface.x1_data.width              = secondCubeInfo.dim_w;
 89 |   surface.x1_data.height             = secondCubeInfo.dim_h;
 90 |   surface.x1_data.channel            = secondCubeInfo.dim_c;
 91 |   surface.x1_data.line_stride        = secondCubeInfo.stride_line;
 92 |   surface.x1_data.surf_stride        = secondCubeInfo.stride_surface;
 93 | 
 94 |   // Setup output tensor destination.
 95 |   const NvDlaCubeInfo outputCubeInfo = makeCubeInfo(*this, NVDLA_CUBE_FEATURE, output);
 96 |   // The output tensor is stored at DRAM.
 97 |   surface.dst_data.type         = DLA_MEM_MC;
 98 |   surface.dst_data.address      = issueDlaAddr(output, outputCubeInfo);
 99 |   surface.dst_data.size         = m_pMeta.getMemoryListEntrySize(output);
100 |   surface.dst_data.width        = outputCubeInfo.dim_w;
101 |   surface.dst_data.height       = outputCubeInfo.dim_h;
102 |   surface.dst_data.channel      = outputCubeInfo.dim_c;
103 |   surface.dst_data.line_stride  = outputCubeInfo.stride_line;
104 |   surface.dst_data.surf_stride  = outputCubeInfo.stride_surface;
105 | 
106 |   //----------------------------------------
107 |   //  enlist the operation 
108 |   //----------------------------------------
109 |   issueDlaOp(operation, NULL, m_pMeta.m_pPrevOp);
110 | }
111 | 
112 | 


--------------------------------------------------------------------------------
/lab_5_CPU_Fallback/src/emu_interface.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions
  6 |  * are met:
  7 |  *  * Redistributions of source code must retain the above copyright
  8 |  *    notice, this list of conditions and the following disclaimer.
  9 |  *  * Redistributions in binary form must reproduce the above copyright
 10 |  *    notice, this list of conditions and the following disclaimer in the
 11 |  *    documentation and/or other materials provided with the distribution.
 12 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 |  *    contributors may be used to endorse or promote products derived
 14 |  *    from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 |  */
 28 | 
 29 | #ifndef NVDLA_PRIV_EMU_EMU1_A_EMU_INTERFACE_H
 30 | #define NVDLA_PRIV_EMU_EMU1_A_EMU_INTERFACE_H
 31 | 
 32 | #include "dlatypes.h"
 33 | 
 34 | #define NVDLA_EMU_MAX_BUFFERS_PER_TASK (6144)
 35 | 
 36 | /**
 37 |  * @name Op Type
 38 |  * Network is formed using a list of these operations
 39 |  * @{
 40 |  */
 41 | #define NVDLA_EMU_OP_POWER    0
 42 | #define NVDLA_EMU_OP_SOFTMAX  1
 43 | #define NVDLA_EMU_OP_LOG      2
 44 | /** @} */
 45 | 
 46 | /**
 47 |  * Address
 48 |  */
 49 | struct emu_address
 50 | {
 51 |     void *hMem;
 52 |     NvU32 offset;
 53 | };
 54 | 
 55 | /**
 56 |  * Task Descriptor
 57 |  */
 58 | struct emu_task_desc
 59 | {
 60 |     NvU32 num_addresses;
 61 |     emu_address address_list[NVDLA_EMU_MAX_BUFFERS_PER_TASK];
 62 | } __attribute__ ((packed, aligned(256)));
 63 | 
 64 | /**
 65 |  * Network Descriptor
 66 |  *
 67 |  * Contains all information to execute a network
 68 |  *
 69 |  * @num_operations: Number of operations in the lists
 70 |  */
 71 | struct emu_network_desc
 72 | {
 73 |     NvS16 operation_desc_index;
 74 |     NvS16 operation_buffer_desc_index;
 75 |     NvU16 num_operations;
 76 | } __attribute__ ((packed, aligned(256)));
 77 | 
 78 | struct emu_common_op_desc
 79 | {
 80 |     NvU8 op_type;
 81 | };
 82 | 
 83 | struct emu_power_op_desc
 84 | {
 85 |     emu_common_op_desc common;
 86 |     NvF32 power;
 87 |     NvF32 scale;
 88 |     NvF32 shift;
 89 | } __attribute__ ((packed, aligned(4)));
 90 | 
 91 | struct emu_softmax_op_desc
 92 | {
 93 |     emu_common_op_desc common;
 94 |     NvU8 axis;
 95 | } __attribute__ ((packed, aligned(4)));
 96 | 
 97 | struct emu_log_op_desc
 98 | {
 99 |   emu_common_op_desc common;
100 | } __attribute__ ((packed, aligned(4)));
101 |   
102 | union emu_operation_container
103 | {
104 |     struct emu_power_op_desc power_op;
105 |     struct emu_softmax_op_desc softmax_op;
106 |     struct emu_log_op_desc log_op;
107 | };
108 | 
109 | struct emu_buffer_desc
110 | {
111 |     /* offset to the actual IOVA in task.address_list */
112 |     NvS16 addressIndex;
113 |     NvU32 size;
114 | 
115 |     /* surface format */
116 |     NvU16 format;
117 | 
118 |     /* cube dimensions */
119 |     NvU16 width;
120 |     NvU16 height;
121 |     NvU16 channel;
122 | 
123 |     /* stride information */
124 |     NvU32 line_stride;
125 |     NvU32 surf_stride;
126 | } __attribute__ ((packed, aligned(256)));
127 | 
128 | struct emu_power_buffer_descs
129 | {
130 |     /* Buffer Descriptors */
131 |     struct emu_buffer_desc src_data;
132 |     struct emu_buffer_desc dst_data;
133 | } __attribute__ ((packed, aligned(4)));
134 | 
135 | struct emu_softmax_buffer_descs
136 | {
137 |     /* Buffer Descriptors */
138 |     struct emu_buffer_desc src_data;
139 |     struct emu_buffer_desc dst_data;
140 | } __attribute__ ((packed, aligned(4)));
141 | 
142 | struct emu_log_buffer_descs
143 | {
144 |     /* Buffer Descriptors */
145 |     struct emu_buffer_desc src_data;
146 |     struct emu_buffer_desc dst_data;
147 | } __attribute__ ((packed, aligned(4)));
148 | 
149 | union emu_operation_buffer_container
150 | {
151 |     struct emu_power_buffer_descs power_buffers;
152 |     struct emu_softmax_buffer_descs softmax_buffers;
153 |     struct emu_log_buffer_descs log_buffers;
154 | };
155 | 
156 | 
157 | #endif // NVDLA_PRIV_EMU_EMU1_A_EMU_INTERFACE_H
158 | 


--------------------------------------------------------------------------------
/lab_5_CPU_Fallback/src/visit_Log.cpp:
--------------------------------------------------------------------------------
 1 | #include <onnc/IR/Compute/Log.h>
 2 | 
 3 | void CodeEmitVisitor::visit(const Log& pOp)
 4 | {
 5 |   printf("visit(Log) is called\n");
 6 |   
 7 |   // Get tensor attributes.
 8 |   const Tensor& input = *(pOp.getInput(0));
 9 |   const Tensor& output = *(pOp.getOutput(0));
10 | 
11 |   //--------------------------------
12 |   // Configure emulator engine
13 |   //--------------------------------
14 | 
15 |   // Use the class NvDlaEmuOperation rather than class NvDlaDlaOperation used in the DLA case.
16 |   NvDlaEmuOperation* operation = new NvDlaEmuOperation();
17 | 
18 |   struct emu_log_op_desc& desc = (struct emu_log_op_desc&)(operation->op_desc);
19 |   desc.common.op_type = NVDLA_EMU_OP_LOG;
20 | 
21 |   //----------------------------------------
22 |   // Setup dataflow sources and destination
23 |   //----------------------------------------
24 | 
25 |   struct emu_log_buffer_descs& surface = (struct emu_log_buffer_descs&)(operation->op_buf);
26 | 
27 |   // Setup input tensor source.
28 |   const NvDlaCubeInfo inputCubeInfo       = makeCubeInfo(*this, NVDLA_CUBE_FEATURE, input);
29 |   int input_mid                           = m_pMeta.getMemoryListEntryId(input);
30 |   surface.src_data.addressIndex           = issueEmuAddr(input_mid);
31 |   surface.src_data.size                   = m_pMeta.getMemoryListEntrySize(input_mid);
32 |   surface.src_data.format                 = PRECISION_FP16;
33 |   surface.src_data.width                  = inputCubeInfo.dim_w;
34 |   surface.src_data.height                 = inputCubeInfo.dim_h;
35 |   surface.src_data.channel                = inputCubeInfo.dim_c;
36 |   surface.src_data.line_stride            = inputCubeInfo.stride_line;
37 |   surface.src_data.surf_stride            = inputCubeInfo.stride_surface;
38 | 
39 |   // Setup output tensor destination.
40 |   const NvDlaCubeInfo outputCubeInfo = makeCubeInfo(*this, NVDLA_CUBE_FEATURE, output);
41 |   int output_mid                = m_pMeta.getMemoryListEntryId(output);
42 |   surface.dst_data.addressIndex = issueEmuAddr(output_mid);
43 |   surface.dst_data.size         = m_pMeta.getMemoryListEntrySize(output_mid);
44 |   surface.dst_data.format       = PRECISION_FP16;
45 |   surface.dst_data.width        = outputCubeInfo.dim_w;
46 |   surface.dst_data.height       = outputCubeInfo.dim_h;
47 |   surface.dst_data.channel      = outputCubeInfo.dim_c;
48 |   surface.dst_data.line_stride  = outputCubeInfo.stride_line;
49 |   surface.dst_data.surf_stride  = outputCubeInfo.stride_surface;
50 | 
51 |   //----------------------------------------
52 |   //  enlist the operation 
53 |   //----------------------------------------
54 |   issueEmuOp(operation);
55 | }
56 | 


--------------------------------------------------------------------------------
/lab_5_CPU_Fallback/src/visit_Relu.cpp:
--------------------------------------------------------------------------------
 1 | #include <onnc/IR/Compute/Relu.h>
 2 | 
 3 | void CodeEmitVisitor::visit(const Relu& pOp)
 4 | {
 5 |   printf("visit(Relu) is called\n");
 6 |   
 7 |   const Tensor* input_X_t       = pOp.getInput(0);
 8 |   int32_t       input_X_ndim    = input_X_t->getNumOfDimensions();
 9 |   int32_t       input_X_dims[4] = {1, 1, 1, 1};
10 |   for (int i = 0; i < input_X_ndim; ++i)
11 |     input_X_dims[i] = input_X_t->dimension(i);
12 |   NvDlaCubeInfo X_cube(*this, NVDLA_CUBE_FEATURE, input_X_dims[0], input_X_dims[1], input_X_dims[2], input_X_dims[3]);
13 | 
14 |   const Tensor* output_Y_t       = pOp.getOutput(0);
15 |   int32_t       output_Y_ndim    = output_Y_t->getNumOfDimensions();
16 |   int32_t       output_Y_dims[4] = {1, 1, 1, 1};
17 |   for (int i = 0; i < output_Y_ndim; ++i)
18 |     output_Y_dims[i] = output_Y_t->dimension(i);
19 | 
20 |   NvDlaCubeInfo Y_cube(*this, NVDLA_CUBE_FEATURE, output_Y_dims[0], output_Y_dims[1], output_Y_dims[2],
21 |                        output_Y_dims[3]);
22 | 
23 |   NvDlaDlaOperation* relu_op = new NvDlaDlaOperation();
24 |   relu_op->op_dep.op_type    = DLA_OP_SDP;
25 | 
26 |   struct dla_sdp_op_desc* relu_desc     = (struct dla_sdp_op_desc*)(&(relu_op->op_desc));
27 |   relu_desc->src_precision              = DLA_PRECISION;
28 |   relu_desc->dst_precision              = DLA_PRECISION;
29 |   relu_desc->lut_index                  = -1;
30 |   relu_desc->conv_mode                  = 0;
31 |   relu_desc->out_cvt.scale              = 1;
32 |   relu_desc->out_cvt.truncate           = 0;
33 |   relu_desc->out_cvt.enable             = 1;
34 |   relu_desc->out_cvt.offset             = 0;
35 |   relu_desc->conv_mode                  = CONV_MODE_DIRECT;
36 |   relu_desc->batch_num                  = 1;
37 |   relu_desc->batch_stride               = 0;
38 |   relu_desc->x1_op.enable               = 1;
39 |   relu_desc->x1_op.alu_type             = SDP_ALU_OP_SUM;
40 |   relu_desc->x1_op.type                 = SDP_OP_NONE;
41 |   relu_desc->x1_op.mode                 = SDP_OP_PER_LAYER;
42 |   relu_desc->x1_op.act                  = ACTIVATION_RELU;
43 |   relu_desc->x1_op.shift_value          = 0;
44 |   relu_desc->x1_op.truncate             = 0;
45 |   relu_desc->x1_op.precision            = DLA_PRECISION;
46 |   relu_desc->x1_op.alu_operand          = 0;
47 |   relu_desc->x1_op.mul_operand          = 1;
48 |   relu_desc->x1_op.cvt.alu_cvt.scale    = 0;
49 |   relu_desc->x1_op.cvt.alu_cvt.truncate = 0;
50 |   relu_desc->x1_op.cvt.alu_cvt.enable   = 0;
51 |   relu_desc->x1_op.cvt.alu_cvt.offset   = 0;
52 |   relu_desc->x1_op.cvt.mul_cvt.scale    = 0;
53 |   relu_desc->x1_op.cvt.mul_cvt.truncate = 0;
54 |   relu_desc->x1_op.cvt.mul_cvt.enable   = 0;
55 |   relu_desc->x1_op.cvt.mul_cvt.offset   = 0;
56 | 
57 |   struct dla_sdp_surface_desc* relu_surf = (struct dla_sdp_surface_desc*)(&(relu_op->op_surf));
58 |   relu_surf->src_data.type               = DLA_MEM_MC;
59 |   relu_surf->src_data.address            = issueDlaAddr(*input_X_t, X_cube);
60 |   relu_surf->src_data.size               = m_pMeta.getMemoryListEntrySize(*input_X_t);
61 |   relu_surf->src_data.width              = X_cube.dim_w;
62 |   relu_surf->src_data.height             = X_cube.dim_h;
63 |   relu_surf->src_data.channel            = X_cube.dim_c;
64 |   relu_surf->src_data.line_stride        = X_cube.stride_line;
65 |   relu_surf->src_data.surf_stride        = X_cube.stride_surface;
66 |   relu_surf->src_data.plane_stride       = X_cube.stride_plane;
67 | 
68 |   relu_surf->dst_data.type         = DLA_MEM_MC;
69 |   relu_surf->dst_data.address      = issueDlaAddr(*output_Y_t, Y_cube);
70 |   relu_surf->dst_data.size         = m_pMeta.getMemoryListEntrySize(*output_Y_t);
71 |   relu_surf->dst_data.width        = Y_cube.dim_w;
72 |   relu_surf->dst_data.height       = Y_cube.dim_h;
73 |   relu_surf->dst_data.channel      = Y_cube.dim_c;
74 |   relu_surf->dst_data.line_stride  = Y_cube.stride_line;
75 |   relu_surf->dst_data.surf_stride  = Y_cube.stride_surface;
76 |   relu_surf->dst_data.plane_stride = Y_cube.stride_plane;
77 | 
78 |   issueDlaOp(relu_op, NULL, m_pMeta.m_pPrevOp);
79 | }
80 | 


--------------------------------------------------------------------------------
/lab_6_Manipulating_ONNC_IR/lab_6.md:
--------------------------------------------------------------------------------
  1 | # Manipulating ONNC IR and Optimization
  2 | 
  3 | ## Preface
  4 | 
  5 | ONNC inherits the concept of pass management from the LLVM infrastructure and
  6 | the pass manager is one of the most important features in ONNC as well. Any analysis or transformation on a target program can be implemented as a pass in the ONNC framework. 
  7 | 
  8 | <img src="../figures/onnc-software-architecture-diagram.png" width="500">
  9 | 
 10 | The above figure depicts the top-level block diagram of ONNC software stacks. The software stack illustrates the functional blocks from importing an ONNX computation graph model to emitting a hardware-executable form. Each stack is implemented as a collection of passes. In addition to leveraging the LLVM backend, ONNC paves another fast track for proprietary DLAs to execute ONNX models by defining ONNC IR, an intermediate representation (IR) that has one-to-one mapping to the ONNX IR. The `TensorSel` pass translates the ONNX IR into the ONNC IR. The subsequent passes analyze and manipulate the ONNC IR for model optimization and transformation. Many deep learning accelerators (DLAs) have limited support for the ONNX operators and only a subset of ONNX/ONNC IRs can be directly mapped to their hardware operations. In that case, some optimization passes are designed to decompose an ONNC IR into a sequence of ONNC IRs that have direct hardware operation support. For example, in the case of NVDLA backend, a BatchNormalization operator in the ONNC IR is decomposed into a CONV followed by an ADD IR in an optimization pass before the `CodeEmit` pass. For those who intend to develop a backend, it is essential to understand the data structure and APIs for ONNC IR manipulation. In this lab, we discuss and demonstrate to write a pass of traversing the ONNC IR of a given model. 
 11 | 
 12 | 
 13 | ## ONNC IR Graph
 14 | 
 15 | Take the following model as an example.
 16 | 
 17 | <img src="../figures/test_Conv_Relu.png" width="80">
 18 | 
 19 | This model contains a CONV followed by a Relu. Its ONNC IR graph is depicted in the following diagram. 
 20 | 
 21 | <img src="../figures/test_Conv_Relu_onnc_ir.png" width="180">
 22 | 
 23 | The ONNC IR graph represents the data flow in a model. There are two types of nodes in the graph including the circle-shape nodes for compute operators and the rectangle-shape nodes for all types of values. They are implemented as C++ classes of `ComputeOperator` and `Value` respectively. In this example, there are five `ComputeOperator` in the graph. Besides `Conv` and `ReLu`, there are three other special ComputeOperators not shown in the original model. They are InputOperator, OutputOperator, and Initializer. InputOperator and OutputOperator, as the name suggests, represents the input and output of the model. Initializer is used to represent constants in the model such as weights. Between many pairs of ComputeOperators, there are rectangle-shape nodes that stores values for computation. Take the *Conv* node in the graph as an example. It has two input Values named *W* and *INPUT0*. They are kernel weights and input data respectively. It also has one output Value named *conv_out*, which is the output data of the convolution.
 24 | 
 25 | ## Data Structures for ONNC IR
 26 | 
 27 | ONNC provides a group of data structures to describe the ONNC IR. The following figure shows the overview of the data structures in the UML form.
 28 | 
 29 | <img src="../figures/compute_graph.png">
 30 | 
 31 | There are four major classes in the ONNC IR implementation - `class Module`, `class ComputeGraph`, `class ComputeOperator`, and `class Value`. `Class ComputeOperator` and `class Value` are already described in previous section. `Class ComputeGraph` encloses a single **connected** DAG (Directed Acyclic Graph) of ComputeOperators and Values, whereas `class Module` encloses a set of independent ComputeGraphs. Although it is rare for a DNN model to have multiple separated, disconnected data flow in reality, ONNC introduces the concept of Module in a higher and broader abstraction for extendibilitiy.  
 32 | 
 33 | ## Lab: Visualing the ONNC IR Graph of a Given Model
 34 | 
 35 | In this lab, we will write a pass to traverse the ONNC IR graph of a given model and print the graph in the [Graphviz](https://www.graphviz.org/) format. Graphviz is a formal language for describing graphs and networks. There are open source software tools available to compile the textual description into some image forms such as png. The following code snippet shows an example of the Graphviz "script" and you may find the complete source file in [test_Conv_Relu.dot](src/test_Conv_Relu.dot).
 36 | 
 37 | ```
 38 | digraph {
 39 |   ...
 40 | 
 41 |   Initializer_94153827516736 -> W
 42 |   InputOperator_94153828180800 -> INPUT0
 43 |   INPUT0 -> Conv_94153828038720
 44 |   W -> Conv_94153828038720
 45 |   Conv_94153828038720 -> conv_out
 46 |   conv_out -> Relu_94153827458336
 47 |   Relu_94153827458336 -> OUTPUT0
 48 |   OUTPUT0 -> OutputOperator_94153828212384
 49 | }
 50 | ```
 51 | 
 52 | Graphviz is supported in many readers that support markdown languages and you may get the corresponding image file for the above script as the following picture.
 53 | 
 54 | <img src="../figures/test_Conv_Relu_onnc_ir.png" width="180">
 55 | 
 56 | Let's write a pass to generate the above Graphviz script in ONNC.
 57 | 
 58 | ### Step 1: Set up environment.
 59 | 
 60 | Please finish the following labs first before continuing this lab.
 61 | 
 62 | * [lab 1: Environment Setup](../lab_1_Environment_Setup/lab_1.md) for preparing the Docker images and ONNC source codes.
 63 | * [lab 3: Starting New Backend](../lab_3_Starting_New_Backend/lab_3.md) for preparing the experimental backend `FooNvdla` for the exercise in this lab.
 64 | * [lab 4: Code Emitting](../lab_4_Code_Emitting/lab_4.md) for setting up the utilities needed by the exampled ONNX model in this lab.
 65 | 
 66 | After the preparation, you should have the backend of `FooNvdla` ready, and the source code file can be found in the `<path/to/onnc>/lib/Target/FooNvdla` directory.
 67 | For the rest of this lab, when we talk about modifying the source code of the NVDLA backend, we are referring to the code in the `FooNvdla` directory.
 68 | 
 69 | ```sh
 70 | $ cd <path/to/onnc>/lib/Target/FooNvdla
 71 | ```
 72 | 
 73 | ### Step 2: Create a new pass
 74 | 
 75 | Pass is an abstraction of each execution in ONNC framework. It is designed for manipulating an ONNC IR graph to achieve a specific goal. Users may define customized pass types, register a pass into pass manager, and let pass manager administrate the executions.
 76 | 
 77 | First, create a pass by inheriting from the `CustomPass<T>` abstract class.
 78 | 
 79 | ```cpp
 80 | #include <onnc/Core/CustomPass.h>
 81 | 
 82 | class GraphvizONNCIRPass : public CustomPass<GraphvizONNCIRPass>
 83 | {
 84 | public:
 85 |   GraphvizONNCIRPass() = default;
 86 | 
 87 |   ReturnType runOnModule(Module& pModule) override;
 88 | };
 89 | ```
 90 | 
 91 | The `CustomPass<T>` abstract class defines several virtual functions. These member functions are invoked by the pass manager on each execution. Their prototypes are listed as below:
 92 |   
 93 | | Prototype |
 94 | | --------- |
 95 | | `virtual ReturnType doInitialization(Module&);` |
 96 | | `virtual ReturnType runOnModule(Module&);` |
 97 | | `virtual ReturnType doFinalization(Module&);` |
 98 | 
 99 | | Method | Description |
100 | | ------ | ----------- |
101 | | `doInitialization` | The first-invoked method in a pass. Acquire resources such as files, network and etc. |
102 | | `runOnModule` | Implement module manipulations in this method. |
103 | | `doFinalization` | The last-called method in a pass. Release resources and prepare next run. |
104 | 
105 | The above three methods are invoked exactly once per execution. Users can assemble meaningful values and return informative result to pass manager. ONNC use an enumeration type `PassResult` for execution results. 
106 | `PassResult` is usually encoded like bit mask and the following table lists all possibile values.
107 | 
108 | | Value | Description |
109 | | ----- | ----------- |
110 | | `kModuleNoChanged` | No update to the module content, or do nothing. |
111 | | `kModuleChanged` | There are some modifications on module, or invoke successfully. |
112 | | `kPassRetry` | Can not finish invocation due to some reason. Need to retry. |
113 | | `kPassFailure` | Failed to action. |
114 | 
115 | In the following code snippet, we show the typical implementation of the function `runOnModule`.
116 | 
117 | ```cpp
118 | // GraphvizONNCIRPass.cpp
119 | 
120 | Pass::ReturnType GraphvizONNCIRPass::runOnModule(Module& pModule)
121 | {
122 |   Pass::ReturnType ret = kModuleNoChanged;
123 |   
124 |   // ...
125 |   // Change the value of variable `ret` if necessary.
126 | 
127 |   if (ret != kModuleNoChanged) {
128 |     pModule.eraseUnusedValues();
129 |   }
130 | 
131 |   return ret;
132 | }
133 | ```
134 | 
135 | ### Step 3: Implement `GraphvizONNCIRPass` to traverse the ONNC IR
136 | 
137 | In this pass, we mainly override the `runOnModule` function of GraphvizONNCIRPass, and here we utilize the ONNC framework to simplify the implementation of `runOnModule`, as the following figure shows.
138 | 
139 | <img src="../figures/runOnModule.png" width="550">
140 | 
141 | The actual implementation of `runOnModule` is as follows.
142 | 
143 | ```cpp
144 | // GraphvizONNCIRPass.cpp
145 | 
146 | Pass::ReturnType GraphvizONNCIRPass::runOnModule(Module& pModule)
147 | {
148 |   Pass::ReturnType ret = kModuleNoChanged;
149 | 
150 |   // Call the default implementation of runOnModule(). It subsequently invokes 
151 |   // runOnComputeGraph() to handle each of the ComputeGraphs in the module.
152 |   ret = BaseType::runOnModule(pModule);
153 | 
154 |   if (ret != kModuleNoChanged) {
155 |     pModule.eraseUnusedValues();
156 |   }
157 | 
158 |   return ret;
159 | }
160 | 
161 | // Use the following function to handle every ComputeGraph.
162 | Pass::ReturnType GraphvizONNCIRPass::runOnComputeGraph(ComputeGraph& pCG)
163 | {
164 |   std::cout << "digraph {\n";
165 | 
166 |   // Traverse ComputeOperators in the topological order.
167 |   for (ComputeOperator& op : pCG) {
168 |     std::string opName = op.name().str() + "_" + std::to_string((long)&op);
169 |     std::cout << "  " << opName << " [label=" << op.name() << "]\n";
170 | 
171 |     // Traverse the input of this ComputeOperator.
172 |     int numInputs = op.getNumOfInputs();
173 |     for (int i = 0; i < numInputs; ++i) {
174 |       Value* input = op.getInput(i);
175 | 
176 |       std::cout << "  " << input->getName() << " -> " << opName << "\n";
177 |     }
178 |     
179 |     // Traverse the output of this ComputeOperator.
180 |     int numOutputs = op.getNumOfOutputs();
181 |     for (int i = 0; i < numOutputs; ++i) {
182 |       Value* output = op.getOutput(i);
183 | 
184 |       std::cout << "  " << opName << " -> " << output->getName() << "\n";
185 |       std::cout << "  " << output->getName() << " [shape=rect]\n";
186 |     }
187 |   }
188 | 
189 |   std::cout << "}\n";
190 | 
191 |   // This pass does not modify the graph topology. Just returnes kModuleNoChanged.
192 |   return Pass::kModuleNoChanged;
193 | }
194 | ```
195 | 
196 | You may copy the complete source code of [GraphvizONNCIRPass.cpp](src/GraphvizONNCIRPass.cpp) and [GraphvizONNCIRPass.h](src/GraphvizONNCIRPass.h) from the `lab_6_anipulating_ONNC_IR/src` directory to your backend directory, `<path/to/onnc>/lib/Target/FooNvdla`.
197 | 
198 | ### Step 4: Register GraphvizONNCIRPass in the target backend.
199 | 
200 | The following code snippet shows how to register `GraphvizONNCIRPass` in the FooNvdla backend.
201 | 
202 | ```diff
203 | // FooNvdlaBackend.cpp
204 | 
205 |  #include "NvDlaFileGenPass.h"
206 | +#include "GraphvizONNCIRPass.h"
207 | 
208 |  #include <onnc/Transforms/TensorSel/Standards/ConvLower.h>
209 | +#include <onnc/Transforms/TensorSel/Standards/ReluLower.h>
210 | 
211 |  void FooNvdlaBackend::addOnncIrOptimization(PassManager& pPM, OptimizationOptions& options)
212 |  {
213 |    TargetBackend::addOnncIrOptimization(pPM, options);
214 | 
215 | +  // Register the pass into the pass manager, so that it can get called during the backend's execution.
216 | +  pPM.add<GraphvizONNCIRPass>();
217 |  }
218 |  
219 |  void FooNvdlaBackend::RegisterLowers(LowerRegistry& pRegistry) const
220 |  {
221 |    pRegistry.emplace<ConvLower>();
222 | +  // We need to register operator Relu because the example model in this lab contains such type of operator.
223 | +  // Only with this registration can Relu be present in the ONNC IR.
224 | +  pRegistry.emplace<ReluLower>();
225 |  }
226 | ```
227 | 
228 | The complete source code of [`FooNvdlaBackend.cpp`](src/FooNvdlaBackend.cpp) is also available in the `lab_6_anipulating_ONNC_IR/src` directory 
229 | 
230 | Since we created a new file, `NvDlaFileGenPass.cpp`, for the backend, we need to declare the file addition in the building system so that it can get compiled. Modify the related cmake files as below.
231 | 
232 | ```diff
233 | // CMakeLists.txt
234 | 
235 |  add_libonnc_src(
236 |      NvDlaMemInfoPass.cpp
237 |      NvDlaTaskSubmitPass.cpp
238 |      NvDlaFileGenPass.cpp
239 | +    GraphvizONNCIRPass.cpp
240 | ```
241 | 
242 | ```diff
243 | // Makefile.am
244 | 
245 |  ONNC_TARGET_SOURCES += \
246 |    Target/FooNvdla/NvDlaMemInfoPass.cpp \
247 |    Target/FooNvdla/NvDlaTaskSubmitPass.cpp \
248 |    Target/FooNvdla/NvDlaFileGenPass.cpp \
249 | +  Target/FooNvdla/GraphvizONNCIRPass.cpp \
250 | ```
251 | 
252 | ### Step 5: Re-build ONNC and check the result
253 | 
254 | Follow the instruction in lab 1 to rebuild the ONNC source code within the ONNC-community Docker.
255 | Use the following command to bring up the ONNC-community Docker.
256 | 
257 | ```sh
258 | $ docker run -ti --rm -v <path/to/onnc>:/onnc/onnc -v <path/to/tutorial>/models:/tutorial/models onnc/onnc-community
259 | ```
260 | 
261 | Within the Docker container, use the following commands to rebuild ONNC and then use the new ONNC binary to compile the target DNN model.
262 | 
263 | ```sh
264 | # Within onnc/onnc-community Docker container
265 | 
266 | $ cd /onnc/onnc-umbrella/build-normal
267 | 
268 | # Rebuild ONNC.
269 | $ smake -j8 install
270 | 
271 | # Run ONNC to compile the DNN model.
272 | $ onnc -mquadruple foonvdla /tutorial/models/test_Conv_Relu/test_Conv_Relu.onnx
273 | FooNvdla is invoked
274 | === GraphvizONNCIRPass ======
275 | digraph {
276 |   Initializer_94890678153536 [label=Initializer]
277 |   Initializer_94890678153536 -> W
278 |   W [shape=rect]
279 |   InputOperator_94890678817600 [label=InputOperator]
280 |   InputOperator_94890678817600 -> INPUT0
281 |   INPUT0 [shape=rect]
282 |   Conv_94890678675520 [label=Conv]
283 |   INPUT0 -> Conv_94890678675520
284 |   W -> Conv_94890678675520
285 |   Conv_94890678675520 -> conv_out
286 |   conv_out [shape=rect]
287 |   Relu_94890678095136 [label=Relu]
288 |   conv_out -> Relu_94890678095136
289 |   Relu_94890678095136 -> OUTPUT0
290 |   OUTPUT0 [shape=rect]
291 |   OutputOperator_94890678849184 [label=OutputOperator]
292 |   OUTPUT0 -> OutputOperator_94890678849184
293 | }
294 | ==========================
295 | ```
296 | 
297 | Note that the number appended to each ComputeOperator is a **random** unique number such as `Initializer_94890678153536`. Since it is random, it is no surprise that your log message have different random numbers. But you may get a similar image file for your script as the following picture.
298 | 
299 | <img src="../figures/test_Conv_Relu_onnc_ir.png" width="180">
300 | 
301 | ## Summary
302 | 
303 | In this lab, you have learned:
304 | 
305 | * Writing a pass, and
306 | * How to traverse the ONNC IR graph and do something on for each `ComputeOperator` and `Value` objects. 
307 | 


--------------------------------------------------------------------------------
/lab_6_Manipulating_ONNC_IR/src/FooNvdlaBackend.cpp:
--------------------------------------------------------------------------------
  1 | //===- FooNvdlaBackend.cpp -----------------------------------------------------===//
  2 | //
  3 | //                             The ONNC Project
  4 | //
  5 | // See LICENSE.TXT for details.
  6 | //
  7 | //===----------------------------------------------------------------------===//
  8 | #include <memory>
  9 | 
 10 | #include "FooNvdlaBackend.h"
 11 | #include "TargetInfo/FooNvdlaTargetInfo.h"
 12 | #include "TargetInfo/FooNvdlaTargetMemInfo.h"
 13 | #include "CodeEmitVisitor.h"
 14 | #include "NvDlaMemInfoPass.h"
 15 | #include "NvDlaTaskSubmitPass.h"
 16 | #include "NvDlaFileGenPass.h"
 17 | #include "GraphvizONNCIRPass.h"
 18 | 
 19 | #include <onnc/Analysis/UpdateGraphOutputSize.h>
 20 | #include <onnc/Analysis/NodeIRScheduler.h>
 21 | #include <onnc/CodeGen/BuildMemOperand.h>
 22 | #include <onnc/CodeGen/LinearScanMemAlloc.h>
 23 | #include <onnc/CodeGen/LiveIntervals.h>
 24 | #include <onnc/CodeGen/LiveValueMatrix.h>
 25 | #include <onnc/CodeGen/SetMemOperand.h>
 26 | #include <onnc/CodeGen/SlotIndexes.h>
 27 | #include <onnc/IR/CodeEmit.h>
 28 | #include <onnc/Support/Memory.h>
 29 | #include <onnc/Target/TargetRegistry.h>
 30 | #include <onnc/Target/TargetStandardPasses.h>
 31 | #include <onnc/Transforms/BookONNXGraphs.h>
 32 | #include <onnc/Transforms/BuildInitializers.h>
 33 | #include <onnc/Transforms/BuildInputOperators.h>
 34 | #include <onnc/Transforms/BuildOutputOperators.h>
 35 | #include <onnc/Transforms/DeadNodeElimination.h>
 36 | #include <onnc/Transforms/RemoveTrainingNodes.h>
 37 | #include <onnc/Transforms/TensorSel.h>
 38 | #include <onnc/Transforms/TensorSel/Standards/AddLower.h>
 39 | #include <onnc/Transforms/TensorSel/Standards/AveragePoolLower.h>
 40 | #include <onnc/Transforms/TensorSel/Standards/BatchNormalizationLower.h>
 41 | #include <onnc/Transforms/TensorSel/Standards/CastLower.h>
 42 | #include <onnc/Transforms/TensorSel/Standards/ConcatLower.h>
 43 | #include <onnc/Transforms/TensorSel/Standards/ConvLower.h>
 44 | #include <onnc/Transforms/TensorSel/Standards/FlattenLower.h>
 45 | #include <onnc/Transforms/TensorSel/Standards/GemmLower.h>
 46 | #include <onnc/Transforms/TensorSel/Standards/GlobalAveragePoolLower.h>
 47 | #include <onnc/Transforms/TensorSel/Standards/LRNLower.h>
 48 | #include <onnc/Transforms/TensorSel/Standards/LeakyReluLower.h>
 49 | #include <onnc/Transforms/TensorSel/Standards/MaxPoolLower.h>
 50 | #include <onnc/Transforms/TensorSel/Standards/MulLower.h>
 51 | #include <onnc/Transforms/TensorSel/Standards/PReluLower.h>
 52 | #include <onnc/Transforms/TensorSel/Standards/ReluLower.h>
 53 | #include <onnc/Transforms/TensorSel/Standards/ReshapeLower.h>
 54 | #include <onnc/Transforms/TensorSel/Standards/SoftmaxLower.h>
 55 | #include <onnc/Transforms/TensorSel/Standards/SplitLower.h>
 56 | #include <onnc/Transforms/TensorSel/Standards/SqueezeLower.h>
 57 | #include <onnc/Transforms/TensorSel/Standards/SumLower.h>
 58 | #include <onnc/Transforms/TensorSel/Standards/TransposeLower.h>
 59 | #include <onnc/Transforms/TensorSel/Standards/UnsqueezeLower.h>
 60 | #include <onnc/Transforms/TensorSel/Standards/UpsampleLower.h>
 61 | 
 62 | #include <memory>
 63 | 
 64 | using namespace onnc;
 65 | 
 66 | //===----------------------------------------------------------------------===//
 67 | // FooNvdlaBackend
 68 | //===----------------------------------------------------------------------===//
 69 | const Version FooNvdlaBackend::LOADABLE_VERSION = Version(1, 1, 255);
 70 | const Version FooNvdlaBackend::BLOB_DLA_VERSION = Version(1, 3, 0);
 71 | const Version FooNvdlaBackend::BLOB_EMU_VERSION = Version(1, 3, 0);
 72 | 
 73 | FooNvdlaBackend::FooNvdlaBackend(const TargetOptions& pOptions)
 74 |   : TargetBackend(pOptions)
 75 |   , NvDlaConstants(getConfig(::nvdla::ConfigSet::nv_full, ::nvdla::ExecutionMode::direct, false))
 76 |   , m_pMeta(*this) { 
 77 |   m_pMemInfo = std::make_unique<FooNvdlaTargetMemInfo>();
 78 | }
 79 | 
 80 | void FooNvdlaBackend::addTensorSel(PassManager& pPM)
 81 | {
 82 |   errs() << "FooNvdla is invoked\n";
 83 | 
 84 |   // Do ONNX graph IR optimization here.
 85 | 
 86 |   // Translate from ONNX graph IR into ONNC IR
 87 |   addStandardTensorSel(pPM, *this);
 88 |   
 89 |   // Now ONNC IR is ready.
 90 |   // If you need to extend ONNC IR, here is the place to add your pass that
 91 |   // adds your ONNC IR operators.
 92 | }
 93 | 
 94 | void FooNvdlaBackend::addOnncIrOptimization(PassManager& pPM, OptimizationOptions& options)
 95 | {
 96 |   TargetBackend::addOnncIrOptimization(pPM, options);
 97 | 
 98 |   pPM.add<GraphvizONNCIRPass>();
 99 | }
100 | 
101 | void FooNvdlaBackend::addTensorSched(PassManager& pPM)
102 | {
103 |   // After method AddTensorSel, operators have been scheduled in an
104 |   // topological order, which totally respects the data dependency.
105 |   // However, that might not be an optimized order for certain objective.
106 |   // Add a scheduling optimization pass here.
107 | }
108 | 
109 | void FooNvdlaBackend::addMemAlloc(PassManager& pPM)
110 | {
111 |   // Input: Module
112 |   // Output: LiveIntervals
113 |   addStandardCreateLiveIntervals(pPM);
114 | 
115 |   // Input: LiveIntervals
116 |   // Output: MemAllocs
117 |   addStandardMemoryAllocation(pPM, *this);
118 | 
119 |   // Input: MemAllocs
120 |   // Output: Virtual memory address for each memory operands.
121 |   addStandardSetMemOperands(pPM);
122 | 
123 |   const NvDlaConstants& constants = *this;
124 |   pPM.add<NvDlaMemInfoPass>(constants, &m_pMeta);
125 | }
126 | 
127 | void FooNvdlaBackend::addCodeEmit(PassManager& pPM, const Path& pOutput)
128 | {
129 |   static foonvdla::CodeEmitVisitor ceVisitor(*this, m_pMeta);
130 |   pPM.add<CodeEmit>(ceVisitor)
131 |      .add<NvDlaTaskSubmitPass>(&m_pMeta, BLOB_DLA_VERSION, BLOB_EMU_VERSION)
132 |      .add<NvDlaFileGenPass>(&m_pMeta, LOADABLE_VERSION)
133 |     ;
134 | }
135 | 
136 | void FooNvdlaBackend::RegisterLowers(LowerRegistry& pRegistry) const
137 | {
138 |   pRegistry.emplace<AddLower>();
139 |   pRegistry.emplace<AveragePoolLower>();
140 |   pRegistry.emplace<BatchNormalizationLower>();
141 |   pRegistry.emplace<ConcatLower>();
142 |   pRegistry.emplace<ConvLower>();
143 |   pRegistry.emplace<FlattenLower>();
144 |   pRegistry.emplace<GemmLower>();
145 |   pRegistry.emplace<GlobalAveragePoolLower>();
146 |   pRegistry.emplace<LRNLower>();
147 |   pRegistry.emplace<MaxPoolLower>();
148 |   pRegistry.emplace<MulLower>();
149 |   pRegistry.emplace<ReluLower>();
150 |   pRegistry.emplace<ReshapeLower>();
151 |   pRegistry.emplace<SoftmaxLower>();
152 |   pRegistry.emplace<SqueezeLower>();
153 |   pRegistry.emplace<SumLower>();
154 |   pRegistry.emplace<TransposeLower>();
155 |   pRegistry.emplace<UnsqueezeLower>();
156 | }
157 | 
158 | 
159 | //===----------------------------------------------------------------------===//
160 | // Non member functions
161 | //===----------------------------------------------------------------------===//
162 | TargetBackend* CreateFooNvdlaBackend(const TargetOptions& pOptions)
163 | {
164 |   return new FooNvdlaBackend(pOptions);
165 | }
166 | 
167 | extern "C" void InitializeFooNvdlaONNCBackend()
168 | {
169 |   onnc::TargetRegistry::RegisterTargetBackend(getTheFooNvdlaTarget(),
170 |       CreateFooNvdlaBackend);
171 | }
172 | 
173 | 


--------------------------------------------------------------------------------
/lab_6_Manipulating_ONNC_IR/src/GraphvizONNCIRPass.cpp:
--------------------------------------------------------------------------------
 1 | //===- GraphvizONNCIRPass.cpp ---------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #include "GraphvizONNCIRPass.h"
 9 | 
10 | #include <onnc/Core/PassSupport.h>
11 | #include <onnc/IR/Compute/Attributes.h>
12 | #include <onnc/IR/Compute/Initializer.h>
13 | #include <onnc/IR/ComputeOperator.h>
14 | #include <onnc/Transforms/Optimizations/OptimizationsUtils.h>
15 | 
16 | namespace onnc {
17 | namespace foonvdla {
18 | 
19 | //===----------------------------------------------------------------------===//
20 | // GraphvizONNCIRPass
21 | //===----------------------------------------------------------------------===//
22 | 
23 | Pass::ReturnType GraphvizONNCIRPass::runOnModule(Module& pModule)
24 | {
25 |   Pass::ReturnType ret = kModuleNoChanged;
26 | 
27 |   ret = BaseType::runOnModule(pModule);
28 | 
29 |   if (ret != kModuleNoChanged) {
30 |     pModule.eraseUnusedValues();
31 |   }
32 | 
33 |   return ret;
34 | }
35 | 
36 | Pass::ReturnType GraphvizONNCIRPass::runOnComputeGraph(ComputeGraph& pCG)
37 | {
38 |   std::cout << "=== GraphvizONNCIRPass ======\n";
39 |   std::cout << "digraph {\n";
40 | 
41 |   // Loop over every operator in this ComputeGraph.
42 |   for (ComputeOperator& op : pCG) {
43 | 
44 |     //------------------------------------------------------------------------------------
45 |     // Print the decleration of this operator's name according to Graphviz's requirement.
46 |     //------------------------------------------------------------------------------------
47 | 
48 |     std::string opName = op.name().str() + "_" + std::to_string((long)&op);
49 |     std::cout << "  " << opName << " [label=" << op.name() << "]\n";
50 | 
51 |     //-----------------------------------------------------------------
52 |     // Print the edges between this operator and all its input tensors. 
53 |     //-----------------------------------------------------------------
54 |     int numInputs = op.getNumOfInputs();
55 |     for (int i = 0; i < numInputs; ++i) {
56 |       Value* input = op.getInput(i);
57 | 
58 |       std::cout << "  " << input->getName() << " -> " << opName << "\n";
59 |     }
60 | 
61 |     //-------------------------------------------------------------------
62 |     // Print the edges between this operator and all its output tensors.
63 |     //-------------------------------------------------------------------
64 |     int numOutputs = op.getNumOfOutputs();
65 |     for (int i = 0; i < numOutputs; ++i) {
66 |       Value* output = op.getOutput(i);
67 | 
68 |       std::cout << "  " << opName << " -> " << output->getName() << "\n";
69 |       std::cout << "  " << output->getName() << " [shape=rect]\n";
70 |     }
71 |   }
72 | 
73 |   std::cout << "}\n";
74 |   std::cout << "==========================\n";
75 |   
76 |   return Pass::kModuleNoChanged;
77 | }
78 | 
79 | } // namespace foonvdla
80 | } // namespace onnc
81 | 


--------------------------------------------------------------------------------
/lab_6_Manipulating_ONNC_IR/src/GraphvizONNCIRPass.h:
--------------------------------------------------------------------------------
 1 | //===- GraphvizONNCIRPass.h -----------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #ifndef ONNC_FOONVDLA_GRAPHVIZ_ONNC_IR_PASS_H
 9 | #define ONNC_FOONVDLA_GRAPHVIZ_ONNC_IR_PASS_H
10 | #include <onnc/Core/CustomPass.h>
11 | //#include <utility>
12 | 
13 | namespace onnc {
14 | namespace foonvdla {
15 | 
16 | class GraphvizONNCIRPass : public CustomPass<GraphvizONNCIRPass>
17 | {
18 | public:
19 |   GraphvizONNCIRPass() = default;
20 | 
21 |   ReturnType runOnModule(Module& pModule) override;
22 | 
23 |   ReturnType runOnComputeGraph(ComputeGraph& pCG) override;
24 | };
25 | 
26 | } // namespace foonvdla
27 | } // namespace onnc
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/lab_6_Manipulating_ONNC_IR/src/test_Conv_Relu.dot:
--------------------------------------------------------------------------------
 1 | digraph {
 2 |   Initializer_94228347773248 [label=Initializer]
 3 |   Initializer_94228347773248 -> W
 4 |   W [shape=rect]
 5 |   InputOperator_94228348437312 [label=InputOperator]
 6 |   InputOperator_94228348437312 -> INPUT0
 7 |   INPUT0 [shape=rect]
 8 |   Conv_94228348295232 [label=Conv]
 9 |   INPUT0 -> Conv_94228348295232
10 |   W -> Conv_94228348295232
11 |   Conv_94228348295232 -> conv_out
12 |   conv_out [shape=rect]
13 |   Relu_94228347714848 [label=Relu]
14 |   conv_out -> Relu_94228347714848
15 |   Relu_94228347714848 -> OUTPUT0
16 |   OUTPUT0 [shape=rect]
17 |   OutputOperator_94228348468896 [label=OutputOperator]
18 |   OUTPUT0 -> OutputOperator_94228348468896
19 | }


--------------------------------------------------------------------------------
/lab_7_ONNC_IR_Extension/lab_7.md:
--------------------------------------------------------------------------------
  1 | # ONNC IR Extension
  2 | 
  3 | ## Preface
  4 | 
  5 | ONNC has implemented a set of ONNC IR operators in the latest release such as [Conv](https://github.com/onnx/onnx/blob/rel-1.3.0/docs/Operators.md#conv), [Relu](https://github.com/onnx/onnx/blob/rel-1.3.0/docs/Operators.md#relu), [MaxPool](https://github.com/onnx/onnx/blob/rel-1.3.0/docs/Operators.md#maxpool), etc. Many of supported operators are directly-mapped to corresponding ONNX operators. You may find each operator's description in the ONNX official site (https://github.com/onnx/onnx/blob/rel-1.3.0/docs/Operators.md). However, on some occasions, we may need additional tailor-made ONNC IR to support specific target hardware feature. An example from NVDLA is the "channel shuffle" operator, which is widely utilized by one famous image classification model, [ShuffleNet](https://arxiv.org/abs/1707.01083). The following figure shows a partial graph of ShuffleNet and there is a Reshape-Transpose-Reshape concatenation highlighted in a box.
  6 | 
  7 | <img src="../figures/shufflenet_partial.png" width="250">
  8 | 
  9 | This three-operator concatenation performs the channel suffle operation as visualized in the following figure.
 10 | 
 11 | <img src="../figures/shuffle_visualization.png" width="400">
 12 | 
 13 | The concatenated operation is equivalent to reordering the channels in an interleaved way. When mapping the Shuffle operator to the NVDLA hardware, we prefer to map three concatenated operators into a sequence of RUBIK operations in NVDLA. As the following figure (from the [NVDLA official site](http://nvdla.org/hw/v1/ias/unit_description.html#split-and-merge)) shows, the RUBIK engine provides split and merge modes to reorder memory layout. 
 14 | 
 15 | <img src="../figures/rubik_split_and_merge.png" width="500">
 16 | 
 17 | With a sequence of RUBIK operations, the `Shuffle` operator can be implemented in a mathematically-equivalent software pipeline. It is critical to fuse three model-layer operators into a single ONNC IR operator and the compiler gets a chance to map the operator into target hardware efficiently. This is a good example to demonstrate why we need to define proprietary IRs in some cases to support hardware-specific features. 
 18 | 
 19 | In this lab, we will discuss and demonstrate the method to extend the built-in ONNC IR for the hardware-specific operator, `Shuffle`, and then collapse the Reshape-Transpose-Reshape concatenation into a single `Shuffle` operator in the model description.
 20 | 
 21 | ## Lab: Adding a Hardware-Specific Operator, `Shuffle`
 22 | 
 23 | The following figure shows the example model, [`test_Shuffle.onnx`](../models/test_Shuffle/test_Shuffle.onnx), used in this lab. It contains a Reshape-Transpose-Reshape concatenation that performs an `Shuffle` operation equivalently.
 24 | 
 25 | <img src="../figures/test_Shuffle.png" width="120">
 26 | 
 27 | Given the above model, ONNC initially transforms the model into an ONNC IR graph as depicted in the following ONNC IR graph. The `Reshape` and `Transpose` operators in the given model are directly mapped to the `Reshape` and `Transpose` ONNC IRs respectively.
 28 | 
 29 | <img src="../figures/test_Shuffle_original_ONNC_IR.png" width="250">
 30 | 
 31 | The goal of this lab is to demonstrate how to define a new ONNC IR and use it for an optimization pass. The optimization pass will convert the above graph to the following graph, where the Reshape-Transpose-Reshape concatenation is replaced by a single `Shuffle`.
 32 | 
 33 | <img src="../figures/test_Shuffle_adjusted_ONNC_IR.png" width="200">
 34 | 
 35 | ### Step 1: Set up environment.
 36 | 
 37 | Please finish the following labs first before continuing this lab.
 38 | 
 39 | * [lab 1: Environment Setup](../lab_1_Environment_Setup/lab_1.md) for preparing the Docker images and ONNC source codes.
 40 | * [lab 3: Starting New Backend](../lab_3_Starting_New_Backend/lab_3.md) for preparing the experimental backend `FooNvdla` for the exercise in this lab.
 41 | * [lab 4: Code Emitting](../lab_4_Code_Emitting/lab_4.md) for setting up the utilities needed by this lab.
 42 | 
 43 | After the preparation, you should have the `FooNvdla` backend ready in `<path/to/onnc>/lib/Target/FooNvdla`.
 44 | For the rest of this lab, all code modification is made in the `FooNvdla` directory.
 45 | 
 46 | ```sh
 47 | $ cd <path/to/onnc>/lib/Target/FooNvdla
 48 | ```
 49 | 
 50 | ### Step 2: Define a new ONNC IR operator
 51 | 
 52 | To define a new ONNC IR, you need to create a new IR class inheriting from the `class ComputeOperator`. In this lab, a new `class NvDlaShuffle` is declared to for the new `Shuffle` operator. There are two sets of methods and variables in this class. One set is associated with the operator attributes. For example, the `Shuffle` operator needs a variable for the attribute, "group", which indicates how to interleave the channels. The other set is mandatory and common for all operators in order to meet the ONNC framework requirement. For example, the `accept()` method is used by the [visitor design pattern](https://en.wikipedia.org/wiki/Visitor_pattern) in ONNC for performing optimization on every operator.
 53 | 
 54 | ```cpp
 55 | // Compute/NvDlaShuffle.h
 56 | 
 57 | class NvDlaShuffle : public ComputeOperator
 58 | {
 59 | public:
 60 |   // This variable is mandatory for all operators. Do not omit it.
 61 |   static char ID;
 62 | 
 63 | public:
 64 |   NvDlaShuffle(int group)
 65 |     : ComputeOperator("Shuffle", ID) // Set "Shuffle" as the operator's type name.
 66 |     , m_Group(group) // Set the "group" attribute of this operator.
 67 |   {}
 68 | 
 69 |   virtual ~NvDlaShuffle() {}
 70 | 
 71 |   // Operator-specific methods.
 72 |   const IntAttr& getGroup() const { return m_Group; }
 73 | 
 74 |   // Mandatory utility methods. Do not emit them.
 75 |   Tensor* getInput(unsigned int pIdx) override { return static_cast<Tensor*>(m_Inputs[pIdx]); }
 76 |   // ...
 77 |   void printAttributes(std::ostream& pOS) const override;
 78 |   void accept(ComputeVisitor& pV) override;
 79 |   void accept(ComputeVisitor& pV) const override;
 80 |   static bool classof(const ComputeOperator* pOp);
 81 | 
 82 | private:
 83 |   IntAttr m_Group; // Operator-specific attribute
 84 | };
 85 | ```
 86 | 
 87 | After the class declaration in the header file, its class implementation is shown in the following code snippet.
 88 | 
 89 | ```cpp
 90 | // Compute/NvDlaShuffle.cpp
 91 | 
 92 | // Mandatory implementation. Every ONNC IR operator follows the same coding.
 93 | char NvDlaShuffle::ID = 0;
 94 | 
 95 | // Operator-specific implementation
 96 | void NvDlaShuffle::printAttributes(std::ostream& pOS) const
 97 | {
 98 |   pOS << "<group: " << m_Group.value() << ">";
 99 | }
100 | 
101 | // Mandatory implementation. Every ONNC IR operator follows the same coding.
102 | void NvDlaShuffle::accept(ComputeVisitor& pV)
103 | {
104 |   CodeEmitVisitor* visitor = dyn_cast<CodeEmitVisitor>(&pV);
105 |   if (nullptr != visitor)
106 |     visitor->visit(*this);
107 | }
108 | 
109 | // Mandatory implementation. Every ONNC IR operator follows the same coding.
110 | void NvDlaShuffle::accept(ComputeVisitor& pV) const
111 | {
112 |   CodeEmitVisitor* visitor = dyn_cast<CodeEmitVisitor>(&pV);
113 |   if (nullptr != visitor)
114 |     visitor->visit(*this);
115 | }
116 | 
117 | // Mandatory implementation. Every ONNC IR operator follows the same coding.
118 | bool NvDlaShuffle::classof(const ComputeOperator* pOp)
119 | {
120 |   if (nullptr == pOp)
121 |     return false;
122 |   return (pOp->getID() == &ID);
123 | }
124 | ```
125 | 
126 | The complete source code of [NvDlaShuffle.cpp](src/NvDlaShuffle.cpp) and [NvDlaShuffle.h](src/NvDlaShuffle.h) can be found in the lab `src` directory. Note that files related to the extended ONNC IR are conventionally located in the `Compute/` directory in a backend.  Specifically, they may be found in `<path/to/onnc>/lib/Target/FooNvdla/Compute` by default. Once the new ONNC IR class is created, we need to add its corresponding code emitting function in `CodeEmitVisitor.h` and `CodeEmitVisitor.cpp`. The code change is shown as in the following code snippet. You may refer to [lab 4: Code Emitting](../lab_4_Code_Emitting/lab_4.md) for more details.
127 | 
128 | ```diff
129 | // CodeEmitVisitor.h
130 | 
131 |  #include "NvDlaMeta.h"
132 | +#include "Compute/NvDlaShuffle.h"
133 | 
134 | class CodeEmitVisitor : public CustomVisitor<CodeEmitVisitor>, private NvDlaConstants
135 | {
136 |    void visit(const Conv& pConv) override;
137 | +  void visit(const NvDlaShuffle& pOp);
138 | 
139 |    void visit(Conv& pConv) override;
140 | +  void visit(NvDlaShuffle& pOp) { visit(const_cast<const NvDlaShuffle&>(pOp)); }
141 | 
142 | };
143 | ```
144 | 
145 | The complete source code of [CodeEmitVisitor.cpp](src/CodeEmitVisitor.cpp) and [CodeEmitVisitor.h](src/CodeEmitVisitor.h) can be found in the `src` directory for your reference. You may copy them into `<path/to/onnc>/lib/Target/FooNvdla` directly.
146 | 
147 | 
148 | ### Step 3: Use the new ONNC IR to replace the matched pattern in the model
149 | 
150 | With a new ONNC IR, `Shuffle`, we will show how to write a pass to replace the Reshape-Transpose-Reshape pattern with the new `Shuffle` IR. We have elaborated on how to develop a pass and manipulate a model graph in [lab 6: Manipulating ONNC IR](../lab_6_Manipulating_ONNC_IR/lab_6.md). In this lab, we first create a pass, `class NvDlaIdentifyShufflePass`, inherited from the `class CustomPass` to search for the Reshape-Transpose-Reshape pattern.
151 | 
152 | ```cpp
153 | // NvDlaIdentifyShufflePass.h
154 | 
155 | class NvDlaIdentifyShufflePass : public CustomPass<NvDlaIdentifyShufflePass>
156 | {
157 | public:
158 |   NvDlaIdentifyShufflePass() = default;
159 | 
160 |   ReturnType runOnModule(Module& pModule) override;
161 |   
162 |   // ...
163 | };
164 | ```
165 | 
166 | We need to implement the `runOnModule()` function of this pass. Please refer to the file [NvDlaIdentifyShufflePass.cpp](src/NvDlaIdentifyShufflePass.cpp) for the complete source code. In the reference implementation, you may find several APIs available for traversing a model graph in this pass. In [lab 6: Manipulating ONNC IR](../lab_6_Manipulating_ONNC_IR/lab_6.md), we have introduced two classes `class ComputeOperator` and `class Value` for operators and input/output tensors respectively. To search for a specific operator concatenation pattern, we further need to know the connectivity of operators in the model graph. However, in the data structure of `ComputeOperator`, there is no variable directly pointing to other operators. 
167 | 
168 | To access another operator from a given operator, it can be done indirectly by accessing linked data structures in `ComputeOperator` and `Value`. The following figure shows how to access its downstream operators.
169 | 
170 | <img src="../figures/ir_graph_get_output.png" width="400">
171 | 
172 | To get an output tensor of an operator, we first call `getOutput()` to get an output tensor and an index argument is required because an operator can have multiple outputs. `GetOutput()` returns a pointer to a `Value` object that might have multiple usages stored in an array. You may call `getUses()` to get the usage array. By retrieving a specific entry of the usage array and then calling `getUser()`, you eventually get the pointer to the target `ComputeOperator`.
173 | For example, given an operator `op`, we can access its first downstream operator using `op.getOutput(0)->getUses()[0].getUser()`.
174 | 
175 | Similarly, if you need to access an upstream operator of a given operator, follow the instructions shown in the following figure.
176 | 
177 | <img src="../figures/ir_graph_get_input.png" width="400">
178 | 
179 | You may call `getInput()` with a specific index to get the corresponding input tensor and then call `getDefine()` to get the upstream operator that produces this tensor. Note that there is no index for `getDefine()` because there exists only one upstream operator for each tensor. `GetDefine()` returns a pointer to a `Define` object. In fact, `class Define` is one of the parent class of `ComputeOperator`, so you can use `static_cast` to cast that operator as a `ComputeOperator`. For example, given an operator `op`, we can access its first upstream operator using `static_cast<ComputeOperator*>(op.getInput(0)->getDefine())`.
180 | 
181 | We have prepared the complete source code of [NvDlaIdentifyShufflePass.cpp](src/NvDlaIdentifyShufflePass.cpp) and [NvDlaIdentifyShufflePass.h](src/NvDlaIdentifyShufflePass.h) for your reference. You may copy them into `<path/to/onnc>/lib/Target/FooNvdla` if you do not want to code from scratch. Lastly, register this new pass to the pass manager to make it effective. There is an utility pass, `PrintONNCIRPass` available in the tutorial `src` directory to dump the whole ONNC IR graph in text format. We can use it to validate the optimization effect. 
182 | 
183 | 
184 | ```diff
185 | // FooNvdlaBackend.cpp
186 | 
187 |  #include "NvDlaFileGenPass.h"
188 | +#include "NvDlaIdentifyShufflePass.h"
189 | +#include "PrintONNCIRPass.h"
190 | 
191 | @@ -74,6 +75,10 @@ void FooNvdlaBackend::addTensorSel(PassManager& pPM)
192 |  void FooNvdlaBackend::addOnncIrOptimization(PassManager& pPM, OptimizationOptions& options)
193 |  {
194 |    TargetBackend::addOnncIrOptimization(pPM, options);
195 | +
196 | +  pPM.add<PrintONNCIRPass>();
197 | +  pPM.add<NvDlaIdentifyShufflePass>();
198 | +  pPM.add<PrintONNCIRPass>();
199 |  }
200 |  
201 | ```
202 | 
203 | You may copy [FooNvdlaBackend.cpp](src/FooNvdlaBackend.cpp), [PrintONNCIRPass.cpp](src/PrintONNCIRPass.cpp), and [PrintONNCIRPass.h](src/PrintONNCIRPass.h) from the `src` directory to `<path/to/onnc>/lib/Target/FooNvdla` to save your time. Since we created a few new files for the backend, we need to declare the file addition in the building script as follows.
204 | 
205 | ```diff
206 | // CMakeLists.txt
207 | 
208 |  add_libonnc_src(
209 |      NvDlaMemInfoPass.cpp
210 |      NvDlaTaskSubmitPass.cpp
211 |      NvDlaFileGenPass.cpp
212 | +    Compute/NvDlaShuffle.cpp
213 | +    NvDlaIdentifyShufflePass.cpp
214 | +    PrintONNCIRPass.cpp
215 | ```
216 | 
217 | ```diff
218 | // Makefile.am
219 | 
220 |  ONNC_TARGET_SOURCES += \
221 |    Target/FooNvdla/NvDlaMemInfoPass.cpp \
222 |    Target/FooNvdla/NvDlaTaskSubmitPass.cpp \
223 |    Target/FooNvdla/NvDlaFileGenPass.cpp \
224 | +  Target/FooNvdla/Compute/NvDlaShuffle.cpp \
225 | +  Target/FooNvdla/NvDlaIdentifyShufflePass.cpp \
226 | +  Target/FooNvdla/PrintONNCIRPass.cpp \
227 | ```
228 | 
229 | ### Step 4: Re-build ONNC to test.
230 | 
231 | Follow the instruction in Lab 1. to rebuild the ONNC source code within the ONNC-community Docker.
232 | Use the following command to bring up the ONNC-community Docker.
233 | 
234 | ```sh
235 | $ docker run -ti --rm -v <path/to/onnc>:/onnc/onnc -v <path/to/tutorial>/models:/tutorial/models onnc/onnc-community
236 | ```
237 | 
238 | Within the Docker container, use the following commands to rebuild ONNC and then use the new ONNC binary to compile the target DNN model.
239 | 
240 | ```sh
241 | # Within onnc/onnc-community Docker container
242 | 
243 | $ cd /onnc/onnc-umbrella/build-normal
244 | 
245 | # Rebuild ONNC.
246 | $ smake -j8 install
247 | 
248 | # Run ONNC to compile the DNN model.
249 | $ onnc -mquadruple foonvdla /tutorial/models/test_Shuffle/test_Shuffle.onnx
250 | FooNvdla is invoked
251 | === PrintONNCIRPass ======
252 | %W0<float>[12, 1, 1, 1] = Initializer<unimplemented>()
253 | %SHAPE1<int64>[5] = Initializer<unimplemented>()
254 | %SHAPE2<int64>[4] = Initializer<unimplemented>()
255 | %W3<float>[12, 1, 1, 1] = Initializer<unimplemented>()
256 | %W4<float>[3, 4, 1, 1] = Initializer<unimplemented>()
257 | %IMAGE<float>[1, 1, 5, 5] = InputOperator<unimplemented>()
258 | %INPUT0<float>[1, 12, 5, 5] = Conv<auto_pad: "NOTSET", dilations: [1, 1], group: 1, kernel_shape: [1, 1], pads: [0, 0, 0, 0], strides: [1, 1]>(%IMAGE<float>[1, 1, 5, 5], %W0<float>[12, 1, 1, 1])
259 | %RESHAPED1<float>[1, 3, 4, 5, 5] = Reshape(%INPUT0<float>[1, 12, 5, 5], %SHAPE1<int64>[5])
260 | %TRANSPOSED<float>[1, 4, 3, 5, 5] = Transpose<perm: [0, 2, 1, 3, 4]>(%RESHAPED1<float>[1, 3, 4, 5, 5])
261 | %RESHAPED2<float>[1, 12, 5, 5] = Reshape(%TRANSPOSED<float>[1, 4, 3, 5, 5], %SHAPE2<int64>[4])
262 | %CONV2<float>[1, 12, 5, 5] = Conv<auto_pad: "NOTSET", dilations: [1, 1], group: 12, kernel_shape: [1, 1], pads: [0, 0, 0, 0], strides: [1, 1]>(%RESHAPED2<float>[1, 12, 5, 5], %W3<float>[12, 1, 1, 1])
263 | %Y<float>[1, 3, 5, 5] = Conv<auto_pad: "NOTSET", dilations: [1, 1], group: 3, kernel_shape: [1, 1], pads: [0, 0, 0, 0], strides: [1, 1]>(%CONV2<float>[1, 12, 5, 5], %W4<float>[3, 4, 1, 1])
264 |  = OutputOperator<unimplemented>(%Y<float>[1, 3, 5, 5])
265 | ==========================
266 | === PrintONNCIRPass ======
267 | %W0<float>[12, 1, 1, 1] = Initializer<unimplemented>()
268 | %W3<float>[12, 1, 1, 1] = Initializer<unimplemented>()
269 | %W4<float>[3, 4, 1, 1] = Initializer<unimplemented>()
270 | %IMAGE<float>[1, 1, 5, 5] = InputOperator<unimplemented>()
271 | %INPUT0<float>[1, 12, 5, 5] = Conv<auto_pad: "NOTSET", dilations: [1, 1], group: 1, kernel_shape: [1, 1], pads: [0, 0, 0, 0], strides: [1, 1]>(%IMAGE<float>[1, 1, 5, 5], %W0<float>[12, 1, 1, 1])
272 | %RESHAPED2<float>[1, 12, 5, 5] = Shuffle<group: 3>(%INPUT0<float>[1, 12, 5, 5])
273 | %CONV2<float>[1, 12, 5, 5] = Conv<auto_pad: "NOTSET", dilations: [1, 1], group: 12, kernel_shape: [1, 1], pads: [0, 0, 0, 0], strides: [1, 1]>(%RESHAPED2<float>[1, 12, 5, 5], %W3<float>[12, 1, 1, 1])
274 | %Y<float>[1, 3, 5, 5] = Conv<auto_pad: "NOTSET", dilations: [1, 1], group: 3, kernel_shape: [1, 1], pads: [0, 0, 0, 0], strides: [1, 1]>(%CONV2<float>[1, 12, 5, 5], %W4<float>[3, 4, 1, 1])
275 |  = OutputOperator<unimplemented>(%Y<float>[1, 3, 5, 5])
276 | ==========================
277 | ```
278 | 
279 | In the above output messages, there are two "PrintONNCIRPass" blocks. The first block prints the ONNC IR before the optimization takes effect. The IR-printing format is simply described by the following grammar rules.
280 | 
281 | ```console
282 | IRStatement:
283 |   OutputList = IRType<AttributeList>( InputList )
284 |   
285 | OutputList:
286 |   Tensor, OutputList
287 |   
288 | InputList:
289 |   Tensor, InputList
290 |   
291 | Tensor:
292 |   %OutputName<DataType>[DataShape]
293 | ```
294 | 
295 | We can see that there is a Reshape-Transpose-Reshape concatenation in the printout. The second block prints the ONNC IR after the optimization takes effect. Obviously, the Reshape-Transpose-Reshape concatenation disappears, and a `Shuffle` operator replaces the concatenation. With this optimization, in the code emitting phase, we can map the operator to the NVDLA RUBIK operations easily.
296 | 
297 | ## Summary
298 | 
299 | In this lab, you have learned:
300 | 
301 | * Extending the built-in ONNC IR to introduce hardware-specific IR, and
302 | * Developing a pass to translate the original model graph into the one with the new IRs.
303 | 
304 | 


--------------------------------------------------------------------------------
/lab_7_ONNC_IR_Extension/src/CodeEmitVisitor.h:
--------------------------------------------------------------------------------
  1 | //===- CodeEmitVisitor.h --------------------------------------------------===//
  2 | //
  3 | //                             The ONNC Project
  4 | //
  5 | // See LICENSE.TXT for details.
  6 | //
  7 | //===----------------------------------------------------------------------===//
  8 | #ifndef TARGET_FOONVDLA_CODE_EMIT_VISITOR_H
  9 | #define TARGET_FOONVDLA_CODE_EMIT_VISITOR_H
 10 | 
 11 | #include "NvDlaDefine.h"
 12 | #include "NvDlaMeta.h"
 13 | #include "Compute/NvDlaShuffle.h"
 14 | 
 15 | #include <onnc/IR/Compute/Initializer.h>
 16 | #include <onnc/IR/Compute/InputOperator.h>
 17 | #include <onnc/IR/Compute/OutputOperator.h>
 18 | #include <onnc/IR/Compute/Tensor.h>
 19 | #include <onnc/IR/CustomVisitor.h>
 20 | #include <onnc/Support/Preprocessor.h>
 21 | #include <onnc/Support/Span.h>
 22 | 
 23 | #include <functional>
 24 | 
 25 | namespace onnc {
 26 | namespace foonvdla {
 27 | 
 28 | class CodeEmitVisitor : public CustomVisitor<CodeEmitVisitor>, private NvDlaConstants
 29 | {
 30 | public:
 31 |   CodeEmitVisitor(const NvDlaConstants& constants, NvDlaBackendMeta& meta) noexcept
 32 |     : NvDlaConstants{constants}
 33 |     , m_pMeta{meta}
 34 |   {}
 35 | 
 36 |   /// ONNC defined operators @{
 37 |   void visit(const Initializer& pInitializer) override;
 38 |   void visit(const InputOperator& pInputOperator) override;
 39 |   void visit(const OutputOperator& pOutputOperator) override;
 40 |   /// @}
 41 | 
 42 |   /// ONNX defined operators @{
 43 |   void visit(const Conv& pConv) override;
 44 |   void visit(const NvDlaShuffle& pOp);
 45 |   /// @}
 46 | 
 47 |   /// ONNC defined operators @{
 48 |   void visit(Initializer& pInitializer) override;
 49 |   void visit(InputOperator& pInputOperator) override;
 50 |   void visit(OutputOperator& pOutputOperator) override;
 51 |   /// @}
 52 | 
 53 |   /// ONNX defined operators @{
 54 |   void visit(Conv& pConv) override;
 55 |   void visit(NvDlaShuffle& pOp) { visit(const_cast<const NvDlaShuffle&>(pOp)); }
 56 |   /// @}
 57 | 
 58 | private:
 59 |   MemoryListEntryId packWeight(const Tensor& weight, NvDlaDims destDims, Tensor::Dimension numFrontPaddingChannels,
 60 |                                Tensor::Dimension outputChannelOffset);
 61 |   MemoryListEntryId packImageWeight(const Tensor& weight, NvDlaDims destDims, Tensor::Dimension outputChannelOffset);
 62 |   MemoryListEntryId packBias(const Tensor& bias, Tensor::Dimension numDestChannels,
 63 |                              Tensor::Dimension srcChannelOffset = 0);
 64 |   MemoryListEntryId packSDPOperand(const Tensor* aluTensor, const Tensor* mulTensor, const NvDlaCubeInfo& cubeInfo);
 65 | 
 66 |   MemoryListEntryId  packFeature(const Tensor& tensor, const NvDlaCubeInfo& cube);
 67 |   void               issueEmuOp(NvDlaEmuOperation* op);
 68 |   AddressListEntryId issueEmuAddr(MemoryListEntryId mid);
 69 |   void               issueDlaOp(NvDlaDlaOperation* op, NvDlaDlaOperation* op_fuse, NvDlaDlaOperation* op_prev);
 70 |   void               issueDlaOp(std::unique_ptr<NvDlaDlaOperation> op);
 71 |   AddressListEntryId issueDlaAddr(const Tensor& tensor, const NvDlaCubeInfo& cube, Tensor::Dimension channelOffset,
 72 |                                   NvDlaBackendMeta::Offset hOffset);
 73 |   AddressListEntryId issueDlaAddr(const Tensor& tensor, const NvDlaCubeInfo& cube);
 74 |   AddressListEntryId issueDlaAddr(MemoryListEntryId memoryId, const NvDlaCubeInfo& cube);
 75 |   AddressListEntryId issueSDPOperand(const Tensor& tensor, const NvDlaCubeInfo& cube, MemoryListEntryId& memoryId);
 76 | 
 77 |   void SetLUTParam(dla_lut_param* lut_param, float alpha, float beta, float bias, int size, float outdata_scale, float outdata_offset);
 78 | 
 79 |   // Perform SDP for 2 input tensors and an output tensor,
 80 |   // the possible value for parameter 'opType' is:
 81 |   //
 82 |   //   1. SDP_OP_ADD
 83 |   //   2. SDP_OP_MUL
 84 |   //
 85 |   void emitSdp(std::uint8_t opType, const Tensor& firstInput, const Tensor& secondInput, const Tensor& output);
 86 | 
 87 | private:
 88 |   MemoryListEntryId packWeight(span<const float> weight, const Tensor* weightTensor, NvDlaDims srcDims,
 89 |                                NvDlaDims destDims, Tensor::Dimension numFrontPaddingChannels,
 90 |                                Tensor::Dimension outputChannelOffset);
 91 | 
 92 |   MemoryListEntryId packWeight(const Tensor& weight, NvDlaDims srcDims, NvDlaDims destDims,
 93 |                                Tensor::Dimension numFrontPaddingChannels, Tensor::Dimension outputChannelOffset);
 94 | 
 95 |   template <typename Type>
 96 |   void packWeightImpl(Type* destData, NvDlaDims destDimsWithFrontPadding, const Tensor* tensor, const float* srcData,
 97 |                       NvDlaDims srcDims, Tensor::Dimension numFrontPaddingChannels,
 98 |                       Tensor::Dimension outputChannelOffset);
 99 | 
100 |   template <typename Type>
101 |   void packImageWeightImpl(Type* blob, NvDlaDims blobDims, const Tensor* tensor, const float* srcData,
102 |                            NvDlaDims srcDims, Tensor::Dimension outputChannelOffset);
103 |   template <typename Type>
104 |   void packBiasImpl(Type* destData, Tensor::Dimension numDestChannels, const Tensor* tensor, const float* srcData,
105 |                     Tensor::Dimension srcChannelOffset);
106 |   void packSDPOperandImpl(NvU8* blob, const Tensor* aluTensor, const float* aluData, const Tensor* mulTensor,
107 |                           const float* mulData, const NvDlaCubeInfo& cubeInfo);
108 | 
109 | private:
110 |   NvDlaBackendMeta&         m_pMeta;
111 | };
112 | 
113 | } // namespace nvdla
114 | } // namespace onnc
115 | 
116 | #undef PP_DECL_VISIT
117 | #undef PP_NVDLA_OP_LIST
118 | 
119 | #endif
120 | 


--------------------------------------------------------------------------------
/lab_7_ONNC_IR_Extension/src/FooNvdlaBackend.cpp:
--------------------------------------------------------------------------------
  1 | //===- FooNvdlaBackend.cpp -----------------------------------------------------===//
  2 | //
  3 | //                             The ONNC Project
  4 | //
  5 | // See LICENSE.TXT for details.
  6 | //
  7 | //===----------------------------------------------------------------------===//
  8 | #include <memory>
  9 | 
 10 | #include "FooNvdlaBackend.h"
 11 | #include "TargetInfo/FooNvdlaTargetInfo.h"
 12 | #include "TargetInfo/FooNvdlaTargetMemInfo.h"
 13 | #include "CodeEmitVisitor.h"
 14 | #include "NvDlaMemInfoPass.h"
 15 | #include "NvDlaTaskSubmitPass.h"
 16 | #include "NvDlaFileGenPass.h"
 17 | #include "NvDlaIdentifyShufflePass.h"
 18 | #include "PrintONNCIRPass.h"
 19 | 
 20 | #include <onnc/Analysis/UpdateGraphOutputSize.h>
 21 | #include <onnc/Analysis/NodeIRScheduler.h>
 22 | #include <onnc/CodeGen/BuildMemOperand.h>
 23 | #include <onnc/CodeGen/LinearScanMemAlloc.h>
 24 | #include <onnc/CodeGen/LiveIntervals.h>
 25 | #include <onnc/CodeGen/LiveValueMatrix.h>
 26 | #include <onnc/CodeGen/SetMemOperand.h>
 27 | #include <onnc/CodeGen/SlotIndexes.h>
 28 | #include <onnc/IR/CodeEmit.h>
 29 | #include <onnc/Support/Memory.h>
 30 | #include <onnc/Target/TargetRegistry.h>
 31 | #include <onnc/Target/TargetStandardPasses.h>
 32 | #include <onnc/Transforms/BookONNXGraphs.h>
 33 | #include <onnc/Transforms/BuildInitializers.h>
 34 | #include <onnc/Transforms/BuildInputOperators.h>
 35 | #include <onnc/Transforms/BuildOutputOperators.h>
 36 | #include <onnc/Transforms/DeadNodeElimination.h>
 37 | #include <onnc/Transforms/RemoveTrainingNodes.h>
 38 | #include <onnc/Transforms/TensorSel.h>
 39 | #include <onnc/Transforms/TensorSel/Standards/ConvLower.h>
 40 | #include <onnc/Transforms/TensorSel/Standards/ReshapeLower.h>
 41 | #include <onnc/Transforms/TensorSel/Standards/TransposeLower.h>
 42 | 
 43 | #include <memory>
 44 | 
 45 | using namespace onnc;
 46 | 
 47 | //===----------------------------------------------------------------------===//
 48 | // FooNvdlaBackend
 49 | //===----------------------------------------------------------------------===//
 50 | const Version FooNvdlaBackend::LOADABLE_VERSION = Version(1, 1, 255);
 51 | const Version FooNvdlaBackend::BLOB_DLA_VERSION = Version(1, 3, 0);
 52 | const Version FooNvdlaBackend::BLOB_EMU_VERSION = Version(1, 3, 0);
 53 | 
 54 | FooNvdlaBackend::FooNvdlaBackend(const TargetOptions& pOptions)
 55 |   : TargetBackend(pOptions)
 56 |   , NvDlaConstants(getConfig(::nvdla::ConfigSet::nv_full, ::nvdla::ExecutionMode::direct, false))
 57 |   , m_pMeta(*this) { 
 58 |   m_pMemInfo = std::make_unique<FooNvdlaTargetMemInfo>();
 59 | }
 60 | 
 61 | void FooNvdlaBackend::addTensorSel(PassManager& pPM)
 62 | {
 63 |   errs() << "FooNvdla is invoked\n";
 64 | 
 65 |   // Do ONNX graph IR optimization here.
 66 | 
 67 |   // Translate from ONNX graph IR into ONNC IR
 68 |   addStandardTensorSel(pPM, *this);
 69 |   
 70 |   // Now ONNC IR is ready.
 71 |   // If you need to extend ONNC IR, here is the place to add your pass that
 72 |   // adds your ONNC IR operators.
 73 | }
 74 | 
 75 | void FooNvdlaBackend::addOnncIrOptimization(PassManager& pPM, OptimizationOptions& options)
 76 | {
 77 |   TargetBackend::addOnncIrOptimization(pPM, options);
 78 | 
 79 |   pPM.add<PrintONNCIRPass>();
 80 |   pPM.add<NvDlaIdentifyShufflePass>();
 81 |   pPM.add<PrintONNCIRPass>();
 82 | }
 83 | 
 84 | void FooNvdlaBackend::addTensorSched(PassManager& pPM)
 85 | {
 86 |   // After method AddTensorSel, operators have been scheduled in an
 87 |   // topological order, which totally respects the data dependency.
 88 |   // However, that might not be an optimized order for certain objective.
 89 |   // Add a scheduling optimization pass here.
 90 | }
 91 | 
 92 | void FooNvdlaBackend::addMemAlloc(PassManager& pPM)
 93 | {
 94 |   // Input: Module
 95 |   // Output: LiveIntervals
 96 |   addStandardCreateLiveIntervals(pPM);
 97 | 
 98 |   // Input: LiveIntervals
 99 |   // Output: MemAllocs
100 |   addStandardMemoryAllocation(pPM, *this);
101 | 
102 |   // Input: MemAllocs
103 |   // Output: Virtual memory address for each memory operands.
104 |   addStandardSetMemOperands(pPM);
105 | 
106 |   const NvDlaConstants& constants = *this;
107 |   pPM.add<NvDlaMemInfoPass>(constants, &m_pMeta);
108 | }
109 | 
110 | void FooNvdlaBackend::addCodeEmit(PassManager& pPM, const Path& pOutput)
111 | {
112 |   static foonvdla::CodeEmitVisitor ceVisitor(*this, m_pMeta);
113 |   pPM.add<CodeEmit>(ceVisitor)
114 |      .add<NvDlaTaskSubmitPass>(&m_pMeta, BLOB_DLA_VERSION, BLOB_EMU_VERSION)
115 |      .add<NvDlaFileGenPass>(&m_pMeta, LOADABLE_VERSION)
116 |     ;
117 | }
118 | 
119 | void FooNvdlaBackend::RegisterLowers(LowerRegistry& pRegistry) const
120 | {
121 |   pRegistry.emplace<ConvLower>();
122 |   pRegistry.emplace<ReshapeLower>();
123 |   pRegistry.emplace<TransposeLower>();
124 | }
125 | 
126 | 
127 | //===----------------------------------------------------------------------===//
128 | // Non member functions
129 | //===----------------------------------------------------------------------===//
130 | TargetBackend* CreateFooNvdlaBackend(const TargetOptions& pOptions)
131 | {
132 |   return new FooNvdlaBackend(pOptions);
133 | }
134 | 
135 | extern "C" void InitializeFooNvdlaONNCBackend()
136 | {
137 |   onnc::TargetRegistry::RegisterTargetBackend(getTheFooNvdlaTarget(),
138 |       CreateFooNvdlaBackend);
139 | }
140 | 
141 | 


--------------------------------------------------------------------------------
/lab_7_ONNC_IR_Extension/src/NvDlaIdentifyShufflePass.cpp:
--------------------------------------------------------------------------------
  1 | //===- NvDlaIdentifyShufflePass.cpp ---------------------------------------===//
  2 | //
  3 | //                             The ONNC Project
  4 | //
  5 | // See LICENSE.TXT for details.
  6 | //
  7 | //===----------------------------------------------------------------------===//
  8 | #include "NvDlaIdentifyShufflePass.h"
  9 | 
 10 | #include "Compute/NvDlaShuffle.h"
 11 | #include "NvDlaDefine.h"
 12 | 
 13 | #include <onnc/Core/PassSupport.h>
 14 | #include <onnc/IR/Compute/Reshape.h>
 15 | #include <onnc/IR/Compute/Transpose.h>
 16 | 
 17 | using namespace onnc;
 18 | using namespace foonvdla;
 19 | 
 20 | //===----------------------------------------------------------------------===//
 21 | // NvDlaIdentifyShufflePass
 22 | //===----------------------------------------------------------------------===//
 23 | Pass::ReturnType NvDlaIdentifyShufflePass::runOnModule(Module& pModule)
 24 | {
 25 |   Pass::ReturnType ret = kModuleNoChanged;
 26 | 
 27 |   ret = BaseType::runOnModule(pModule);
 28 | 
 29 |   if (ret != kModuleNoChanged) {
 30 |     pModule.eraseUnusedValues();
 31 |   }
 32 | 
 33 |   return ret;
 34 | }
 35 | 
 36 | Pass::ReturnType NvDlaIdentifyShufflePass::runOnComputeGraph(ComputeGraph& pCG)
 37 | {
 38 |   Pass::ReturnType ret   = Pass::kModuleNoChanged;
 39 | 
 40 |   //---------------------------------------------------------------------
 41 |   // Find out all Reshape-Transpose-Reshape patterns in the model graph.
 42 |   //---------------------------------------------------------------------
 43 |   
 44 |   std::vector<Reshape*> reshapes;
 45 |   for (auto& op : pCG) {
 46 |     if (Reshape* reshape1 = dyn_cast<Reshape>(&op)) {
 47 |       if (is_shuffle(reshape1)) { // A channel-shuffle pattern is detected.
 48 |         // Save the first node of this pattern into a queue.
 49 |         // We will replace this pattern by a single Shuffle IR later on.
 50 |         reshapes.push_back(reshape1);
 51 | 
 52 |         // Since a node replacement will happen in the model, the model graph
 53 |         // will be changed and thus this function should return kModuleChanged.
 54 |         ret |= Pass::kModuleChanged;
 55 |       }
 56 |     }
 57 |   }
 58 | 
 59 |   //---------------------------------------------------------------------------
 60 |   // Replace every Reshape-Transpose-Reshape pattern with a single Shuffle IR.
 61 |   //---------------------------------------------------------------------------
 62 |   
 63 |   for (Reshape* reshape1 : reshapes) {
 64 | 
 65 |     // Derive the Tranpose and the second Reshape.
 66 |     auto* transpose  = dyn_cast<Transpose>(reshape1->getOutput(0)->getUses()[0].getUser());
 67 |     auto* reshape2 = dyn_cast<Reshape>(transpose->getOutput(0)->getUses()[0].getUser());
 68 | 
 69 |     Tensor* input_tensor = reshape1->getInput(0);
 70 |     Tensor* shape1_tensor = reshape1->getInput(1);
 71 |     auto shape1_initializer = static_cast<ComputeOperator*>(shape1_tensor->getDefine());
 72 |     Tensor* reshape1_out_tensor = reshape1->getOutput(0);
 73 |     Tensor* transpose_out = transpose->getOutput(0);
 74 |     Tensor* shape2_tensor = reshape2->getInput(1);
 75 |     auto shape2_initializer = static_cast<ComputeOperator*>(shape2_tensor->getDefine());
 76 |     Tensor* output_tensor = reshape2->getOutput(0);
 77 | 
 78 |     // The current ONNC IR graph status
 79 |     // ================================
 80 |     //
 81 |     //               (shape1_initializer)
 82 |     //       |              |
 83 |     //  input_tensor  shape1_tensor
 84 |     //           \      /
 85 |     //          (reshape1)
 86 |     //              |
 87 |     //     reshape1_out_tensor
 88 |     //              |      
 89 |     //         (transpose)  (shape2_initializer)
 90 |     //              |             |
 91 |     //       transpose_out  shape2_tensor
 92 |     //                   \     /
 93 |     //                 (reshape2)
 94 |     //                     |
 95 |     //               output_tensor
 96 |     //                     |
 97 | 
 98 |     // Create a new Shuffle.
 99 |     const auto& reshape_shape = static_cast<Int64Tensor*>(reshape1->getInput(1))->getValues();
100 |     auto*       shuffle       = pCG.addOperator<NvDlaShuffle>(reshape_shape[1]);
101 | 
102 |     // The current ONNC IR graph status
103 |     // ================================
104 |     //
105 |     //               (shape1_initializer)
106 |     //       |              |
107 |     //  input_tensor  shape1_tensor
108 |     //           \      /
109 |     //          (reshape1)
110 |     //              |
111 |     //     reshape1_out_tensor
112 |     //              |      
113 |     //         (transpose)  (shape2_initializer)
114 |     //              |             |
115 |     //       transpose_out  shape2_tensor
116 |     //                   \     /
117 |     //                 (reshape2)                (shuffle)
118 |     //                     |
119 |     //               output_tensor
120 |     //                     |
121 | 
122 |     // Remove the edges between some operators and their input/output tensors.
123 |     // Remove an edge means to erase the records within an operator's data structure about its input tensors.
124 |     reshape1->removeAllInputs();
125 |     reshape1->removeAllOutputs();
126 |     transpose->removeAllInputs();
127 |     transpose->removeAllOutputs();
128 |     reshape2->removeAllInputs();
129 |     reshape2->removeAllOutputs();
130 |     shape1_initializer->removeAllOutputs();
131 |     shape2_initializer->removeAllOutputs();
132 | 
133 |     // The current ONNC IR graph status
134 |     // ================================
135 |     //
136 |     //               (shape1_initializer)
137 |     //       |              |
138 |     //  input_tensor  shape1_tensor
139 |     //                 
140 |     //          (reshape1)
141 |     //              
142 |     //     reshape1_out_tensor
143 |     //                    
144 |     //         (transpose)  (shape2_initializer)
145 |     //                           |
146 |     //       transpose_out  shape2_tensor
147 |     //                        
148 |     //                 (reshape2)                (shuffle)
149 |     //                     
150 |     //               output_tensor
151 |     //                     |
152 | 
153 |     // Remove some un-used nodes in the ONNC IR graph.
154 |     pCG.erase(*reshape1);
155 |     pCG.erase(*transpose);
156 |     pCG.erase(*reshape2);
157 |     pCG.erase(*shape1_initializer);
158 |     pCG.erase(*shape2_initializer);
159 |     pCG.erase(*shape1_tensor);
160 |     pCG.erase(*reshape1_out_tensor);
161 |     pCG.erase(*transpose_out);
162 |     pCG.erase(*shape2_tensor);
163 | 
164 |     // The current ONNC IR graph status
165 |     // ================================
166 |     //
167 |     //       |
168 |     //  input_tensor
169 |     //
170 |     //                                           (shuffle)
171 |     //
172 |     //               output_tensor
173 |     //                     |
174 |     
175 |     shuffle->addInput(*input_tensor);
176 |     shuffle->addOutput(*output_tensor);
177 | 
178 |     // The current ONNC IR graph status
179 |     // ================================
180 |     //
181 |     //       |
182 |     //  input_tensor
183 |     //       |
184 |     //   (shuffle)
185 |     //       |              
186 |     // output_tensor
187 |     //       |
188 |     
189 |   }
190 | 
191 |   pCG.topologicalSort();
192 | 
193 |   return ret;
194 | }
195 | 
196 | bool NvDlaIdentifyShufflePass::is_shuffle(Reshape* reshape1)
197 | {
198 |   // We are going to detect the following pattern.
199 |   //
200 |   //       |
201 |   //  input_tensor
202 |   //           \ 
203 |   //          (reshape1)
204 |   //              |
205 |   //     reshape1_out_tensor
206 |   //              |      // This tensor must have only one user.
207 |   //         (transpose)
208 |   //              |
209 |   //       transpose_out
210 |   //               \     // This tensor must have only one user.
211 |   //             (reshape2)
212 |   //                  |
213 |   //            output_tensor
214 |   //                  |
215 |   //
216 |   
217 | #define SHUFFLE_ASSERT(cond) if (! (cond)) return false;
218 | 
219 |   //--------------------------
220 |   // Check the first Reshape.
221 |   //--------------------------
222 | 
223 |   SHUFFLE_ASSERT( reshape1->getNumOfOutputs() == 1 );
224 | 
225 |   // the output tensor of the Reshape has only one user.
226 |   SHUFFLE_ASSERT( reshape1->getOutput(0)->getUses().size() == 1 );
227 | 
228 |   // The Reshape attribute must satisfy certain constraints.
229 |   // The input dimension must be 4, and this Reshape splits the second dimension into two,
230 |   // thus causing the output dimension to be 5.
231 |   // e.g. input:  1x12x5x6, shape: [1,3,4,5,6]
232 |   //      output: 1x3x4x5x6
233 |   SHUFFLE_ASSERT( reshape1->getInput(0)->getNumOfDimensions() == 4 );
234 |   SHUFFLE_ASSERT( reshape1->getInput(1)->getNumOfDimensions() == 1 ); // shape tensor must be array
235 | 
236 |   const auto& reshape1_shape = static_cast<Int64Tensor*>(reshape1->getInput(1))->getValues();
237 |   SHUFFLE_ASSERT( reshape1_shape.size() == 5 );
238 |   SHUFFLE_ASSERT( reshape1->getInput(0)->dimension(1) == reshape1_shape[1] * reshape1_shape[2] );
239 |   SHUFFLE_ASSERT( reshape1->getInput(0)->dimension(2) == reshape1_shape[3] &&
240 |                   reshape1->getInput(0)->dimension(3) == reshape1_shape[4]);
241 |   
242 |   //-----------------------------
243 |   // Check the middle Transpose.
244 |   //-----------------------------
245 | 
246 |   // the output tensor of the first Reshape has the user to be a Transpose.
247 |   Transpose* transpose = dyn_cast<Transpose>(reshape1->getOutput(0)->getUses()[0].getUser());
248 |   SHUFFLE_ASSERT( transpose );
249 | 
250 |   // the output tensor of the Transpose has only one user.
251 |   SHUFFLE_ASSERT( transpose->getNumOfOutputs() == 1 );
252 |   SHUFFLE_ASSERT( transpose->getOutput(0)->getUses().size() == 1 );
253 | 
254 |   // the attribute of Tranpose, perm, must be [0, 2, 1, 3, 4], ie. swap the 1st and 2nd dimensions.
255 |   // e.g. input:  1x3x4x5x6
256 |   //      output: 1x4x3x5x6
257 |   SHUFFLE_ASSERT( transpose->getInput(0)->getNumOfDimensions() == 5 );
258 |   SHUFFLE_ASSERT( transpose->getPerm().at(0) == 0 &&
259 |                   transpose->getPerm().at(1) == 2 &&
260 |                   transpose->getPerm().at(2) == 1 &&
261 |                   transpose->getPerm().at(3) == 3 &&
262 |                   transpose->getPerm().at(4) == 4);
263 | 
264 |   //-----------------------------
265 |   // Check the last Reshape.
266 |   //-----------------------------
267 | 
268 |   // the output tensor of the middle Transpose has the user to be a Reshape.
269 |   Reshape* reshape2 = dyn_cast<Reshape>(transpose->getOutput(0)->getUses()[0].getUser());
270 |   SHUFFLE_ASSERT( reshape2 );
271 | 
272 |   // The Reshape attribute must satisfy certain constraints.
273 |   // The input dimension must be 5, and this Reshape merges the 2nd and 3rd dimension into one,
274 |   // thus causing the output dimension to be 4.
275 |   // e.g. input: 1x4x3x5x6, shape: [1,12,5,6]
276 |   // output: 1x12x5x6
277 |   SHUFFLE_ASSERT( reshape2->getInput(0)->getNumOfDimensions() == 5 );
278 |   SHUFFLE_ASSERT( reshape2->getInput(1)->getNumOfDimensions() == 1 ); // shape tensor must be array
279 | 
280 |   const auto& reshape2_shape = static_cast<Int64Tensor*>(reshape2->getInput(1))->getValues();
281 |   SHUFFLE_ASSERT( reshape2_shape.size() == 4 );
282 |   SHUFFLE_ASSERT( reshape2->getInput(0)->dimension(1) * reshape2->getInput(0)->dimension(2) ==
283 |                   reshape2_shape[1] );
284 |   SHUFFLE_ASSERT( reshape2->getInput(0)->dimension(3) == reshape2_shape[2] &&
285 |                   reshape2->getInput(0)->dimension(4) == reshape2_shape[3]);
286 | 
287 | #undef SHUFFLE_ASSERT
288 | 
289 |   return true;
290 | }
291 | 


--------------------------------------------------------------------------------
/lab_7_ONNC_IR_Extension/src/NvDlaIdentifyShufflePass.h:
--------------------------------------------------------------------------------
 1 | //===- NvDlaIdentifyShufflePass.h -------------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===------------------------------------------------------------------------------===//
 8 | #ifndef NVDLA_IDENTIFY_SHUFFLE_PASS_H
 9 | #define NVDLA_IDENTIFY_SHUFFLE_PASS_H
10 | 
11 | #include "NvDlaMeta.h"
12 | 
13 | #include <onnc/Core/CustomPass.h>
14 | 
15 | namespace onnc {
16 | namespace foonvdla {
17 | 
18 | class NvDlaIdentifyShufflePass : public CustomPass<NvDlaIdentifyShufflePass>
19 | {
20 | public:
21 |   NvDlaIdentifyShufflePass() = default;
22 | 
23 |   ReturnType runOnModule(Module& pModule) override;
24 |   ReturnType runOnComputeGraph(ComputeGraph& pCG) override;
25 |   
26 | private:
27 |   bool is_shuffle(Reshape* reshape1);
28 | };
29 | 
30 | } // namespace foonvdla
31 | } // namespace onnc
32 | 
33 | #endif // MODELSIM_IDENTIFY_SHUFFLE_PASS_H
34 | 


--------------------------------------------------------------------------------
/lab_7_ONNC_IR_Extension/src/NvDlaShuffle.cpp:
--------------------------------------------------------------------------------
 1 | //===- NvDlaShuffle.cpp ----------------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #include "NvDlaShuffle.h"
 9 | 
10 | #include "../CodeEmitVisitor.h"
11 | #include "../NvDlaDefine.h"
12 | 
13 | using namespace onnc;
14 | using namespace onnc::foonvdla;
15 | 
16 | char NvDlaShuffle::ID = 0;
17 | 
18 | //===----------------------------------------------------------------------===//
19 | // NvDlaShuffle
20 | //===----------------------------------------------------------------------===//
21 | void NvDlaShuffle::printAttributes(std::ostream& pOS) const
22 | {
23 |   pOS << "<group: " << m_Group.value() << ">";
24 | }
25 | 
26 | void NvDlaShuffle::accept(ComputeVisitor& pV)
27 | {
28 |   CodeEmitVisitor* visitor = dyn_cast<CodeEmitVisitor>(&pV);
29 |   if (nullptr != visitor)
30 |     visitor->visit(*this);
31 | }
32 | 
33 | void NvDlaShuffle::accept(ComputeVisitor& pV) const
34 | {
35 |   CodeEmitVisitor* visitor = dyn_cast<CodeEmitVisitor>(&pV);
36 |   if (nullptr != visitor)
37 |     visitor->visit(*this);
38 | }
39 | 
40 | bool NvDlaShuffle::classof(const ComputeOperator* pOp)
41 | {
42 |   if (nullptr == pOp)
43 |     return false;
44 |   return (pOp->getID() == &ID);
45 | }
46 | 


--------------------------------------------------------------------------------
/lab_7_ONNC_IR_Extension/src/NvDlaShuffle.h:
--------------------------------------------------------------------------------
 1 | //===- NvDlaShuffle.h ------------------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===--------------------------------------------------------------------------===//
 8 | #ifndef TARGET_NVDLA_NVDLA_SHUFFLE_H
 9 | #define TARGET_NVDLA_NVDLA_SHUFFLE_H
10 | 
11 | #include <onnc/IR/Compute/Reshape.h>
12 | #include <onnc/IR/Compute/Transpose.h>
13 | #include <onnc/IR/ComputeOperator.h>
14 | 
15 | namespace onnc {
16 | namespace foonvdla {
17 | 
18 | class NvDlaShuffle : public ComputeOperator
19 | {
20 | public:
21 |   static char ID;
22 | 
23 | public:
24 |   NvDlaShuffle(int group)
25 |     : ComputeOperator("Shuffle", ID)
26 |     , m_Group(group)
27 |   {}
28 | 
29 |   virtual ~NvDlaShuffle() {}
30 | 
31 |   // Paramater
32 |   const IntAttr& getGroup() const { return m_Group; }
33 | 
34 |   // Input & Ouput Tensor
35 |   Tensor* getInput(unsigned int pIdx) override { return static_cast<Tensor*>(m_Inputs[pIdx]); }
36 | 
37 |   const Tensor* getInput(unsigned int pIdx) const override { return static_cast<Tensor*>(m_Inputs[pIdx]); }
38 | 
39 |   Tensor* getOutput(unsigned int pIdx) override { return static_cast<Tensor*>(m_Outputs[pIdx]); }
40 | 
41 |   const Tensor* getOutput(unsigned int pIdx) const override { return static_cast<Tensor*>(m_Outputs[pIdx]); }
42 | 
43 |   void printAttributes(std::ostream& pOS) const override;
44 | 
45 |   void accept(ComputeVisitor& pV) override;
46 | 
47 |   void accept(ComputeVisitor& pV) const override;
48 | 
49 |   static bool classof(const ComputeOperator* pOp);
50 | 
51 | private:
52 |   IntAttr m_Group;
53 | };
54 | 
55 | } // namespace foonvdla
56 | } // namespace onnc
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/lab_7_ONNC_IR_Extension/src/PrintONNCIRPass.cpp:
--------------------------------------------------------------------------------
 1 | //===- PrintONNCIRPass.cpp ------------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #include "PrintONNCIRPass.h"
 9 | 
10 | #include <onnc/Core/PassSupport.h>
11 | #include <onnc/IR/Compute/Attributes.h>
12 | #include <onnc/IR/Compute/Initializer.h>
13 | #include <onnc/IR/ComputeOperator.h>
14 | #include <onnc/Transforms/Optimizations/OptimizationsUtils.h>
15 | 
16 | namespace onnc {
17 | namespace foonvdla {
18 | 
19 | //===----------------------------------------------------------------------===//
20 | // PrintONNCIRPass
21 | //===----------------------------------------------------------------------===//
22 | 
23 | Pass::ReturnType PrintONNCIRPass::runOnModule(Module& pModule)
24 | {
25 |   const Pass::ReturnType ret = BaseType::runOnModule(pModule);
26 | 
27 |   if (ret != kModuleNoChanged) {
28 |     pModule.eraseUnusedValues();
29 |   }
30 | 
31 |   return ret;
32 | }
33 | 
34 | Pass::ReturnType PrintONNCIRPass::runOnComputeGraph(ComputeGraph& pCG)
35 | {
36 |   Pass::ReturnType ret = Pass::kModuleNoChanged;
37 | 
38 |   std::cout << "=== PrintONNCIRPass ======\n";
39 |   for (ComputeOperator& node : pCG) {
40 |     node.print(std::cout);
41 |     std::cout << "\n";
42 |   }
43 |   std::cout << "==========================\n";
44 |   
45 |   return ret;
46 | }
47 | 
48 | } // namespace foonvdla
49 | } // namespace onnc
50 | 


--------------------------------------------------------------------------------
/lab_7_ONNC_IR_Extension/src/PrintONNCIRPass.h:
--------------------------------------------------------------------------------
 1 | //===- PrintONNCIRPass.h --------------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #ifndef ONNC_FOONVDLA_PRINT_ONNC_IR_PASS_H
 9 | #define ONNC_FOONVDLA_PRINT_ONNC_IR_PASS_H
10 | #include <onnc/Core/CustomPass.h>
11 | #include <utility>
12 | 
13 | namespace onnc {
14 | namespace foonvdla {
15 | 
16 | class PrintONNCIRPass : public CustomPass<PrintONNCIRPass>
17 | {
18 | public:
19 |   PrintONNCIRPass() = default;
20 | 
21 |   ReturnType runOnModule(Module& pModule) override;
22 | 
23 |   ReturnType runOnComputeGraph(ComputeGraph& pCG) override;
24 | };
25 | 
26 | } // namespace foonvdla
27 | } // namespace onnc
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/lab_8.md:
--------------------------------------------------------------------------------
  1 | # Hardware-specific Optimization
  2 | 
  3 | ## Preface
  4 | 
  5 | Many optimizations that operate on the ONNX/ONNC model IR graph are independent of the DLA hardware design, but many of the most effective optimizations are those that best exploit special features of the target platform. In the case of NVDLA, for a specific ONNX operator, there are more than one way to map the operator into a sequence of hardware operations. It is also possible that we transform a part of the model graph into another mathematically-equivalent subgraph, and then derive a better hardware execution sequence in terms of performance and power. In this tutorial, we will illustrate a hardware-specific optimization in the NVDLA backend. 
  6 | 
  7 | The following diagram shows one of the computation modules in NVDLA called SDP-X1.
  8 | 
  9 | <img src="../figures/sdp_x1_datapath.png" width="350">
 10 | 
 11 | The Single Point Data Processor (SDP) in NVDLA performs post processing operations at the single data element level. It has several function blocks and each of which targets a different purpose. The X1 block has the architecture support for Bias addition, BatchNorm, PReLU, ReLU, and Eltwise. Its datapath is composed of ALU, multiplier, and ReLU sub-blocks. Each sub-block can be programmed to be enabled or bypassed. More sub-blocks in action result in better performance. For example, if a model contains a series of Add-Mul-Relu, we prefer to map those three operators into a single SDP-X1 operation, rather than mapping them separately into three SDP-X1 operations, as the following figure shows. 
 12 | 
 13 | <img src="../figures/bad_mapping_Add_Mul_Relu.png" width="550">
 14 | 
 15 | Mapping to multiple SDP-X1 operations incurs additional memory accesses, degrades performance, and consumes more power compared to mapping to a single SDP-X1. Intuitively, if we can identify a Add-Mul-ReLu pattern in a model, we can take advantage of the SDP-X1 pipeline to optimize the performance. In addition, we can transform the original model graph to create more Add-Mul-Relu patterns. For example, we can convert a Mul-Add pair to an Add-Mul pair with some adjustment in computation constants. The basic idea is that the original `Y = (X * a) + b` is mathematically equivalent to `Y = (X + c) * a`, where `c = b / a`. Note that this translation has the following pre-conditions to meet in order to be a valid conversion and executable on NVDLA hardware:
 16 | 
 17 | 1. The result of the `Mul` operator has only one consumer.
 18 | 2. The values of `a` and `b` can be determined at compile time.
 19 | 
 20 | The first requirement is due to the fact that the ALU output is consumed by the multiplier without any path written to the memory in the SDP-X1 pipeline. Let's look at an invalid example in the following figure.
 21 | 
 22 | <img src="../figures/sdp_x1_condition_1.png" width="200">
 23 | 
 24 | The Add in the model violates the first condition because it has two consumers. The result of the Add operation has no way to be passed to the other consumer, Conv. Although this pattern exists in the model graph, we cannot convert the pattern for optimization. 
 25 | 
 26 | The second requirement comes from the fact that `c = b / a` introduces a division that has no corresponding hardware to execute. Therefore, we need the values of `a` and `b` determined in the compilation time so that the ALU input will be a single constant. It usually implies that the values of `a` and `b` are constant values in the given model.
 27 | 
 28 | In this lab, we will show how to implement such an optimization pass within the ONNC framework.
 29 | 
 30 | 
 31 | ## Lab: Mul and Add Re-ordering and Fusion
 32 | 
 33 | The following figure shows the example model [test_Mul_Add_Relu.onnx](../models/test_Mul_Add_Relu/test_Mul_Add_Relu.onnx) in this lab, and it contains a Mul-Add-Relu pattern mentioned in previous section.
 34 | 
 35 | <img src="../figures/test_Mul_Add_Relu.png" width="80">
 36 | 
 37 | Given the above model, ONNC initially transforms the model into an ONNC IR graph as depicted in the following diagram.
 38 | 
 39 | <img src="../figures/test_Mul_Add_Relu_original_IR.png" width="180">
 40 | 
 41 | In the ONNC IR graph, you can see that the Mul, Add, and Relu operators are ordered in the same way as in the model graph. We aim at re-ordering the Mul and Add operations as shown in the following ONNC IR graph to take advantage of SDP-X1 pipeline for better performance. 
 42 | 
 43 | <img src="../figures/test_Mul_Add_Relu_reordered_IR.png" width="180">
 44 | 
 45 | Lastly, we will create a new compound ONNC IR AddMulRelu, and convert the Add-Mul-Relu IR sequence into a single IR so that the original three operators are considered as a whole during the code emitting phase and issued as a single SDP hardware operation.
 46 | 
 47 | <img src="../figures/test_Mul_Add_Relu_compound_IR.png" width="250">
 48 | 
 49 | 
 50 | ### Step 1: Set up environment.
 51 | 
 52 | We recommend to finish the following labs first before continuing this lab.
 53 | 
 54 | * [lab 1: Environment Setup](../lab_1_Environment_Setup/lab_1.md) for preparing the Docker images and ONNC source codes.
 55 | * [lab 3: Starting New Backend](../lab_3_Starting_New_Backend/lab_3.md) for preparing the experimental backend `FooNvdla` for the exercise in this lab.
 56 | * [lab 4: Code Emitting](../lab_4_Code_Emitting/lab_4.md) for setting up the utilities needed by this lab.
 57 | 
 58 | The following note enables you to jump start on this lab if you forget the details of the above labs. 
 59 | It is recommended to have two terminal consoles.  One is for running the ONNC-community docker container, and the other is for running commands on your host machine.
 60 | 
 61 | ```sh
 62 | ###################################################
 63 | # Within your computer console (outside Docker container)
 64 | ###################################################
 65 | 
 66 | # Skip these commands if you have already had the Docker images.
 67 | $ docker pull onnc/onnc-community
 68 | $ docker pull onnc/vp
 69 | 
 70 | # Prepare ONNC and tutorial source code.
 71 | $ git clone https://github.com/ONNC/onnc.git
 72 | $ cd onnc; git checkout tags/1.2.0; cd ..
 73 | $ git clone https://github.com/ONNC/onnc-tutorial.git
 74 | 
 75 | # Start the onnc/onnc-community Docker.
 76 | $ docker run -ti --rm -v <absolute/path/to/onnc>:/onnc/onnc -v <absolute/path/to/tutorial>:/tutorial onnc/onnc-community
 77 | 
 78 | ###################################################
 79 | # Within the onnc/onnc-community Docker container.
 80 | ###################################################
 81 | 
 82 | $ cd /onnc/onnc
 83 | $ ./scripts/create-new-backend.sh FooNvdla
 84 | 
 85 | ###################################################
 86 | # Within your computer console (outside Docker container)
 87 | ###################################################
 88 | 
 89 | # Install the pre-built FooNvdla backend.
 90 | $ tar -zxvf <path/to/tutorial>/lab_4_Code_Emitting/src/FooNvdla.tar.gz -C <path/to/onnc>/lib/Target
 91 | 
 92 | ###################################################
 93 | # Within the onnc/onnc-community Docker container.
 94 | ###################################################
 95 | 
 96 | $ cd /onnc/onnc-umbrella/build-normal
 97 | # Rebuild ONNC.
 98 | $ smake -j8 install
 99 | 
100 | # Run ONNC to compile the DNN model. Make sure all the previous preparation is good.
101 | $ onnc -mquadruple foonvdla /tutorial/models/test_group_Conv/test_group_Conv.onnx
102 | FooNvdla is invoked
103 | ```
104 | By now, you should have the `FooNvdla` backend ready in `<path/to/onnc>/lib/Target/FooNvdla`.
105 | For the rest of this lab, all code modification is made in the `FooNvdla` directory.
106 | 
107 | ```sh
108 | # Within your computer console (outside Docker container)
109 | $ cd <path/to/onnc>/lib/Target/FooNvdla
110 | ```
111 | 
112 | ### Step 2: Search for the Mul-Add pattern and re-order the two operators.
113 | 
114 | The re-ordering optimization is done in a pass named NvDlaReorderMulAddPass. We have introduced how to create a pass in [lab 6: Manipulating ONNC IR](../lab_6_Manipulating_ONNC_IR/lab_6.md). You may find all the related files for this lab in the `<path/to/tutorial>/lab_8_Mul_Add_Reordering_and_Fusion/src/` directory. The implementation traverses the ONNC IR graph, finds the matched pattern, and converts the graph accordingly. The details are similar to what is done previous lab and we will skip them in this lab. 
115 | 
116 | ```sh
117 | $ cp <path/to/tutorial>/lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaReorderMulAddPass.* <path/to/onnc>/lib/Target/FooNvdla
118 | ```
119 | 
120 | 
121 | ### Step 3: Fuse Mul-Add-Relu into a single IR.
122 | 
123 | In this lab, we need to define a new ONNC IR, `AddMulRelu`, to represent the Mul-Add-Relu concatenation. For details, please refer to how to define a new ONNC IR operator in [lab 7: ONNC IR Extension](../lab_7_ONNC_IR_Extension/lab_7.md). 
124 | 
125 | ```sh
126 | $ mkdir -p <path/to/onnc>/lib/Target/FooNvdla/Compute
127 | 
128 | # These files are about the new IR's definition.
129 | $ cp <path/to/tutorial>/lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaAddMulRelu.* <path/to/onnc>/lib/Target/FooNvdla/Compute
130 | 
131 | # These files are about deploying the new IR into the model graph.
132 | $ cp <path/to/tutorial>/lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaFuseAddMulReluPass.* <path/to/onnc>/lib/Target/FooNvdla
133 | 
134 | # These files are about the code emitting functions for the new IR.
135 | $ cp <path/to/tutorial>/lab_8_Mul_Add_Reordering_and_Fusion/src/CodeEmitVisitor.* <path/to/onnc>/lib/Target/FooNvdla
136 | ```
137 | 
138 | In addition, in order to visualize the optimization effect, we additionally introduce an utility pass "PrintONNCIRPass" to print out the ONNC IR.
139 | 
140 | ```sh
141 | $ cp <path/to/tutorial>/lab_8_Mul_Add_Reordering_and_Fusion/src/PrintONNCIRPass.* <path/to/onnc>/lib/Target/FooNvdla
142 | ```
143 | 
144 | We have introduced a few optimization passes, and remember to enable those passes in the backend.
145 | 
146 | ```sh
147 | $ cp <path/to/tutorial>/lab_8_Mul_Add_Reordering_and_Fusion/src/FooNvdlaBackend.cpp <path/to/onnc>/lib/Target/FooNvdla
148 | ```
149 | 
150 | Lastly, since we created a few new files for the backend, we need to declare the file addition in the building script so that they can get compiled. You may find the related files in the tutorial `src` directory and simply update the building scripts by the following commands.
151 | 
152 | ```sh
153 | $ cp <path/to/tutorial>/lab_8_Mul_Add_Reordering_and_Fusion/src/CMakeLists.txt <path/to/onnc>/lib/Target/FooNvdla
154 | $ cp <path/to/tutorial>/lab_8_Mul_Add_Reordering_and_Fusion/src/Makefile.am <path/to/onnc>/lib/Target/FooNvdla
155 | ```
156 | 
157 | 
158 | ### Step 4: Re-build ONNC and compile the example model.
159 | 
160 | Follow the instruction in Lab 1. to rebuild the ONNC source code within the ONNC-community Docker.
161 | Use the following command to bring up the ONNC-community Docker.
162 | 
163 | ```sh
164 | # Within onnc/onnc-community Docker container
165 | 
166 | $ cd /onnc/onnc-umbrella/build-normal
167 | 
168 | # Rebuild ONNC.
169 | $ smake -j8 install
170 | 
171 | # Execute ONNC to compile the model.
172 | $ onnc -mquadruple foonvdla /tutorial/models/test_Mul_Add_Relu/test_Mul_Add_Relu.onnx
173 | FooNvdla is invoked
174 | === PrintONNCIRPass ======
175 | %A<float>[1, 1, 5, 5] = Initializer<unimplemented>()
176 | %B<float>[1, 1, 5, 5] = Initializer<unimplemented>()
177 | %INPUT0<float>[1, 1, 5, 5] = InputOperator<unimplemented>()
178 | %mul_out<float>[1, 1, 5, 5] = Mul(%INPUT0<float>[1, 1, 5, 5], %A<float>[1, 1, 5, 5])
179 | %add_out<float>[1, 1, 5, 5] = Add(%mul_out<float>[1, 1, 5, 5], %B<float>[1, 1, 5, 5])
180 | %OUTPUT0<float>[1, 1, 5, 5] = Relu(%add_out<float>[1, 1, 5, 5])
181 |  = OutputOperator<unimplemented>(%OUTPUT0<float>[1, 1, 5, 5])
182 | ==========================
183 | NvDlaReorderMulAddPass is called...
184 | === PrintONNCIRPass ======
185 | %A<float>[1, 1, 5, 5] = Initializer<unimplemented>()
186 | %INPUT0<float>[1, 1, 5, 5] = InputOperator<unimplemented>()
187 | %B__gamma_0)<float>[1, 1, 5, 5] = Initializer<unimplemented>()
188 | %add_out<float>[1, 1, 5, 5] = Add(%INPUT0<float>[1, 1, 5, 5], %B__gamma_0)<float>[1, 1, 5, 5])
189 | %mul_out<float>[1, 1, 5, 5] = Mul(%add_out<float>[1, 1, 5, 5], %A<float>[1, 1, 5, 5])
190 | %OUTPUT0<float>[1, 1, 5, 5] = Relu(%mul_out<float>[1, 1, 5, 5])
191 |  = OutputOperator<unimplemented>(%OUTPUT0<float>[1, 1, 5, 5])
192 | ==========================
193 | NvDlaFuseAddMulReluPass is called...
194 | === PrintONNCIRPass ======
195 | %A<float>[1, 1, 5, 5] = Initializer<unimplemented>()
196 | %INPUT0<float>[1, 1, 5, 5] = InputOperator<unimplemented>()
197 | %B__gamma_0)<float>[1, 1, 5, 5] = Initializer<unimplemented>()
198 | %OUTPUT0<float>[1, 1, 5, 5] = AddMulRelu<>(%INPUT0<float>[1, 1, 5, 5], %B__gamma_0)<float>[1, 1, 5, 5], %A<float>[1, 1, 5, 5])
199 |  = OutputOperator<unimplemented>(%OUTPUT0<float>[1, 1, 5, 5])
200 | ==========================
201 | visit(NvDlaAddMulRelu) is called
202 | ```
203 | 
204 | In the above output log, there are three `PrintONNCIRPass` blocks. The first one prints the initial ONNC IR graph before the re-ordering optimization takes effect. There is a Mul-Add pair in the initial graph. After `NvDlaReorderMulAddPass` is applied, the Mul-Add pair is converted to an Add-Mul pair. occurs before the Mul. In addition, one of the Add's inputs is connected to a newly-created tensor called `B__gamma_0`, which contains the adjusted coefficients. After another pass, `NvDlaFuseAddMulReluPass`, is applied, the ONNC IR graph changes again. A new ONNC IR called `AddMulRelu` replaces the Add-Mul-Relu sequence in the previous ONNC IR graph. With these optimization passes on the model graph, we can easily map three model operations into a single SDP-X1 operation in NVDLA. 
205 | 
206 | ## Summary
207 | 
208 | In this lab, you have learned:
209 | 
210 | * How to fully utilize the pipelined SDP-X1 datapath by searching for the Add-Mul-Relu patterns in a given model and mapping them together into a single SDP-X1 operation.
211 | * How to create a pass to manipulate the model graph for achieving the above optimization.
212 | 
213 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | include_directories(.)
 3 | include_directories(include)
 4 | 
 5 | add_libonnc_src(
 6 |     CodeEmitVisitor.cpp
 7 |     FooNvdlaBackend.cpp
 8 |     Loadable.cpp
 9 |     NvDlaDefine.cpp
10 |     NvDlaMeta.cpp
11 |     NvDlaUtil.cpp
12 |     NvDlaMemInfoPass.cpp
13 |     NvDlaTaskSubmitPass.cpp
14 |     NvDlaFileGenPass.cpp
15 |     NvDlaReorderMulAddPass.cpp
16 |     Compute/NvDlaAddMulRelu.cpp
17 |     NvDlaFuseAddMulReluPass.cpp
18 |     PrintONNCIRPass.cpp
19 |     Config/NvFull.cpp
20 |     TargetInfo/FooNvdlaTargetInfo.cpp
21 |     TargetInfo/FooNvdlaTargetMemInfo.cpp)
22 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/CodeEmitVisitor.h:
--------------------------------------------------------------------------------
  1 | //===- CodeEmitVisitor.h --------------------------------------------------===//
  2 | //
  3 | //                             The ONNC Project
  4 | //
  5 | // See LICENSE.TXT for details.
  6 | //
  7 | //===----------------------------------------------------------------------===//
  8 | #ifndef TARGET_FOONVDLA_CODE_EMIT_VISITOR_H
  9 | #define TARGET_FOONVDLA_CODE_EMIT_VISITOR_H
 10 | 
 11 | #include "NvDlaDefine.h"
 12 | #include "NvDlaMeta.h"
 13 | #include "Compute/NvDlaAddMulRelu.h"
 14 | 
 15 | #include <onnc/IR/Compute/Initializer.h>
 16 | #include <onnc/IR/Compute/InputOperator.h>
 17 | #include <onnc/IR/Compute/OutputOperator.h>
 18 | #include <onnc/IR/Compute/Tensor.h>
 19 | #include <onnc/IR/CustomVisitor.h>
 20 | #include <onnc/Support/Preprocessor.h>
 21 | #include <onnc/Support/Span.h>
 22 | 
 23 | #include <functional>
 24 | 
 25 | namespace onnc {
 26 | namespace foonvdla {
 27 | 
 28 | class CodeEmitVisitor : public CustomVisitor<CodeEmitVisitor>, private NvDlaConstants
 29 | {
 30 | public:
 31 |   CodeEmitVisitor(const NvDlaConstants& constants, NvDlaBackendMeta& meta) noexcept
 32 |     : NvDlaConstants{constants}
 33 |     , m_pMeta{meta}
 34 |   {}
 35 | 
 36 |   /// ONNC defined operators @{
 37 |   void visit(const Initializer& pInitializer) override;
 38 |   void visit(const InputOperator& pInputOperator) override;
 39 |   void visit(const OutputOperator& pOutputOperator) override;
 40 |   /// @}
 41 | 
 42 |   /// ONNX defined operators @{
 43 |   void visit(const Conv& pConv) override;
 44 |   void visit(const NvDlaAddMulRelu& pOp);
 45 |   /// @}
 46 | 
 47 |   /// ONNC defined operators @{
 48 |   void visit(Initializer& pInitializer) override;
 49 |   void visit(InputOperator& pInputOperator) override;
 50 |   void visit(OutputOperator& pOutputOperator) override;
 51 |   /// @}
 52 | 
 53 |   /// ONNX defined operators @{
 54 |   void visit(Conv& pConv) override;
 55 |   void visit(NvDlaAddMulRelu& pOp) { visit(const_cast<const NvDlaAddMulRelu&>(pOp)); }
 56 |   /// @}
 57 | 
 58 | private:
 59 |   MemoryListEntryId packWeight(const Tensor& weight, NvDlaDims destDims, Tensor::Dimension numFrontPaddingChannels,
 60 |                                Tensor::Dimension outputChannelOffset);
 61 |   MemoryListEntryId packImageWeight(const Tensor& weight, NvDlaDims destDims, Tensor::Dimension outputChannelOffset);
 62 |   MemoryListEntryId packBias(const Tensor& bias, Tensor::Dimension numDestChannels,
 63 |                              Tensor::Dimension srcChannelOffset = 0);
 64 |   MemoryListEntryId packSDPOperand(const Tensor* aluTensor, const Tensor* mulTensor, const NvDlaCubeInfo& cubeInfo);
 65 | 
 66 |   MemoryListEntryId  packFeature(const Tensor& tensor, const NvDlaCubeInfo& cube);
 67 |   void               issueEmuOp(NvDlaEmuOperation* op);
 68 |   AddressListEntryId issueEmuAddr(MemoryListEntryId mid);
 69 |   void               issueDlaOp(NvDlaDlaOperation* op, NvDlaDlaOperation* op_fuse, NvDlaDlaOperation* op_prev);
 70 |   void               issueDlaOp(std::unique_ptr<NvDlaDlaOperation> op);
 71 |   AddressListEntryId issueDlaAddr(const Tensor& tensor, const NvDlaCubeInfo& cube, Tensor::Dimension channelOffset,
 72 |                                   NvDlaBackendMeta::Offset hOffset);
 73 |   AddressListEntryId issueDlaAddr(const Tensor& tensor, const NvDlaCubeInfo& cube);
 74 |   AddressListEntryId issueDlaAddr(MemoryListEntryId memoryId, const NvDlaCubeInfo& cube);
 75 |   AddressListEntryId issueSDPOperand(const Tensor& tensor, const NvDlaCubeInfo& cube, MemoryListEntryId& memoryId);
 76 | 
 77 |   void SetLUTParam(dla_lut_param* lut_param, float alpha, float beta, float bias, int size, float outdata_scale, float outdata_offset);
 78 | 
 79 |   // Perform SDP for 2 input tensors and an output tensor,
 80 |   // the possible value for parameter 'opType' is:
 81 |   //
 82 |   //   1. SDP_OP_ADD
 83 |   //   2. SDP_OP_MUL
 84 |   //
 85 |   void emitSdp(std::uint8_t opType, const Tensor& firstInput, const Tensor& secondInput, const Tensor& output);
 86 | 
 87 | private:
 88 |   MemoryListEntryId packWeight(span<const float> weight, const Tensor* weightTensor, NvDlaDims srcDims,
 89 |                                NvDlaDims destDims, Tensor::Dimension numFrontPaddingChannels,
 90 |                                Tensor::Dimension outputChannelOffset);
 91 | 
 92 |   MemoryListEntryId packWeight(const Tensor& weight, NvDlaDims srcDims, NvDlaDims destDims,
 93 |                                Tensor::Dimension numFrontPaddingChannels, Tensor::Dimension outputChannelOffset);
 94 | 
 95 |   template <typename Type>
 96 |   void packWeightImpl(Type* destData, NvDlaDims destDimsWithFrontPadding, const Tensor* tensor, const float* srcData,
 97 |                       NvDlaDims srcDims, Tensor::Dimension numFrontPaddingChannels,
 98 |                       Tensor::Dimension outputChannelOffset);
 99 | 
100 |   template <typename Type>
101 |   void packImageWeightImpl(Type* blob, NvDlaDims blobDims, const Tensor* tensor, const float* srcData,
102 |                            NvDlaDims srcDims, Tensor::Dimension outputChannelOffset);
103 |   template <typename Type>
104 |   void packBiasImpl(Type* destData, Tensor::Dimension numDestChannels, const Tensor* tensor, const float* srcData,
105 |                     Tensor::Dimension srcChannelOffset);
106 |   void packSDPOperandImpl(NvU8* blob, const Tensor* aluTensor, const float* aluData, const Tensor* mulTensor,
107 |                           const float* mulData, const NvDlaCubeInfo& cubeInfo);
108 | 
109 | private:
110 |   NvDlaBackendMeta&         m_pMeta;
111 | };
112 | 
113 | } // namespace nvdla
114 | } // namespace onnc
115 | 
116 | #undef PP_DECL_VISIT
117 | #undef PP_NVDLA_OP_LIST
118 | 
119 | #endif
120 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/FooNvdlaBackend.cpp:
--------------------------------------------------------------------------------
  1 | //===- FooNvdlaBackend.cpp -----------------------------------------------------===//
  2 | //
  3 | //                             The ONNC Project
  4 | //
  5 | // See LICENSE.TXT for details.
  6 | //
  7 | //===----------------------------------------------------------------------===//
  8 | #include <memory>
  9 | 
 10 | #include "FooNvdlaBackend.h"
 11 | #include "TargetInfo/FooNvdlaTargetInfo.h"
 12 | #include "TargetInfo/FooNvdlaTargetMemInfo.h"
 13 | #include "CodeEmitVisitor.h"
 14 | #include "NvDlaMemInfoPass.h"
 15 | #include "NvDlaTaskSubmitPass.h"
 16 | #include "NvDlaFileGenPass.h"
 17 | #include "NvDlaReorderMulAddPass.h"
 18 | #include "NvDlaFuseAddMulReluPass.h"
 19 | #include "PrintONNCIRPass.h"
 20 | 
 21 | #include <onnc/Analysis/UpdateGraphOutputSize.h>
 22 | #include <onnc/Analysis/NodeIRScheduler.h>
 23 | #include <onnc/CodeGen/BuildMemOperand.h>
 24 | #include <onnc/CodeGen/LinearScanMemAlloc.h>
 25 | #include <onnc/CodeGen/LiveIntervals.h>
 26 | #include <onnc/CodeGen/LiveValueMatrix.h>
 27 | #include <onnc/CodeGen/SetMemOperand.h>
 28 | #include <onnc/CodeGen/SlotIndexes.h>
 29 | #include <onnc/IR/CodeEmit.h>
 30 | #include <onnc/Support/Memory.h>
 31 | #include <onnc/Target/TargetRegistry.h>
 32 | #include <onnc/Target/TargetStandardPasses.h>
 33 | #include <onnc/Transforms/BookONNXGraphs.h>
 34 | #include <onnc/Transforms/BuildInitializers.h>
 35 | #include <onnc/Transforms/BuildInputOperators.h>
 36 | #include <onnc/Transforms/BuildOutputOperators.h>
 37 | #include <onnc/Transforms/DeadNodeElimination.h>
 38 | #include <onnc/Transforms/RemoveTrainingNodes.h>
 39 | #include <onnc/Transforms/TensorSel.h>
 40 | #include <onnc/Transforms/TensorSel/Standards/ConvLower.h>
 41 | #include <onnc/Transforms/TensorSel/Standards/MulLower.h>
 42 | #include <onnc/Transforms/TensorSel/Standards/AddLower.h>
 43 | #include <onnc/Transforms/TensorSel/Standards/ReluLower.h>
 44 | 
 45 | #include <memory>
 46 | 
 47 | using namespace onnc;
 48 | 
 49 | //===----------------------------------------------------------------------===//
 50 | // FooNvdlaBackend
 51 | //===----------------------------------------------------------------------===//
 52 | const Version FooNvdlaBackend::LOADABLE_VERSION = Version(1, 1, 255);
 53 | const Version FooNvdlaBackend::BLOB_DLA_VERSION = Version(1, 3, 0);
 54 | const Version FooNvdlaBackend::BLOB_EMU_VERSION = Version(1, 3, 0);
 55 | 
 56 | FooNvdlaBackend::FooNvdlaBackend(const TargetOptions& pOptions)
 57 |   : TargetBackend(pOptions)
 58 |   , NvDlaConstants(getConfig(::nvdla::ConfigSet::nv_full, ::nvdla::ExecutionMode::direct, false))
 59 |   , m_pMeta(*this) { 
 60 |   m_pMemInfo = std::make_unique<FooNvdlaTargetMemInfo>();
 61 | }
 62 | 
 63 | void FooNvdlaBackend::addTensorSel(PassManager& pPM)
 64 | {
 65 |   errs() << "FooNvdla is invoked\n";
 66 | 
 67 |   // Do ONNX graph IR optimization here.
 68 | 
 69 |   // Translate from ONNX graph IR into ONNC IR
 70 |   addStandardTensorSel(pPM, *this);
 71 |   
 72 |   // Now ONNC IR is ready.
 73 |   // If you need to extend ONNC IR, here is the place to add your pass that
 74 |   // adds your ONNC IR operators.
 75 | }
 76 | 
 77 | void FooNvdlaBackend::addOnncIrOptimization(PassManager& pPM, OptimizationOptions& options)
 78 | {
 79 |   TargetBackend::addOnncIrOptimization(pPM, options);
 80 | 
 81 |   pPM.add<PrintONNCIRPass>();
 82 |   pPM.add<NvDlaReorderMulAddPass>();
 83 |   pPM.add<PrintONNCIRPass>();
 84 |   pPM.add<NvDlaFuseAddMulReluPass>();
 85 |   pPM.add<PrintONNCIRPass>();
 86 | }
 87 | 
 88 | void FooNvdlaBackend::addTensorSched(PassManager& pPM)
 89 | {
 90 |   // After method AddTensorSel, operators have been scheduled in an
 91 |   // topological order, which totally respects the data dependency.
 92 |   // However, that might not be an optimized order for certain objective.
 93 |   // Add a scheduling optimization pass here.
 94 | }
 95 | 
 96 | void FooNvdlaBackend::addMemAlloc(PassManager& pPM)
 97 | {
 98 |   // Input: Module
 99 |   // Output: LiveIntervals
100 |   addStandardCreateLiveIntervals(pPM);
101 | 
102 |   // Input: LiveIntervals
103 |   // Output: MemAllocs
104 |   addStandardMemoryAllocation(pPM, *this);
105 | 
106 |   // Input: MemAllocs
107 |   // Output: Virtual memory address for each memory operands.
108 |   addStandardSetMemOperands(pPM);
109 | 
110 |   const NvDlaConstants& constants = *this;
111 |   pPM.add<NvDlaMemInfoPass>(constants, &m_pMeta);
112 | }
113 | 
114 | void FooNvdlaBackend::addCodeEmit(PassManager& pPM, const Path& pOutput)
115 | {
116 |   static foonvdla::CodeEmitVisitor ceVisitor(*this, m_pMeta);
117 |   pPM.add<CodeEmit>(ceVisitor)
118 |      .add<NvDlaTaskSubmitPass>(&m_pMeta, BLOB_DLA_VERSION, BLOB_EMU_VERSION)
119 |      .add<NvDlaFileGenPass>(&m_pMeta, LOADABLE_VERSION)
120 |     ;
121 | }
122 | 
123 | void FooNvdlaBackend::RegisterLowers(LowerRegistry& pRegistry) const
124 | {
125 |   pRegistry.emplace<ConvLower>();
126 |   pRegistry.emplace<MulLower>();
127 |   pRegistry.emplace<AddLower>();
128 |   pRegistry.emplace<ReluLower>();
129 | }
130 | 
131 | 
132 | //===----------------------------------------------------------------------===//
133 | // Non member functions
134 | //===----------------------------------------------------------------------===//
135 | TargetBackend* CreateFooNvdlaBackend(const TargetOptions& pOptions)
136 | {
137 |   return new FooNvdlaBackend(pOptions);
138 | }
139 | 
140 | extern "C" void InitializeFooNvdlaONNCBackend()
141 | {
142 |   onnc::TargetRegistry::RegisterTargetBackend(getTheFooNvdlaTarget(),
143 |       CreateFooNvdlaBackend);
144 | }
145 | 
146 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/FooNvdlaBackend.h:
--------------------------------------------------------------------------------
 1 | //===- FooNvdlaBackend.h -------------------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #ifndef TARGET_FOONVDLA_FOONVDLA_BACKEND_H
 9 | #define TARGET_FOONVDLA_FOONVDLA_BACKEND_H
10 | #include <string>
11 | #include <onnc/Target/TargetBackend.h>
12 | #include "NvDlaDefine.h"
13 | #include "NvDlaMeta.h"
14 | #include "Version.h"
15 | 
16 | namespace onnc {
17 | using namespace onnc::foonvdla;
18 |   
19 | class FooNvdlaBackend : public TargetBackend, private NvDlaConstants
20 | {
21 | private:
22 |   static const Version LOADABLE_VERSION;
23 |   static const Version BLOB_DLA_VERSION;
24 |   static const Version BLOB_EMU_VERSION;
25 |   
26 | public:
27 |   FooNvdlaBackend(const TargetOptions& pOptions);
28 | 
29 |   virtual ~FooNvdlaBackend() = default;
30 | 
31 |   void addTensorSel(PassManager& pPM) override;
32 | 
33 |   void addOnncIrOptimization(PassManager& pPM, OptimizationOptions& options) override;
34 | 
35 |   void addTensorSched(PassManager& pPM) override;
36 |   
37 |   void addMemAlloc(PassManager& pPM) override;
38 | 
39 |   void addCodeEmit(PassManager& pPM, const Path& pOutput) override;
40 | 
41 |   void RegisterLowers(LowerRegistry& pRegistry) const override;
42 | 
43 | private:
44 |   NvDlaBackendMeta       m_pMeta;
45 | };
46 | 
47 | }  // namespace onnc
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/Makefile.am:
--------------------------------------------------------------------------------
 1 | ONNC_TARGET_SOURCES += \
 2 |   Target/FooNvdla/CodeEmitVisitor.cpp \
 3 |   Target/FooNvdla/FooNvdlaBackend.cpp \
 4 |   Target/FooNvdla/Loadable.cpp \
 5 |   Target/FooNvdla/NvDlaDefine.cpp \
 6 |   Target/FooNvdla/NvDlaMeta.cpp \
 7 |   Target/FooNvdla/NvDlaUtil.cpp \
 8 |   Target/FooNvdla/NvDlaMemInfoPass.cpp \
 9 |   Target/FooNvdla/NvDlaTaskSubmitPass.cpp \
10 |   Target/FooNvdla/NvDlaFileGenPass.cpp \
11 |   Target/FooNvdla/NvDlaReorderMulAddPass.cpp \
12 |   Target/FooNvdla/Compute/NvDlaAddMulRelu.cpp \
13 |   Target/FooNvdla/NvDlaFuseAddMulReluPass.cpp \
14 |   Target/FooNvdla/PrintONNCIRPass.cpp \
15 |   Target/FooNvDla/Config/NvFull.cpp \
16 |   Target/FooNvdla/TargetInfo/FooNvdlaTargetInfo.cpp \
17 |   Target/FooNvdla/TargetInfo/FooNvdlaTargetMemInfo.cpp
18 | 
19 | ONNC_INCLUDES += \
20 |   -I${srcdir}/Target/FooNvdla/include \
21 |   -I${srcdir}/Target/FooNvdla/
22 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaAddMulRelu.cpp:
--------------------------------------------------------------------------------
 1 | //===- NvDlaAddMulRelu.cpp ------------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #include "NvDlaAddMulRelu.h"
 9 | 
10 | #include "../CodeEmitVisitor.h"
11 | #include "../NvDlaDefine.h"
12 | 
13 | using namespace onnc;
14 | using namespace onnc::foonvdla;
15 | 
16 | char NvDlaAddMulRelu::ID = 0;
17 | 
18 | //===----------------------------------------------------------------------===//
19 | // NvDlaAddMulRelu
20 | //===----------------------------------------------------------------------===//
21 | void NvDlaAddMulRelu::printAttributes(std::ostream& pOS) const
22 | {
23 |   pOS << "<>";
24 | }
25 | 
26 | void NvDlaAddMulRelu::accept(ComputeVisitor& pV)
27 | {
28 |   CodeEmitVisitor* visitor = dyn_cast<CodeEmitVisitor>(&pV);
29 |   if (nullptr != visitor)
30 |     visitor->visit(*this);
31 | }
32 | 
33 | void NvDlaAddMulRelu::accept(ComputeVisitor& pV) const
34 | {
35 |   CodeEmitVisitor* visitor = dyn_cast<CodeEmitVisitor>(&pV);
36 |   if (nullptr != visitor)
37 |     visitor->visit(*this);
38 | }
39 | 
40 | bool NvDlaAddMulRelu::classof(const ComputeOperator* pOp)
41 | {
42 |   if (nullptr == pOp)
43 |     return false;
44 |   return (pOp->getID() == &ID);
45 | }
46 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaAddMulRelu.h:
--------------------------------------------------------------------------------
 1 | //===- NvDlaAddMulRelu.h ------------------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===--------------------------------------------------------------------------===//
 8 | #ifndef TARGET_NVDLA_NVDLA_ADD_MUL_RELU_H
 9 | #define TARGET_NVDLA_NVDLA_ADD_MUL_RELU_H
10 | 
11 | #include <onnc/IR/Compute/Reshape.h>
12 | #include <onnc/IR/Compute/Transpose.h>
13 | #include <onnc/IR/ComputeOperator.h>
14 | 
15 | namespace onnc {
16 | namespace foonvdla {
17 | 
18 | class NvDlaAddMulRelu : public ComputeOperator
19 | {
20 | public:
21 |   static char ID;
22 | 
23 | public:
24 |   NvDlaAddMulRelu()
25 |     : ComputeOperator("AddMulRelu", ID)
26 |   {}
27 | 
28 |   virtual ~NvDlaAddMulRelu() {}
29 | 
30 |   // Paramater
31 | 
32 |   // Input & Ouput Tensor
33 |   Tensor* getInput(unsigned int pIdx) override { return static_cast<Tensor*>(m_Inputs[pIdx]); }
34 | 
35 |   const Tensor* getInput(unsigned int pIdx) const override { return static_cast<Tensor*>(m_Inputs[pIdx]); }
36 | 
37 |   Tensor* getOutput(unsigned int pIdx) override { return static_cast<Tensor*>(m_Outputs[pIdx]); }
38 | 
39 |   const Tensor* getOutput(unsigned int pIdx) const override { return static_cast<Tensor*>(m_Outputs[pIdx]); }
40 | 
41 |   void printAttributes(std::ostream& pOS) const override;
42 | 
43 |   void accept(ComputeVisitor& pV) override;
44 | 
45 |   void accept(ComputeVisitor& pV) const override;
46 | 
47 |   static bool classof(const ComputeOperator* pOp);
48 | 
49 | };
50 | 
51 | } // namespace foonvdla
52 | } // namespace onnc
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaFuseAddMulReluPass.cpp:
--------------------------------------------------------------------------------
  1 | //===- NvDlaFuseAddMulReluPass.cpp ----------------------------------------===//
  2 | //
  3 | //                             The ONNC Project
  4 | //
  5 | // See LICENSE.TXT for details.
  6 | //
  7 | //===----------------------------------------------------------------------===//
  8 | #include "NvDlaFuseAddMulReluPass.h"
  9 | #include "Compute/NvDlaAddMulRelu.h"
 10 | 
 11 | #include <onnc/Core/PassSupport.h>
 12 | #include <onnc/IR/Compute/Attributes.h>
 13 | #include <onnc/IR/Compute/Initializer.h>
 14 | #include <onnc/IR/Compute/Add.h>
 15 | #include <onnc/IR/Compute/Mul.h>
 16 | #include <onnc/IR/Compute/Relu.h>
 17 | #include <onnc/IR/ComputeOperator.h>
 18 | #include <onnc/Transforms/Optimizations/OptimizationsUtils.h>
 19 | 
 20 | namespace onnc {
 21 | namespace foonvdla {
 22 | 
 23 | //===----------------------------------------------------------------------===//
 24 | // NvDlaFuseAddMulReluPass
 25 | //===----------------------------------------------------------------------===//
 26 | 
 27 | Pass::ReturnType NvDlaFuseAddMulReluPass::runOnModule(Module& pModule)
 28 | {
 29 |   const Pass::ReturnType ret = BaseType::runOnModule(pModule);
 30 | 
 31 |   if (ret != kModuleNoChanged) {
 32 |     pModule.eraseUnusedValues();
 33 |   }
 34 | 
 35 |   return ret;
 36 | }
 37 | 
 38 | Pass::ReturnType NvDlaFuseAddMulReluPass::runOnComputeGraph(ComputeGraph& pCG)
 39 | {
 40 |   Pass::ReturnType ret = Pass::kModuleNoChanged;
 41 | 
 42 |   // Search for the Add-Mul-Relu patterns that can be replaced by a single AddMulRelu IR.
 43 |   std::vector<ComputeOperator*> patternList;
 44 |   for (ComputeOperator& node : pCG) {
 45 |     if (isAddMulRelu(&node)) {
 46 |       patternList.emplace_back(&node);
 47 |       ret |= Pass::kModuleChanged;
 48 |     }
 49 |   }
 50 |   
 51 |   for (ComputeOperator* node : patternList) {
 52 |     // Derive original IRs.
 53 |     Add* add = dyn_cast<Add>(node);
 54 |     Mul* mul = dyn_cast<Mul>(add->getOutput(0)->getUses()[0].getUser());
 55 |     Relu* relu = dyn_cast<Relu>(mul->getOutput(0)->getUses()[0].getUser());
 56 | 
 57 |     Tensor* addA = add->getInput(0);
 58 |     Tensor* addB = add->getInput(1);
 59 |     Tensor* addC = add->getOutput(0);
 60 |     Tensor* mulB;
 61 |     if (addC == mul->getInput(0)) {
 62 |       mulB = mul->getInput(1);
 63 |     } else {
 64 |       mulB = mul->getInput(0);
 65 |     }
 66 |     Tensor* mulC = mul->getOutput(0);
 67 |     Tensor* reluY = relu->getOutput(0);
 68 | 
 69 |     // The current ONNC IR graph status
 70 |     // ================================
 71 |     //        
 72 |     //    |      |
 73 |     //  addA   addB
 74 |     //      \   /
 75 |     //      (add)  
 76 |     //        |      |
 77 |     //       addC   mulB
 78 |     //         \   /
 79 |     //         (mul)
 80 |     //           |
 81 |     //         mulC
 82 |     //           |
 83 |     //         (relu)
 84 |     //           |
 85 |     //         reluY
 86 |     //           |
 87 | 
 88 |     // Create a new AddMulRelu IR.
 89 |     NvDlaAddMulRelu* compound = pCG.addOperator<NvDlaAddMulRelu>();
 90 | 
 91 |     // The current ONNC IR graph status
 92 |     // ================================
 93 |     //        
 94 |     //    |      |
 95 |     //  addA   addB
 96 |     //      \   /
 97 |     //      (add)  
 98 |     //        |      |
 99 |     //       addC   mulB
100 |     //         \   /
101 |     //         (mul)
102 |     //           |
103 |     //         mulC
104 |     //           |
105 |     //         (relu)    (compound)
106 |     //           |
107 |     //         reluY
108 |     //           |
109 |     
110 |     add->removeAllInputs();
111 |     add->removeAllOutputs();
112 |     mul->removeAllInputs();
113 |     mul->removeAllOutputs();
114 |     relu->removeAllInputs();
115 |     relu->removeAllOutputs();
116 | 
117 |     // The current ONNC IR graph status
118 |     // ================================
119 |     //        
120 |     //    |      |
121 |     //  addA   addB
122 |     //        
123 |     //      (add)  
124 |     //               |
125 |     //       addC   mulB
126 |     //            
127 |     //         (mul)
128 |     //           
129 |     //         mulC
130 |     //           
131 |     //         (relu)    (compound)
132 |     //           
133 |     //         reluY
134 |     //           |
135 | 
136 |     pCG.erase(*add);
137 |     pCG.erase(*mul);
138 |     pCG.erase(*relu);
139 |     pCG.erase(*addC);
140 |     pCG.erase(*mulC);
141 |     
142 |     // The current ONNC IR graph status
143 |     // ================================
144 |     //        
145 |     //    |      |
146 |     //  addA   addB
147 |     //        
148 |     //               |
149 |     //              mulB
150 |     //            
151 |     //                   (compound)
152 |     //           
153 |     //         reluY
154 |     //           |
155 | 
156 |     compound->addInput(*addA);
157 |     compound->addInput(*addB);
158 |     compound->addInput(*mulB);
159 |     compound->addOutput(*reluY);
160 | 
161 |     // The current ONNC IR graph status
162 |     // ================================
163 |     //        
164 |     //     |     |     |
165 |     //    addA  addB  mulB
166 |     //       \   |    /
167 |     //       (compound)
168 |     //           |
169 |     //         reluY
170 |     //           |
171 | 
172 |   }
173 | 
174 |   pCG.topologicalSort();
175 |   
176 |   return ret;
177 | }
178 | 
179 | bool NvDlaFuseAddMulReluPass::isAddMulRelu(ComputeOperator* pNode)
180 | {
181 |   // Check the first node.
182 |   // It must be
183 |   //   1) an Add and,
184 |   //   2) has only one operator to use its result.
185 |   if ( ! isa<Add>(pNode)) return false;
186 |   if (pNode->getOutput(0)->getUses().size() > 1) return false;
187 | 
188 |   // Check the second node.
189 |   // It must be
190 |   //   1) a Mul and,
191 |   //   2) has only one operator to use its result.
192 |   ComputeOperator* secondNode = pNode->getOutput(0)->getUses()[0].getUser();
193 |   if ( ! isa<Mul>(secondNode)) return false;
194 |   if (secondNode->getOutput(0)->getUses().size() > 1) return false;
195 | 
196 |   // Check the third node.
197 |   // It must be a Relu.
198 |   // However, it does not need the limitation of only one operator to use its result, because
199 |   // its result is saved in system memory which can be loaded by multiple operators for use
200 |   // at any time.
201 |   ComputeOperator* thirdNode = secondNode->getOutput(0)->getUses()[0].getUser();
202 |   if ( ! isa<Relu>(thirdNode)) return false;
203 | 
204 |   return true;
205 | }
206 |   
207 | } // namespace foonvdla
208 | } // namespace onnc
209 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaFuseAddMulReluPass.h:
--------------------------------------------------------------------------------
 1 | //===- NvDlaFuseAddMulReluPass.h ------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #ifndef ONNC_FOONVDLA_FUSE_ADD_MUL_RELU_PASS_H
 9 | #define ONNC_FOONVDLA_FUSE_ADD_MUL_RELU_PASS_H
10 | #include <onnc/Core/CustomPass.h>
11 | 
12 | 
13 | namespace onnc {
14 | namespace foonvdla {
15 | 
16 | class NvDlaFuseAddMulReluPass : public CustomPass<NvDlaFuseAddMulReluPass>
17 | {
18 | public:
19 |   NvDlaFuseAddMulReluPass() = default;
20 | 
21 |   ReturnType runOnModule(Module& pModule) override;
22 | 
23 |   ReturnType runOnComputeGraph(ComputeGraph& pCG) override;
24 | 
25 | private:
26 |   bool isAddMulRelu(ComputeOperator* pNode);
27 | };
28 | 
29 | } // namespace foonvdla
30 | } // namespace onnc
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaReorderMulAddPass.cpp:
--------------------------------------------------------------------------------
  1 | //===- NvDlaReorderMulAddPass.cpp -----------------------------------------===//
  2 | //
  3 | //                             The ONNC Project
  4 | //
  5 | // See LICENSE.TXT for details.
  6 | //
  7 | //===----------------------------------------------------------------------===//
  8 | #include "NvDlaReorderMulAddPass.h"
  9 | 
 10 | #include <onnc/Core/PassSupport.h>
 11 | #include <onnc/IR/Compute/Attributes.h>
 12 | #include <onnc/IR/Compute/Initializer.h>
 13 | #include <onnc/IR/Compute/Mul.h>
 14 | #include <onnc/IR/Compute/Add.h>
 15 | #include <onnc/IR/ComputeOperator.h>
 16 | #include <onnc/Transforms/Optimizations/OptimizationsUtils.h>
 17 | 
 18 | using namespace onnc;
 19 | using namespace onnc::foonvdla;
 20 | 
 21 | //===----------------------------------------------------------------------===//
 22 | // NvDlaReorderMulAddPass
 23 | //===----------------------------------------------------------------------===//
 24 | 
 25 | unsigned NvDlaReorderMulAddPass::tensorIdx = 0;
 26 | 
 27 | Pass::ReturnType NvDlaReorderMulAddPass::runOnModule(Module& pModule)
 28 | {
 29 |   std::cout << "NvDlaReorderMulAddPass is called...\n";
 30 |   
 31 |   const Pass::ReturnType ret = BaseType::runOnModule(pModule);
 32 | 
 33 |   if (ret != kModuleNoChanged) {
 34 |     pModule.eraseUnusedValues();
 35 |   }
 36 | 
 37 |   return ret;
 38 | }
 39 | 
 40 | Pass::ReturnType NvDlaReorderMulAddPass::runOnComputeGraph(ComputeGraph& pCG)
 41 | {
 42 |   Pass::ReturnType ret = Pass::kModuleNoChanged;
 43 | 
 44 |   //--------------------------------------------------------
 45 |   // Search for the Mul-Add patterns that can be reordered.
 46 |   //--------------------------------------------------------
 47 |   std::vector<ComputeOperator*> mulList;
 48 |   for (ComputeOperator& node : pCG) {
 49 |     if (canBeReordered(&node)) {
 50 |       mulList.emplace_back(&node);
 51 |       ret |= Pass::kModuleChanged;
 52 |     }
 53 |   }
 54 | 
 55 |   //--------------------------------------------
 56 |   // Perform re-ordering on the found patterns.
 57 |   //--------------------------------------------
 58 |   // The original pattern is:
 59 |   //   outputY = (inputX * alpha) + beta
 60 |   //
 61 |   // We will re-arrange the above pattern by:
 62 |   //   outputY = (inputX + gamma) * alpha, where
 63 |   //     gamma = beta / alpha
 64 | 
 65 |   for (ComputeOperator* node : mulList) {
 66 |     Mul* mul = dyn_cast<Mul>(node);
 67 |     Add* add = dyn_cast<Add>(node->getOutput(0)->getUses()[0].getUser());
 68 | 
 69 |     Tensor* inputX;
 70 |     FloatTensor* alpha; // This kind of tensor contains constant values.
 71 |     FloatTensor* beta;
 72 |     Tensor* outputY;
 73 |     Tensor* tmp;
 74 | 
 75 |     // Find alpha and inputX. alpha must be a constant tensor.
 76 |     // In this example, we assume that Mul must have one constant input.
 77 |     if (isConstant(mul->getInput(0))) {
 78 |       alpha = dynamic_cast<FloatTensor*>(mul->getInput(0));
 79 |       inputX = mul->getInput(1);
 80 |     } else {
 81 |       inputX = mul->getInput(0);
 82 |       alpha = dynamic_cast<FloatTensor*>(mul->getInput(1));
 83 |     }
 84 | 
 85 |     // Find beta. beta must be a constant tensor.
 86 |     // In this example, we assume that Add must have one constant input.
 87 |     if (isConstant(add->getInput(0))) {
 88 |       beta = dynamic_cast<FloatTensor*>(add->getInput(0));
 89 |       tmp = add->getInput(1);
 90 |     } else {
 91 |       tmp = add->getInput(0);
 92 |       beta = dynamic_cast<FloatTensor*>(add->getInput(1));
 93 |     }
 94 | 
 95 |     // Find outputY
 96 |     outputY = add->getOutput(0);
 97 | 
 98 |     std::string addOutputTensorName = add->getOutput(0)->getName();
 99 |     std::string mulOutputTensorName = mul->getOutput(0)->getName();
100 | 
101 |     // The current ONNC IR graph status
102 |     // ================================
103 |     //
104 |     //        (alphaInitializer)
105 |     //    |      |
106 |     // inputX  alpha
107 |     //      \   /
108 |     //      (mul)  (betaInitializer)
109 |     //        |      |
110 |     //       tmp   beta
111 |     //         \   /
112 |     //         (add)
113 |     //           |
114 |     //        outputY
115 |     //           |
116 |     //
117 |     
118 |     // Remove the edges between Mul/Add and their input/output tensors.
119 |     // We will re-build their edges later on.
120 |     // Remove an edge means to erase the records within an operator's data structure about its input tensors.
121 |     mul->removeAllInputs();
122 |     mul->removeAllOutputs();
123 |     add->removeAllInputs();
124 |     add->removeAllOutputs();
125 | 
126 |     // The current ONNC IR graph status
127 |     // ================================
128 |     //
129 |     //        (alphaInitializer)
130 |     //    |      |
131 |     // inputX  alpha
132 |     //         
133 |     //      (mul)  (betaInitializer)
134 |     //              |
135 |     //       tmp   beta
136 |     //            
137 |     //         (add)
138 |     //           
139 |     //        outputY
140 |     //           |
141 |     //
142 |     
143 |     // Create a new tensor gamma.
144 |     FloatTensor* gamma = dynamic_cast<FloatTensor*>(beta->create());
145 | 
146 |     // Give gamma tensor a unique name.
147 |     gamma->setName(beta->getName() + "__gamma_" + std::to_string(tensorIdx++) + ")");
148 | 
149 |     // Initialize gamma.
150 |     gamma->setDimensions(beta->getDimensions());
151 | 
152 |     // Add gamma into the ONNC IR graph.
153 |     gamma = pCG.addValue<FloatTensor>(gamma);
154 |     assert((gamma != nullptr) && "The name must be unique");
155 |     
156 |     // Create a new Initializer operator for gamma tensor. This is a must in ONNC IR graph.
157 |     // Every tensor must have a "defining" operator. For a constant tensor, its defining
158 |     // operator is an Initializer.
159 |     Initializer* gammaInitializer = pCG.addOperator<Initializer>();
160 |     gammaInitializer->setTensor(*gamma);
161 | 
162 |     // The current ONNC IR graph status
163 |     // ================================
164 |     //
165 |     //        (alphaInitializer)
166 |     //    |      |
167 |     // inputX  alpha
168 |     //         
169 |     //      (mul)  (betaInitializer)
170 |     //              |
171 |     //       tmp   beta
172 |     //            
173 |     //         (add)     (gammaInitializer)
174 |     //                      |
175 |     //        outputY     gamma
176 |     //           |
177 |     //
178 | 
179 |     // Get the constant data of beta.
180 |     const float* betaData = reinterpret_cast<const float*>(beta->getValues().data());
181 | 
182 |     // Get the constant data of alpha.
183 |     const float* alphaData = reinterpret_cast<const float*>(alpha->getValues().data());
184 | 
185 |     // Calculate the constant data of gamma.
186 |     int tensorSize = beta->getValues().size();
187 |     for (int i = 0; i < tensorSize; i++) {
188 |       gamma->getValues().push_back( betaData[i] / alphaData[i] );
189 |     }
190 | 
191 |     // Remove beta from the ONNC IR graph. We don't need it anymore.
192 |     Initializer* betaInitializer = static_cast<Initializer*>(beta->getDefine());
193 |     pCG.erase(*betaInitializer);
194 |     pCG.erase(*beta);
195 |     
196 |     // The current ONNC IR graph status
197 |     // ================================
198 |     //
199 |     //        (alphaInitializer)
200 |     //    |      |
201 |     // inputX  alpha
202 |     //         
203 |     //      (mul)
204 |     //         
205 |     //       tmp
206 |     //            
207 |     //         (add)     (gammaInitializer)
208 |     //                      |
209 |     //        outputY     gamma
210 |     //           |
211 |     //
212 | 
213 |     // Re-connect the operators.
214 |     add->addInput(*inputX);
215 |     add->addInput(*gamma);
216 |     add->addOutput(*tmp);
217 |     mul->addInput(*tmp);
218 |     mul->addInput(*alpha);
219 |     mul->addOutput(*outputY);
220 | 
221 |     // The current ONNC IR graph status
222 |     // ================================
223 |     //
224 |     //        (gammaInitializer)
225 |     //    |      |
226 |     // inputX  gamma
227 |     //      \   /
228 |     //      (add)  (alphaInitializer)
229 |     //        |      |
230 |     //       tmp   alpha
231 |     //         \   /
232 |     //         (mul)
233 |     //           |
234 |     //        outputY
235 |     //           |
236 |     //
237 | 
238 |     // Rename tensor tmp to become the original output tensor's name of add.
239 |     add->getOutput(0)->setName(addOutputTensorName);
240 |     // Rename tensor outputY to become the original output tensor's name of mul.
241 |     mul->getOutput(0)->setName(mulOutputTensorName);
242 |   }
243 | 
244 |   pCG.topologicalSort();
245 |   
246 |   return ret;
247 | }
248 | 
249 | bool NvDlaReorderMulAddPass::canBeReordered(ComputeOperator* pNode)
250 | {
251 |   if (!isa<Mul>(pNode)) {
252 |     return false;
253 |   }
254 | 
255 |   if (!isConstant(pNode->getInput(0)) && !isConstant(pNode->getInput(1))) {
256 |     return false;
257 |   }
258 |   
259 |   Value* outv = pNode->getOutput(0);
260 | 
261 |   // if Mul's result has more than one users, we can't fuse it.
262 |   if (outv->getUses().size() > 1) {
263 |     return false;
264 |   }
265 | 
266 |   ComputeOperator* userNode = outv->getUses()[0].getUser();
267 |   if (!isa<Add>(userNode)) {
268 |     return false;
269 |   }
270 | 
271 |   return true;
272 | }
273 | 
274 | bool NvDlaReorderMulAddPass::isConstant(Value* pValue)
275 | {
276 |   // Only if this value's (tensor's) "defining" operator is Initializer,
277 |   // this tensor is a constant tensor.  
278 |   ComputeOperator* op = static_cast<ComputeOperator*>(pValue->getDefine());
279 |   if (isa<Initializer>(op)) {
280 |     return true;
281 |   } else {
282 |     return false;
283 |   }
284 | }
285 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/NvDlaReorderMulAddPass.h:
--------------------------------------------------------------------------------
 1 | //===- NvDlaReorderMulAddPass.h -------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #ifndef ONNC_FOONVDLA_REORDER_MUL_ADD_PASS_H
 9 | #define ONNC_FOONVDLA_REORDER_MUL_ADD_PASS_H
10 | #include <onnc/Core/CustomPass.h>
11 | 
12 | namespace onnc {
13 | namespace foonvdla {
14 | 
15 | class NvDlaReorderMulAddPass : public CustomPass<NvDlaReorderMulAddPass>
16 | {
17 | public:
18 |   NvDlaReorderMulAddPass() = default;
19 | 
20 |   ReturnType runOnModule(Module& pModule) override;
21 | 
22 |   ReturnType runOnComputeGraph(ComputeGraph& pCG) override;
23 | 
24 | private:
25 |   bool canBeReordered(ComputeOperator* pNode);
26 |   bool isConstant(Value* value);
27 | 
28 |   static unsigned tensorIdx;
29 | };
30 | 
31 | } // namespace foonvdla
32 | } // namespace onnc
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/PrintONNCIRPass.cpp:
--------------------------------------------------------------------------------
 1 | //===- PrintONNCIRPass.cpp ------------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #include "PrintONNCIRPass.h"
 9 | 
10 | #include <onnc/Core/PassSupport.h>
11 | #include <onnc/IR/Compute/Attributes.h>
12 | #include <onnc/IR/Compute/Initializer.h>
13 | #include <onnc/IR/ComputeOperator.h>
14 | #include <onnc/Transforms/Optimizations/OptimizationsUtils.h>
15 | 
16 | namespace onnc {
17 | namespace foonvdla {
18 | 
19 | //===----------------------------------------------------------------------===//
20 | // PrintONNCIRPass
21 | //===----------------------------------------------------------------------===//
22 | 
23 | Pass::ReturnType PrintONNCIRPass::runOnModule(Module& pModule)
24 | {
25 |   const Pass::ReturnType ret = BaseType::runOnModule(pModule);
26 | 
27 |   if (ret != kModuleNoChanged) {
28 |     pModule.eraseUnusedValues();
29 |   }
30 | 
31 |   return ret;
32 | }
33 | 
34 | Pass::ReturnType PrintONNCIRPass::runOnComputeGraph(ComputeGraph& pCG)
35 | {
36 |   Pass::ReturnType ret = Pass::kModuleNoChanged;
37 | 
38 |   std::cout << "=== PrintONNCIRPass ======\n";
39 |   for (ComputeOperator& node : pCG) {
40 |     node.print(std::cout);
41 |     std::cout << "\n";
42 |   }
43 |   std::cout << "==========================\n";
44 |   
45 |   return ret;
46 | }
47 | 
48 | } // namespace foonvdla
49 | } // namespace onnc
50 | 


--------------------------------------------------------------------------------
/lab_8_Mul_Add_Reordering_and_Fusion/src/PrintONNCIRPass.h:
--------------------------------------------------------------------------------
 1 | //===- PrintONNCIRPass.h --------------------------------------------------===//
 2 | //
 3 | //                             The ONNC Project
 4 | //
 5 | // See LICENSE.TXT for details.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | #ifndef ONNC_FOONVDLA_PRINT_ONNC_IR_PASS_H
 9 | #define ONNC_FOONVDLA_PRINT_ONNC_IR_PASS_H
10 | #include <onnc/Core/CustomPass.h>
11 | #include <utility>
12 | 
13 | namespace onnc {
14 | namespace foonvdla {
15 | 
16 | class PrintONNCIRPass : public CustomPass<PrintONNCIRPass>
17 | {
18 | public:
19 |   PrintONNCIRPass() = default;
20 | 
21 |   ReturnType runOnModule(Module& pModule) override;
22 | 
23 |   ReturnType runOnComputeGraph(ComputeGraph& pCG) override;
24 | };
25 | 
26 | } // namespace foonvdla
27 | } // namespace onnc
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/models/lenet/input0.output.dimg:
--------------------------------------------------------------------------------
1 | 149.25 -49.625 13.875 11.2344 -59.8125 -2.61523 7.80078 -44.7188 30.8594 17.3594 


--------------------------------------------------------------------------------
/models/lenet/input0.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input0.pgm


--------------------------------------------------------------------------------
/models/lenet/input1.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input1.pgm


--------------------------------------------------------------------------------
/models/lenet/input2.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input2.pgm


--------------------------------------------------------------------------------
/models/lenet/input4.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input4.pgm


--------------------------------------------------------------------------------
/models/lenet/input5.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input5.pgm


--------------------------------------------------------------------------------
/models/lenet/input6.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input6.pgm


--------------------------------------------------------------------------------
/models/lenet/input7.output.dimg:
--------------------------------------------------------------------------------
1 | -2.21875 -5.39062 22.4375 7.35938 -25.4688 -18.0469 -39.8125 165.875 2.22656 40.0312 


--------------------------------------------------------------------------------
/models/lenet/input7.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input7.pgm


--------------------------------------------------------------------------------
/models/lenet/input8.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input8.pgm


--------------------------------------------------------------------------------
/models/lenet/input9.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/input9.pgm


--------------------------------------------------------------------------------
/models/lenet/lenet.nvdla:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/lenet.nvdla


--------------------------------------------------------------------------------
/models/lenet/lenet.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/lenet/lenet.onnx


--------------------------------------------------------------------------------
/models/quantized_mnist/mnist_calibration.txt:
--------------------------------------------------------------------------------
1 | 9,0,12,8,0,13,7,0,15
2 | 


--------------------------------------------------------------------------------
/models/quantized_mnist/quantized_mnist.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/quantized_mnist/quantized_mnist.onnx


--------------------------------------------------------------------------------
/models/test_Add/input1x5x7.pgm:
--------------------------------------------------------------------------------
1 | P5
2 | 7 5
3 | 255
4 |  	
5 |  !"


--------------------------------------------------------------------------------
/models/test_Add/out.nvdla:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Add/out.nvdla


--------------------------------------------------------------------------------
/models/test_Add/test_Add.log:
--------------------------------------------------------------------------------
 1 | # ./nvdla_runtime --loadable test_Add.nvdla --image input1x5x7.pgm --rawdump
 2 | creating new runtime context...
 3 | Emulator starting
 4 | ppgminfo 1 5 7
 5 | pgm2dimg 1 5 7 1 32 160 160
 6 | submitting tasks...
 7 | [   37.512018] Enter:dla_read_network_config
 8 | [   37.513711] Exit:dla_read_network_config status=0
 9 | [   37.513905] Enter: dla_initiate_processors
10 | [   37.514392] Enter: dla_submit_operation
11 | [   37.514536] Prepare SDP operation index 0 ROI 0 dep_count 0
12 | [   37.514717] Enter: dla_prepare_operation
13 | [   37.515103] processor:SDP group:0, rdma_group:0 available
14 | [   37.515332] Enter: dla_read_config
15 | [   37.518969] Exit: dla_read_config
16 | [   37.519171] Exit: dla_prepare_operation status=0
17 | [   37.519372] Enter: dla_program_operation
18 | [   37.519527] Program SDP operation index 0 ROI 0 Group[0]
19 | [   37.522174] no desc get due to index==-1
20 | [   37.522364] no desc get due to index==-1
21 | [   37.522492] no desc get due to index==-1
22 | [   37.522629] no desc get due to index==-1
23 | [   37.522766] no desc get due to index==-1
24 | [   37.522908] no desc get due to index==-1
25 | [   37.523053] no desc get due to index==-1
26 | [   37.523201] Enter: dla_op_programmed
27 | [   37.523427] Exit: dla_op_programmed
28 | [   37.523565] Exit: dla_program_operation status=0
29 | [   37.523756] Enter: dla_enable_operation
30 | [   37.523938] Enable SDP operation index 0 ROI 0
31 | [   37.524254] Enter: dla_op_enabled
32 | [   37.524388] Exit: dla_op_enabled
33 | [   37.524507] Exit: dla_enable_operation status=0
34 | [   37.526518] Exit: dla_submit_operation
35 | [   37.526796] Enter: dla_dequeue_operation
36 | [   37.526982] exit SDP as there's no further operation
37 | [   37.527187] Exit: dla_dequeue_operation
38 | [   37.527355] Exit: dla_initiate_processors status=0
39 | [   37.547196] Enter:dla_handle_events, processor:BDMA
40 | [   37.547525] Exit:dla_handle_events, ret:0
41 | [   37.547753] Enter:dla_handle_events, processor:Convolution
42 | [   37.547940] Exit:dla_handle_events, ret:0
43 | [   37.548077] Enter:dla_handle_events, processor:SDP
44 | [   37.548251] Handle op complete event, processor SDP group 0
45 | [   37.548465] Enter:dla_op_completion processor SDP group0
46 | [   37.548723] Completed SDP operation index 0 ROI 0
47 | [   37.549006] 1 HWLs done, totally 1 layers
48 | [   37.549223] Enter: dla_free_op_desc op desc index 0 ROI 0
49 | [   37.549678] Exit: dla_free_op_desc
50 | [   37.549856] Exit:dla_op_completion processor SDP group0 status=0
51 | [   37.550126] Exit:dla_handle_events, ret:0
52 | [   37.550281] Enter:dla_handle_events, processor:PDP
53 | [   37.550484] Exit:dla_handle_events, ret:0
54 | [   37.550637] Enter:dla_handle_events, processor:CDP
55 | [   37.550813] Exit:dla_handle_events, ret:0
56 | [   37.550966] Enter:dla_handle_events, processor:RUBIK
57 | [   37.551132] Exit:dla_handle_events, ret:0
58 | [   37.553519] reset engine done
59 | Shutdown signal received, exiting
60 | Test pass
61 | 


--------------------------------------------------------------------------------
/models/test_Add/test_Add.nvdla:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Add/test_Add.nvdla


--------------------------------------------------------------------------------
/models/test_Add/test_Add.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Add/test_Add.onnx


--------------------------------------------------------------------------------
/models/test_Add/test_Add.output.dimg:
--------------------------------------------------------------------------------
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 


--------------------------------------------------------------------------------
/models/test_Conv_Relu/test_Conv_Relu.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Conv_Relu/test_Conv_Relu.onnx


--------------------------------------------------------------------------------
/models/test_Log/input1x5x7.pgm:
--------------------------------------------------------------------------------
1 | P5
2 | 7 5
3 | 255
4 |  	
5 |  !"


--------------------------------------------------------------------------------
/models/test_Log/test_Log.log:
--------------------------------------------------------------------------------
 1 | # ./nvdla_runtime --loadable test_Log.nvdla  --image input1x5x7.pgm  --rawdump
 2 | creating new runtime context...
 3 | Emulator starting
 4 | ppgminfo 1 5 7
 5 | pgm2dimg 1 5 7 1 32 160 160
 6 | submitting tasks...
 7 | Work Found!
 8 | Work Done
 9 | Shutdown signal received, exiting
10 | Test pass
11 | 


--------------------------------------------------------------------------------
/models/test_Log/test_Log.nvdla:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Log/test_Log.nvdla


--------------------------------------------------------------------------------
/models/test_Log/test_Log.onnx:
--------------------------------------------------------------------------------
 1 | 
 2 | onnx-model:]
 3 | 
 4 | INPUT0Y"Log
 5 | test-modelZ 
 6 | INPUT0
 7 | 
 8 | 
 9 | 
10 | 
11 | b
12 | Y
13 | 
14 | 
15 | 
16 | 
17 | B


--------------------------------------------------------------------------------
/models/test_Log/test_Log.output.dimg:
--------------------------------------------------------------------------------
1 | -inf 0 0.693359 1.09863 1.38672 1.60938 1.79199 1.94629 2.08008 2.19727 2.30273 2.39844 2.48438 2.56445 2.63867 2.70898 2.77344 2.83398 2.89062 2.94531 2.99609 3.04492 3.0918 3.13477 3.17773 3.21875 3.25781 3.29492 3.33203 3.36719 3.40039 3.43359 3.46484 3.49609 3.52539 


--------------------------------------------------------------------------------
/models/test_Mul_Add_Relu/input1x5x5.pgm:
--------------------------------------------------------------------------------
1 | P5
2 | 5 5
3 | 255
4 | 


--------------------------------------------------------------------------------
/models/test_Mul_Add_Relu/test_Mul_Add_Relu.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Mul_Add_Relu/test_Mul_Add_Relu.onnx


--------------------------------------------------------------------------------
/models/test_Relu/test_Relu.onnx:
--------------------------------------------------------------------------------
 1 | onnc-tutorial:]
 2 | 
 3 | INPUT0Y"Relu	test_ReluZ 
 4 | INPUT0
 5 | 
 6 | 
 7 | 
 8 | 
 9 | b
10 | Y
11 | 
12 | 
13 | 
14 | 
15 | B


--------------------------------------------------------------------------------
/models/test_Relu_Log_Relu/input1x5x7.pgm:
--------------------------------------------------------------------------------
1 | P5
2 | 7 5
3 | 255
4 |  	
5 |  !"


--------------------------------------------------------------------------------
/models/test_Relu_Log_Relu/test_Relu_Log_Relu.log:
--------------------------------------------------------------------------------
  1 | creating new runtime context...
  2 | Emulator starting
  3 | ppgminfo 1 5 7
  4 | pgm2dimg 1 5 7 1 32 160 160
  5 | submitting tasks...
  6 | [  125.794087] Enter:dla_read_network_config
  7 | [  125.794863] Exit:dla_read_network_config status=0
  8 | [  125.795039] Enter: dla_initiate_processors
  9 | [  125.795486] Enter: dla_submit_operation
 10 | [  125.795647] Prepare SDP operation index 0 ROI 0 dep_count 0
 11 | [  125.795885] Enter: dla_prepare_operation
 12 | [  125.796322] processor:SDP group:0, rdma_group:0 available
 13 | [  125.796586] Enter: dla_read_config
 14 | [  125.800336] Exit: dla_read_config
 15 | [  125.800577] Exit: dla_prepare_operation status=0
 16 | [  125.800817] Enter: dla_program_operation
 17 | [  125.801749] Program SDP operation index 0 ROI 0 Group[0]
 18 | [  125.803583] no desc get due to index==-1
 19 | [  125.803794] no desc get due to index==-1
 20 | [  125.803961] no desc get due to index==-1
 21 | [  125.804138] no desc get due to index==-1
 22 | [  125.804316] no desc get due to index==-1
 23 | [  125.804485] no desc get due to index==-1
 24 | [  125.804661] no desc get due to index==-1
 25 | [  125.806009] Enter: dla_op_programmed
 26 | [  125.806304] Exit: dla_op_programmed
 27 | [  125.806464] Exit: dla_program_operation status=0
 28 | [  125.806699] Enter: dla_enable_operation
 29 | [  125.806925] Enable SDP operation index 0 ROI 0
 30 | [  125.807286] Enter: dla_op_enabled
 31 | [  125.807448] Exit: dla_op_enabled
 32 | [  125.807590] Exit: dla_enable_operation status=0
 33 | [  125.807775] Exit: dla_submit_operation
 34 | [  125.808019] Enter: dla_dequeue_operation
 35 | [  125.808232] exit SDP as there's no further operation
 36 | [  125.808443] Exit: dla_dequeue_operation
 37 | [  125.808631] Exit: dla_initiate_processors status=0
 38 | [  125.827492] Enter:dla_handle_events, processor:BDMA
 39 | [  125.827800] Exit:dla_handle_events, ret:0
 40 | [  125.827995] Enter:dla_handle_events, processor:Convolution
 41 | [  125.828208] Exit:dla_handle_events, ret:0
 42 | [  125.828380] Enter:dla_handle_events, processor:SDP
 43 | [  125.828609] Handle op complete event, processor SDP group 0
 44 | [  125.828849] Enter:dla_op_completion processor SDP group0
 45 | [  125.829088] Completed SDP operation index 0 ROI 0
 46 | [  125.829361] 1 HWLs done, totally 1 layers
 47 | [  125.829602] Enter: dla_free_op_desc op desc index 0 ROI 0
 48 | [  125.830059] Exit: dla_free_op_desc
 49 | [  125.830240] Exit:dla_op_completion processor SDP group0 status=0
 50 | [  125.830490] Exit:dla_handle_events, ret:0
 51 | [  125.830669] Enter:dla_handle_events, processor:PDP
 52 | [  125.830877] Exit:dla_handle_events, ret:0
 53 | [  125.831050] Enter:dla_handle_events, processor:CDP
 54 | [  125.831255] Exit:dla_handle_events, ret:0
 55 | [  125.831435] Enter:dla_handle_events, processor:RUBIK
 56 | [  125.831653] Exit:dla_handle_events, ret:0
 57 | [  125.833915] reset engine done
 58 | Work Found!
 59 | Work Done
 60 | [  125.981455] Enter:dla_read_network_config
 61 | [  125.981911] Exit:dla_read_network_config status=0
 62 | [  125.982142] Enter: dla_initiate_processors
 63 | [  125.982454] Enter: dla_submit_operation
 64 | [  125.982626] Prepare SDP operation index 0 ROI 0 dep_count 0
 65 | [  125.982862] Enter: dla_prepare_operation
 66 | [  125.983126] processor:SDP group:1, rdma_group:1 available
 67 | [  125.983313] Enter: dla_read_config
 68 | [  125.985834] Exit: dla_read_config
 69 | [  125.987172] Exit: dla_prepare_operation status=0
 70 | [  125.987372] Enter: dla_program_operation
 71 | [  125.987545] Program SDP operation index 0 ROI 0 Group[1]
 72 | [  125.988535] no desc get due to index==-1
 73 | [  125.988714] no desc get due to index==-1
 74 | [  125.989816] no desc get due to index==-1
 75 | [  125.990005] no desc get due to index==-1
 76 | [  125.990163] no desc get due to index==-1
 77 | [  125.990351] no desc get due to index==-1
 78 | [  125.990518] no desc get due to index==-1
 79 | [  125.990691] Enter: dla_op_programmed
 80 | [  125.990845] Exit: dla_op_programmed
 81 | [  125.990993] Exit: dla_program_operation status=0
 82 | [  125.991179] Enter: dla_enable_operation
 83 | [  125.991378] Enable SDP operation index 0 ROI 0
 84 | [  125.991672] Enter: dla_op_enabled
 85 | [  125.991821] Exit: dla_op_enabled
 86 | [  125.991968] Exit: dla_enable_operation status=0
 87 | [  125.992162] Exit: dla_submit_operation
 88 | [  125.992326] Enter: dla_dequeue_operation
 89 | [  125.992494] exit SDP as there's no further operation
 90 | [  125.992688] Exit: dla_dequeue_operation
 91 | [  125.994728] Exit: dla_initiate_processors status=0
 92 | [  126.026894] Enter:dla_handle_events, processor:BDMA
 93 | [  126.027179] Exit:dla_handle_events, ret:0
 94 | [  126.027387] Enter:dla_handle_events, processor:Convolution
 95 | [  126.027582] Exit:dla_handle_events, ret:0
 96 | [  126.027727] Enter:dla_handle_events, processor:SDP
 97 | [  126.027886] Handle op complete event, processor SDP group 1
 98 | [  126.028103] Enter:dla_op_completion processor SDP group1
 99 | [  126.028291] Completed SDP operation index 0 ROI 0
100 | [  126.028471] 1 HWLs done, totally 1 layers
101 | [  126.028628] Enter: dla_free_op_desc op desc index 0 ROI 0
102 | [  126.028979] Exit: dla_free_op_desc
103 | [  126.029121] Exit:dla_op_completion processor SDP group1 status=0
104 | [  126.029332] Exit:dla_handle_events, ret:0
105 | [  126.029481] Enter:dla_handle_events, processor:PDP
106 | [  126.029661] Exit:dla_handle_events, ret:0
107 | [  126.029817] Enter:dla_handle_events, processor:CDP
108 | [  126.029995] Exit:dla_handle_events, ret:0
109 | [  126.030146] Enter:dla_handle_events, processor:RUBIK
110 | [  126.030323] Exit:dla_handle_events, ret:0
111 | [  126.032432] reset engine done
112 | Shutdown signal received, exiting
113 | Test pass
114 | 


--------------------------------------------------------------------------------
/models/test_Relu_Log_Relu/test_Relu_Log_Relu.nvdla:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Relu_Log_Relu/test_Relu_Log_Relu.nvdla


--------------------------------------------------------------------------------
/models/test_Relu_Log_Relu/test_Relu_Log_Relu.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Relu_Log_Relu/test_Relu_Log_Relu.onnx


--------------------------------------------------------------------------------
/models/test_Relu_Log_Relu/test_Relu_Log_Relu.output.dimg:
--------------------------------------------------------------------------------
1 | 0 0 0.693359 1.09863 1.38672 1.60938 1.79199 1.94629 2.08008 2.19727 2.30273 2.39844 2.48438 2.56445 2.63867 2.70898 2.77344 2.83398 2.89062 2.94531 2.99609 3.04492 3.0918 3.13477 3.17773 3.21875 3.25781 3.29492 3.33203 3.36719 3.40039 3.43359 3.46484 3.49609 3.52539 


--------------------------------------------------------------------------------
/models/test_Shuffle/input.pgm:
--------------------------------------------------------------------------------
1 | P5
2 | 5 5
3 | 255
4 | 


--------------------------------------------------------------------------------
/models/test_Shuffle/test_Shuffle.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_Shuffle/test_Shuffle.onnx


--------------------------------------------------------------------------------
/models/test_Shuffle/test_Shuffle.output.dimg:
--------------------------------------------------------------------------------
1 | 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48


--------------------------------------------------------------------------------
/models/test_group_Conv/test_group_Conv.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ONNC/onnc-tutorial/af27b015f65339aa07c40d27ffb32fedee7ea692/models/test_group_Conv/test_group_Conv.onnx


--------------------------------------------------------------------------------
/models/test_group_Conv/test_group_Conv.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | from onnx import helper
 3 | from onnx import AttributeProto, TensorProto, GraphProto
 4 | import numpy as np
 5 | 
 6 | def getOnesTensor(shape, name):
 7 |     values = np.ones(shape).flatten().astype(float)
 8 |     return helper.make_tensor(name=name, data_type=TensorProto.FLOAT, dims=shape, vals=values)
 9 | 
10 | # create input
11 | x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 8, 5, 5])
12 | W = helper.make_tensor_value_info('W', TensorProto.FLOAT, [6, 4, 2, 2])
13 | 
14 | # create output
15 | y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 6, 4, 4])
16 | 
17 | # Convolution without padding
18 | node_def = helper.make_node(
19 |     'Conv',
20 |     inputs=['x', 'W'],
21 |     outputs=['y'],
22 |     groups=2,
23 |     kernel_shape=[2, 2],
24 |     strides=[1, 1],
25 |     pads=[0, 0, 0, 0],
26 |     # Default values for other attributes: dilations=[1, 1]
27 | )
28 | 
29 | # create the graph
30 | graph_def = helper.make_graph(
31 |     [node_def],
32 |     'test_group_Conv',
33 |     [x, W],
34 |     [y],
35 |     [getOnesTensor([6, 4, 2, 2], 'W')]
36 | )
37 | 
38 | # create the model
39 | model_def = helper.make_model(
40 |     graph_def,
41 |     producer_name = 'onnc-tutorial'
42 | )
43 | 
44 | onnx.save(model_def, 'test_group_Conv.onnx')
45 | 


--------------------------------------------------------------------------------