├── .gitignore
├── .readthedocs.yaml
├── LICENSE
├── Makefile
├── README.md
├── requirements.txt
└── source
    ├── appendix
        ├── build_llm.rst
        ├── op_support_list_ax620e.rst
        ├── op_support_list_ax650.rst
        └── precision_debug_guides.rst
    ├── conf.py
    ├── doc_update_info
        └── update_info.md
    ├── index.rst
    ├── media
        ├── axmodel-netron.png
        ├── deploy-pipeline.png
        ├── multi_shape_compiled_axmodel.png
        ├── multy_inputs.png
        ├── nodename_vs_tensorname.png
        ├── precision_analysis.png
        ├── precision_analysis_step1.png
        ├── precision_analysis_step2.png
        ├── pulsar2-build-pipeline.png
        ├── pulsar2-run-pipeline.png
        ├── ssd_dog.jpg
        ├── tensor_name.png
        ├── vNPU-ax620e.png
        ├── vNPU-ax650.png
        └── verify-preprocess-postprocess.png
    ├── other_tools
        ├── ax_run_model.rst
        └── caffe_to_onnx.rst
    ├── pulsar2
        └── introduction.rst
    ├── user_guides_advanced
        ├── advanced_build_guides.rst
        ├── advanced_deploy_guides.rst
        └── advanced_run_guides.rst
    ├── user_guides_config
        └── config.rst
    └── user_guides_quick
        ├── quick_start_ax620e.rst
        ├── quick_start_ax650.rst
        └── quick_start_prepare.rst


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | __pycache__
4 | public
5 | build/
6 | .vscode/
7 | pre_process.log
8 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file for Sphinx projects
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the OS, Python version and other tools you might need
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.12"
12 |     # You can also specify other tool versions:
13 |     # nodejs: "20"
14 |     # rust: "1.70"
15 |     # golang: "1.20"
16 | 
17 | 
18 | python:
19 |   install:
20 |     - requirements: requirements.txt
21 | 
22 | # Build documentation in the "docs/" directory with Sphinx
23 | sphinx:
24 |   configuration: source/conf.py
25 |   # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
26 |   # builder: "dirhtml"
27 |   # Fail on all warnings to avoid broken references
28 |   # fail_on_warning: true
29 | 
30 | # Optionally build your docs in additional formats such as PDF and ePub
31 | # formats:
32 | #   - pdf
33 | #   - epub
34 | 
35 | # Optional but recommended, declare the Python requirements required
36 | # to build your documentation
37 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
38 | # python:
39 | #   install:
40 | #     - requirements: docs/requirements.txt
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2024, AXera
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | clean: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | html: 
23 | 	@echo "sphinx build..."
24 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pulsar2 User Manual
 2 | 
 3 | [Web 预览](https://pulsar2-docs.readthedocs.io/zh_CN/latest/)
 4 | 
 5 | ## 1. 项目背景
 6 | 
 7 | 新一代 AI 工具链 *Pulsar2* 使用手册公共维护项目
 8 | 
 9 | - 提供统一的 AI 工具链文档内部展示地址
10 | - 降低 AI 工具链 Developer 维护成本
11 | - 降低 AI 工具链 User 学习成本
12 | 
13 | ## 2. 本地运行指南
14 | 
15 | ### 2.1 git clone
16 | 
17 | ```bash
18 | git clone https://github.com/AXERA-TECH/pulsar2-docs.git
19 | ```
20 | 
21 | 目录树如下:
22 | 
23 | ```bash
24 | .
25 | ├── LICENSE
26 | ├── Makefile
27 | ├── README.md
28 | ├── build
29 | │   ├── doctrees
30 | │   └── html
31 | ├── requirements.txt
32 | └── source                      # 文档主体
33 |     ├── appendix
34 |     ├── conf.py
35 |     ├── doc_update_info
36 |     ├── examples                # 以 .zip 格式保存了一些例子, 由于git pages的限制, 在线文档不支持点击下载操作
37 |     ├── faq
38 |     ├── index.rst
39 |     ├── media
40 |     ├── pulsar2
41 |     ├── user_guides_advanced
42 |     ├── user_guides_config
43 |     ├── user_guides_quick
44 |     └── user_guides_runtime
45 | ```
46 | 
47 | ### 2.2 编译
48 | 
49 | 安装依赖
50 | 
51 | ```bash
52 | pip install -r requirements.txt
53 | ```
54 | 
55 | 在项目根目录下执行以下命令
56 | 
57 | ```bash
58 | $ make clean
59 | $ make html
60 | ```
61 | 
62 | ### 2.3 预览
63 | 
64 | 完成编译后，使用浏览器查看 `build/html/index.html` . 如果在服务器上开发, 可以通过 `ssh` 端口转发的方式访问编译后的文档, 方法如下:
65 | 
66 | 首先可以利用 `python` 在编译后的 `build/html/` 文件夹下启动一个 `http` 服务,
67 | 
68 | ```bash
69 | $ cd build/html/
70 | $ python -m SimpleHTTPServer 8005  # For python2, 端口可以自定义
71 | # or
72 | $ python3 -m http.server 8005      # For python3, 端口可以自定义
73 | ```
74 | 
75 | 然后通过 `ssh` 链接服务器,
76 | 
77 | ```bash
78 | ssh -L 8005:localhost:8005 username@server
79 | ```
80 | 
81 | 然后本地浏览器访问: `localhost:8005/index.html`
82 | 
83 | ## 3. 参考
84 | 
85 | - 本项目基于 Sphinx 搭建，关于更多 Sphinx 的信息请见 https://www.sphinx-doc.org/en/master/
86 | 
87 | ## 4. 发版
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | recommonmark
3 | sphinx_markdown_tables
4 | sphinx_rtd_theme
5 | sphinx_copybutton
6 | myst-parser
7 | sphinxcontrib.mermaid
8 | 


--------------------------------------------------------------------------------
/source/appendix/build_llm.rst:
--------------------------------------------------------------------------------
  1 | ======================
  2 | 大模型编译(实验阶段)
  3 | ======================
  4 | 
  5 | **本章节适用于平台**
  6 | 
  7 | - AX650N
  8 | - AX630C
  9 | 
 10 | **已验证模型**
 11 | 
 12 | - DeepSeek-R1-Distill
 13 | - Qwen2.5
 14 | - MiniCPM、MiniCPM-V 2.0
 15 | - InternVL2
 16 | - ChatGLM3
 17 | - OpenBuddy
 18 | - SmolLM
 19 | - Llama3.2
 20 | - Gemma2
 21 | - Phi2、Phi3
 22 | - TinyLlama
 23 | 
 24 | 本章节介绍如何将 Huggingface 上的模型转换的基本操作, 使用 ``pulsar2`` 工具将从 Huggingface 下载的项目中 ``*.safetensor`` 或 ``pytorch_model.bin``  模型编译成 ``axmodel`` 模型. 请先参考 :ref:`《开发环境准备》 <dev_env_prepare>` 章节完成开发环境搭建. 
 25 | 本节示例模型为 ``Qwen2-0.5B-Instruct``.
 26 | 
 27 | **版本约束**
 28 | 
 29 | 本文档基于 Pulsar2 3.2 版本进行编写。
 30 | 
 31 | **LLM ModelZoo**
 32 | 
 33 | - `AX650N <https://pan.baidu.com/s/1_LG-sPKnLS_LTWF3Cmcr7A?pwd=ph0e>`_
 34 | - `AX630C <https://pan.baidu.com/s/1X0aJTQM0bl8wsraspHnDUw?pwd=ifg5>`_
 35 | 
 36 | **关联项目 AX-LLM**
 37 | 
 38 | 该项目用于探索业界常用 LLM(Large Language Model) 在已有芯片平台上落地的可行性和相关能力边界，方便社区开发者进行快速评估和二次开发自己的 LLM 应用。
 39 | 
 40 | - `AX-LLM <https://github.com/AXERA-TECH/ax-llm>`_
 41 | 
 42 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 43 | 命令说明
 44 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 45 | 
 46 | ``Pulsar2`` 工具链中使用 ``pulsar2 llm_build`` 命令来完成 LLM 模型的转换. 
 47 | 
 48 | .. code-block:: shell
 49 | 
 50 |     root@xxx:/data# pulsar2 llm_build --help
 51 |     usage: pulsar2 llm_build [-h] [--input_path INPUT_PATH] [--output_path OUTPUT_PATH] [--prefill_len PREFILL_LEN]
 52 |                             [--parallel PARALLEL] [--model_config MODEL_CONFIG] [--kv_cache_len KV_CACHE_LEN]
 53 |                             [--post_topk POST_TOPK] [--post_weight_type {bf16,s8}] [-t {fp16,bf16,fp32}]
 54 |                             [-w {fp16,bf16,fp32,s8,s4}] [-c CHECK_LEVEL] [--chip {AX620E,AX650}] [--prompt PROMPT]
 55 | 
 56 |     optional arguments:
 57 |     -h, --help            show this help message and exit
 58 |     --input_path INPUT_PATH
 59 |                             path of model or npy path
 60 |     --output_path OUTPUT_PATH
 61 |                             path of dumpped ax_model
 62 |     --prefill_len PREFILL_LEN
 63 |                             token length of prefill
 64 |     --parallel PARALLEL   build parallel
 65 |     --model_config MODEL_CONFIG
 66 |                             config file
 67 |     --kv_cache_len KV_CACHE_LEN
 68 |                             length of kv_cache
 69 |     --post_topk POST_TOPK
 70 |                             post model output indices and prob
 71 |     --post_weight_type {bf16,s8}
 72 |                             post weight type
 73 |     -t {fp16,bf16,fp32}, --hidden_state_type {fp16,bf16,fp32}
 74 |                             hidden_state dtype
 75 |     -w {fp16,bf16,fp32,s8,s4}, --weight_type {fp16,bf16,fp32,s8,s4}
 76 |                             weight dtype
 77 |     -c CHECK_LEVEL, --check_level CHECK_LEVEL
 78 |                             check level 0:run 1:layer_check 2: cal 1+1
 79 |     --chip {AX620E,AX650}
 80 |                             chip
 81 |     --prompt PROMPT       prompt for check_level==2
 82 | 
 83 | 
 84 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 85 | 下载 ax-llm-build 项目
 86 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 87 | 
 88 | .. code-block:: shell
 89 | 
 90 |     git clone https://github.com/AXERA-TECH/ax-llm-build.git
 91 | 
 92 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 93 | 下载 Qwen2-0.5B-Instruct
 94 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 95 | 
 96 | .. code-block:: shell
 97 | 
 98 |     cd ax-llm-build
 99 |     pip install -U huggingface_hub
100 |     huggingface-cli download --resume-download Qwen/Qwen2-0.5B-Instruct --local-dir Qwen/Qwen2-0.5B-Instruct
101 | 
102 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
103 | 编译执行
104 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
105 | 
106 | .. code-block:: shell
107 | 
108 |     pulsar2 llm_build --input_path Qwen/Qwen2-0.5B-Instruct/ --output_path Qwen/Qwen2-0.5B-w8a16/ --kv_cache_len 1023 --hidden_state_type bf16 --prefill_len 128 --chip AX650
109 | 
110 | ^^^^^^^^^^^^^^^^^^^^^
111 | log 参考信息
112 | ^^^^^^^^^^^^^^^^^^^^^
113 | 
114 | .. code-block::
115 | 
116 |     pulsar2 llm_build --input_path Qwen/Qwen2-0.5B-Instruct/ --output_path Qwen/Qwen2-0.5B-w8a16/ --kv_cache_len 1023 --model_config config/qwen2-0.5B.json --hidden_state_type bf16 --weight_type s8 --parallel 8
117 |     Config(
118 |         model_name='Qwen2-0.5B-Instruct',
119 |         model_type='qwen2',
120 |         num_hidden_layers=24,
121 |         num_attention_heads=14,
122 |         num_key_value_heads=2,
123 |         hidden_size=896,
124 |         intermediate_size=4864,
125 |         vocab_size=151936,
126 |         rope_theta=1000000.0,
127 |         max_position_embeddings=32768,
128 |         rope_partial_factor=1.0,
129 |         rms_norm_eps=1e-06,
130 |         norm_type='rms_norm',
131 |         hidden_act='silu',
132 |         hidden_act_param=0.03,
133 |         scale_depth=1.4,
134 |         scale_emb=1
135 |     )
136 |     2024-08-22 16:16:04.364 | SUCCESS  | yamain.command.llm_build:llm_build:100 - prepare llm model done!
137 |     building llm decode layers   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24/24 0:05:03
138 |     building llm post layer   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1/1 0:01:25
139 |     2024-08-22 16:22:33.485 | SUCCESS  | yamain.command.llm_build:llm_build:160 - build llm model done!
140 |     2024-08-22 16:22:47.861 | SUCCESS  | yamain.command.llm_build:llm_build:337 - check llm model done!
141 | 
142 | .. note::
143 | 
144 |     该示例所运行的主机配置为:
145 | 
146 |         - Intel(R) Xeon(R) Gold 6336Y CPU @ 2.40GHz
147 |         - Memory 32G
148 | 
149 |     全流程耗时大约 ``6min`` , 不同配置的主机转换时间略有差异.
150 | 
151 | 
152 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
153 | embed 提取和优化
154 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
155 | 
156 | .. code-block:: shell  
157 | 
158 |     chmod +x ./tools/fp32_to_bf16
159 |     chmod +x ./tools/embed_process.sh
160 |     ./tools/embed_process.sh Qwen/Qwen2-0.5B-Instruct/ Qwen/Qwen2-0.5B-w8a16/
161 | 
162 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
163 | 输出文件说明
164 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
165 | 
166 | .. code-block:: shell  
167 | 
168 |     root@xxx:/data/ax-llm-build# tree Qwen/Qwen2-0.5B-w8a16
169 |     Qwen/Qwen2-0.5B-w8a16
170 |     ├── model.embed_tokens.weight.bfloat16.bin
171 |     ├── model.embed_tokens.weight.float32.bin # 临时文件，可删掉
172 |     ├── model.embed_tokens.weight.npy # 临时文件，可删掉 
173 |     ├── qwen2_p128_l0_together.axmodel
174 |     ├── qwen2_p128_l10_together.axmodel
175 |     ├── qwen2_p128_l11_together.axmodel
176 |     ├── qwen2_p128_l12_together.axmodel
177 |     ├── qwen2_p128_l13_together.axmodel
178 |     ├── qwen2_p128_l14_together.axmodel
179 |     ├── qwen2_p128_l15_together.axmodel
180 |     ├── qwen2_p128_l16_together.axmodel
181 |     ├── qwen2_p128_l17_together.axmodel
182 |     ├── qwen2_p128_l18_together.axmodel
183 |     ├── qwen2_p128_l19_together.axmodel
184 |     ├── qwen2_p128_l1_together.axmodel
185 |     ├── qwen2_p128_l20_together.axmodel
186 |     ├── qwen2_p128_l21_together.axmodel
187 |     ├── qwen2_p128_l22_together.axmodel
188 |     ├── qwen2_p128_l23_together.axmodel
189 |     ├── qwen2_p128_l2_together.axmodel
190 |     ├── qwen2_p128_l3_together.axmodel
191 |     ├── qwen2_p128_l4_together.axmodel
192 |     ├── qwen2_p128_l5_together.axmodel
193 |     ├── qwen2_p128_l6_together.axmodel
194 |     ├── qwen2_p128_l7_together.axmodel
195 |     ├── qwen2_p128_l8_together.axmodel
196 |     ├── qwen2_p128_l9_together.axmodel
197 |     └── qwen2_post.axmodel
198 | 
199 | 
200 | 其中 ``model.embed_tokens.weight.bfloat16.bin``, ``qwen_p128_l0.axmodel ~ qwen_p128_l23.axmodel``, ``qwen_post.axmodel`` 文件是上板运行所需要
201 | 
202 | ~~~~~~~~~~~~~~~~~~~~~~~
203 | 开发板运行
204 | ~~~~~~~~~~~~~~~~~~~~~~~
205 | 
206 | 本章节介绍如何在 ``AX650`` 开发板上运行 LLM 模型. 
207 | 
208 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
209 | 使用 ax-llm 运行大模型
210 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
211 | 
212 | 运行该实例相关文件已上传网盘，请自行下载和参考
213 |   
214 |   - `百度网盘(AX650N) <https://pan.baidu.com/s/1_LG-sPKnLS_LTWF3Cmcr7A?pwd=ph0e>`_
215 |   - `百度网盘(AX630C) <https://pan.baidu.com/s/1X0aJTQM0bl8wsraspHnDUw?pwd=ifg5>`_
216 | 
217 | .. code-block:: shell
218 | 
219 |     root@ax650:/mnt/qtang/llama_axera_cpp# ./run_qwen2_0.5B.sh
220 |     [I][                            Init][ 128]: LLM init start
221 |     3% | ██                                |   1 /  27 [0.27s<7.29s, 3.70 count/s] tokenizer init ok
222 |     [I][                            Init][  26]: LLaMaEmbedSelector use mmap
223 |     100% | ████████████████████████████████ |  27 /  27 [6.88s<6.88s, 3.92 count/s] init post axmodel ok,remain_cmm(11317 MB)
224 |     [I][                            Init][ 244]: max_token_len : 1023
225 |     [I][                            Init][ 249]: kv_cache_size : 128, kv_cache_num: 1023
226 |     [I][                            Init][ 257]: prefill_token_num : 128
227 |     [I][                            Init][ 266]: LLM init ok
228 |     Type "q" to exit, Ctrl+c to stop current running
229 |     >> who are you?
230 |     [I][                             Run][ 464]: ttft: 129.16 ms
231 |     I am a large language model created by Alibaba Cloud. I am called Qwen.
232 |     
233 |     [N][                             Run][ 603]: hit eos,avg 27.22 token/s
234 | 
235 | 板端运行程序编译流程，请参考我们在 github 上的开源项目 `AX-LLM <https://github.com/AXERA-TECH/ax-llm>`_
236 | 
237 | 
238 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
239 | Tokenizer 解析器说明
240 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
241 | 
242 | ax-llm 项目中的 Tokenizer 解析器采用本地模块与 HTTP Server 两种方案，其中本地方案又尝试了 sentencepiece、tiktoken 两种方案。
243 | 但是我们在实际调试过程中发现 sentencepiece 对于不同 LLM 模型的 special tokens 支持不友好，需要用户自行处理 special tokens 的拆分，容易导致板端 token id 与 transformers 库中的 AutoTokenizer 获得的 token id 存在差异，最终影响 LLM 的输出结果正确性。
244 | 因此我们建议前期调试的时候使用 Tokenizer HTTP Server 的方式直接调用 transformers 库中的 AutoTokenizer 模块进行测试。 
245 | 
246 | Tokenizer HTTP Server 的特点：
247 | 
248 | * 保证 token id 正确
249 | * 方便添加 chat template
250 | * 支持本地、远端部署
251 | * 支持多用户接入
252 | 
253 | 以在网盘中已提供基于 Qwen2.5 3B 的相关文件为例
254 | 
255 | .. code-block:: shell
256 | 
257 |     root@xxx:/data/ax-llm-build# tree qwen2.5-3b-prefill-ax650/
258 |     qwen2.5-3b-prefill-ax650/
259 |     ├── main_prefill
260 |     ├── qwen2.5-3B-prefill-ax650
261 |     │   ├── model.embed_tokens.weight.bfloat16.bin
262 |     │   ├── qwen2_p128_l0_together.axmodel
263 |         ...
264 |     │   ├── qwen2_p128_l12_together.axmodel
265 |     │   └── qwen2_post.axmodel
266 |     ├── qwen2.5_tokenizer
267 |     │   ├── merges.txt
268 |     │   ├── tokenizer_config.json
269 |     │   ├── tokenizer.json
270 |     │   └── vocab.json
271 |     ├── qwen2.5_tokenizer.py
272 |     ├── qwen.tiktoken
273 |     ├── readme.txt
274 |     └── run_qwen2.5_3B_prefill_ax650.sh
275 | 
276 | * qwen2.5_tokenizer：是 tokenizer 相关文件，从 Qwen/Qwen2.5-3B-Instruct/ 中提取
277 | * qwen2.5_tokenizer.py：是用 python 实现的 Tokenizer HTTP Server
278 | 
279 | 运行说明如下：
280 | 
281 | * python qwen2.5_tokenizer.py --host xxx.xxx.xxx.xxx --port 12345，其中 --host xxx.xxx.xxx.xxx 设置 tokenizer解析服务器的 IP 地址，确保 AX650N 能正常访问该地址。可以在具备 python 环境的 AX650N 本地运行
282 | * 修改 run_qwen2.5_3B_prefill_ax650.sh 中 --filename_tokenizer_model 的 IP 信息和步骤1中的一致
283 | * 运行 run_qwen2.5_3B_prefill_ax650.sh 即可
284 | 
285 | .. code-block:: shell
286 | 
287 |     root@xxx:/data/ax-llm-build# cat qwen2.5-3b-prefill-ax650/run_qwen2.5_3B_prefill_ax650.sh
288 |     ./main_prefill \
289 |     --template_filename_axmodel "qwen2.5-3B-prefill-ax650/qwen2_p128_l%d_together.axmodel" \
290 |     --axmodel_num 36 \
291 |     --tokenizer_type 2 \
292 |     --filename_tokenizer_model http://xxx.xxx.xxx.xxx:12345 \
293 |     --bos 0 --eos 0 \
294 |     --filename_post_axmodel "qwen2.5-3B-prefill-ax650/qwen2_post.axmodel" \
295 |     --filename_tokens_embed "qwen2.5-3B-prefill-ax650/model.embed_tokens.weight.bfloat16.bin" \
296 |     --tokens_embed_num 151936 \
297 |     --tokens_embed_size 2048 \
298 |     --use_mmap_load_embed 1 \
299 |     --live_print 1 \
300 |     --continue 1 \
301 |     --prompt "$1"
302 | 
303 | ~~~~~~~~~~~~~~~~~~~~~~~
304 | 其他示例
305 | ~~~~~~~~~~~~~~~~~~~~~~~
306 | 
307 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
308 | MiniCPM-V 2.0
309 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
310 | 
311 | **下载 MiniCPM-V 2.0**
312 | 
313 | 
314 | .. code-block:: shell
315 | 
316 |     cd ax-llm-build
317 |     pip install -U huggingface_hub
318 |     huggingface-cli download --resume-download openbmb/MiniCPM-V-2 --local-dir openbmb/MiniCPM-V-2
319 | 
320 | 
321 | **获取 axmodel**
322 | 
323 | .. code-block:: shell
324 | 
325 |     pulsar2 llm_build --input_path openbmb/MiniCPM-V-2/ --output_path openbmb/MiniCPM-V-2-ax650 --kv_cache_len 1023 --hidden_state_type bf16 --prefill_len 128 --chip AX650
326 | 
327 | log 参考信息
328 | 
329 | .. code-block::
330 | 
331 |     pulsar2 llm_build --input_path openbmb/MiniCPM-V-2/ --output_path openbmb/MiniCPM-V-2-ax650 --kv_cache_len 1023 --hidden_state_type bf16 --prefill_len 128 --chip AX650 --parallel 8
332 |     Config(
333 |         model_name='openbmb/MiniCPM-V-2',
334 |         model_type='minicpmv',
335 |         num_hidden_layers=40,
336 |         num_attention_heads=36,
337 |         num_key_value_heads=36,
338 |         hidden_size=2304,
339 |         intermediate_size=5760,
340 |         vocab_size=122753,
341 |         rope_theta=10000.0,
342 |         max_position_embeddings=4096,
343 |         rope_partial_factor=1.0,
344 |         rms_norm_eps=1e-05,
345 |         norm_type='rms_norm',
346 |         hidden_act='silu',
347 |         hidden_act_param=0.03,
348 |         scale_depth=1.4,
349 |         scale_emb=12,
350 |         dim_model_base=256
351 |     )
352 |     2024-10-07 15:18:38.605 | SUCCESS  | yamain.command.llm_build:llm_build:101 - prepare llm model done!
353 |     tiling op...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3287/3287 0:00:44
354 |     build op serially...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7610/7610 0:04:09
355 |     build op...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11485/11485 0:00:00
356 |     add ddr swap...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 253160/253160 0:00:42
357 |     calc input dependencies...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 289230/289230 0:00:31
358 |     calc output dependencies...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 289230/289230 0:00:42
359 |     assign eu heuristic   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 289230/289230 0:00:51
360 |     assign eu onepass   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 289230/289230 0:00:10
361 |     assign eu greedy   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 289230/289230 0:00:12
362 |     building vision model   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1/1 0:14:51
363 |     building llm decode layers   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40/40 0:04:24
364 |     building llm post layer   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1/1 0:02:19
365 |     2024-10-07 15:40:14.676 | SUCCESS  | yamain.command.llm_build:llm_build:170 - build llm model done!
366 |     2024-10-07 15:40:48.246 | SUCCESS  | yamain.command.llm_build:llm_build:349 - check llm model done!
367 | 
368 | 
369 | **获取 embed 文件**
370 | 
371 | .. code-block:: shell
372 | 
373 |     chmod +x ./tools/fp32_to_bf16
374 |     chmod +x ./tools/embed_process.sh
375 |     ./tools/embed_process_vl.sh Qwen/Qwen2-0.5B-Instruct/ Qwen/Qwen2-0.5B-w8a16/
376 | 
377 | 最终生成文件如下
378 | 
379 | .. code-block:: shell
380 | 
381 |     root@xxx: tree openbmb/MiniCPM-V-2-ax650/
382 |     openbmb/MiniCPM-V-2-ax650/
383 |     ├── minicpmv_p128_l0_together.axmodel
384 |     ├── minicpmv_p128_l10_together.axmodel
385 |     ...
386 |     ├── minicpmv_p128_l19_together.axmodel
387 |     ├── minicpmv_p128_l1_together.axmodel
388 |     ├── minicpmv_p128_l20_together.axmodel
389 |     ...
390 |     ├── minicpmv_p128_l29_together.axmodel
391 |     ├── minicpmv_p128_l2_together.axmodel
392 |     ├── minicpmv_p128_l30_together.axmodel
393 |     ...
394 |     ├── minicpmv_p128_l39_together.axmodel
395 |     ├── minicpmv_p128_l3_together.axmodel
396 |     ...
397 |     ├── minicpmv_p128_l8_together.axmodel
398 |     ├── minicpmv_p128_l9_together.axmodel
399 |     ├── minicpmv_post.axmodel
400 |     ├── model.embed_tokens.weight.bfloat16.bin
401 |     └── vpm_resampler.axmodel
402 | 
403 | 
404 | **上板运行**
405 | 
406 | MiniCPM-V 的上板部署项目需要使用 ax-llm 的 minicpmv 的分支
407 | 
408 | - `ax-llm/tree/minicpm-v <https://github.com/AXERA-TECH/ax-llm/tree/minicpm-v>`_
409 | 
410 | .. figure:: ../media/ssd_dog.jpg
411 |     :alt: pipeline
412 |     :align: center
413 | 
414 | .. code-block:: shell
415 | 
416 |     root@ax650:/llm-test/minicpm-v-2.0# ./run_minicpmv-2.sh
417 |     [I][                            Init][ 125]: LLM init start
418 |     2% | █                                 |   1 /  44 [0.21s<9.11s, 4.83 count/s] tokenizer init ok
419 |     [I][                            Init][  26]: LLaMaEmbedSelector use mmap
420 |     100% | ████████████████████████████████ |  44 /  44 [33.54s<33.54s, 1.31 count/s] init vpm axmodel ok,remain_cmm(8086 MB)
421 |     [I][                            Init][ 284]: max_token_len : 1023
422 |     [I][                            Init][ 289]: kv_cache_size : 2304, kv_cache_num: 1023
423 |     [I][                            Init][ 297]: prefill_token_num : 128
424 |     [I][                            Init][ 306]: LLM init ok
425 |     Type "q" to exit, Ctrl+c to stop current running
426 |     prompt >> 描述下图片
427 |     image >> ssd_dog.jpg
428 |     [I][                          Encode][ 365]: image encode time : 728.507019 ms
429 |     [I][                             Run][ 589]: ttft: 520.94 ms
430 |     这幅图片展示了一只大而毛茸茸的狗，可能是拉布拉多或类似品种，坐在黄色和红色相间的门廊上。这只狗看起来在休息，它的目光朝向相机，表情平静。在狗的后面，有一辆红色自行车，车架上有黑色的装饰，停放在门廊上。自行车上挂着几个行李袋，表明它可能用于旅行或运输。背景中，可以看到一辆白色车辆，可能是汽车，停在门廊的后面。整个场景暗示了一个家庭环境，可能是在住宅区。
431 | 
432 |     [N][                             Run][ 728]: hit eos,avg 5.55 token/s
433 | 
434 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
435 | 调试说明
436 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
437 | 
438 | ``pulsar2 llm_build`` 通过在编译命令中使用 ``--check_level`` 启动调试精度调试功能
439 | 
440 | * ``--check_level 1``：测试第一层的相似度
441 | * ``--check_level 2``：指定 prompt 输入的内容，用于仿真运行编译生成的模型文件。
442 | 
443 | ^^^^^^^^^^^^^^^^^^^^^
444 | --check_level 1
445 | ^^^^^^^^^^^^^^^^^^^^^
446 | 
447 | .. code-block:: shell
448 | 
449 |     pulsar2 llm_build --check_level 1 --input_path Qwen/Qwen2-0.5B-Instruct/ --output_path Qwen/Qwen2-0.5B-w8a16/ --kv_cache_len 1023 --hidden_state_type bf16 --prefill_len 128 --chip AX650 
450 | 
451 | LOG：
452 | 
453 | .. code-block:: shell
454 | 
455 |     pulsar2 llm_build --check_level 1 --input_path Qwen/Qwen2-0.5B-Instruct/ --output_path Qwen/Qwen2-0.5B-w8a16/ --kv_cache_len 1023 --hidden_state_type bf16 --prefill_len 128 --chip AX650 --parallel 8
456 |     Config(
457 |         model_name='Qwen2-0.5B-Instruct',
458 |         model_type='qwen2',
459 |         num_hidden_layers=24,
460 |         num_attention_heads=14,
461 |         num_key_value_heads=2,
462 |         hidden_size=896,
463 |         intermediate_size=4864,
464 |         vocab_size=151936,
465 |         rope_theta=1000000.0,
466 |         max_position_embeddings=32768,
467 |         rope_partial_factor=1.0,
468 |         rms_norm_eps=1e-06,
469 |         norm_type='rms_norm',
470 |         hidden_act='silu',
471 |         hidden_act_param=0.03,
472 |         scale_depth=1.4,
473 |         scale_emb=1,
474 |         dim_model_base=256
475 |     )
476 |     2024-10-07 01:23:28.414 | SUCCESS  | yamain.command.llm_build:llm_build:101 - prepare llm model done!
477 |     building llm decode layers   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24/24 0:00:39
478 |     building llm post layer   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1/1 0:01:26
479 |     2024-10-07 01:25:34.765 | SUCCESS  | yamain.command.llm_build:llm_build:170 - build llm model done!
480 |     2024-10-07 01:25:38.740 | INFO     | yamain.command.llm_build:llm_build:294 - decode layer0_gt layer0_got cos_sim is: 0.9986067835921196
481 |     2024-10-07 01:25:45.421 | INFO     | yamain.command.llm_build:llm_build:325 - prefill layer0_gt layer0_got cos_sim is: 0.9986067835921196
482 |     2024-10-07 01:25:45.421 | SUCCESS  | yamain.command.llm_build:llm_build:349 - check llm model done!
483 | 
484 | ^^^^^^^^^^^^^^^^^^^^^
485 | --check_level 2
486 | ^^^^^^^^^^^^^^^^^^^^^
487 | 
488 | .. code-block:: shell
489 | 
490 |     pulsar2 llm_build --check_level 2 --prompt "<|im_start|>user\n1+1=?<|im_end|>\n<|im_start|>assistant\n" --input_path Qwen/Qwen2-0.5B-Instruct/ --output_path Qwen/Qwen2-0.5B-w8a16/ --kv_cache_len 1023 --hidden_state_type bf16 --prefill_len 128 --chip AX650 
491 | 
492 | 由于会打印每一层（hidden_layer）的调试信息，信息量有点大，这里就只显示比较关键的一些内容。
493 | 
494 | .. code-block:: shell
495 | 
496 |     pulsar2 llm_build --check_level 2 --prompt "<|im_start|>user\n1+1=?<|im_end|>\n<|im_start|>assistant\n" --input_path Qwen/Qwen2-0.5B-Instruct/ --output_path Qwen/Qwen2-0.5B-w8a16/ --kv_cache_len 1023 --hidden_state_type bf16 --prefill_len 128 --chip AX650
497 |     Config(
498 |         model_name='Qwen2-0.5B-Instruct',
499 |         model_type='qwen2',
500 |         num_hidden_layers=24,
501 |         num_attention_heads=14,
502 |         num_key_value_heads=2,
503 |         hidden_size=896,
504 |         intermediate_size=4864,
505 |         vocab_size=151936,
506 |         rope_theta=1000000.0,
507 |         max_position_embeddings=32768,
508 |         rope_partial_factor=1.0,
509 |         rms_norm_eps=1e-06,
510 |         norm_type='rms_norm',
511 |         hidden_act='silu',
512 |         hidden_act_param=0.03,
513 |         scale_depth=1.4,
514 |         scale_emb=1,
515 |         dim_model_base=256
516 |     )
517 |     2024-10-07 01:04:57.881 | SUCCESS  | yamain.command.llm_build:llm_build:101 - prepare llm model done!
518 |     building llm decode layers   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24/24 0:00:39
519 |     building llm post layer   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1/1 0:01:26
520 |     2024-10-07 01:07:04.398 | SUCCESS  | yamain.command.llm_build:llm_build:170 - build llm model done!
521 |     Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
522 |     load Qwen/Qwen2-0.5B-w8a16/qwen2_p128_l0_together
523 |     load Qwen/Qwen2-0.5B-w8a16/qwen2_p128_l1_together
524 |     ...
525 |     load Qwen/Qwen2-0.5B-w8a16/qwen2_p128_l22_together
526 |     load Qwen/Qwen2-0.5B-w8a16/qwen2_p128_l23_together
527 |     2024-10-07 01:07:05.499 | INFO     | yasched.llm_utils:run:497 - simulate layer 0
528 |     2024-10-07 01:07:11.902 | INFO     | yasched.llm_utils:run:503 - end simulate
529 |     [[[-0.24707 0.0883789 -0.232422 ... -0.294922 0.0644531 -0.65625]
530 |     [0.0649414 -0.183594 -0.251953 ... -0.248047 -0.0231934 -0.138672]
531 |     [0.0766602 -0.0961914 0.152344 ... -0.0125732 0.106445 0.15625]
532 |     ...
533 |     [-0.0737305 -0.210938 -0.455078 ... -0.640625 0.0429688 -0.263672]
534 |     [-0.0737305 -0.210938 -0.455078 ... -0.640625 0.0429688 -0.263672]
535 |     [-0.0737305 -0.210938 -0.455078 ... -0.640625 0.0429688 -0.263672]]]
536 |     2024-10-07 01:07:11.903 | INFO     | yasched.llm_utils:run:497 - simulate layer 1
537 |     ...
538 |     2024-10-07 01:09:35.992 | INFO     | yasched.llm_utils:run:497 - simulate layer 23
539 |     2024-10-07 01:09:42.591 | INFO     | yasched.llm_utils:run:503 - end simulate
540 |     [[[-1.25 0.222656 2.375 ... 2.07812 -0.410156 1.84375]
541 |     [-0.289062 -1.08594 0.234375 ... 1.07812 -0.257812 -1.96094]
542 |     [-0.0839844 -0.542969 0.636719 ... 3.21875 -0.351562 -2.01562]
543 |     ...
544 |     [-3.21875 -0.478516 1.42188 ... 4.8125 1.21875 -0.294922]
545 |     [-3.21875 -0.478516 1.42188 ... 4.8125 1.21875 -0.294922]
546 |     [-3.21875 -0.478516 1.42188 ... 4.8125 1.21875 -0.294922]]]
547 |     2
548 |     posibile ('\n', 0.0),('答案', 0.0),('Result', 0.0),('0', 0.0),('3', 0.0),('2', 1.0),('1', 0.0),('Answer', 0.0),('\\', 0.0),('4', 0.0)
549 |     load Qwen/Qwen2-0.5B-w8a16/qwen2_p128_l0_together
550 |     load Qwen/Qwen2-0.5B-w8a16/qwen2_p128_l1_together
551 |     load Qwen/Qwen2-0.5B-w8a16/qwen2_p128_l2_together
552 |     ...
553 |     start_indice = 12
554 |     2024-10-07 01:10:37.005 | INFO     | yasched.llm_utils:run:556 - simulate layer 23
555 |     2024-10-07 01:10:38.859 | INFO     | yasched.llm_utils:run:562 - end simulate
556 |     [-0.310547 -2.21875 0.871094 -1.86719 -0.546875]
557 |     start_indice = 12
558 |     <|im_end|>
559 |     posibile ('\n', 0.0),('\\t', 0.0),('<|im_start|>', 0.0),(' \\', 0.0),('.', 0.0),('\n\n', 0.0),(' ', 0.0),('\\', 0.0),('<|im_end|>', 1.0),('\\n', 0.0)
560 |     ====================================================================================================
561 |     <|im_start|>user\n1+1=?<|im_end|>\n<|im_start|>assistant\n2<|im_end|>
562 |     ====================================================================================================
563 |     hit eos!
564 |     2024-10-07 01:10:51.637 | SUCCESS  | yamain.command.llm_build:llm_build:349 - check llm model done!
565 | 
566 | 


--------------------------------------------------------------------------------
/source/appendix/op_support_list_ax620e.rst:
--------------------------------------------------------------------------------
  1 | ========================
  2 | NPU 算子支持列表(AX620E)
  3 | ========================
  4 | 
  5 | 本节介绍 ``AX630C`` ``AX620Q`` 中的 **NPU** 对 ``ONNX`` 算子支持的情况。
  6 | 
  7 | - 支持的 ONNX opset_version >= 11，详细算子描述可参考 `onnx Operators <https://github.com/onnx/onnx/blob/main/docs/Operators.md>`_ 。
  8 | - 部分支持的算子尚无标准的 ONNX 定义，如果模型中包含了此类算子，请咨询技术支持。
  9 | 
 10 |  .. note:: 
 11 |     | "暂不支持": 表示当前版本算子实现还不支持，但NPU理论上可以支持，后续版本有可能会支持上。
 12 |     | "无限制": 表示当前算子实现上可以支持，由于测试不一定能覆盖到全部参数空间，如果出现意外可以跟我们进行反馈，我们会当BUG来尽快修复。
 13 |     | "不支持": 表示无法支持该属性的实现。
 14 | 
 15 | +-----------------------+--------------------------------------------+
 16 | | 算子名称              | Attrs 约束                                 |
 17 | +=======================+============================================+
 18 | | Abs                   | 无限制                                     |
 19 | +-----------------------+--------------------------------------------+
 20 | | Add                   | 无限制                                     |
 21 | +-----------------------+--------------------------------------------+
 22 | || ArgMax               || axis: 无限制                              |
 23 | ||                      || keepdims: 无限制                          |
 24 | ||                      || select_last_index: 只支持设为0            |
 25 | +-----------------------+--------------------------------------------+
 26 | || ArgMin               || axis: 无限制                              |
 27 | ||                      || keepdims: 无限制                          |
 28 | ||                      || select_last_index: 只支持设为0            |
 29 | +-----------------------+--------------------------------------------+
 30 | || AveragePool          || auto_pad: 只支持NOTSET                    |
 31 | ||                      || ceil_mode: 无限制                         |
 32 | ||                      || count_include_pad: 只支持设为1            |
 33 | ||                      || kernel_shape: 无限制                      |
 34 | ||                      || pads: 无限制                              |
 35 | ||                      || strides: 无限制                           |
 36 | +-----------------------+--------------------------------------------+
 37 | || BatchNormalization   || epsilon: 无限制                           |
 38 | ||                      || momentum: 不支持                          |
 39 | ||                      || training_mode: 不支持                     |
 40 | +-----------------------+--------------------------------------------+
 41 | || Cast                 || to:                                       |
 42 | ||                      ||                                           |
 43 | ||                      ||                                           |
 44 | +-----------------------+--------------------------------------------+
 45 | | Ceil                  | 无限制                                     |
 46 | +-----------------------+--------------------------------------------+
 47 | || Clip                 || min: 无限制                               |
 48 | ||                      || max: 无限制                               |
 49 | +-----------------------+--------------------------------------------+
 50 | | Concat                | axis: 无限制                               |
 51 | +-----------------------+--------------------------------------------+
 52 | | Constant              | 无限制                                     |
 53 | +-----------------------+--------------------------------------------+
 54 | | ConstantOfShape       | 无限制                                     |
 55 | +-----------------------+--------------------------------------------+
 56 | || Conv                 || auto_pad: 只支持NOTSET                    |
 57 | ||                      || dilations: 无限制                         |
 58 | ||                      || group: 无限制                             |
 59 | ||                      || kernel_shape: 无限制                      |
 60 | ||                      || pads: 无限制                              |
 61 | ||                      || strides: 无限制                           |
 62 | ||                      || note: 当使用DepthWise/Group Conv，        |
 63 | ||                      || 并且dilation不为1时效率较低。             |
 64 | +-----------------------+--------------------------------------------+
 65 | || ConvTranspose        || auto_pad: 只支持NOTSET                    |
 66 | ||                      || dilations: 暂时只能设为1                  |
 67 | ||                      || group: 无限制                             |
 68 | ||                      || kernel_shape: 无限制                      |
 69 | ||                      || output_shape: 暂不支持                    |
 70 | ||                      || pads: 无限制                              |
 71 | ||                      || strides: 无限制                           |
 72 | ||                      || note: DepthWise ConvTranspose 效率较低。  |
 73 | ||                      ||                                           |
 74 | ||                      || output_padding: output_padding_h <=       |
 75 | ||                      || pads_bottom, output_padding_w <=          |
 76 | ||                      || pads_right                                |
 77 | +-----------------------+--------------------------------------------+
 78 | || DepthToSpace         || blocksize: 无限制                         |
 79 | ||                      || mode: 暂时只支持DCR                       |
 80 | +-----------------------+--------------------------------------------+
 81 | | Div                   | 无限制                                     |
 82 | +-----------------------+--------------------------------------------+
 83 | | Elu                   | 无限制                                     |
 84 | +-----------------------+--------------------------------------------+
 85 | | Equal                 | 无限制                                     |
 86 | +-----------------------+--------------------------------------------+
 87 | | Erf                   | 无限制                                     |
 88 | +-----------------------+--------------------------------------------+
 89 | | Exp                   | 无限制                                     |
 90 | +-----------------------+--------------------------------------------+
 91 | | Expand                | 无限制                                     |
 92 | +-----------------------+--------------------------------------------+
 93 | | Flatten               | 无限制                                     |
 94 | +-----------------------+--------------------------------------------+
 95 | || Gather               || axis: 无限制                              |
 96 | ||                      || indices: 暂时只支持1维                    |
 97 | +-----------------------+--------------------------------------------+
 98 | | Gelu                  | 无限制                                     |
 99 | +-----------------------+--------------------------------------------+
100 | || Gemm                 || alpha: 暂不支持                           |
101 | ||                      || beta: 暂不支持                            |
102 | ||                      || transA: 无限制                            |
103 | ||                      || transB: 无限制                            |
104 | +-----------------------+--------------------------------------------+
105 | | GlobalAveragePool     | 无限制                                     |
106 | +-----------------------+--------------------------------------------+
107 | | GlobalMaxPool         | 无限制                                     |
108 | +-----------------------+--------------------------------------------+
109 | | Greater               | 无限制                                     |
110 | +-----------------------+--------------------------------------------+
111 | | GreaterOrEqual        | 无限制                                     |
112 | +-----------------------+--------------------------------------------+
113 | | GridSample            | 无限制                                     |
114 | +-----------------------+--------------------------------------------+
115 | | HardSigmoid           | 无限制                                     |
116 | +-----------------------+--------------------------------------------+
117 | | HardSwish             | 无限制                                     |
118 | +-----------------------+--------------------------------------------+
119 | | Identity              | 无限制                                     |
120 | +-----------------------+--------------------------------------------+
121 | | InstanceNormalization | epsilon:无限制                             |
122 | +-----------------------+--------------------------------------------+
123 | | LayerNormalization    | axis暂时只支持为-1(即最后一维)             |
124 | +-----------------------+--------------------------------------------+
125 | | Less                  | 无限制                                     |
126 | +-----------------------+--------------------------------------------+
127 | | LessOrEqual           | 无限制                                     |
128 | +-----------------------+--------------------------------------------+
129 | || LpNormalization      || axis暂时只支持-1(即最后一维)              |
130 | ||                      || p只支持1或2                               |
131 | +-----------------------+--------------------------------------------+
132 | || LSTM                 || activation_alpha: 暂时不支持              |
133 | ||                      || activation_beta: 暂时不支持               |
134 | ||                      || activations: 暂时不支持                   |
135 | ||                      || clip: 暂时不支持                          |
136 | ||                      || hidden_size: 无限制                       |
137 | ||                      || input_forget: 暂时不支持                  |
138 | ||                      || layout: 只支持设为0                       |
139 | ||                      || B: 无限制                                 |
140 | ||                      || sequence_lens: 不支持                     |
141 | ||                      || initial_h: 无限制                         |
142 | ||                      || initial_c: 无限制                         |
143 | ||                      || P: 暂时不支持                             |
144 | ||                      ||                                           |
145 | ||                      || direction:                                |
146 | ||                      || 支持“bidirectional”、“reverse”、“forward” |
147 | +-----------------------+--------------------------------------------+
148 | | LeakyRelu             | 无限制                                     |
149 | +-----------------------+--------------------------------------------+
150 | | MatMul                | 无限制                                     |
151 | +-----------------------+--------------------------------------------+
152 | | Max                   | 无限制                                     |
153 | +-----------------------+--------------------------------------------+
154 | | Min                   | 无限制                                     |
155 | +-----------------------+--------------------------------------------+
156 | | Mish                  | 无限制                                     |
157 | +-----------------------+--------------------------------------------+
158 | || MaxPool              || auto_pad: 只支持设为NOTSET                |
159 | ||                      || ceil_mode: 无限制                         |
160 | ||                      || dilations: 只支持为1                      |
161 | ||                      || kernel_shape: 无限制                      |
162 | ||                      || pads: 无限制                              |
163 | ||                      || storage_order: 只支持设为0                |
164 | ||                      || strides: 无限制                           |
165 | +-----------------------+--------------------------------------------+
166 | | Mul                   | 无限制                                     |
167 | +-----------------------+--------------------------------------------+
168 | || PRelu                || 4D tensor输入时，channel维度在第二维，    |
169 | ||                      || 并且slope shape暂时只支持(channel,)       |
170 | ||                      || 或者(1, channel, 1, 1)                    |
171 | +-----------------------+--------------------------------------------+
172 | || Pad                  || pads: 无限制                              |
173 | ||                      || constant_value: 无限制                    |
174 | ||                      || mode: 只支持constant                      |
175 | ||                      || axes: 暂不支持                            |
176 | +-----------------------+--------------------------------------------+
177 | || Pow                  || 不支持elemwise计算，                      |
178 | ||                      || exponent只支持initializer形式且为标量。   |
179 | +-----------------------+--------------------------------------------+
180 | || ReduceL2             || axes: 无限制                              |
181 | ||                      || keepdims: 无限制                          |
182 | ||                      || noop_with_empty_axes: 该参数暂不支持      |
183 | +-----------------------+--------------------------------------------+
184 | || ReduceMax            || axes: 无限制                              |
185 | ||                      || keepdims: 无限制                          |
186 | ||                      || noop_with_empty_axes: 该参数暂不支持      |
187 | +-----------------------+--------------------------------------------+
188 | || ReduceMean           || axes: 无限制                              |
189 | ||                      || keepdims: 无限制                          |
190 | ||                      || noop_with_empty_axes: 该参数暂不支持      |
191 | +-----------------------+--------------------------------------------+
192 | || ReduceSum            || axes: 无限制                              |
193 | ||                      || keepdims: 无限制                          |
194 | ||                      || noop_with_empty_axes: 该参数暂不支持      |
195 | +-----------------------+--------------------------------------------+
196 | | Relu                  | 无限制                                     |
197 | +-----------------------+--------------------------------------------+
198 | | Reshape               | shape: 无限制                              |
199 | +-----------------------+--------------------------------------------+
200 | || Resize               || mode: 支持"nearest"、”linear“可选         |
201 | ||                      || scales: 无限制                            |
202 | ||                      ||                                           |
203 | ||                      || nearest_mode:                             |
204 | ||                      || 只支持设为round_prefer_ceil               |
205 | +-----------------------+--------------------------------------------+
206 | | Sigmoid               | 无限制                                     |
207 | +-----------------------+--------------------------------------------+
208 | || Slice                || starts: 无限制                            |
209 | ||                      || ends: 无限制                              |
210 | ||                      || axes: 无限制                              |
211 | ||                      || steps: 无限制                             |
212 | +-----------------------+--------------------------------------------+
213 | || SpatialTransformer   || 插值方式为 "bilinear",                    |
214 | ||                      || 边界处理方式为 "constant"（值为0）        |
215 | +-----------------------+--------------------------------------------+
216 | || Split                || axis: 无限制                              |
217 | ||                      || num_outputs: 无限制                       |
218 | +-----------------------+--------------------------------------------+
219 | | Sqrt                  | 无限制                                     |
220 | +-----------------------+--------------------------------------------+
221 | | Silu                  | 无限制                                     |
222 | +-----------------------+--------------------------------------------+
223 | | Sin                   | 无限制                                     |
224 | +-----------------------+--------------------------------------------+
225 | | Swish                 | 无限制                                     |
226 | +-----------------------+--------------------------------------------+
227 | | Squeeze               | axes: 无限制                               |
228 | +-----------------------+--------------------------------------------+
229 | | Softmax               | axis: 无限制                               |
230 | +-----------------------+--------------------------------------------+
231 | | Softplus              | 无限制                                     |
232 | +-----------------------+--------------------------------------------+
233 | | SpaceToDepth          | blocksize: 无限制                          |
234 | +-----------------------+--------------------------------------------+
235 | | Sub                   | 无限制                                     |
236 | +-----------------------+--------------------------------------------+
237 | | Tanh                  | 无限制                                     |
238 | +-----------------------+--------------------------------------------+
239 | | Topk                  | 无限制                                     |
240 | +-----------------------+--------------------------------------------+
241 | | Transpose             | perm: 无限制                               |
242 | +-----------------------+--------------------------------------------+
243 | | Unsqueeze             | axes: 无限制                               |
244 | +-----------------------+--------------------------------------------+
245 | | Where                 | 无限制                                     |
246 | +-----------------------+--------------------------------------------+
247 | 


--------------------------------------------------------------------------------
/source/appendix/op_support_list_ax650.rst:
--------------------------------------------------------------------------------
  1 | =======================
  2 | NPU 算子支持列表(AX650)
  3 | =======================
  4 | 
  5 | 本节介绍 ``AX650`` ``M76H`` 中的 **NPU** 对 ``ONNX`` 算子支持的情况。
  6 | 
  7 | - 支持的 ONNX opset_version >= 11，详细算子描述可参考 `onnx Operators <https://github.com/onnx/onnx/blob/main/docs/Operators.md>`_ 。
  8 | - 部分支持的算子尚无标准的 ONNX 定义，如果模型中包含了此类算子，请咨询技术支持。
  9 | 
 10 |  .. note:: 
 11 |     | "暂不支持": 表示当前版本算子实现还不支持，但NPU理论上可以支持，后续版本有可能会支持上。
 12 |     | "无限制": 表示当前算子实现上可以支持，由于测试不一定能覆盖到全部参数空间，如果出现意外可以跟我们进行反馈，我们会当BUG来尽快修复。
 13 |     | "不支持": 表示无法支持该属性的实现。
 14 | 
 15 | +-----------------------+---------------------------------------------+
 16 | | 算子名称              | Attrs 约束                                  |
 17 | +=======================+=============================================+
 18 | | Abs                   | 无限制                                      |
 19 | +-----------------------+---------------------------------------------+
 20 | | Add                   | 无限制                                      |
 21 | +-----------------------+---------------------------------------------+
 22 | | And                   | 无限制                                      |
 23 | +-----------------------+---------------------------------------------+
 24 | | ArgMax                | | axis: 无限制                              |
 25 | |                       | | keepdims: 无限制                          |
 26 | |                       | | select_last_index: 只支持设为0            |
 27 | +-----------------------+---------------------------------------------+
 28 | | ArgMin                | | axis: 无限制                              |
 29 | |                       | | keepdims: 无限制                          |
 30 | |                       | | select_last_index: 只支持设为0            |
 31 | +-----------------------+---------------------------------------------+
 32 | | AveragePool           | | auto_pad: 只支持NOTSET                    |
 33 | |                       | | ceil_mode: 无限制                         |
 34 | |                       | | count_include_pad: 只支持设为1            |
 35 | |                       | | kernel_shape: 无限制                      |
 36 | |                       | | pads: 无限制                              |
 37 | |                       | | strides: 无限制                           |
 38 | +-----------------------+---------------------------------------------+
 39 | | BatchNormalization    | | epsilon: 无限制                           |
 40 | |                       | | momentum: 不支持                          |
 41 | |                       | | training_mode: 不支持                     |
 42 | +-----------------------+---------------------------------------------+
 43 | | Cast                  | to:                                         |
 44 | |                       |                                             |
 45 | |                       | uint8/int8/uint16/int16/uint32/int32/float32|
 46 | +-----------------------+---------------------------------------------+
 47 | | Ceil                  | 无限制                                      |
 48 | +-----------------------+---------------------------------------------+
 49 | | Clip                  | | min: 无限制                               |
 50 | |                       | | max: 无限制                               |
 51 | +-----------------------+---------------------------------------------+
 52 | | Concat                | axis: 无限制                                |
 53 | +-----------------------+---------------------------------------------+
 54 | | Constant              | 无限制                                      |
 55 | +-----------------------+---------------------------------------------+
 56 | | ConstantOfShape       | 无限制                                      |
 57 | +-----------------------+---------------------------------------------+
 58 | | Conv                  | | auto_pad: 只支持NOTSET                    |
 59 | |                       | | dilations: 无限制                         |
 60 | |                       | | group: 无限制                             |
 61 | |                       | | kernel_shape: 无限制                      |
 62 | |                       | | pads: 无限制                              |
 63 | |                       | | strides: 无限制                           |
 64 | |                       | | note: 当使用DepthWise/Group Conv，        |
 65 | |                       |   并且dilation不为1时效率较低。             |
 66 | +-----------------------+---------------------------------------------+
 67 | | ConvTranspose         | | auto_pad: 只支持NOTSET                    |
 68 | |                       | | dilations: 暂时只能设为1                  |
 69 | |                       | | group: 无限制                             |
 70 | |                       | | kernel_shape: 无限制                      |
 71 | |                       | | output_shape: 暂不支持                    |
 72 | |                       | | pads: 无限制                              |
 73 | |                       | | strides: 无限制                           |
 74 | |                       | | note: DepthWise ConvTranspose 效率较低。  |
 75 | |                       |                                             |
 76 | |                       | output_padding: output_padding_h <=         |
 77 | |                       | pads_bottom, output_padding_w <=            |
 78 | |                       | pads_right                                  |
 79 | +-----------------------+---------------------------------------------+
 80 | | Cos                   | 无限制                                      |
 81 | +-----------------------+---------------------------------------------+
 82 | | DepthToSpace          | | blocksize: 无限制                         |
 83 | |                       | | mode: 无限制                              |
 84 | +-----------------------+---------------------------------------------+
 85 | | Div                   | 无限制                                      |
 86 | +-----------------------+---------------------------------------------+
 87 | | Elu                   | 无限制                                      |
 88 | +-----------------------+---------------------------------------------+
 89 | | Equal                 | 无限制                                      |
 90 | +-----------------------+---------------------------------------------+
 91 | | Erf                   | 无限制                                      |
 92 | +-----------------------+---------------------------------------------+
 93 | | Exp                   | 无限制                                      |
 94 | +-----------------------+---------------------------------------------+
 95 | | Expand                | 无限制                                      |
 96 | +-----------------------+---------------------------------------------+
 97 | | Flatten               | 无限制                                      |
 98 | +-----------------------+---------------------------------------------+
 99 | | Floor                 | 无限制                                      |
100 | +-----------------------+---------------------------------------------+
101 | | Gather                | | axis: 无限制                              |
102 | |                       | | indices: 暂时只支持1维                    |
103 | +-----------------------+---------------------------------------------+
104 | | GatherElements        | | axis: 无限制                              |
105 | +-----------------------+---------------------------------------------+
106 | | GatherND              |   无限制                                    |
107 | +-----------------------+---------------------------------------------+
108 | | Gelu                  | 无限制                                      |
109 | +-----------------------+---------------------------------------------+
110 | | Gemm                  | | alpha: 暂不支持                           |
111 | |                       | | beta: 暂不支持                            |
112 | |                       | | transA: 无限制                            |
113 | |                       | | transB: 无限制                            |
114 | +-----------------------+---------------------------------------------+
115 | | GlobalAveragePool     | 无限制                                      |
116 | +-----------------------+---------------------------------------------+
117 | | GlobalMaxPool         | 无限制                                      |
118 | +-----------------------+---------------------------------------------+
119 | | Greater               | 无限制                                      |
120 | +-----------------------+---------------------------------------------+
121 | | GreaterOrEqual        | 无限制                                      |
122 | +-----------------------+---------------------------------------------+
123 | | GridSample            | 无限制                                      |
124 | +-----------------------+---------------------------------------------+
125 | | GroupNormalization    | 无限制                                      |
126 | +-----------------------+---------------------------------------------+
127 | | HardSigmoid           | 无限制                                      |
128 | +-----------------------+---------------------------------------------+
129 | | HardSwish             | 无限制                                      |
130 | +-----------------------+---------------------------------------------+
131 | | Identity              | 无限制                                      |
132 | +-----------------------+---------------------------------------------+
133 | | InstanceNormalization | epsilon:无限制                              |
134 | +-----------------------+---------------------------------------------+
135 | | InverseSigmoid        | 无限制                                      |
136 | +-----------------------+---------------------------------------------+
137 | | LayerNormalization    | axis暂时只支持为-1(即最后一维)              |
138 | +-----------------------+---------------------------------------------+
139 | | LeakyRelu             | 无限制                                      |
140 | +-----------------------+---------------------------------------------+
141 | | Less                  | 无限制                                      |
142 | +-----------------------+---------------------------------------------+
143 | | LessOrEqual           | 无限制                                      |
144 | +-----------------------+---------------------------------------------+
145 | | LpNormalization       | | axis暂时只支持-1(即最后一维)              |
146 | |                       | | p只支持1或2                               |
147 | +-----------------------+---------------------------------------------+
148 | | LSTM                  | | activation_alpha: 暂时不支持              |
149 | |                       | | activation_beta: 暂时不支持               |
150 | |                       | | activations: 暂时不支持                   |
151 | |                       | | clip: 暂时不支持                          |
152 | |                       | | hidden_size: 无限制                       |
153 | |                       | | input_forget: 暂时不支持                  |
154 | |                       | | layout: 只支持设为0                       |
155 | |                       | | B: 无限制                                 |
156 | |                       | | sequence_lens: 不支持                     |
157 | |                       | | initial_h: 无限制                         |
158 | |                       | | initial_c: 无限制                         |
159 | |                       | | P: 暂时不支持                             |
160 | |                       |                                             |
161 | |                       | direction:                                  |
162 | |                       | 支持“bidirectional”、“reverse”、“forward”   |
163 | +-----------------------+---------------------------------------------+
164 | | LogSoftmax            | 无限制                                      |
165 | +-----------------------+---------------------------------------------+
166 | | MatMul                | 无限制                                      |
167 | +-----------------------+---------------------------------------------+
168 | | Max                   | 无限制                                      |
169 | +-----------------------+---------------------------------------------+
170 | | MaxPool               | | auto_pad: 只支持设为NOTSET                |
171 | |                       | | ceil_mode: 无限制                         |
172 | |                       | | dilations: 只支持为1                      |
173 | |                       | | kernel_shape: 无限制                      |
174 | |                       | | pads: 无限制                              |
175 | |                       | | storage_order: 只支持设为0                |
176 | |                       | | strides: 无限制                           |
177 | +-----------------------+---------------------------------------------+
178 | | Min                   | 无限制                                      |
179 | +-----------------------+---------------------------------------------+
180 | | Mish                  | 无限制                                      |
181 | +-----------------------+---------------------------------------------+
182 | | Mul                   | 无限制                                      |
183 | +-----------------------+---------------------------------------------+
184 | | Not                   | 无限制                                      |
185 | +-----------------------+---------------------------------------------+
186 | | Pad                   | | pads: 无限制                              |
187 | |                       | | constant_value: 无限制                    |
188 | |                       | | mode: 只支持constant                      |
189 | |                       | | axes: 暂不支持                            |
190 | +-----------------------+---------------------------------------------+
191 | | Pow                   | 不支持elemwise计算，                        |
192 | |                       | exponent只支持initializer形式且为标量。     |
193 | +-----------------------+---------------------------------------------+
194 | | PRelu                 | 4D tensor输入时，channel维度在第二维，      |
195 | |                       | 并且slope shape暂时只支持(channel,)         |
196 | |                       | 或者(1, channel, 1, 1)                      |
197 | +-----------------------+---------------------------------------------+
198 | | ReduceL2              | | axes: 无限制                              |
199 | |                       | | keepdims: 无限制                          |
200 | |                       | | noop_with_empty_axes: 该参数暂不支持      |
201 | +-----------------------+---------------------------------------------+
202 | | ReduceMax             | | axes: 无限制                              |
203 | |                       | | keepdims: 无限制                          |
204 | |                       | | noop_with_empty_axes: 该参数暂不支持      |
205 | +-----------------------+---------------------------------------------+
206 | | ReduceMean            | | axes: 无限制                              |
207 | |                       | | keepdims: 无限制                          |
208 | |                       | | noop_with_empty_axes: 该参数暂不支持      |
209 | +-----------------------+---------------------------------------------+
210 | | ReduceMin             | | axes: 无限制                              |
211 | |                       | | keepdims: 无限制                          |
212 | |                       | | noop_with_empty_axes: 该参数暂不支持      |
213 | +-----------------------+---------------------------------------------+
214 | | ReduceSum             | | axes: 无限制                              |
215 | |                       | | keepdims: 无限制                          |
216 | |                       | | noop_with_empty_axes: 该参数暂不支持      |
217 | +-----------------------+---------------------------------------------+
218 | | Relu                  | 无限制                                      |
219 | +-----------------------+---------------------------------------------+
220 | | Reshape               | shape: 无限制                               |
221 | +-----------------------+---------------------------------------------+
222 | | Resize                | | mode: 支持"nearest"、”linear“可选         |
223 | |                       | | scales: 无限制                            |
224 | |                       |                                             |
225 | |                       | nearest_mode:                               |
226 | |                       | 只支持设为round_prefer_ceil                 |
227 | +-----------------------+---------------------------------------------+
228 | | RMSNormalization      | 无限制                                      |
229 | +-----------------------+---------------------------------------------+
230 | | RoiAlign              | sampling_ratio: 只支持不为0                 |
231 | +-----------------------+---------------------------------------------+
232 | | RotaryEmbedding       | 无限制                                      |
233 | +-----------------------+---------------------------------------------+
234 | | Round                 | 无限制                                      |
235 | +-----------------------+---------------------------------------------+
236 | | ScatterElements       | 无限制                                      |
237 | +-----------------------+---------------------------------------------+
238 | | ScatterND             | 无限制                                      |
239 | +-----------------------+---------------------------------------------+
240 | | Sigmoid               | 无限制                                      |
241 | +-----------------------+---------------------------------------------+
242 | | Silu                  | 无限制                                      |
243 | +-----------------------+---------------------------------------------+
244 | | Sin                   | 无限制                                      |
245 | +-----------------------+---------------------------------------------+
246 | | Slice                 | | starts: 无限制                            |
247 | |                       | | ends: 无限制                              |
248 | |                       | | axes: 无限制                              |
249 | |                       | | steps: 无限制                             |
250 | +-----------------------+---------------------------------------------+
251 | | Softmax               | axis: 无限制                                |
252 | +-----------------------+---------------------------------------------+
253 | | Softplus              | 无限制                                      |
254 | +-----------------------+---------------------------------------------+
255 | | SpaceToDepth          | blocksize: 无限制                           |
256 | +-----------------------+---------------------------------------------+
257 | | SpatialTransformer    | 插值方式为 "bilinear",                      |
258 | |                       | 边界处理方式为 "constant"（值为0）          |
259 | +-----------------------+---------------------------------------------+
260 | | Split                 | | axis: 无限制                              |
261 | |                       | | num_outputs: 无限制                       |
262 | +-----------------------+---------------------------------------------+
263 | | Sqrt                  | 无限制                                      |
264 | +-----------------------+---------------------------------------------+
265 | | Squeeze               | axes: 无限制                                |
266 | +-----------------------+---------------------------------------------+
267 | | Sub                   | 无限制                                      |
268 | +-----------------------+---------------------------------------------+
269 | | Swish                 | 无限制                                      |
270 | +-----------------------+---------------------------------------------+
271 | | Tanh                  | 无限制                                      |
272 | +-----------------------+---------------------------------------------+
273 | | Tile                  | 无限制                                      |
274 | +-----------------------+---------------------------------------------+
275 | | Topk                  | 无限制                                      |
276 | +-----------------------+---------------------------------------------+
277 | | Transpose             | perm: 无限制                                |
278 | +-----------------------+---------------------------------------------+
279 | | Unsqueeze             | axes: 无限制                                |
280 | +-----------------------+---------------------------------------------+
281 | | Where                 | 无限制                                      |
282 | +-----------------------+---------------------------------------------+
283 | | Xor                   | 无限制                                      |
284 | +-----------------------+---------------------------------------------+
285 | 


--------------------------------------------------------------------------------
/source/appendix/precision_debug_guides.rst:
--------------------------------------------------------------------------------
  1 | =========================================
  2 | 精度调优建议
  3 | =========================================
  4 | 
  5 | -----------------------
  6 | 基础问题排查
  7 | -----------------------
  8 | 
  9 | 浮点模型量化后，不可避免的会有一定程度的精度损失，为了衡量精度损失情况，在编译阶段提供了一套量化精度分析工具，通过余弦相似度来判断量化前后模型的精度是否符合预期。
 10 | 通常情况下，当模型最后的输出层 **余弦相似度 > 98%** 时，此时可以认为量化后的模型精度正常，可以进行部署的下一阶段。
 11 | 
 12 | .. note::
 13 | 
 14 |     需要注意的是，在编译阶段的量化精度分析工具的余弦相似度，并不等价于在测试数据集上的精度掉点情况(比如 ``AP`` ， ``mAP`` )。
 15 |     如果要掌握详细的数据集精度掉点情况，建议使用编译后的模型上板使用数据集测一遍模型精度。
 16 | 
 17 | 本章节将会有一些基础的名词:
 18 | 
 19 | - **量化策略**：指使用何种策略统计浮点分布范围以获得量化参数，对应到配置中 ``quant`` 字段的 ``calibration_method`` 。
 20 | 
 21 | - **量化位宽**：指量化后，算子的输入输出位宽，可以通过 ``quant`` 字段的 ``layer_configs`` 进行配置。
 22 | 
 23 | 在遇到精度问题时，先确认下列选项，再按照后面的章节进行精度调优：
 24 | 
 25 | - mean/std 与训练时一致：如果量化使用的数据集使用的格式为 ``Image`` ，请确保 ``quant`` 中的 ``input_configs`` 下的 ``calibration_mean`` 以及 ``calibration_std`` 与训练时一致。
 26 | - BGR与RGB格式：如果量化使用的数据集使用的格式为 ``Image`` ，请确保 ``input_processors`` 中的 ``tensor_layout`` 与训练时一致。
 27 | - 训练时Python的前后处理与板端运行时C++的前后处理确保对齐，对齐方式请参照 Q&A。
 28 | - 如果 ``csc_mode`` 设置成除 **YUYV422, UYVY422, YUV420SP, YVU420SP** 时，上板时测试精度时预处理建议使用 **IVE TDP做 resize** ，该预处理与Opencv 的 `bilinear` 插值方式对齐。
 29 | - 量化数据集是否正确:
 30 |   - 校准图片与使用场景尽量相同
 31 |   - 校准集数量是否场景足够丰富，应尽量覆盖所有类别
 32 | 
 33 | 
 34 | -----------------------
 35 | 常见精度问题
 36 | -----------------------
 37 | 
 38 | ~~~~~~~~~~~~~~~~~~~~~~~~
 39 | 怎么设置模型为全U16?
 40 | ~~~~~~~~~~~~~~~~~~~~~~~~
 41 | 
 42 | .. code-block:: shell
 43 | 
 44 |     {
 45 |        "layer_configs": [
 46 |            {
 47 |                "start_tensor_names": ["DEFAULT"], # string of list
 48 |                "end_tensor_names": ["DEFAULT"],   # string of list
 49 |                "data_type": "U16"
 50 |            }
 51 |          ]
 52 |      }
 53 | 
 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 55 | 为什么配置了 ``Add`` 算子量化位宽是 ``U16`` 在余弦相似度表里面看类型还是 ``U8``?
 56 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 57 | 
 58 | - 工具链会先将输入的浮点模型做一次浮点图优化再进行量化，这时配置的算子名/算子类型可能没有出现在浮点图优化后的模型 ``optimized.onnx`` 里面，这时可以打开输出目录下的  ``output/frontend/optimized.onnx`` 查看该算子是否存在。
 59 | - 量化后模型的输出可能与输入类型不同，就会经常出现余弦相似度表格里的算子输出类型与配置不同，这是因为下一个算子的输入类型可能并没有配置为相同位宽，这时会将算子的输出类型设置成下个算子的输入类型，以提升推理性能。这种优化并不会影响精度。
 60 | - 如果是 ``Reshape / Transpose`` 等数据搬运类算子，设置该类算子的类型不会生效，它们的类型由下游的计算类算子类型决定。
 61 | 
 62 | ~~~~~~~~~~~~~~~~~~
 63 | 如何对齐前后处理？
 64 | ~~~~~~~~~~~~~~~~~~
 65 | 
 66 | 前后处理未对齐通常是影响上板精度的重要原因，为了排查前后处理的问题，我们建议按照如下步骤进行：
 67 | 
 68 | .. figure:: ../media/verify-preprocess-postprocess.png
 69 |     :alt: pipeline
 70 |     :align: center
 71 | 
 72 | 
 73 | - 单个数据，使用训练时的Python端将 原始输入，预处理后的数据 ，模型输出，后处理后的数据 保存成bin文件；这里可以将结果进行可视化，以确保输出的正确性
 74 | - C++端测试预处理：读取上一步保存下来的原始数据，做为输入，得到 C++ 预处理后的结果，**与上一步保存的预处理后的数据进行比较，当两者误差在 0.0001 (1e-4) 以内时认为误差符合预期，即 (a - b) < 0.0001** 。
 75 | - C++端测试后处理：读取第一步保存下来的模型输出，做为模型输出，并计算后处理， 得到C++ 端处理后的结果，**与第一步保存的后处理后的数据进行比较，当两者误差在 0.001 (1e-3) 以内时认为误差符合预期。即 (a - b) < 0.001** 。
 76 | 
 77 | 
 78 | 
 79 | ~~~~~~~~~~~~~~~~~~
 80 | outlier过大
 81 | ~~~~~~~~~~~~~~~~~~
 82 | 
 83 | 模型中出现如下的日志，说明模型中激活值存在较多的 ``outlier`` ，我们建议使用 ``smooth quant`` 功能来降低这些 ``outlier``。
 84 | 
 85 | 
 86 | .. code-block:: shell
 87 |     
 88 |                                             Ratio of outliers in tensor【 level=Log(Max_Pertensor/Max_Perchannel) 】
 89 |     ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓
 90 |     ┃ Op outputs                        ┃ Sparse channel ratio ┃ level>=3 ratio     ┃ level>=4 ratio        ┃ level>=5 ratio        ┃ level>=6 ratio        ┃
 91 |     ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩
 92 |     │ /vision_model/embeddings/patch_e… │ 0.0                  │ 0.6614583134651184 │ 0.3111979067325592    │ 0.00390625            │ 0.0                   │
 93 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
 94 |     │ op_348:onnx.LayerNormalization_0… │ 0.0                  │ 0.921875           │ 0.5169270634651184    │ 0.1080729141831398    │ 0.0403645820915699    │
 95 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
 96 |     │ op_396:onnx.LayerNormalization_0… │ 0.0                  │ 0.4427083432674408 │ 0.2473958283662796    │ 0.12109375            │ 0.0546875             │
 97 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
 98 |     │ op_821:onnx.AxFullyConnected_q_0… │ 0.0                  │ 0.359375           │ 0.1875                │ 0.125                 │ 0.0625                │
 99 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
100 |     │ op_821:onnx.AxFullyConnected_k_0… │ 0.0                  │ 0.203125           │ 0.078125              │ 0.0625                │ 0.015625              │
101 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
102 |     │ op_821:onnx.AxFullyConnected_v_0… │ 0.0                  │ 0.453125           │ 0.203125              │ 0.078125              │ 0.03125               │
103 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
104 |     │ op_821:onnx.AxFullyConnected_q_1… │ 0.0                  │ 0.234375           │ 0.125                 │ 0.109375              │ 0.015625              │
105 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
106 |     │ op_821:onnx.AxFullyConnected_k_1… │ 0.0                  │ 0.3125             │ 0.140625              │ 0.046875              │ 0.015625              │
107 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
108 |     │ op_821:onnx.AxFullyConnected_v_1… │ 0.0                  │ 0.21875            │ 0.03125               │ 0.015625              │ 0.0                   │
109 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
110 |     │ op_821:onnx.AxFullyConnected_q_2… │ 0.0                  │ 0.296875           │ 0.203125              │ 0.140625              │ 0.09375               │
111 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
112 |     │ op_821:onnx.AxFullyConnected_k_2… │ 0.0                  │ 0.234375           │ 0.109375              │ 0.0625                │ 0.015625              │
113 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
114 |     │ op_821:onnx.AxFullyConnected_v_2… │ 0.0                  │ 0.234375           │ 0.125                 │ 0.078125              │ 0.078125              │
115 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
116 |     │ op_821:onnx.AxFullyConnected_q_3… │ 0.0                  │ 0.25               │ 0.09375               │ 0.078125              │ 0.03125               │
117 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
118 |     │ op_821:onnx.AxFullyConnected_k_3… │ 0.0                  │ 0.1875             │ 0.109375              │ 0.03125               │ 0.015625              │
119 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
120 |     │ op_821:onnx.AxFullyConnected_v_3… │ 0.0                  │ 0.296875           │ 0.15625               │ 0.0625                │ 0.0                   │
121 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
122 |     │ op_821:onnx.AxFullyConnected_q_4… │ 0.0                  │ 0.234375           │ 0.171875              │ 0.0625                │ 0.046875              │
123 |     ├───────────────────────────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
124 |     │ op_821:onnx.AxFullyConnected_k_4… │ 0.0                  │ 0.359375           │ 0.203125              │ 0.09375               │ 0.046875              │
125 | 
126 | 通过配置 ``quant`` 字段中的 ``enable_smooth_quant`` 可以使能该功能。
127 | 
128 | .. hint::
129 | 
130 |     该方法来源于论文  `SmoothQuant <https://arxiv.org/abs/2211.10438>`_
131 | 
132 | -----------------------
133 | 精度调优步骤
134 | -----------------------
135 | 
136 | ~~~~~~~~~~~~~~~~~~
137 | 更改量化策略
138 | ~~~~~~~~~~~~~~~~~~
139 | 
140 | 通过更改量化策略来提升模型精度，目前可尝试的有 ``MSE`` ``Percentile`` ``MinMax`` ，对应 ``quant`` 字段中的 ``calibration_method`` 。
141 | 
142 | .. figure:: ../media/precision_analysis_step1.png
143 |     :alt: pipeline
144 |     :align: center
145 | 
146 | 
147 | ~~~~~~~~~~~~~~~~~~
148 | 混合精度调优设置
149 | ~~~~~~~~~~~~~~~~~~
150 | 
151 | 如果更改量化策略之后余弦相似度还是较低，可以根据 ``Quant Precision Table 【PerLayer Reference】`` 中余弦相似度来调节量化位宽，具体流程如下图所示。
152 | 
153 | .. figure:: ../media/precision_analysis_step2.png
154 |     :alt: pipeline
155 |     :align: center
156 | 
157 | 
158 | -----------------------
159 | 量化工单模板
160 | -----------------------
161 | 
162 | 请详细填写以下项，并提交给 FAE/AE。
163 | 
164 | - 其他平台经验
165 |     - 是否有在其他平台部署过
166 |     - 对应厂商、芯片型号、相应工具链版本 
167 |     - 其他平台的量化脚本或者配置文件
168 |     - 其他平台执行量化时的命令
169 |     - 相应的数据集指标：浮点精度 / 板上运行时精度 / 精度指标
170 | - 提供可复现的最小case：
171 |     - onnx 浮点模型
172 |     - onnx 浮点模型的单张图片测试用例，python 或者 C++ 都行
173 |     - config.json 配置文件
174 |     - 用于量化的最小数据集
175 |     - Pulsar2 的编译命令
176 | - 如因数据安全无法提供原始模型和数据集，则需要提供：
177 |     - 提供随机权重的浮点模型
178 |     - 完整的编译日志
179 |     - 开启精度分析功能后 ( ``"precision_analysis": true, "precision_analysis_method" : "EndToEnd"`` )，将output/quant/debug 进行打包。
180 |     - config.json 配置文件
181 |     - 用于量化的最小数据集
182 |     - Pulsar2 编译命令 


--------------------------------------------------------------------------------
/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | import sphinx_rtd_theme
18 | from recommonmark.parser import CommonMarkParser
19 | from recommonmark.transform import AutoStructify
20 | import sphinxcontrib.mermaid
21 | # -- Project information -----------------------------------------------------
22 | 
23 | project = 'Pulsar2'
24 | copyright = '2024, AXERA Semiconductor Co., Ltd. All rights reserved'
25 | author = 'AXera-Tech'
26 | 
27 | # The full version, including alpha/beta/rc tags
28 | release = 'V3.2'
29 | 
30 | # -- General configuration ---------------------------------------------------
31 | 
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | # myst_parser is incompatible with recommonmark, myst_parser support mermaid.
36 | extensions = ['sphinxcontrib.mermaid', 'recommonmark', 'sphinx_markdown_tables', 'sphinx_copybutton']
37 | 
38 | # Add any paths that contain templates here, relative to this directory.
39 | templates_path = ['_templates']
40 | 
41 | # List of patterns, relative to source directory, that match files and
42 | # directories to ignore when looking for source files.
43 | # This pattern also affects html_static_path and html_extra_path.
44 | exclude_patterns = ["examples/*[!.zip]"]
45 | 
46 | # -- Options for HTML output -------------------------------------------------
47 | 
48 | # The theme to use for HTML and HTML Help pages.  See the documentation for
49 | # a list of builtin themes.
50 | 
51 | # html_theme = 'alabaster'
52 | html_theme = 'sphinx_rtd_theme'
53 | 
54 | # html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
55 | # Add any paths that contain custom static files (such as style sheets) here,
56 | # relative to this directory. They are copied after the builtin static files,
57 | # so a file named "default.css" will overwrite the builtin "default.css".
58 | html_static_path = ['_static']
59 | 
60 | # The suffix(es) of source filenames.
61 | # You can specify multiple suffix as a list of string:
62 | 
63 | # source_parsers = {
64 | #   '.md': CommonMarkParser,
65 | # }
66 | source_suffix = ['.rst', '.md']
67 | 
68 | # mermaid
69 | mermaid_output_format = 'raw'
70 | mermaid_version = 'latest'
71 | 


--------------------------------------------------------------------------------
/source/doc_update_info/update_info.md:
--------------------------------------------------------------------------------
 1 | # 文档修改与发布说明
 2 | 
 3 | ## 版权申明
 4 | 权利声明 
 5 | 爱芯元智或其许可人保留一切权利。
 6 | 非经权利人书面许可，任何单位和个人不得擅自摘抄、复制本文档内容的部分或全部，并不得以任何形式传播。
 7 | 
 8 | 注意
 9 | 您购买的产品、服务或特性等应受商业合同和条款的约束，本文档中描述的全部或部分产品、服务或特性可能不在您的购买或使用范围之内。除非商业合同另有约定，本公司对本文档内容不做任何明示或默示的声明或保证。
10 | 由于产品版本升级或其他原因，本文档内容会不定期进行更新。除非另有约定，本文档仅作为使用指导，本文档中的所有陈述、信息和建议不构成任何明示或暗示的担保。
11 | 
12 | ## 修改记录
13 | 
14 | | 文档版本 | 发布日期 | 修改说明 |
15 | | --- | --- | --- |
16 | | V001 | 2022-11-09 | Initial |
17 | | V002 | 2022-12-02 | Pulsar2 1.1 |
18 | | V003 | 2022-12-19 | Pulsar2 1.1-patch1 |
19 | | V004 | 2022-12-22 | Pulsar2 1.1-patch2 |
20 | | V005 | 2022-12-30 | Pulsar2 1.1-patch3 |
21 | | V006 | 2023-02-12 | Pulsar2 1.2 |
22 | | V007 | 2023-02-16 | Pulsar2 1.2-patch1 |
23 | | V008 | 2023-02-27 | Pulsar2 1.2-patch2 |
24 | | V009 | 2023-03-18 | Pulsar2 1.3 |
25 | | V010 | 2023-04-02 | Pulsar2 1.4 |
26 | | V011 | 2023-04-04 | Pulsar2 1.4-patch1 |
27 | | V012 | 2023-04-10 | Pulsar2 1.4-patch2 |
28 | | V013 | 2023-04-24 | Pulsar2 1.4-patch3 |
29 | | V014 | 2023-05-07 | Pulsar2 1.5 |
30 | | V015 | 2023-05-12 | Pulsar2 1.5-patch1 |
31 | | V016 | 2023-06-04 | Pulsar2 1.6 |
32 | | V017 | 2023-07-09 | Pulsar2 1.7 |
33 | | V018 | 2023-07-16 | Pulsar2 1.7-patch1 |
34 | | V019 | 2023-08-04 | Pulsar2 1.8 |
35 | | V020 | 2023-08-24 | Pulsar2 1.9 |
36 | | V021 | 2023-09-24 | Pulsar2 2.1 |
37 | | V022 | 2023-10-29 | Pulsar2 2.2 |
38 | | V023 | 2023-11-20 | Pulsar2 2.3 |
39 | | V024 | 2024-01-05 | Pulsar2 2.4 |
40 | | V025 | 2024-02-03 | Pulsar2 2.5 |
41 | | V026 | 2024-03-26 | Pulsar2 2.6 |
42 | | V027 | 2024-04-26 | Pulsar2 2.7 |
43 | | V028 | 2024-06-16 | Pulsar2 3.0 |
44 | | V029 | 2024-07-22 | Pulsar2 3.1 |
45 | | V030 | 2024-09-25 | Pulsar2 3.2 |
46 | 
47 | 


--------------------------------------------------------------------------------
/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Pulsar2 documentation master file, created by
 2 |    sphinx-quickstart on Tue Mar  1 17:28:24 2022.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | AXera Pulsar2 工具链指导手册
 7 | =======================================
 8 | 
 9 | .. toctree::
10 |   :maxdepth: 2
11 |   :numbered:
12 |   :caption: Pulsar2 工具链
13 |   :name: Pulsar2
14 | 
15 |   doc_update_info/update_info.md
16 |   pulsar2/introduction.rst
17 |   user_guides_quick/quick_start_prepare.rst
18 |   user_guides_quick/quick_start_ax650.rst
19 |   user_guides_quick/quick_start_ax620e.rst
20 |   user_guides_advanced/advanced_build_guides.rst
21 |   user_guides_advanced/advanced_run_guides.rst
22 |   user_guides_advanced/advanced_deploy_guides.rst
23 |   user_guides_config/config.rst
24 |   other_tools/caffe_to_onnx.rst
25 |   other_tools/ax_run_model.rst
26 | 
27 | .. toctree::
28 |   :maxdepth: 1
29 |   :numbered:
30 |   :caption: 附录
31 |   :name: appendix
32 | 
33 |   appendix/op_support_list_ax650.rst
34 |   appendix/op_support_list_ax620e.rst
35 |   appendix/precision_debug_guides.rst
36 |   appendix/build_llm.rst
37 | 


--------------------------------------------------------------------------------
/source/media/axmodel-netron.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/axmodel-netron.png


--------------------------------------------------------------------------------
/source/media/deploy-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/deploy-pipeline.png


--------------------------------------------------------------------------------
/source/media/multi_shape_compiled_axmodel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/multi_shape_compiled_axmodel.png


--------------------------------------------------------------------------------
/source/media/multy_inputs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/multy_inputs.png


--------------------------------------------------------------------------------
/source/media/nodename_vs_tensorname.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/nodename_vs_tensorname.png


--------------------------------------------------------------------------------
/source/media/precision_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/precision_analysis.png


--------------------------------------------------------------------------------
/source/media/precision_analysis_step1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/precision_analysis_step1.png


--------------------------------------------------------------------------------
/source/media/precision_analysis_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/precision_analysis_step2.png


--------------------------------------------------------------------------------
/source/media/pulsar2-build-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/pulsar2-build-pipeline.png


--------------------------------------------------------------------------------
/source/media/pulsar2-run-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/pulsar2-run-pipeline.png


--------------------------------------------------------------------------------
/source/media/ssd_dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/ssd_dog.jpg


--------------------------------------------------------------------------------
/source/media/tensor_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/tensor_name.png


--------------------------------------------------------------------------------
/source/media/vNPU-ax620e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/vNPU-ax620e.png


--------------------------------------------------------------------------------
/source/media/vNPU-ax650.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/vNPU-ax650.png


--------------------------------------------------------------------------------
/source/media/verify-preprocess-postprocess.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AXERA-TECH/pulsar2-docs/221df8f8ef3d6c27d6374d375e2a2a27abb55378/source/media/verify-preprocess-postprocess.png


--------------------------------------------------------------------------------
/source/other_tools/ax_run_model.rst:
--------------------------------------------------------------------------------
  1 | .. _ax_run_model:
  2 | 
  3 | =======================
  4 | 模型评测工具使用说明
  5 | =======================
  6 | 
  7 | 为了方便用户测评模型，在开发板上预制了 ``ax_run_model`` 工具，此工具有若干参数，可以很方便地测试模型速度和精度。
  8 | 
  9 |    .. code:: bash
 10 | 
 11 |       root@~# ax_run_model
 12 |       usage: ax_run_model --model=string [options] ...
 13 |          options:
 14 |          -m, --model                path to a model file (string)
 15 |          -r, --repeat               repeat times running a model (int [=1])
 16 |          -w, --warmup               repeat times before running a model to warming up (int [=1])
 17 |          -a, --affinity             npu affinity when running a model (int [=1])
 18 |          -v, --vnpu                 type of Visual-NPU inited {0=Disable, 1=STD, 2=BigLittle} (int [=0])
 19 |          -b, --batch                the batch will running (int [=0])
 20 |          -i, --input-folder         the folder of each inputs (folders) located (string [=])
 21 |          -o, --output-folder        the folder of each outputs (folders) will saved in (string [=])
 22 |          -l, --list                 the list of inputs which will test (string [=])
 23 |                --inputs-is-folder     each time model running needs inputs stored in each standalone input folders
 24 |                --outputs-is-folder    each time model running saved outputs stored in each standalone output folders
 25 |                --use-tensor-name      using tensor names instead of using tensor indexes when loading & saving io files
 26 |                --verify               verify outputs after running model
 27 |                --save-benchmark       save benchmark result(min, max, avg) as a json file
 28 |          -?, --help                 print this message
 29 | 
 30 | 
 31 | -----------------------------
 32 | 参数说明
 33 | -----------------------------
 34 | 
 35 | 测评工具参数主要有两部分.
 36 | 
 37 | 第一部分是与测速有关的参数：
 38 | 
 39 | .. data:: ax_run_model 参数解释
 40 | 
 41 |   --model
 42 | 
 43 |     - 数据类型：string
 44 |     - 是否必选：是
 45 |     - 描述：指定测试模型的路径
 46 | 
 47 |   --repeat
 48 | 
 49 |     - 数据类型：int
 50 |     - 是否必选：否
 51 |     - 描述：指定要测试的循环次数，然后显示 min/max/avg 的速度
 52 | 
 53 |   --warmup 
 54 |   
 55 |     - 数据类型：int
 56 |     - 是否必选：否
 57 |     - 描述：循环测试前，预热的次数
 58 | 
 59 |   --affinity
 60 |   
 61 |     - 数据类型：int
 62 |     - 是否必选：否
 63 |     - 描述：亲和性的 mask 值，大于 1(0b001)，小于 7(0b111)
 64 | 
 65 |   --vnpu
 66 |   
 67 |     - 数据类型：int
 68 |     - 是否必选：否
 69 |     - 描述：虚拟npu模式；0 禁用虚拟 npu；1 标准切分模式；2 大小核模式
 70 | 
 71 |   --batch 
 72 |   
 73 |     - 数据类型：int
 74 |     - 是否必选：否
 75 |     - 描述：指定测试的batch
 76 | 
 77 |   --input-folder
 78 |   
 79 |     - 数据类型：string
 80 |     - 是否必选：否
 81 |     - 描述：指定用于精度测试的输入文件夹
 82 |   
 83 |   --output-folder
 84 |   
 85 |     - 数据类型：string
 86 |     - 是否必选：否
 87 |     - 描述：指定用于精度测试的输出文件夹
 88 | 
 89 |   --list
 90 |   
 91 |     - 数据类型：string
 92 |     - 是否必选：否
 93 |     - 描述：指定测试列表
 94 | 
 95 |   --inputs-is-folder
 96 |   
 97 |     - 数据类型：string
 98 |     - 是否必选：否
 99 |     - 描述：指定输入路径 --input-folder 是由文件夹组成的，参数不指定也默认生效，后续废弃
100 | 
101 |   --outputs-is-folder
102 |   
103 |     - 数据类型：string
104 |     - 是否必选：否
105 |     - 描述：指定输出径 --out-folder 是由文件夹组成的，参数不指定也默认生效，后续废弃
106 | 
107 |   --use-tensor-name
108 |   
109 |     - 数据类型：string
110 |     - 是否必选：否
111 |     - 描述：指定按模型输入输出名字查找激励文件，不设置是按索引查找，参数不指定也默认生效，后续废弃
112 | 
113 |   --verify
114 |   
115 |     - 数据类型：string
116 |     - 是否必选：否
117 |     - 描述：指定不保存模型输出且指定的目录输出文件已存在，进行逐 byte 比较
118 | 
119 | -----------------------------
120 | 使用示例
121 | -----------------------------
122 | 
123 | 以测速需求为例，假设已经转换完成了一个单核心的 ``YOLOv5s`` 模型，现在想要知道上板子运行的速度，那么可以参考运行如下命令：
124 | 
125 |    .. code:: bash
126 | 
127 |       root@~# ax_run_model -m /opt/data/npu/models/yolov5s.axmodel -w 10 -r 100
128 |       Run AxModel:
129 |             model: /opt/data/npu/models/yolov5s.axmodel
130 |              type: NPU1
131 |              vnpu: Disable
132 |          affinity: 0b001
133 |            repeat: 100
134 |            warmup: 10
135 |             batch: 1
136 |       pulsar2 ver: 1.2-patch2 7e6b2b5f
137 |        engine ver: V1.13.0 Apr 26 2023 16:48:53 1.1.0
138 |          tool ver: 1.0.0
139 |          cmm size: 12730188 Bytes
140 |       ------------------------------------------------------
141 |       min =   7.658 ms   max =   7.672 ms   avg =   7.662 ms
142 |       ------------------------------------------------------
143 | 
144 | 
145 | 从打印的 log 可以看出，VNPU 被初始化成 standard 模式，此时 NPU 被分作三份；并且这次测速时亲和性设置为亲和序号最大的那个模型。
146 | 
147 | 通过设置亲和性，可以很方便地在不编写代码的情况下，同时跑多个模型进行测速。
148 | 
149 | 比如，在一个 SSH 终端窗口里，运行模型 a 数万次，然后在另一个 SSH 终端里，设置不同的亲和性，观察模型 b 速度相较于没有运行模型a时的速度下降，就可以得知极高负载情况下，模型b受模型 a 运行的影响(这可能比真实情况更严苛)。需要注意的是，两个 SSH 里， ``-v`` 参数需要是一致的。
150 | 
151 | 另一个很常见的需求是转完了模型，想要知道板子上的精度如何，这可以通过精度的参数进行测试。
152 | 
153 | 以分类模型为例，说明目录结构和参数的使用，这里以典型的一个目录结构举例：
154 | 
155 |    .. code:: bash
156 | 
157 |       root@~# tree /opt/data/npu/temp/
158 |       /opt/data/npu/temp/
159 |       |-- input
160 |       |   `-- 0
161 |       |       `-- data.bin
162 |       |-- list.txt
163 |       |-- mobilenet_v1.axmodel
164 |       `-- output
165 |          `-- 0
166 |             `-- prob.bin
167 | 
168 |       4 directories, 4 files
169 | 
170 | 测试精度时必须的参数是 ``-m -i -o -l``，分别指定模型、输入文件夹、输出文件夹、和待测试的输入列表。
171 | 
172 | 此外，这三个模式的输出文件夹都非空，在运行命令时输出文件夹的已有文件会被覆盖；但如果是已经从 ``Pulsar2`` 仿真拿到的输出 ``golden`` 文件，
173 | 则可以通过附加 ``--verify`` 参数不覆写输出文件，而是读取输出文件夹的已有文件，和当前模型的输出在内存中进行逐位比较，这个模式在怀疑仿真和上板精度不对齐时特别有用。
174 | 
175 | 参数 ``-l`` 指定激励文件夹列表：
176 | 
177 |    .. code:: bash
178 | 
179 |       root@~# cat /opt/data/npu/temp/list.txt
180 |       0
181 |       root@~#
182 | 
183 | 
184 | 也就是在示例中，指定的是唯一一个激励文件夹。这个参数在数据集很大时非常有用，比如输入文件夹是完整的 ``ImageNet`` 数据集，文件非常多；
185 | 但这次测试时只希望测 10 个文件验证一下，如果没有异常再跑全量的测试，那么这样的需求可以通过创建两个 ``list.txt`` 完成，一个list里保存的只有 10 行激励，一个list文件里是全部的激励。
186 | 以下是 ``verify`` 的需求进行举例， ``ax_run_model`` 参数运行示例如下：
187 | 
188 |    .. code:: bash
189 | 
190 |       root@~# ax_run_model -m /opt/data/npu/temp/mobilenet_v1.axmodel -i /opt/data/npu/temp/input/ -o /opt/data/npu/temp/output/ -l /opt/data/npu/temp/list.txt --verify
191 |        total found {1} input drive folders.
192 |        infer model, total 1/1. Done.
193 |        ------------------------------------------------------
194 |        min =   3.347 ms   max =   3.347 ms   avg =   3.347 ms
195 |        ------------------------------------------------------
196 | 
197 |       root@~#
198 | 
199 | 可以看出，这个模型在这组输入输出 binary 文件下，输出是逐位对齐的。如果没有对齐，打印会报告没有对齐的 ``byte`` 偏移量。
200 | 


--------------------------------------------------------------------------------
/source/other_tools/caffe_to_onnx.rst:
--------------------------------------------------------------------------------
 1 | =======================
 2 | caffe2onnx 工具使用说明
 3 | =======================
 4 | 
 5 | 本章介绍 AX 版 caffe2onnx 转换工具，用于将 Caffe 浮点模型转换成 ONNX 浮点模型。
 6 | 
 7 | .. note::
 8 | 
 9 |    下文的模型语义皆为浮点模型。
10 | 
11 | -----------------------------
12 | 将 Caffe 模型转换成 ONNX 模型
13 | -----------------------------
14 | 
15 | 我们提供了三种方式将 Caffe 模型转换成 ONNX 模型。
16 | 
17 | 1. 您可以将 Caffe 文件传入进来以转换您指定的某一个 Caffe 模型：
18 | 
19 |    .. code:: bash
20 | 
21 |       python3 /opt/pulsar2/tools/convert_caffe_to_onnx.py
22 |             --prototxt_path /path/to/your/model.prototxt
23 |             --caffemodel_path /path/to/your/model.caffemodel
24 |             --onnx_path /path/to/your/model.onnx  # optional
25 |             --opset_version OPSET_VERSION  # default to ONNX opset 13
26 | 
27 |    一个 ".caffemodel" 跟其匹配的 ".prototxt" 文件一起组成一个 Caffe 模型，
28 |    您需要同时指定 ``--caffemodel_path`` 和 ``--prototxt_path`` 这两个参数以确定一个
29 |    Caffe 模型。 ``--onnx_path`` 跟 ``--opset_version`` 参数是可选的，
30 |    其中 ``--opset_version`` 的缺省值为 13.
31 | 
32 |    .. note::
33 | 
34 |       如果您不指定 ``--onnx_path`` 命令行参数，生成的 ONNX 模型会
35 |       使用 ".caffemodel" 模型文件（由 ``--caffemodel_path`` 指定）
36 |       的前缀，并存放到和 ".caffemodel" 文件同一级的目录下。
37 | 
38 | 2. 或者您也可以传入一个文件夹以转换其里面所有的 Caffe 模型：
39 | 
40 |    .. code:: bash
41 | 
42 |       python3 /opt/pulsar2/tools/convert_caffe_to_onnx.py
43 |             --checkpoint_path /path/to/your/model/zoo
44 |             --opset_version OPSET_VERSION  # default to ONNX opset 13
45 | 
46 |    这将递归地找到指定文件夹里面所有的以 ".caffemodel" 为后缀文件及其对应的
47 |    ".prototxt" 文件，此为一个 Caffe 模型，将其转换为 ONNX 模型，
48 |    并使用 Caffe 模型的前缀，以 ".onnx" 为后缀进行保存。
49 | 
50 |    .. note::
51 | 
52 |       Caffe 模型对应的 ".prototxt" 和 ".caffemodel"
53 |       文件需要在同一个文件夹并共享一个前缀。
54 | 
55 | 3. caffe2onnx 命令行工具
56 | 
57 |    新版工具链提供了 caffe2onnx 的命令行工具，也可以使用如下方式来转换模型。
58 | 
59 |    .. code:: bash
60 | 
61 |       caffe2onnx --convert --checkpoint_path /path/to/your/model/zoo
62 | 
63 | ----------------------
64 | 验证转换出的 ONNX 模型
65 | ----------------------
66 | 
67 | 您可以使用如下命令对分原始的 Caffe 模型和转换出的 ONNX 模型：
68 | 
69 | .. code:: bash
70 | 
71 |    python3 /opt/pulsar2/tools/validate_caffe_onnx.py
72 |          --checkpoint_path /path/to/your/model/zoo
73 | 
74 | 首先这将递归地找到指定文件夹里所有的以 ".onnx" 为后缀的文件，然后按照其前缀匹配对应的
75 | ".prototxt" 和 ".caffemodel" 文件，生成一个随机数据集，分别使用 ONNX Runtime 和
76 | Caffe 推理工具进行推理，并计算两者的“相关系数 (Correlation)”、“标准偏差 (Standard Deviation)”、
77 | “余弦距离相似度 (Cosine Similarity)”、“归一化相对误差 (Normalized Relative Error)”、
78 | “最大差异 (Max Difference)” 和 “平均差异 (Mean Difference)”。
79 | 
80 | .. note::
81 | 
82 |    Caffe 模型对应的 ".prototxt" 和 ".caffemodel"
83 |    文件以及转换出来的 ".onnx" 文件需要在同一个文件夹并共享一个前缀。
84 | 
85 | .. note::
86 | 
87 |    此步需要安装 caffe。
88 | 
89 | .. note::
90 | 
91 |    新版工具链提供了 caffe2onnx 的命令行工具，也可以使用如下方式来验证转换后的模型。
92 | 
93 | .. code:: bash
94 | 
95 |    caffe2onnx --validate --checkpoint_path /path/to/your/model/zoo
96 | 


--------------------------------------------------------------------------------
/source/pulsar2/introduction.rst:
--------------------------------------------------------------------------------
 1 | ========================================
 2 | Pulsar2 工具链概述
 3 | ========================================
 4 | 
 5 | ----------------------------
 6 | 简介
 7 | ----------------------------
 8 | 
 9 | **Pulsar2** 由 `爱芯元智 <https://www.axera-tech.com/>`_ **自主研发** 的 ``all-in-one`` 新一代神经网络编译器, 
10 | 即 **转换**、 **量化**、 **编译**、 **异构** 四合一, 实现深度学习神经网络模型 **快速**、 **高效** 的部署需求. 
11 | 针对新一代 `AX6、M7` 系列芯片（AX630C、AX620Q、AX650A、AX650N、M76H）特性进行了深度定制优化, 充分发挥片上异构计算单元(CPU+NPU)算力, 提升神经网络模型的产品部署效率.
12 | 
13 | **特别说明：**
14 | 
15 | - 工具链文档中的提示说明
16 |    - **Note**: 注释内容，对某些专业词做进一步解释说明
17 |    - **Hint**: 提示内容，提醒用户确认相关信息
18 |    - **Attention**: 注意内容，提醒用户对工具配置的相关注意事项
19 |    - **Warning**: 告警内容，提醒用户注意工具链的正确使用方法。如果客户没有按Warning提示内容进行使用，有可能会出现错误结果。
20 | - 工具链文档中的命令兼容车载芯片，例如 ``Pulsar2`` 支持 ``M76H``
21 | - 工具链文档中的 **示例命令**、 **示例输出** 均基于 ``AX650`` 进行展示
22 | - 具体芯片的算力配置，以芯片SPEC为准
23 | 
24 | ``Pulsar2`` 工具链核心功能是将 ``.onnx`` 模型编译成芯片能解析并运行的 ``.axmodel`` 模型.
25 | 
26 | **部署流程**
27 | 
28 | .. figure:: ../media/deploy-pipeline.png
29 |     :alt: pipeline
30 |     :align: center
31 | 
32 | .. _soc_introduction:
33 | 
34 | ----------------------------
35 | 虚拟 NPU 介绍
36 | ----------------------------
37 | 
38 | .. figure:: ../media/vNPU-ax650.png
39 |     :alt: pipeline
40 |     :align: center
41 | 
42 | **AX650 和 M76H NPU** 主要由 **3** 个 Conv 卷积计算核，3 组向量 Vector 计算核组成。这些 Conv 和 Vector 计算核按照 1:1 的比例分配，划分为 **3 组 vNPU**。
43 | 
44 | - 在运行时，通过 **AXEngine API** 可以设置 NPU 的工作模式，灵活的对 vNPU 进行分组，可以设置为 1 + 1 + 1 的三个对称 vNPU 模式，或者 2 + 1 的大小 vNPU 模式，也可以设置为 3 的大算力单 vNPU 模式。
45 | 
46 | - 在转换模型时，可以根据需求灵活的指定模型推理所需的 vNPU 数量（详细信息请查看 ``pulsar2 build`` 的 ``--npu_mode 参数``）。当模型部署到芯片平台上加载时，AXEngine 可以根据当前设置的 NPU 工作模式将模型分配到对应算力的 vNPU 之上运行。
47 | 
48 | .. figure:: ../media/vNPU-ax620e.png
49 |     :alt: pipeline
50 |     :align: center
51 | 
52 | **AX630C、AX620Q** 采用双核 NPU 设计，根据 AI-ISP 是否启用划分成两种工况分配给用户不同算力。
53 | 
54 | - 在转换模型时，需根据实际业务中 AI-ISP 的工况显式配置用户模型的 NPU 工作模式（详细信息请查看 ``pulsar2 build`` 的 ``--npu_mode 参数``）。
55 | - AX630C、AX620Q 中的 NPU 模块均采用爱芯元智 **通元4.0** NPU 引擎，后续章节使用 ``AX620E`` 简化目标硬件平台指定。
56 | 
57 | ----------------------------
58 | 后续章节内容引导
59 | ----------------------------
60 | 
61 | * **Section3**: 本章介绍使用NPU工具链进行模型转换和部署的软硬件环境准备和安装。如何在不同系统环境下安装 ``Docker`` 并启动容器
62 | * **Section4**：本章介绍NPU工具链在爱芯AX650（包括AX650A，AX650N，M76H）芯片平台上的基本应用流程
63 | * **Section5**：本章介绍NPU工具链在爱芯AX620E（包括AX620Q，AX630C）芯片平台上的基本应用流程
64 | * **Section6**：本章为模型转换的进阶说明，即详细介绍如何利用 ``Pulsar2 Docker`` 工具链将 ``onnx`` 模型转换为 ``axmodel`` 模型
65 | * **Section7**: 本章为模型仿真的进阶说明，即详细介绍如何使用 ``axmodel`` 模型在 ``x86`` 平台上仿真运行并衡量推理结果与 ``onnx`` 推理结果之间的差异度(内部称之为 ``对分``)
66 | * **Section8**: 本章为模型上板运行的进阶说明，即详细介绍如何上板运行 ``axmodel`` 得到模型在爱芯SOC硬件上的推理结果
67 | * **Section9**: 本章对模型转换编译过程使用的配置文件进行详细说明
68 | * **Section10**: Caffe AI训练平台导出的模型不是NPU工具链支持的 ``onnx`` 格式，需要一个工具把Caffe模型转换成 ``onnx`` 模型。本章介绍这个模型转换工具的使用方法。
69 | * **Section11**: 本章为模型板上速度和精度测试工具的使用说明
70 | * **Section12**: 本章为NPU工具链功能安全符合性的声明
71 | * **附录**：文档附录部分包括算子支持列表、精度调优建议
72 | 
73 | .. note::
74 | 
75 |     所谓 ``对分``, 即对比工具链编译前后的同一个模型不同版本 (文件类型) 推理结果之间的误差。
76 | 


--------------------------------------------------------------------------------
/source/user_guides_advanced/advanced_deploy_guides.rst:
--------------------------------------------------------------------------------
  1 | .. _model_deploy_advanced:
  2 | 
  3 | =========================
  4 | 模型部署进阶指南
  5 | =========================
  6 | 
  7 | --------------------
  8 | 概述
  9 | --------------------
 10 | 
 11 | 本章节介绍开发板上 NPU 相关示例程序的使用方式，相关示例程序源码参考 SDK 中 ``msp/sample/npu`` 目录，如何编译出 NPU 示例代码请参考 《AX SDK 使用说明》.
 12 | 
 13 | --------------------
 14 | 运行示例
 15 | --------------------
 16 | 
 17 | **运行准备**
 18 | 
 19 | 对于 ``AX650A``、 ``AX650N``、 ``M76H``、 ``AX630C`` 开发板，NPU 相关示例已预装在 ``/opt/bin/`` 路径下，分别为 ``sample_npu_classification`` 和 ``sample_npu_yolov5s``.
 20 | 
 21 | 对于 ``AX620Q`` 开发板，由于默认采用 16M NorFlash 方案，文件系统中未包含上述2个示例，可通过 NFS 网络挂载的方式将 SDK 中 ``msp/out/bin/`` 路径挂载到开发板的文件系统中获取以上示例.
 22 | 
 23 | 如果提示板子空间不足, 可以通过文件夹挂载的方式解决.
 24 | 
 25 | **MacOS 挂载 ARM 开发板示例**
 26 | 
 27 | .. hint::
 28 | 
 29 |     由于板上空间有限, 测试时通常需要进行文件夹共享操作, 这个时候就需要将 ``ARM`` 开发板与主机之间进行共享. 这里仅以 ``MacOS`` 为例.
 30 | 
 31 | 开发机挂载 ``ARM`` 开发板需要 ``NFS`` 服务, 而 ``MacOS`` 系统自带 ``NFS`` 服务, 只需要创建 ``/etc/exports`` 文件夹, ``nfsd`` 将自动启动并开始用于 ``exports``.
 32 | 
 33 | ``/etc/exports`` 可以配置如下:
 34 | 
 35 | .. code-block:: shell
 36 | 
 37 |     /path/your/sharing/directory -alldirs -maproot=root:wheel -rw -network xxx.xxx.xxx.xxx -mask 255.255.255.0
 38 | 
 39 | 参数释义
 40 | 
 41 | .. list-table::
 42 |     :widths: 15 40
 43 |     :header-rows: 1
 44 | 
 45 |     * - 参数名
 46 |       - 含义
 47 |     * - alldirs
 48 |       - 共享 ``/Users`` 目录下所有文件, 如果只想共享一个文件夹可以省略
 49 |     * - network
 50 |       - 挂载 ARM 开发板 IP 地址, 可以是网段地址
 51 |     * - mask
 52 |       - 子网掩码, 通常是 255.255.255.0
 53 |     * - maproot
 54 |       - 映射规则, 当 ``maproot=root:wheel`` 时表示把 ``ARM`` 板的 ``root`` 用户映射为开发机上的 ``root`` 用户, ``ARM`` 的 ``root`` 组 映射为 ``MacOS`` 上的 ``wheel`` (gid=0) 组. 
 55 |         如果缺省, 可能会出现 ``nfsroot`` 链接失败错误.
 56 |     * - rw
 57 |       - 读写操作, 默认开启
 58 | 
 59 | 修改 ``/etc/exports`` 需要重启 ``nfsd`` 服务
 60 | 
 61 | .. code-block:: bash
 62 | 
 63 |     sudo nfsd restart
 64 | 
 65 | 如果配置成功, 可以使用
 66 | 
 67 | .. code-block:: bash
 68 | 
 69 |     sudo showmount -e
 70 |  
 71 | 命令查看挂载信息, 例如输出 ``/Users/skylake/board_nfs 10.168.21.xx``, 配置好开发机后需要在 ``ARM`` 端执行 ``mount`` 指令
 72 | 
 73 | .. code-block:: bash
 74 | 
 75 |     mount -t nfs -o nolock,tcp macos_ip:/your/shared/directory /mnt/directory
 76 | 
 77 | 如果出现权限问题, 需要检查 ``maproot`` 参数是否正确.
 78 | 
 79 | .. hint::
 80 | 
 81 |     ``network`` 参数可以配置成网段的形式, 如: ``10.168.21.0``, 如果挂载单ip出现 ``Permission denied``, 可以尝试一下网段内挂载.
 82 | 
 83 | **分类模型**
 84 | 
 85 | 以下打印信息基于 AX650N 开发板运行输出，非 AX650N 开发板的打印信息以实际打印为准.
 86 | 
 87 | .. code-block:: bash
 88 | 
 89 |     /root # sample_npu_classification -m /opt/data/npu/models/mobilenetv2.axmodel -i /opt/data/npu/images/cat.jpg -r 10
 90 |     --------------------------------------
 91 |     model file : /opt/data/npu/models/mobilenetv2.axmodel
 92 |     image file : /opt/data/npu/images/cat.jpg
 93 |     img_h, img_w : 224 224
 94 |     --------------------------------------
 95 |     Engine creating handle is done.
 96 |     Engine creating context is done.
 97 |     Engine get io info is done.
 98 |     Engine alloc io is done.
 99 |     Engine push input is done.
100 |     --------------------------------------
101 |     topk cost time:0.07 ms
102 |     9.5094, 285
103 |     9.3773, 282
104 |     9.2452, 281
105 |     8.5849, 283
106 |     7.6603, 287
107 |     --------------------------------------
108 |     Repeat 10 times, avg time 0.72 ms, max_time 0.72 ms, min_time 0.72 ms
109 |     --------------------------------------
110 | 
111 | **检测模型**
112 | 
113 | .. code-block:: bash
114 | 
115 |     /root # sample_npu_yolov5s -m /opt/data/npu/models/yolov5s.axmodel -i /opt/data/npu/images/dog.jpg -r 10
116 |     --------------------------------------
117 |     model file : /opt/data/npu/models/yolov5s.axmodel
118 |     image file : /opt/data/npu/images/dog.jpg
119 |     img_h, img_w : 640 640
120 |     --------------------------------------
121 |     Engine creating handle is done.
122 |     Engine creating context is done.
123 |     Engine get io info is done.
124 |     Engine alloc io is done.
125 |     Engine push input is done.
126 |     --------------------------------------
127 |     post process cost time:2.25 ms
128 |     --------------------------------------
129 |     Repeat 10 times, avg time 7.65 ms, max_time 7.66 ms, min_time 7.65 ms
130 |     --------------------------------------
131 |     detection num: 3
132 |     16:  91%, [ 138,  218,  310,  541], dog
133 |     2:  69%, [ 470,   76,  690,  173], car
134 |     1:  56%, [ 158,  120,  569,  420], bicycle
135 |     --------------------------------------
136 | 
137 | --------------------
138 | 其他示例
139 | --------------------
140 | 
141 | 请参考我们在 github 上的开源项目：
142 | 
143 | - `AX-Samples <https://github.com/AXERA-TECH/ax-samples>`_
144 | 


--------------------------------------------------------------------------------
/source/user_guides_advanced/advanced_run_guides.rst:
--------------------------------------------------------------------------------
  1 | ===================
  2 | 仿真运行进阶指南
  3 | ===================
  4 | 
  5 | -------------------------------
  6 | 概述
  7 | -------------------------------
  8 | 
  9 | ``pulsar2 run`` 用于在 ``x86`` 平台上对 ``axmodel`` 模型进行 **x86仿真推理计算**，提前获取编译后的模型运行结果.
 10 | 
 11 | .. figure:: ../media/pulsar2-run-pipeline.png
 12 |     :alt: pipeline
 13 |     :align: center
 14 | 
 15 | .. _pulsar_run:
 16 | 
 17 | -------------------------------
 18 | 仿真运行详解
 19 | -------------------------------
 20 | 
 21 | ~~~~~~~~~~~~~~~~~~~~~
 22 | pulsar2 run 
 23 | ~~~~~~~~~~~~~~~~~~~~~
 24 | 
 25 | 本节介绍 ``pulsar2 run`` 命令完整使用方法.
 26 | 
 27 | ``pulsar2 run -h`` 可显示详细命令行参数:
 28 | 
 29 | .. code-block:: python
 30 |     :name: input_conf_items
 31 |     :linenos:
 32 | 
 33 |     usage: main.py run [-h] [--config] [--model] [--input_dir] [--output_dir]
 34 |                        [--list] [--random_input ] [--batch_size]
 35 |                        [--enable_perlayer_output ] [--dump_with_stride ]
 36 |                        [--group_index] [--mode] [--target_hardware]
 37 |     
 38 |     optional arguments:
 39 |       -h, --help            show this help message and exit
 40 |       --config              config file path, supported formats: json / yaml /
 41 |                             toml / prototxt. type: string. required: false.
 42 |                             default:.
 43 |       --model               run model path, support ONNX, QuantAxModel and
 44 |                             CompiledAxmodel. type: string. required: true.
 45 |       --input_dir           model input data in this directory. type: string.
 46 |                             required: true. default:.
 47 |       --output_dir          model output data directory. type: string. required:
 48 |                             true. default:.
 49 |       --list                list file path. type: string. required: true.
 50 |                             default:.
 51 |       --random_input []     random input data. type: bool. required: false.
 52 |                             default: false.
 53 |       --batch_size          batch size to be used in dynamic inference mode, only
 54 |                             work for CompiledAxModel. type: int. required: false.
 55 |                             defalult: 0.
 56 |       --enable_perlayer_output []
 57 |                             enable dump perlayer output. type: bool. required:
 58 |                             false. default: false.
 59 |       --dump_with_stride []
 60 |       --group_index 
 61 |       --mode                run mode, only work for QuantAxModel. type: enum.
 62 |                             required: false. default: Reference. option:
 63 |                             Reference, NPUBackend.
 64 |       --target_hardware     target hardware, only work for QuantAxModel. type:
 65 |                             enum. required: false. default: AX650. option: AX650,
 66 |                             AX620E, M76H.
 67 | 
 68 | .. data:: pulsar2 run 参数解释
 69 |   
 70 |     --model
 71 | 
 72 |         - 数据类型：string
 73 |         - 是否必选：是
 74 |         - 描述：推理仿真的模型路径，模型支持 ``ONNX``, ``QuantAXModel`` 或者 ``AXModel`` 格式
 75 | 
 76 |     --input_dir
 77 | 
 78 |         - 数据类型：string
 79 |         - 是否必选：是
 80 |         - 描述：模型仿真输入数据文件所在的目录。
 81 | 
 82 |     --output_dir
 83 |     
 84 |         - 数据类型：string
 85 |         - 是否必选：是
 86 |         - 描述：模型仿真输出数据文件所在的目录。
 87 | 
 88 |     --list
 89 |     
 90 |         - 数据类型：string
 91 |         - 是否必选：否
 92 |         - 默认值：""
 93 |         - 描述：若未指定，则直接从 ``input_dir`` 中读取仿真输入数据，仿真结果直接写到 ``output_dir`` 中。若指定了 list 文件路径，则文件中的每一行代表一次仿真，会在 ``input_dir`` / ``output_dir`` 下寻找以行内容命名的子目录，分别用于读取仿真输入和写出仿真结果。例如：当 ``list`` 指定的文件中有一行内容为 0，仿真输入数据文件在 ``input_dir/0`` 目录下，仿真结果在 ``output_dir/0`` 目录下。
 94 | 
 95 |     --random_input
 96 |     
 97 |         - 数据类型：bool
 98 |         - 是否必选：否
 99 |         - 默认值：false
100 |         - 描述：是否在 ``input_dir`` 中生成随机输入用于后续的仿真。
101 | 
102 |     .. attention::
103 |     
104 |         仿真输入输出数据文件的命名方法。
105 |     
106 |         .. code-block:: python
107 |             :linenos:
108 |         
109 |             import re
110 |         
111 |             # 假设变量 name 代表模型输入名称
112 |             escaped_name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
113 |             file_name = escaped_name + ".bin"
114 | 
115 |     --batch_size
116 |     
117 |         - 数据类型：int
118 |         - 是否必选：否
119 |         - 默认值：0
120 |         - 描述：多 batch 仿真大小，仅支持 ``CompiledAxmodel``。
121 |             - 当输入模型是非多 batch 编译出的模型时，循环运行 batch_size 次。
122 |             - 当输入模型是多 batch 编译出的模型时，会根据模型中包含的 batch 组合以及 batch_size 自动计算出仿真过程。
123 | 
124 |     --enable_perlayer_output
125 |     
126 |         - 数据类型：bool
127 |         - 是否必选：否
128 |         - 默认值：false
129 |         - 描述：仿真时，将中间层的输出保存到输出目录。
130 | 
131 |     --mode
132 |     
133 |         - 数据类型：enum
134 |         - 是否必选：否
135 |         - 默认值：Reference
136 |         - 描述：AX 算子的运行模式，仅支持 ``QuantAxModel``。可选：Reference / NPUBackend。
137 | 
138 |     --target_hardware
139 |     
140 |         - 数据类型：enum
141 |         - 是否必选：否
142 |         - 默认值：AX650
143 |         - 描述：运行 AX 算子的目标后端实现，仅支持 ``QuantAxModel``。当 ``mode`` 为 ``NPUBackend`` 时生效。
144 | 
145 | ~~~~~~~~~~~~~~~~~~~~~
146 | pulsar2-run-helper
147 | ~~~~~~~~~~~~~~~~~~~~~
148 | 
149 | 为了使用 ``pulsar2 run`` 模拟真实的上板运行结果，我们提供了 ``pulsar2-run-helper`` 工具实现网络模型运行依赖的 **输入**、 **输出** 数据处理，指导用户实现以下功能。
150 | 
151 | * 将 ``jpg``、 ``png`` 等格式的图片预处理成 ``pulsar2 run`` 命令参数 ``input_dir`` 所指定的格式；
152 | * 解析 ``pulsar2 run`` 运行完成后输出到 ``output_dir`` 中的输出数据，实现 **分类**、 **检测** 任务的后处理操作；
153 | * 所以工具内容均由 **python** 脚本实现，便于算法工程师快速上手。
154 | 
155 | ``pulsar2-run-helper`` 获取方式及环境搭建请先参考 :ref:`《仿真运行》 <model_simulator>` 章节。
156 | 
157 | **pulsar2-run-helper** 目录说明如下：
158 | 
159 | .. code-block:: shell
160 | 
161 |     root@xxx:/data/pulsar2-run-helper# tree -L 2
162 |     .
163 |     ├── cli_classification.py     # 分类任务的数据处理参考脚本 
164 |     ├── cli_detection.py          # 检测任务的数据处理参考脚本
165 |     ├── models
166 |     │   ├── mobilenetv2.axmodel   # 由 pulsar2 build 生成的 axmodel
167 |     │   └── yolov5s.axmodel
168 |     ├── pulsar2_run_helper
169 |     │   ├── __init__.py
170 |     │   ├── pipeline
171 |     │   ├── postprocessing.py
172 |     │   ├── preprocessing.py
173 |     │   ├── utils
174 |     │   └── yolort
175 |     ├── pyproject.toml
176 |     ├── README.md
177 |     ├── requirements.txt
178 |     ├── setup.cfg
179 |     ├── sim_images                # 仿真运行的图片
180 |     │   ├── cat.jpg
181 |     │   └── dog.jpg
182 |     ├── sim_inputs                # 输入数据
183 |     ├── sim_inputs
184 |     │   ├── 0
185 |     │   │   └── input.bin
186 |     │   └── input.bin
187 |     └── sim_outputs
188 |         ├── 0
189 |         │   └── output.bin
190 |         └── output.bin
191 | 
192 | **cli_classification** 参数说明
193 | 
194 | .. code-block:: shell
195 | 
196 |     root@xxx:/data# python3 pulsar2-run-helper/cli_classification.py -h
197 |     usage: CLI tools for pre-processing and post-processing. [-h] [--image_path IMAGE_PATH] --axmodel_path AXMODEL_PATH --intermediate_path INTERMEDIATE_PATH
198 |                                                             [--output_path OUTPUT_PATH] [--crop_size CROP_SIZE] [--pre_processing] [--post_processing]
199 | 
200 |     optional arguments:
201 |       -h, --help            show this help message and exit
202 |       --image_path IMAGE_PATH
203 |                             The path of image file.
204 |       --axmodel_path AXMODEL_PATH
205 |                             The path of compiled axmodel.
206 |       --intermediate_path INTERMEDIATE_PATH
207 |                             The path of intermediate data bin.
208 |       --output_path OUTPUT_PATH
209 |                             The path of output files.
210 |       --crop_size CROP_SIZE
211 |                             Image size for croping (default: 224).
212 |       --pre_processing      Do pre processing.
213 |       --post_processing     Do post processing.
214 | 
215 | **cli_detection** 参数说明
216 | 
217 | .. code-block:: shell
218 | 
219 |     root@xxx:/data/pulsar2-run-helper# python3 cli_detection.py --help
220 |     usage: CLI tools for pre-processing and post-processing. [-h] [--image_path IMAGE_PATH] --axmodel_path AXMODEL_PATH --intermediate_path INTERMEDIATE_PATH [--output_path OUTPUT_PATH]
221 |                                                             [--letterbox_size LETTERBOX_SIZE] [--num_classes NUM_CLASSES] [--score_thresh SCORE_THRESH] [--nms_thresh NMS_THRESH]
222 |                                                             [--pre_processing] [--post_processing]
223 | 
224 |     optional arguments:
225 |       -h, --help            show this help message and exit
226 |       --image_path IMAGE_PATH
227 |                             The path of image file.
228 |       --axmodel_path AXMODEL_PATH
229 |                             The path of compiled axmodel.
230 |       --intermediate_path INTERMEDIATE_PATH
231 |                             The path of intermediate data bin.
232 |       --output_path OUTPUT_PATH
233 |                             The path of output files.
234 |       --letterbox_size LETTERBOX_SIZE
235 |                             Image size for croping (default: 640).
236 |       --num_classes NUM_CLASSES
237 |                             Number of classes (default: 80).
238 |       --score_thresh SCORE_THRESH
239 |                             Threshold of score (default: 0.45).
240 |       --nms_thresh NMS_THRESH
241 |                             Threshold of NMS (default: 0.45).
242 |       --pre_processing      Do pre processing.
243 |       --post_processing     Do post processing.
244 | 
245 | --------------------
246 | 仿真运行示例
247 | --------------------
248 | 
249 | 以下示例中使用到的 ``mobilenetv2.axmodel`` 和 ``yolov5s.axmodel`` 获取方式：
250 | 
251 | * 参考 :ref:`《模型编译》 <model_simulator>` 章节自行编译生成；
252 | * 从 :ref:`《开发板运行》 <onboard_running>` 章节中提及到的 ``demo_onboard.zip`` 中获取预编译好的版本。
253 | 
254 | ~~~~~~~~~~~~~~~~~~~~~
255 | MobileNetv2
256 | ~~~~~~~~~~~~~~~~~~~~~
257 | 
258 | ^^^^^^^^^^^^^^^^^^^^^
259 | 输入数据准备
260 | ^^^^^^^^^^^^^^^^^^^^^
261 | 
262 | .. code-block:: shell
263 | 
264 |     root@xxx:/data/pulsar2-run-helper# python3 cli_classification.py --pre_processing --image_path sim_images/cat.jpg --axmodel_path models/mobilenetv2.axmodel --intermediate_path sim_inputs/0
265 |     [I] Write [input] to 'sim_inputs/0/input.bin' successfully.
266 | 
267 | ^^^^^^^^^^^^^^^^^^^^^
268 | 仿真模型推理
269 | ^^^^^^^^^^^^^^^^^^^^^
270 | 
271 | .. code-block:: shell
272 | 
273 |     root@xxx:/data/pulsar2-run-helper# pulsar2 run --model models/mobilenetv2.axmodel --input_dir sim_inputs --output_dir sim_outputs --list list.txt
274 |     Building native ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
275 |     >>> [0] start
276 |     write [output] to [sim_outputs/0/output.bin] successfully
277 |     >>> [0] finish
278 | 
279 | ^^^^^^^^^^^^^^^^^^^^^
280 | 输出数据处理
281 | ^^^^^^^^^^^^^^^^^^^^^
282 | 
283 | .. code-block:: shell
284 | 
285 |     root@xxx:/data/pulsar2-run-helper# python3 cli_classification.py --post_processing --axmodel_path models/mobilenetv2.axmodel --intermediate_path sim_outputs/0
286 |     [I] The following are the predicted score index pair.
287 |     [I] 9.5094, 285
288 |     [I] 9.3773, 283
289 |     [I] 9.2452, 281
290 |     [I] 8.5849, 282
291 |     [I] 7.6603, 463
292 | 
293 | ~~~~~~~~~~~~~~~~~~~~~
294 | YOLOv5s
295 | ~~~~~~~~~~~~~~~~~~~~~
296 | 
297 | ^^^^^^^^^^^^^^^^^^^^^
298 | 输入数据准备
299 | ^^^^^^^^^^^^^^^^^^^^^
300 | 
301 | .. code-block:: shell
302 | 
303 |     root@xxx:/data/pulsar2-run-helper# python3 cli_detection.py --pre_processing --image_path sim_images/dog.jpg --axmodel_path models/yolov5s.axmodel --intermediate_path sim_inputs/0
304 |     [I] Write [images] to 'sim_inputs/0/images.bin' successfully.
305 | 
306 | ^^^^^^^^^^^^^^^^^^^^^
307 | 仿真模型推理
308 | ^^^^^^^^^^^^^^^^^^^^^
309 | 
310 | .. code-block:: shell
311 | 
312 |     root@xxx:/data/pulsar2-run-helper# pulsar2 run --model models/yolov5s.axmodel --input_dir sim_inputs/ --output_dir sim_outputs/ --list list.txt
313 |     Building native ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
314 |     >>> [0] start
315 |     write [326] to [sim_outputs/0/326.bin] successfully
316 |     write [370] to [sim_outputs/0/370.bin] successfully
317 |     write [414] to [sim_outputs/0/414.bin] successfully
318 |     >>> [0] finish
319 | 
320 | ^^^^^^^^^^^^^^^^^^^^^
321 | 输出数据处理
322 | ^^^^^^^^^^^^^^^^^^^^^
323 | 
324 | .. code-block:: shell
325 | 
326 |     root@xxx:/data/pulsar2-run-helper# python3 cli_detection.py --post_processing --image_path sim_images/dog.jpg --axmodel_path models/yolov5s.axmodel --intermediate_path sim_outputs/0
327 |     [I] Number of detected objects: 4
328 |     [I] 16: 92.62%, [182, 291, 411, 721]
329 |     [I]  2: 72.18%, [626, 101, 919, 231]
330 |     [I]  1: 59.62%, [212, 158, 760, 558]
331 |     [I]  7: 46.22%, [628, 101, 916, 232]
332 | 


--------------------------------------------------------------------------------
/source/user_guides_config/config.rst:
--------------------------------------------------------------------------------
  1 | .. _config_details:
  2 | 
  3 | ============================
  4 | 配置文件详细说明
  5 | ============================
  6 | 
  7 | 本节将对 ``pulsar2 build`` 中的 **config** 文件进行详细介绍.
  8 | 
  9 | ------------------------------------
 10 | 配置文件概述
 11 | ------------------------------------
 12 | 
 13 | - 工具链支持的全部编译参数定义请参考 :ref:`《proto 配置定义》 <config_define>` ，基础数据结构为 ``BuildConfig``；
 14 | 
 15 | - 用户可以根据参数规格编写 ``prototxt / 宽松 json / yaml / toml`` 格式的配置文件，通过命令行参数 ``--config`` 指向配置文件；
 16 |   
 17 |     - 宽松的 ``json`` 格式：支持包含 ``js-style`` 或者 ``python-style`` 注释的 ``json`` 文件；
 18 | 
 19 | - 部分编译参数支持命令行传入，且优先级高于配置文件，通过 ``pulsar2 build -h`` 查看支持的命令行编译参数，比如命令行参数 ``--quant.calibration_method`` 相当于配置了 ``QuantConfig`` 结构体的 ``calibration_method`` 字段.
 20 | 
 21 | ------------------------------------
 22 | 完整的 json 配置参考
 23 | ------------------------------------
 24 | 
 25 | .. code-block:: json
 26 | 
 27 |     {
 28 |       // input model file path. type: string. required: true.
 29 |       "input": "/path/to/lenet5.onnx",
 30 |       // axmodel output directory. type: string. required: true.
 31 |       "output_dir": "/path/to/output_dir",
 32 |       // rename output axmodel. type: string. required: false. default: compiled.axmodel.
 33 |       "output_name": "compiled.axmodel",
 34 |       // temporary data output directory. type: string. required: false. default: same with ${output_dir}.
 35 |       "work_dir": "",
 36 |       // input model type. type: enum. required: false. default: ONNX. option: ONNX, QuantAxModel, QuantONNX.
 37 |       "model_type": "ONNX",
 38 |       // target hardware. type: enum. required: false. default: AX650. option: AX650, AX620E, M76H.
 39 |       "target_hardware": "AX650",
 40 |       // npu mode. while ${target_hardware} is AX650, npu mode can be NPU1 / NPU2 / NPU3. while ${target_hardware} is AX620E, npu mode can be NPU1 / NPU2. type: enum. required: false. default: NPU1.
 41 |       "npu_mode": "NPU1",
 42 |       // modify model input shape of input model, this feature will take effect before the `input_processors` configuration. format: input1:1x3x224x224;input2:1x1x112x112. type: string. required: false. default: .
 43 |       "input_shapes": "input:1x1x28x28",
 44 |       "onnx_opt": {
 45 |         // disable onnx optimization. type: bool. required: false. default: false.
 46 |         "disable_onnx_optimization": false,
 47 |         // enable onnx simplify by https://github.com/daquexian/onnx-simplifier. type: bool. required: false. default: false.
 48 |         "enable_onnxsim": false,
 49 |         // enable model check. type: bool. required: false. default: false.
 50 |         "model_check": false,
 51 |         // disable transformation check. type: bool. required: false. default: false.
 52 |         "disable_transformation_check": false,
 53 |         // save tensors data to optimize memory footprint. type: bool. required: false. default: false.
 54 |         "save_tensors_data": false
 55 |       },
 56 |       "quant": {
 57 |         "input_configs": [
 58 |           {
 59 |             // input tensor name in origin model. "DEFAULT" means input config for all input tensors. type: string. required: true.
 60 |             "tensor_name": "input",
 61 |             // quantize calibration dataset archive file path. type: string. required: true. limitation: tar, tar.gz, zip.
 62 |             "calibration_dataset": "/path/to/dataset",
 63 |             // quantize calibration data format. type: enum. required: false. default: Image. option: Image, Numpy, Binary, NumpyObject.
 64 |             "calibration_format": "Image",
 65 |             // quantize calibration data size is min(${calibration_size}, size of ${calibration_dataset}), "-1" means load all dataset. type: int. required: false. default: 32.
 66 |             "calibration_size": 32,
 67 |             // quantize mean parameter of normlization. type: float array. required: false. default: [].
 68 |             "calibration_mean": [127],
 69 |             // quantize std parameter of normlization. type: float array. required: false. default: [].
 70 |             "calibration_std": [1]
 71 |           }
 72 |         ],
 73 |         "layer_configs": [
 74 |           {
 75 |             // set layer quantize precision. type: string. required: must choose between `layer_name` and `op_type` and `layer_names` and `op_types`. default: .
 76 |             "layer_name": "Conv_0",
 77 |             // quantize data type. type: enum. required: false. default: U8. option: U8, S8, U16, S16, FP32.
 78 |             "data_type": "U8",
 79 |             // quantize data type for Conv. type: enum. required: false. default: U8. option: U8, S8, U16, S16, FP32.
 80 |             "output_data_type": "U8",
 81 |             // quantize weight type for Conv. type: enum. required: false. default: S8. option: S8, FP32.
 82 |             "weight_data_type": "S8"
 83 |           },
 84 |           {
 85 |             // set quantize precision by operator type. type: string. required: must choose between `layer_name` and `op_type` and `layer_names` and `op_types`. default: .
 86 |             "op_type": "MaxPool",
 87 |             // quantize data type. type: enum. required: false. default: U8. option: U8, S8, U16, S16, FP32.
 88 |             "data_type": "U8"
 89 |           },
 90 |           {
 91 |             // set layer quantize precision by layers name. type: enum. required: must choose between `layer_name` and `op_type` and `layer_names` and `op_types`. default: [].
 92 |             "layer_names": ["Conv_2"],
 93 |             // quantize data type. type: enum. required: false. default: U8. option: U8, S8, U16, S16, FP32.
 94 |             "data_type": "U8",
 95 |             // quantize data type for Conv. type: enum. required: false. default: U8. option: U8, S8, U16, S16, FP32.
 96 |             "output_data_type": "U8",
 97 |             // quantize weight type for Conv. type: enum. required: false. default: S8. option: S8, FP32.
 98 |             "weight_data_type": "S8"
 99 |           },
100 |           {
101 |             // set quantize precision by operator types. type: enum. required: must choose between `layer_name` and `op_type` and `layer_names` and `op_types`. default: [].
102 |             "op_types": ["Gemm"],
103 |             // quantize data type. type: enum. required: false. default: U8. option: U8, S8, U16, S16, FP32.
104 |             "data_type": "U8"
105 |           },
106 |           {
107 |             // start tensor names of subgraph quantization config. type: string array. required: false. default: [].
108 |             "start_tensor_names": ["13"],
109 |             // end tensor names of subgraph quantization config. type: string array. required: false. default: [].
110 |             "end_tensor_names": ["15"],
111 |             // quantize data type. type: enum. required: false. default: U8. option: U8, S8, U16, S16, FP32.
112 |             "data_type": "U16"
113 |           }
114 |         ],
115 |         // quantize calibration method. type: enum. required: false. default: MinMax. option: MinMax, Percentile, MSE.
116 |         "calibration_method": "MinMax",
117 |         // enable quantization precision analysis. type: bool. required: false. default: false.
118 |         "precision_analysis": true,
119 |         // precision analysis method. type: enum. required: false. default: PerLayer. option: PerLayer, EndToEnd.
120 |         "precision_analysis_method": "PerLayer",
121 |         // precision analysis mode. type: enum. required: false. default: Reference. option: Reference, NPUBackend.
122 |         "precision_analysis_mode": "Reference",
123 |         // input sample data dir for precision analysis. type: string. required: false. default: .
124 |         "input_sample_dir": "",
125 |         // enable highest mix precision quantization. type: bool. required: false. default: false.
126 |         "highest_mix_precision": false,
127 |         // conv bias data type. type: enum. required: false. default: S32. option: S32, FP32.
128 |         "conv_bias_data_type": "S32",
129 |         // LayerNormalization scale data type. type: enum. required: false. default: FP32. option: FP32, S32, U32.
130 |         "ln_scale_data_type": "FP32",
131 |         // refine weight threshold, should be a legal float number, like 1e-6. -1 means disable this feature. type: float. required: false. default: 1e-6. limitation: 0 or less than 0.0001.
132 |         "refine_weight_threshold": 1e-6,
133 |         // enalbe smooth quant strategy for conv 1x1. type: bool. required: false. default: false.
134 |         "enable_smooth_quant": false,
135 |         // tranformer opt level. type: int. required: false. default: 0. limitation: 0~2.
136 |         "transformer_opt_level": 0,
137 |         // quant check level, 0: no check; 1: check node dtype. type: int. required: false. default: 0.
138 |         "check": 0,
139 |         // refine weight scale and input scale, type: bool. required: false. default: false.
140 |         "disable_auto_refine_scale": false
141 |       },
142 |       "input_processors": [
143 |         {
144 |           // input tensor name in origin model. "DEFAULT" means processor for all input tensors. type: string. required: true.
145 |           "tensor_name": "input",
146 |           // input tensor format in origin model. type: enum. required: false. default: AutoColorSpace. option: AutoColorSpace, BGR, RGB, GRAY.
147 |           "tensor_format": "AutoColorSpace",
148 |           // input tensor layout in origin model. type: enum. required: false. default: NCHW. option: NHWC, NCHW.
149 |           "tensor_layout": "NCHW",
150 |           // input format in runtime. type: enum. required: false. default: AutoColorSpace. option: AutoColorSpace, GRAY, BGR, RGB, YUYV422, UYVY422, YUV420SP, YVU420SP, RAW.
151 |           "src_format": "AutoColorSpace",
152 |           // input layout in runtime; if `src_format` is YUV/YVU, `src_layout` will be changed to NHWC. type: enum. required: false. default: NCHW. option: NHWC, NCHW.
153 |           "src_layout": "NHWC",
154 |           // input data type in runtime. type: enum. required: false. default: FP32. option: U8, S8, U16, S16, U32, S32, FP16, FP32.
155 |           "src_dtype": "U8",
156 |     
157 |           // extra compiler shapes for this input. src_extra_shapes size of every input should be the same. shape at the same index of every input will be treated as a input group which can inference independently at runtime. type: list of Shape. required: false. default [].
158 |           "src_extra_shapes": [],
159 |     
160 |           // color space mode. type: enum. required: false. default: NoCSC. option: NoCSC, Matrix, FullRange, LimitedRange.
161 |           "csc_mode": "NoCSC",
162 |           // color space conversion matrix, 12 elements array that represents a 3x4 matrix. type: float array. required: false. default: [].
163 |           "csc_mat": [1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4],
164 |           // mean parameter of normlization in runtime. type: float array. required: false. default: same with ${quant.input_configs.calibration_mean}.
165 |           "mean": [],
166 |           // std parameter of normlization in runtime. type: float array. required: false. default: same with ${quant.input_configs.calibration_std}.
167 |           "std": [],
168 |           // list containing the number of start and end pad values for axis when padding. type: int32 array. required: false. default: [].
169 |           "padding": [],
170 |           // padding mode. type: string. required: false. default: constant.
171 |           "padding_mode": "constant",
172 |           // padding constant value. type: int32. required: false. default: 0.
173 |           "padding_constant_value": 0,
174 |           // list containing the number of start and end pad values for axis when slicing. type: int32 array. required: false. default: [].
175 |           "slicing": []
176 |         }
177 |       ],
178 |       "output_processors": [
179 |         {
180 |           // output tensor name in origin model. "DEFAULT" means processor for all output tensors. type: string. required: true.
181 |           "tensor_name": "output",
182 |           // permute the output tensor. type: int32 array. required: false. default: [].
183 |           "dst_perm": [0, 1],
184 |           // output data type. type: enum. required: false. default: FP32. option: FP32, U8.
185 |           "output_dtype": "FP32"
186 |         }
187 |       ],
188 |       "const_processors": [
189 |         {
190 |           // const tensor name in origin model. type: string. required: true.
191 |           "name": "fc2.bias",
192 |           // const tensor data array. type: list of double. required: false.
193 |           "data": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
194 |           // const tensor data file path, support .bin / .npy / .txt. type: string. required: false.
195 |           "data_path": "replaced_data_file_path"
196 |         }
197 |       ],
198 |       "quant_op_processors": [
199 |         {
200 |           // operator name in origin model. type: string. required: true.
201 |           "op_name": "MaxPool_3",
202 |           // operator attributes to be patched. type: dict. default: {}. required: true.
203 |           "attrs": {
204 |             "ceil_mode": 0
205 |           }
206 |         },
207 |         {
208 |           "op_name": "Flatten_4", // AxReshape
209 |           "attrs": {
210 |             "shape": [0, 800]
211 |           }
212 |         }
213 |       ],
214 |       "compiler": {
215 |         // static batch sizes. type: int array. required: false. default: [].
216 |         "static_batch_sizes": [],
217 |         // max dynamic batch. type: int, required: false. default: 0.
218 |         "max_dynamic_batch_size": 0,
219 |         // disable ir fix, only work in multi-batch compilation. type: bool. required: false. default: false.
220 |         "disable_ir_fix": false,
221 |         // compiler check level, 0: no check; 1: assert all close; 2: assert all equal; 3: check cosine simularity. type: int. required: false. default: 0.
222 |         "check": 0,
223 |         // dump npu perf information for profiling. type: bool. required: false. default: false.
224 |         "npu_perf": false,
225 |         // compiler check mode, CheckOutput: only check model output; CheckPerLayer: check model intermediate tensor and output. type: enum. required: false. default: CheckOutput. option: CheckOutput, CheckPerLayer.
226 |         "check_mode": "CheckOutput",
227 |         // relative tolerance when check level is 1. type: float. required: false. default: 1e-5.
228 |         "check_rtol": 1e-5,
229 |         // absolute tolerance when check level is 1. type: float. required: false. default: 0.
230 |         "check_atol": 0,
231 |         // cosine simularity threshold when check level is 3. type: float. required: false. default: 0.999.
232 |         "check_cosine_simularity": 0.999,
233 |         // tensor black list for per layer check, support regex. type: list of string. required: false. default: [].
234 |         "check_tensor_black_list": [],
235 |         // input sample data dir for compiler check. type: string. required: false. default: .
236 |         "input_sample_dir": ""
237 |       }
238 |     }
239 | 
240 | .. _config_define:
241 | 
242 | ------------------------------------
243 | 量化参数说明
244 | ------------------------------------
245 | 
246 | - ``input_configs`` 中的 ``tensor_name`` 需要根据模型的实际输入/输出节点名称进行设置。
247 | - ``input_configs`` 中的 ``tensor_name`` 可以设置为 ``DEFAULT`` 代表量化配置应用于全部输入。
248 | - 模型输入的色彩空间由预处理 ``input_processors`` 配置中的 ``tensor_format`` 参数来表达。
249 | - 工具链读取量化校准集时，会根据 ``input_processors`` 中的 ``tensor_format`` 参数自动转换校准集数据的色彩空间。
250 | - ``layer_configs`` 中的 ``layer_name`` 及 ``op_type`` 选项不可以同时配置。
251 | - ``transformer_opt_level`` 设置 ``Transformer`` 模型的优化选项。
252 | 
253 | .. _quant_precision_analysis_config_define:
254 | 
255 | ------------------------------------
256 | 量化精度分析参数说明
257 | ------------------------------------
258 | 
259 | - 精度分析计算方法，``precision_analysis_mode`` 字段。
260 | 
261 |     - ``Reference`` 可以运行编译器支持的全部模型（支持包含 CPU 及 NPU 子图的模型），但是计算结果相比于最终上板结果会有少量误差（基本上差距在正负 1 内，且无系统性误差）。
262 |     - ``NPUBackend`` 可以运行仅包含 NPU 子图的模型，但是计算结果与上板结果比特对齐。
263 | 
264 | - 精度分析方法，``precision_analysis_method`` 字段。
265 | 
266 |     - ``PerLayer`` 意味着每一层都采用浮点模型对应的层输入，计算每一层的输出与浮点模型输出的相似度。
267 |     - ``EndToEnd`` 代表首层采用浮点模型输入，然后进行完整模型的仿真，计算最终输出结果与浮点模型输出的相似度。
268 | 
269 | 
270 | .. _processing_arg_details:
271 | 
272 | ------------------------------------
273 | 预处理、后处理参数说明
274 | ------------------------------------
275 | 
276 | - ``input_processors`` / ``output_processors`` 配置说明
277 | 
278 |     - ``tensor_name`` 需要根据模型的实际输入/输出节点名称进行设置。
279 |     - ``tensor_name`` 可以设置为 ``DEFAULT`` 代表配置应用于全部输入或者输出。
280 |     - 前缀为 ``tensor_`` 的参数代表原始模型中的输入输出属性。
281 |     - 前缀为 ``src_`` 的参数代表着运行时实际的输入输出属性。
282 |     - 工具链会根据用户的配置自动添加算子，以完成运行时输入输出与原始模型输入输出之间的转换。
283 | 
284 |         - 例如：当 ``tensor_layout`` 为 ``NCHW``，且 ``src_layout`` 为 ``NHWC`` 时，工具链会在原始模型输入之前自动添加一个 ``perm`` 属性为 [0, 3, 1, 2] 的 ``Transpose`` 算子。
285 | 
286 | - 色彩空间转换预处理
287 | 
288 |     - 当 ``csc_mode`` 为 ``LimitedRange`` 或者 ``FullRange`` 且 ``src_format`` 为 ``YUV 色彩空间`` 时，工具链会根据内置的模板参数，在原始的输入前添加一个色彩空间转换算子，此时 ``csc_mat`` 配置无效；
289 |     - 当 ``csc_mode`` 为 ``Matrix`` 且 ``src_format`` 为 ``YUV 色彩空间`` 时，工具链会根据用户配置的 ``csc_mat`` 矩阵，在原始的输入前添加一个色彩空间转换算子，以实现在运行时将输入的 ``YUV`` 数据转换为模型计算所需的 ``BGR`` 或者 ``RGB`` 数据；
290 |     - 当 ``csc_mode`` 为 ``Matrix`` 时，计算流程为，先将 ``YUV / YVU 色彩空间`` 输入统一转换为 ``YUV444`` 格式，然后再乘以 ``csc_mat`` 系数矩阵。
291 |     - 当 ``csc_mode`` 为 ``Matrix`` 时，``bias`` (csc_mat[3] / csc_mat[7] / csc_mat[11]) 数值范围为 (-9, 8)。其余参数 (csc_mat[0-2] / csc_mat[4-6] / csc_mat[8-10]) 数值范围为 (-524289, 524288)。
292 | 
293 | - 归一化预处理
294 | 
295 |     - ``input_processors`` 中的 ``mean`` / ``std`` 参数，默认为用户在量化配置中 ``calibration_mean`` / ``calibration_std`` 参数所配置的值。
296 |     - 如果用户希望在运行时采用不同的归一化参数，那么可以显示的配置 中的 ``mean`` / ``std`` 参数以覆盖默认值。
297 | 
298 | - 数据预处理中的填充 (Pad) 和切片 (Slice) 操作
299 | 
300 |     配置示例:
301 | 
302 |     .. code-block:: shell
303 | 
304 |         {
305 |           ...
306 |           "input_processors": [
307 |             {
308 |               "slicing": [0, 0, 0, 0, 0, 1, 0, 1]
309 |             }
310 |           ],
311 |           ...
312 |         }
313 | 
314 |     - ``padding`` 此字段表示在数据预处理对特定轴进行填充时，每个轴的开始和结束部分应填充的长度。以 32 位整型数组的形式表示，如果未设置，则使用默认值，即空列表，表示不进行填充。
315 |     - ``padding_mode`` 这个字段指定了填充的模式。它是一个字符串类型，可能的值决定了填充值的生成方式。默认值为 "constant"，表示使用常数值进行填充。目前仅支持 "constant" 模式填充。
316 |     - ``padding_constant_value`` 此字段指定了在填充模式为 "constant" 时使用的常数值。它是一个 32 位整型。表示用于填充的固定值。默认值值为 0。
317 |     - ``slicing`` 此字段表示在数据预处理对特定轴进行切片时，每个轴的开始和结束部分应切片的长度。以 32 位整型数组的形式表示的，如果未设置，则使用默认值，即空列表，表示不进行切片。
318 | 
319 | ------------------------------------
320 | proto 配置定义
321 | ------------------------------------
322 | 
323 | .. code-block:: shell
324 | 
325 |     syntax = "proto3";
326 |     
327 |     package common;
328 |     
329 |     enum ColorSpace {
330 |       AutoColorSpace = 0;
331 |       GRAY = 1;
332 |       BGR = 2;
333 |       RGB = 3;
334 |       RGBA = 4;
335 |       YUV420SP = 6;   // Semi-Planner, NV12
336 |       YVU420SP = 7;   // Semi-Planner, NV21
337 |       YUYV422 = 8;     // Planner, YUYV
338 |       UYVY422 = 9;     // Planner, UYVY
339 |     }
340 |     
341 |     enum Layout {
342 |       DefaultLayout = 0;
343 |       NHWC = 1;
344 |       NCHW = 2;
345 |     }
346 |     
347 |     enum DataType {
348 |       DefaultDataType = 0;
349 |       U8 = 1;
350 |       S8 = 2;
351 |       U16 = 3;
352 |       S16 = 4;
353 |       U32 = 5;
354 |       S32 = 6;
355 |       U64 = 7;
356 |       S64 = 8;
357 |       FP16 = 9;
358 |       FP32 = 10;
359 |     }
360 |     
361 |     enum NPUMode {
362 |       NPU1 = 0;
363 |       NPU2 = 1;
364 |       NPU3 = 2;
365 |     }
366 |     
367 |     enum HardwareType {
368 |       AX650 = 0;
369 |       AX620E = 1;
370 |       M76H = 2;
371 |     }
372 | 
373 | .. code-block:: shell
374 | 
375 |     syntax = "proto3";
376 |     
377 |     import "path/to/common.proto";
378 |     import "google/protobuf/struct.proto";
379 |     
380 |     package pulsar2.build;
381 |     
382 |     enum ModelType {
383 |       ONNX = 0;
384 |       QuantAxModel = 1;
385 |       QuantONNX = 3;
386 |     }
387 |     
388 |     enum QuantMethod {
389 |       MinMax = 0;
390 |       Percentile = 1;
391 |       MSE = 2;
392 |     }
393 |     
394 |     enum PrecisionAnalysisMethod {
395 |       PerLayer = 0;
396 |       EndToEnd = 1;
397 |     }
398 |     
399 |     enum PrecisionAnalysisMode {
400 |       Reference = 0;
401 |       NPUBackend = 1;
402 |     }
403 |     
404 |     enum CheckMode {
405 |       CheckOutput = 0;
406 |       CheckPerLayer = 1;
407 |     }
408 |     
409 |     enum DataFormat {
410 |       Image = 0;
411 |       Numpy = 1;
412 |       Binary = 2;
413 |       NumpyObject = 3;
414 |     }
415 |     
416 |     enum CSCMode {
417 |       NoCSC = 0;
418 |       Matrix = 1;
419 |       FullRange = 2;
420 |       LimitedRange = 3;
421 |     }
422 |     
423 |     message InputQuantConfig {
424 |       // input tensor name in origin model. "DEFAULT" means input config for all input tensors. type: string. required: true.
425 |       string tensor_name = 1;
426 |       // quantize calibration dataset archive file path. type: string. required: true. limitation: tar, tar.gz, zip.
427 |       string calibration_dataset = 2;
428 |       // quantize calibration data format. type: enum. required: false. default: Image. option: Image, Numpy, Binary, NumpyObject.
429 |       DataFormat calibration_format = 3;
430 |       // quantize calibration data size is min(${calibration_size}, size of ${calibration_dataset}), "-1" means load all dataset. type: int. required: false. default: 32.
431 |       int32 calibration_size = 4;
432 |       // quantize mean parameter of normlization. type: float array. required: false. default: [].
433 |       repeated float calibration_mean = 5;
434 |       // quantize std parameter of normlization. type: float array. required: false. default: [].
435 |       repeated float calibration_std = 6;
436 |     }
437 |     
438 |     message LayerConfig {
439 |       // set layer quantize precision. type: string. required: must choose between `layer_name` and `op_type` and `layer_names` and `op_types`. default: .
440 |       string layer_name = 1;
441 |     
442 |       // set quantize precision by operator type. type: string. required: must choose between `layer_name` and `op_type` and `layer_names` and `op_types`. default: .
443 |       string op_type = 2;
444 |     
445 |       // start tensor names of subgraph quantization config. type: string array. required: false. default: [].
446 |       repeated string start_tensor_names = 3;
447 |       // end tensor names of subgraph quantization config. type: string array. required: false. default: [].
448 |       repeated string end_tensor_names = 4;
449 |     
450 |       // quantize data type. type: enum. required: false. default: U8. option: U8, S8, U16, S16, FP32.
451 |       common.DataType data_type = 5;
452 |     
453 |       // quantize weight type for Conv. type: enum. required: false. default: S8. option: S8, FP32.
454 |       common.DataType weight_data_type = 6;
455 |     
456 |       // set layer quantize precision by layers name. type: enum. required: must choose between `layer_name` and `op_type` and `layer_names` and `op_types`. default: [].
457 |       repeated string layer_names = 7;
458 |     
459 |       // set quantize precision by operator types. type: enum. required: must choose between `layer_name` and `op_type` and `layer_names` and `op_types`. default: [].
460 |       repeated string op_types = 8;
461 |     
462 |       // quantize data type for Conv. type: enum. required: false. default: U8. option: U8, S8, U16, S16, FP32.
463 |       common.DataType output_data_type = 10;
464 |     }
465 |     
466 |     message OnnxOptimizeOption {
467 |       // disable onnx optimization. type: bool. required: false. default: false.
468 |       bool disable_onnx_optimization = 1;
469 |       // enable onnx simplify by https://github.com/daquexian/onnx-simplifier. type: bool. required: false. default: false.
470 |       bool enable_onnxsim = 2;
471 |       // enable model check. type: bool. required: false. default: false.
472 |       bool model_check = 3;
473 |       // disable transformation check. type: bool. required: false. default: false.
474 |       bool disable_transformation_check = 4;
475 |       // save tensors data to optimize memory footprint. type: bool. required: false. default: false.
476 |       bool save_tensors_data = 5;
477 |     }
478 |     
479 |     message QuantConfig {
480 |       repeated InputQuantConfig input_configs = 1;
481 |       repeated LayerConfig layer_configs = 2;
482 |     
483 |       // quantize calibration method. type: enum. required: false. default: MinMax. option: MinMax, Percentile, MSE.
484 |       QuantMethod calibration_method = 3;
485 |       // enable quantization precision analysis. type: bool. required: false. default: false.
486 |       bool precision_analysis = 4;
487 |       // precision analysis method. type: enum. required: false. default: PerLayer. option: PerLayer, EndToEnd.
488 |       PrecisionAnalysisMethod precision_analysis_method = 5;
489 |       // precision analysis mode. type: enum. required: false. default: Reference. option: Reference, NPUBackend.
490 |       PrecisionAnalysisMode precision_analysis_mode = 6;
491 |       // enable highest mix precision quantization. type: bool. required: false. default: false.
492 |       bool highest_mix_precision = 7;
493 |       // conv bias data type. type: enum. required: false. default: S32. option: S32, FP32.
494 |       common.DataType conv_bias_data_type = 8;
495 |       // refine weight threshold, should be a legal float number, like 1e-6. -1 means disable this feature. type: float. required: false. default: 1e-6. limitation: 0 or less than 0.0001.
496 |       float refine_weight_threshold = 9;
497 |       // enalbe smooth quant strategy for conv 1x1. type: bool. required: false. default: false.
498 |       bool enable_smooth_quant = 10;
499 |       // tranformer opt level. type: int. required: false. default: 0. limitation: 0~2.
500 |       int32 transformer_opt_level = 20;
501 |       // input sample data dir for precision analysis. type: string. required: false. default: .
502 |       string input_sample_dir = 30;
503 |       // LayerNormalization scale data type. type: enum. required: false. default: FP32. option: FP32, S32, U32.
504 |       common.DataType ln_scale_data_type = 40;
505 |       // quant check level, 0: no check; 1: check node dtype. type: int. required: false. default: 0.
506 |       int32 check = 50;
507 |       // refine weight scale and input scale, type: bool. required: false. default: false.
508 |       bool disable_auto_refine_scale = 60;
509 |     }
510 |     
511 |     message InputProcessor {
512 |       // input tensor name in origin model. "DEFAULT" means processor for all input tensors. type: string. required: true.
513 |       string tensor_name = 1;
514 |     
515 |       // input tensor format in origin model. type: enum. required: false. default: AutoColorSpace. option: AutoColorSpace, BGR, RGB, GRAY.
516 |       common.ColorSpace tensor_format = 2;
517 |       // input tensor layout in origin model. type: enum. required: false. default: NCHW. option: NHWC, NCHW.
518 |       common.Layout tensor_layout = 3;
519 |     
520 |       // input format in runtime. type: enum. required: false. default: AutoColorSpace. option: AutoColorSpace, GRAY, BGR, RGB, YUYV422, UYVY422, YUV420SP, YVU420SP, RAW.
521 |       common.ColorSpace src_format = 4;
522 |       // input layout in runtime; if `src_format` is YUV/YVU, `src_layout` will be changed to NHWC. type: enum. required: false. default: NCHW. option: NHWC, NCHW.
523 |       common.Layout src_layout = 5;
524 |       // input data type in runtime. type: enum. required: false. default: FP32. option: U8, S8, U16, S16, U32, S32, FP16, FP32.
525 |       common.DataType src_dtype = 6;
526 |     
527 |       // extra compiler shapes for this input. src_extra_shapes size of every input should be the same. shape at the same index of every input will be treated as a input group which can inference independently at runtime. type: list of Shape. required: false. default [].
528 |       repeated common.Shape src_extra_shapes = 11;
529 |     
530 |       // color space mode. type: enum. required: false. default: NoCSC. option: NoCSC, Matrix, FullRange, LimitedRange.
531 |       CSCMode csc_mode = 7;
532 |       // color space conversion matrix, 12 elements array that represents a 3x4 matrix. type: float array. required: false. default: [].
533 |       repeated float csc_mat = 8;
534 |       // mean parameter of normlization in runtime. type: float array. required: false. default: same with ${quant.input_configs.calibration_mean}.
535 |       repeated float mean = 9;
536 |       // std parameter of normlization in runtime. type: float array. required: false. default: same with ${quant.input_configs.calibration_std}.
537 |       repeated float std = 10;
538 |       // list containing the number of start and end pad values for axis when padding. type: int32 array. required: false. default: [].
539 |       repeated int32 padding = 20;
540 |       // padding mode. type: string. required: false. default: constant.
541 |       string padding_mode = 21;
542 |       // padding constant value. type: int32. required: false. default: 0.
543 |       int32 padding_constant_value = 22;
544 |       // list containing the number of start and end pad values for axis when slicing. type: int32 array. required: false. default: [].
545 |       repeated int32 slicing = 30;
546 |     }
547 |     
548 |     message OutputProcessor {
549 |       // output tensor name in origin model. "DEFAULT" means processor for all output tensors. type: string. required: true.
550 |       string tensor_name = 1;
551 |     
552 |       common.Layout tensor_layout = 2;
553 |     
554 |       // permute the output tensor. type: int32 array. required: false. default: [].
555 |       repeated int32 dst_perm = 3;
556 |     
557 |       // output data type. type: enum. required: false. default: FP32. option: FP32, U8.
558 |       common.DataType output_dtype = 4;
559 |     }
560 |     
561 |     message OpProcessor {
562 |       // operator name in origin model. type: string. required: true.
563 |       string op_name = 1;
564 |     
565 |       // operator attributes to be patched. type: dict. default: {}. required: true.
566 |       .google.protobuf.Struct attrs = 2;
567 |     }
568 |     
569 |     message ConstProcessor {
570 |       // const tensor name in origin model. type: string. required: true.
571 |       string name = 1;
572 |     
573 |       // const tensor data array. type: list of double. required: false.
574 |       repeated double data = 2;
575 |     
576 |       // const tensor data file path, support .bin / .npy / .txt. type: string. required: false.
577 |       string data_path = 3;
578 |     }
579 |     
580 |     message CompilerConfig {
581 |       // static batch sizes. type: int array. required: false. default: [].
582 |       repeated int32 static_batch_sizes = 1;
583 |       // max dynamic batch. type: int, required: false. default: 0.
584 |       int32 max_dynamic_batch_size = 2;
585 |       // disable ir fix, only work in multi-batch compilation. type: bool. required: false. default: false.
586 |       bool disable_ir_fix = 3;
587 |       // compiler check level, 0: no check; 1: assert all close; 2: assert all equal; 3: check cosine simularity. type: int. required: false. default: 0.
588 |       int32 check = 5;
589 |       // dump npu perf information for profiling. type: bool. required: false. default: false.
590 |       bool npu_perf = 6;
591 |       // compiler check mode, CheckOutput: only check model output; CheckPerLayer: check model intermediate tensor and output. type: enum. required: false. default: CheckOutput. option: CheckOutput, CheckPerLayer.
592 |       CheckMode check_mode = 7;
593 |       // relative tolerance when check level is 1. type: float. required: false. default: 1e-5.
594 |       float check_rtol = 8;
595 |       // absolute tolerance when check level is 1. type: float. required: false. default: 0.
596 |       float check_atol = 9;
597 |       // cosine simularity threshold when check level is 3. type: float. required: false. default: 0.999.
598 |       float check_cosine_simularity = 10;
599 |       // tensor black list for per layer check, support regex. type: list of string. required: false. default: [].
600 |       repeated string check_tensor_black_list = 11;
601 |       // input sample data dir for compiler check. type: string. required: false. default: .
602 |       string input_sample_dir = 30;
603 |     }
604 |     
605 |     message BuildConfig {
606 |       // input model file path. type: string. required: true.
607 |       string input = 1;
608 |       // axmodel output directory. type: string. required: true.
609 |       string output_dir = 2;
610 |       // rename output axmodel. type: string. required: false. default: compiled.axmodel.
611 |       string output_name = 3;
612 |       // temporary data output directory. type: string. required: false. default: same with ${output_dir}.
613 |       string work_dir = 4;
614 |     
615 |       // input model type. type: enum. required: false. default: ONNX. option: ONNX, QuantAxModel, QuantONNX.
616 |       ModelType model_type = 5;
617 |     
618 |       // target hardware. type: enum. required: false. default: AX650. option: AX650, AX620E, M76H.
619 |       common.HardwareType target_hardware = 6;
620 |       // npu mode. while ${target_hardware} is AX650, npu mode can be NPU1 / NPU2 / NPU3. while ${target_hardware} is AX620E, npu mode can be NPU1 / NPU2. type: enum. required: false. default: NPU1.
621 |       common.NPUMode npu_mode = 7;
622 |     
623 |       // modify model input shape of input model, this feature will take effect before the `input_processors` configuration. format: input1:1x3x224x224;input2:1x1x112x112. type: string. required: false. default: .
624 |       string input_shapes = 8;
625 |     
626 |       OnnxOptimizeOption onnx_opt = 10;
627 |     
628 |       QuantConfig quant = 20;
629 |     
630 |       repeated InputProcessor input_processors = 31;
631 |       repeated OutputProcessor output_processors = 32;
632 |       repeated ConstProcessor const_processors = 33;
633 |       repeated OpProcessor op_processors = 34;
634 |       repeated OpProcessor quant_op_processors = 35;
635 |     
636 |       CompilerConfig compiler = 40;
637 |     }
638 | 


--------------------------------------------------------------------------------
/source/user_guides_quick/quick_start_ax620e.rst:
--------------------------------------------------------------------------------
  1 | ======================
  2 | Quick Start(AX620E)
  3 | ======================
  4 | 
  5 | **本章节适用于以下平台：**
  6 | 
  7 | - AX630C
  8 | - AX620Q
  9 | 
 10 | 本章节介绍 ``ONNX`` 模型转换的基本操作, 使用 ``pulsar2`` 工具将 ``ONNX``  模型编译成 ``axmodel`` 模型. 请先参考 :ref:`《开发环境准备》 <dev_env_prepare>` 章节完成开发环境搭建. 
 11 | 本节示例模型为开源模型 ``MobileNetv2``.
 12 | 
 13 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 14 | Pulsar2 工具链命令说明
 15 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 16 | 
 17 | ``Pulsar2`` 工具链中的功能指令以 ``pulsar2`` 开头, 与用户强相关的命令为 ``pulsar2 build`` , ``pulsar2 run`` 以及 ``pulsar2 version``. 
 18 | 
 19 | * ``pulsar2 build`` 用于将 ``onnx`` 模型转换为 ``axmodel`` 格式模型
 20 | * ``pulsar2 run`` 用于模型转换后的仿真运行
 21 | * ``pulsar2 version`` 可以用于查看当前工具链的版本信息, 通常在反馈问题时需要提供此信息
 22 | 
 23 | .. code-block:: shell
 24 | 
 25 |     root@xxx:/data# pulsar2 --help
 26 |     usage: pulsar2 [-h] {version,build,run} ...
 27 |     
 28 |     positional arguments:
 29 |       {version,build,run}
 30 |     
 31 |     optional arguments:
 32 |       -h, --help           show this help message and exit
 33 | 
 34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 35 | 模型编译配置文件说明
 36 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 37 | 
 38 | ``/data/config/`` 路径下的 ``mobilenet_v2_build_config.json`` 展示:
 39 | 
 40 | .. code-block:: shell
 41 | 
 42 |     {
 43 |       "model_type": "ONNX",
 44 |       "npu_mode": "NPU1",
 45 |       "quant": {
 46 |         "input_configs": [
 47 |           {
 48 |             "tensor_name": "input",
 49 |             "calibration_dataset": "./dataset/imagenet-32-images.tar",
 50 |             "calibration_size": 32,
 51 |             "calibration_mean": [103.939, 116.779, 123.68],
 52 |             "calibration_std": [58.0, 58.0, 58.0]
 53 |           }
 54 |         ],
 55 |         "calibration_method": "MinMax",
 56 |         "precision_analysis": false
 57 |       },
 58 |       "input_processors": [
 59 |         {
 60 |           "tensor_name": "input",
 61 |           "tensor_format": "BGR",
 62 |           "src_format": "BGR",
 63 |           "src_dtype": "U8",
 64 |           "src_layout": "NHWC",
 65 |           "csc_mode": "NoCSC"
 66 |         }
 67 |       ],
 68 |       "compiler": {
 69 |         "check": 0
 70 |       }
 71 |     }
 72 | 
 73 | .. attention::
 74 | 
 75 |     ``input_processors``, ``output_processors`` 及 ``quant`` 节点下 ``input_configs`` 中的 ``tensor_name`` 字段需要根据模型的实际输入/输出节点名称进行设置，也可以设置为 ``DEFAULT`` 代表当前配置应用于全部输入或者输出。
 76 | 
 77 |     .. figure:: ../media/tensor_name.png
 78 |         :alt: pipeline
 79 |         :align: center
 80 | 
 81 | 更加详细的内容，请参考 :ref:`《配置文件详细说明》 <config_details>`.
 82 | 
 83 | .. _model_compile_20e:
 84 | 
 85 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 86 | 编译执行
 87 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 88 | 
 89 | 以 ``mobilenetv2-sim.onnx`` 为例, 执行如下 ``pulsar2 build`` 命令编译生成 ``compiled.axmodel``:
 90 | 
 91 | .. code-block:: shell
 92 | 
 93 |     pulsar2 build --target_hardware AX620E --input model/mobilenetv2-sim.onnx --output_dir output --config config/mobilenet_v2_build_config.json
 94 | 
 95 | .. warning::
 96 | 
 97 |     在编译模型前，需要确保已经对原始模型使用过 ``onnxsim`` 工具优化，主要目的是将模型转变成更利于 ``Pulsar2`` 编译的静态图及获得更好的推理性能。有以下两种方法：
 98 | 
 99 |     1. 在 ``Pulsar2`` docker 内部直接执行命令：``onnxsim in.onnx out.onnx``。
100 |     2. 使用 ``pulsar2 build`` 进行模型转换时，增加参数：``--onnx_opt.enable_onnxsim true`` （默认值为 false）。
101 | 
102 |     如果想要进一步了解 ``onnxsim`` ，可访问 `官方网站 <https://github.com/daquexian/onnx-simplifier>`_ 。
103 | 
104 | ^^^^^^^^^^^^^^^^^^^^^
105 | log 参考信息
106 | ^^^^^^^^^^^^^^^^^^^^^
107 | 
108 | .. code-block::
109 | 
110 |     $ pulsar2 build --target_hardware AX620E --input model/mobilenetv2-sim.onnx --output_dir output --config config/mobilenet_v2_build_config.json
111 |     2023-07-29 14:23:01.757 | WARNING  | yamain.command.build:fill_default:313 - ignore input csc config because of src_format is AutoColorSpace or src_format and tensor_format are the same
112 |     Building onnx ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
113 |     2023-07-29 14:23:07.806 | INFO     | yamain.command.build:build:424 - save optimized onnx to [output/frontend/optimized.onnx]
114 |     patool: Extracting ./dataset/imagenet-32-images.tar ...
115 |     patool: running /usr/bin/tar --extract --file ./dataset/imagenet-32-images.tar --directory output/quant/dataset/input
116 |     patool: ... ./dataset/imagenet-32-images.tar extracted to `output/quant/dataset/input'.
117 |                                                                             Quant Config Table
118 |     ┏━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓
119 |     ┃ Input ┃ Shape            ┃ Dataset Directory ┃ Data Format ┃ Tensor Format ┃ Mean                                                         ┃ Std                ┃
120 |     ┡━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩
121 |     │ input │ [1, 3, 224, 224] │ input             │ Image       │ BGR           │ [103.93900299072266, 116.77899932861328, 123.68000030517578] │ [58.0, 58.0, 58.0] │
122 |     └───────┴──────────────────┴───────────────────┴─────────────┴───────────────┴──────────────────────────────────────────────────────────────┴────────────────────┘
123 |     Transformer optimize level: 0
124 |     32 File(s) Loaded.
125 |     [14:23:09] AX LSTM Operation Format Pass Running ...      Finished.
126 |     [14:23:09] AX Set MixPrecision Pass Running ...           Finished.
127 |     [14:23:09] AX Refine Operation Config Pass Running ...    Finished.
128 |     [14:23:09] AX Reset Mul Config Pass Running ...           Finished.
129 |     [14:23:09] AX Tanh Operation Format Pass Running ...      Finished.
130 |     [14:23:09] AX Confused Op Refine Pass Running ...         Finished.
131 |     [14:23:09] AX Quantization Fusion Pass Running ...        Finished.
132 |     [14:23:09] AX Quantization Simplify Pass Running ...      Finished.
133 |     [14:23:09] AX Parameter Quantization Pass Running ...     Finished.
134 |     Calibration Progress(Phase 1): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 18.07it/s]
135 |     Finished.
136 |     [14:23:11] AX Passive Parameter Quantization Running ...  Finished.
137 |     [14:23:11] AX Parameter Baking Pass Running ...           Finished.
138 |     [14:23:11] AX Refine Int Parameter Pass Running ...       Finished.
139 |     [14:23:11] AX Refine Weight Parameter Pass Running ...    Finished.
140 |     --------- Network Snapshot ---------
141 |     Num of Op:                    [100]
142 |     Num of Quantized Op:          [100]
143 |     Num of Variable:              [278]
144 |     Num of Quantized Var:         [278]
145 |     ------- Quantization Snapshot ------
146 |     Num of Quant Config:          [387]
147 |     BAKED:                        [53]
148 |     OVERLAPPED:                   [145]
149 |     ACTIVATED:                    [65]
150 |     SOI:                          [1]
151 |     PASSIVE_BAKED:                [53]
152 |     FP32:                         [70]
153 |     Network Quantization Finished.
154 |     [Warning]File output/quant/quant_axmodel.onnx has already exist, quant exporter will overwrite it.
155 |     [Warning]File output/quant/quant_axmodel.json has already exist, quant exporter will overwrite it.
156 |     quant.axmodel export success: output/quant/quant_axmodel.onnx
157 |     Building native ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
158 |     2023-07-29 14:23:18.332 | WARNING  | yamain.command.load_model:pre_process:454 - preprocess tensor [input]
159 |     2023-07-29 14:23:18.332 | INFO     | yamain.command.load_model:pre_process:456 - tensor: input, (1, 224, 224, 3), U8
160 |     2023-07-29 14:23:18.332 | INFO     | yamain.command.load_model:pre_process:459 - op: op:pre_dequant_1, AxDequantizeLinear, {'const_inputs': {'x_zeropoint': 0, 'x_scale': 1}, 'output_dtype': <class 'numpy.float32'>, 'quant_method': 0}
161 |     2023-07-29 14:23:18.332 | INFO     | yamain.command.load_model:pre_process:456 - tensor: tensor:pre_norm_1, (1, 224, 224, 3), FP32
162 |     2023-07-29 14:23:18.332 | INFO     | yamain.command.load_model:pre_process:459 - op: op:pre_norm_1, AxNormalize, {'dim': 3, 'mean': [103.93900299072266, 116.77899932861328, 123.68000030517578], 'std': [58.0, 58.0, 58.0]}
163 |     2023-07-29 14:23:18.332 | INFO     | yamain.command.load_model:pre_process:456 - tensor: tensor:pre_transpose_1, (1, 224, 224, 3), FP32
164 |     2023-07-29 14:23:18.332 | INFO     | yamain.command.load_model:pre_process:459 - op: op:pre_transpose_1, AxTranspose, {'perm': [0, 3, 1, 2]}
165 |     tiling op...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 174/174 0:00:00
166 |     new_ddr_tensor = []
167 |     build op...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 440/440 0:00:00
168 |     add ddr swap...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1606/1606 0:00:00
169 |     calc input dependencies...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2279/2279 0:00:00
170 |     calc output dependencies...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2279/2279 0:00:00
171 |     assign eu heuristic   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2279/2279 0:00:00
172 |     assign eu onepass   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2279/2279 0:00:00
173 |     assign eu greedy   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2279/2279 0:00:00
174 |     2023-07-29 14:23:21.762 | INFO     | yasched.test_onepass:results2model:1882 - max_cycle = 782,940
175 |     2023-07-29 14:23:22.159 | INFO     | yamain.command.build:compile_npu_subgraph:1004 - QuantAxModel macs: 280,262,480
176 |     2023-07-29 14:23:25.209 | INFO     | backend.ax620e.linker:link_with_dispatcher:1586 - DispatcherQueueType.IO: Generate 69 EU chunks, 7 Dispatcher Chunk
177 |     2023-07-29 14:23:25.209 | INFO     | backend.ax620e.linker:link_with_dispatcher:1586 - DispatcherQueueType.Compute: Generate 161 EU chunks, 23 Dispatcher Chunk
178 |     2023-07-29 14:23:25.209 | INFO     | backend.ax620e.linker:link_with_dispatcher:1587 - EU mcode size: 147 KiB
179 |     2023-07-29 14:23:25.209 | INFO     | backend.ax620e.linker:link_with_dispatcher:1588 - Dispatcher mcode size: 21 KiB
180 |     2023-07-29 14:23:25.209 | INFO     | backend.ax620e.linker:link_with_dispatcher:1589 - Total mcode size: 168 KiB
181 |     2023-07-29 14:23:26.928 | INFO     | yamain.command.build:compile_ptq_model:940 - fuse 1 subgraph(s)
182 | 
183 | .. note::
184 | 
185 |     该示例所运行的主机配置为:
186 | 
187 |         - Intel(R) Xeon(R) Gold 6336Y CPU @ 2.40GHz
188 |         - Memory 32G
189 | 
190 |     全流程耗时大约 ``11s`` , 不同配置的主机转换时间略有差异.
191 | 
192 | 
193 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
194 | 模型编译输出文件说明
195 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
196 | 
197 | .. code-block:: shell  
198 | 
199 |     root@xxx:/data# tree output/
200 |     output/
201 |     ├── build_context.json
202 |     ├── compiled.axmodel            # 最终板上运行模型，AxModel
203 |     ├── compiler                    # 编译器后端中间结果及 debug 信息
204 |     ├── frontend                    # 前端图优化中间结果及 debug 信息
205 |     │   └── optimized.onnx          # 输入模型经过图优化以后的浮点 ONNX 模型
206 |     └── quant                       # 量化工具输出及 debug 信息目录
207 |         ├── dataset                 # 解压后的校准集数据目录
208 |         │   └── input
209 |         │       ├── ILSVRC2012_val_00000001.JPEG
210 |         │       ├── ......
211 |         │       └── ILSVRC2012_val_00000032.JPEG
212 |         ├── debug
213 |         ├── quant_axmodel.json      # 量化配置信息
214 |         └── quant_axmodel.onnx      # 量化后的模型，QuantAxModel
215 | 
216 | 其中 ``compiled.axmodel`` 为最终编译生成的板上可运行的 ``.axmodel`` 模型文件
217 | 
218 | .. note::
219 | 
220 |     因为 ``.axmodel`` 基于 **ONNX** 模型存储格式开发，所以将 ``.axmodel`` 文件后缀修改为 ``.axmodel.onnx`` 后可支持被网络模型图形化工具 **Netron** 直接打开。
221 | 
222 |     .. figure:: ../media/axmodel-netron.png
223 |         :alt: pipeline
224 |         :align: center
225 | 
226 | ----------------------
227 | 模型信息查询
228 | ----------------------
229 | 
230 | 可以通过 ``onnx inspect --io ${axmodel/onnx_path}`` 来查看编译后 ``axmodel`` 模型的输入输出信息，还有其他 ``-m -n -t`` 参数可以查看模型里的 ``meta / node / tensor`` 信息。
231 | 
232 | .. code-block:: shell
233 | 
234 |     root@xxx:/data# onnx inspect -m -n -t output/compiled.axmodel
235 |     Failed to check model output/compiled.axmodel, statistic could be inaccurate!
236 |     Inpect of model output/compiled.axmodel
237 |     ================================================================================
238 |       Graph name: 8
239 |       Graph inputs: 1
240 |       Graph outputs: 1
241 |       Nodes in total: 1
242 |       ValueInfo in total: 2
243 |       Initializers in total: 2
244 |       Sparse Initializers in total: 0
245 |       Quantization in total: 0
246 | 
247 |     Meta information:
248 |     --------------------------------------------------------------------------------
249 |       IR Version: 7
250 |       Opset Import: [version: 13
251 |     ]
252 |       Producer name: Pulsar2
253 |       Producer version:
254 |       Domain:
255 |       Doc string: Pulsar2 Version:  1.8-beta1
256 |     Pulsar2 Commit: 6a7e59de
257 |       meta.{} = {} extra_data CgsKBWlucHV0EAEYAgoICgZvdXRwdXQSATEaMgoFbnB1XzBSKQoNbnB1XzBfYjFfZGF0YRABGhYKBnBhcmFtcxoMbnB1XzBfcGFyYW1zIgAoAQ==
258 | 
259 |     Node information:
260 |     --------------------------------------------------------------------------------
261 |       Node type "neu mode" has: 1
262 |     --------------------------------------------------------------------------------
263 |       Node "npu_0": type "neu mode", inputs "['input']", outputs "['output']"
264 | 
265 |     Tensor information:
266 |     --------------------------------------------------------------------------------
267 |       ValueInfo "input": type UINT8, shape [1, 224, 224, 3],
268 |       ValueInfo "output": type FLOAT, shape [1, 1000],
269 |       Initializer "npu_0_params": type UINT8, shape [3740416],
270 |       Initializer "npu_0_b1_data": type UINT8, shape [173256],
271 | 
272 | .. _model_simulator_20e:
273 | 
274 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
275 | 仿真运行
276 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
277 | 
278 | 本章节介绍 ``axmodel`` 仿真运行的基本操作, 使用 ``pulsar2 run`` 命令可以直接在 ``PC`` 上直接运行由 ``pulsar2 build`` 生成的 ``axmodel`` 模型，无需上板运行即可快速得到网络模型的运行结果。
279 | 
280 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
281 | 仿真运行准备
282 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
283 | 
284 | 某些模型只能支持特定的输入数据格式，模型的输出数据也是以模组特定的格式输出的。在模型仿真运行前，需要把输入数据转换成模型支持的数据格式，这部分数据操作称为 ``前处理`` 。在模型仿真运行后，需要把输出数据转换成工具可以分析查看的数据格式，这部分数据操作称为 ``后处理`` 。仿真运行时需要的 ``前处理`` 和 ``后处理`` 工具已包含在 ``pulsar2-run-helper`` 文件夹中。
285 | 
286 | ``pulsar2-run-helper`` 文件夹内容如下所示：
287 | 
288 | .. code-block:: shell
289 | 
290 |     root@xxx:/data# ll pulsar2-run-helper/
291 |     drwxr-xr-x 2 root root 4.0K Dec  2 12:23 models/
292 |     drwxr-xr-x 5 root root 4.0K Dec  2 12:23 pulsar2_run_helper/
293 |     drwxr-xr-x 2 root root 4.0K Dec  2 12:23 sim_images/
294 |     drwxr-xr-x 2 root root 4.0K Dec  2 12:23 sim_inputs/
295 |     drwxr-xr-x 2 root root 4.0K Dec  2 12:23 sim_outputs/
296 |     -rw-r--r-- 1 root root 3.0K Dec  2 12:23 cli_classification.py
297 |     -rw-r--r-- 1 root root 4.6K Dec  2 12:23 cli_detection.py
298 |     -rw-r--r-- 1 root root    2 Dec  2 12:23 list.txt
299 |     -rw-r--r-- 1 root root   29 Dec  2 12:23 requirements.txt
300 |     -rw-r--r-- 1 root root  308 Dec  2 12:23 setup.cfg
301 | 
302 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
303 | 仿真运行 示例``mobilenetv2``
304 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
305 | 
306 | 将 :ref:`《编译执行》 <model_compile_20e>` 章节生成的 ``compiled.axmodel`` 拷贝 ``pulsar2-run-helper/models`` 路径下，并更名为 ``mobilenetv2.axmodel``
307 | 
308 | .. code-block:: shell
309 | 
310 |     root@xxx:/data# cp output/compiled.axmodel pulsar2-run-helper/models/mobilenetv2.axmodel
311 | 
312 | ----------------------
313 | 输入数据准备
314 | ----------------------
315 | 
316 | 进入 ``pulsar2-run-helper`` 目录，使用 ``cli_classification.py`` 脚本将 ``cat.jpg`` 处理成 ``mobilenetv2.axmodel`` 所需要的输入数据格式。
317 | 
318 | .. code-block:: shell
319 | 
320 |     root@xxx:~/data# cd pulsar2-run-helper
321 |     root@xxx:~/data/pulsar2-run-helper# python3 cli_classification.py --pre_processing --image_path sim_images/cat.jpg --axmodel_path models/mobilenetv2.axmodel --intermediate_path sim_inputs/0
322 |     [I] Write [input] to 'sim_inputs/0/input.bin' successfully.
323 | 
324 | ----------------------
325 | 仿真模型推理
326 | ----------------------
327 | 
328 | 运行 ``pulsar2 run`` 命令，将 ``input.bin`` 作为 ``mobilenetv2.axmodel`` 的输入数据并执行推理计算，输出 ``output.bin`` 推理结果。
329 | 
330 | .. code-block:: shell
331 | 
332 |     root@xxx:~/data/pulsar2-run-helper# pulsar2 run --model models/mobilenetv2.axmodel --input_dir sim_inputs --output_dir sim_outputs --list list.txt
333 |     Building native ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
334 |     >>> [0] start
335 |     write [output] to [sim_outputs/0/output.bin] successfully
336 |     >>> [0] finish
337 | 
338 | ----------------------
339 | 输出数据处理
340 | ----------------------
341 | 
342 | 使用 ``cli_classification.py`` 脚本对仿真模型推理输出的 ``output.bin`` 数据进行后处理，得到最终计算结果。
343 | 
344 | .. code-block:: shell
345 | 
346 |     root@xxx:/data/pulsar2-run-helper# python3 cli_classification.py --post_processing --axmodel_path models/mobilenetv2.axmodel --intermediate_path sim_outputs/0
347 |     [I] The following are the predicted score index pair.
348 |     [I] 9.1132, 285
349 |     [I] 8.8490, 281
350 |     [I] 8.7169, 282
351 |     [I] 8.0566, 283
352 |     [I] 6.8679, 463
353 | 
354 | .. _onboard_running_20e:
355 | 
356 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
357 | 开发板运行
358 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
359 | 
360 | 本章节介绍如何在 ``AX630C`` ``AX620Q`` 开发板上运行通过 :ref:`《编译执行》 <model_compile_20e>` 章节获取 ``compiled.axmodel`` 模型. 
361 | 
362 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
363 | 开发板获取
364 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
365 | 
366 | - 通过企业途径向 AXera 签署 NDA 后获取 **AX630C DEMO Board**.
367 | 
368 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
369 | 使用 ax_run_model 工具快速测试模型推理速度
370 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
371 | 
372 | 为了方便用户测评模型，在开发板上预制了 :ref:`ax_run_model <ax_run_model>` 工具，此工具有若干参数，可以很方便地测试模型速度和精度。
373 | 
374 | 将 ``mobilenetv2.axmodel`` 拷贝到开发板上，执行以下命令即可快速测试模型推理性能（首先推理 3 次进行预热，以排除资源初始化导致的统计误差，然后推理 10 次，统计平均推理速度）。
375 | 
376 | .. code-block:: shell
377 | 
378 |     /root # ax_run_model -m /opt/data/npu/models/mobilenetv2.axmodel -w 3 -r 10
379 |       Run AxModel:
380 |             model: /opt/data/npu/models/mobilenetv2.axmodel
381 |              type: Half
382 |              vnpu: Disable
383 |          affinity: 0b01
384 |            warmup: 3
385 |            repeat: 10
386 |             batch: { auto: 0 }
387 |       pulsar2 ver: 1.8-beta1 6a7e59de
388 |        engine ver: 2.6.3sp
389 |          tool ver: 2.3.3sp
390 |          cmm size: 4414192 Bytes
391 |       ------------------------------------------------------
392 |       min =   1.093 ms   max =   1.098 ms   avg =   1.096 ms
393 |       ------------------------------------------------------
394 | 
395 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
396 | 使用 sample_npu_classification 示例测试单张图片推理结果
397 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
398 | 
399 | .. hint::
400 | 
401 |     该运行示例已经预装在开发板的文件系统中，其源文件位于 SDK 的 ``msp/sample/npu`` 路径下文件夹下。将 ``mobilennetv2.axmodel`` 拷贝到开发板上，使用 ``sample_npu_classification`` 进行测试。
402 | 
403 | ``sample_npu_classification`` 输入参数说明: 
404 | 
405 | .. code-block:: shell
406 | 
407 |     /root # sample_npu_classification --help
408 |     usage: sample_npu_classification --model=string --image=string [options] ...
409 |     options:
410 |       -m, --model     joint file(a.k.a. joint model) (string)
411 |       -i, --image     image file (string)
412 |       -g, --size      input_h, input_w (string [=224,224])
413 |       -r, --repeat    repeat count (int [=1])
414 |       -?, --help      print this message
415 | 
416 | 通过执行 ``sample_npu_classification`` 程序实现分类模型板上运行, 运行结果如下:
417 | 
418 | .. code-block:: shell
419 | 
420 |     /root # sample_npu_classification -m mobilenetv2.axmodel -i /opt/data/npu/images/cat.jpg -r 100
421 |     --------------------------------------
422 |     model file : mobilenetv2.axmodel
423 |     image file : /opt/data/npu/images/cat.jpg
424 |     img_h, img_w : 224 224
425 |     --------------------------------------
426 |     Engine creating handle is done.
427 |     Engine creating context is done.
428 |     Engine get io info is done.
429 |     Engine alloc io is done.
430 |     Engine push input is done.
431 |     --------------------------------------
432 |     topk cost time:0.10 ms
433 |     9.1132, 285
434 |     8.8490, 281
435 |     8.7169, 282
436 |     8.0566, 283
437 |     6.8679, 463
438 |     --------------------------------------
439 |     Repeat 100 times, avg time 1.09 ms, max_time 1.10 ms, min_time 1.09 ms
440 |     --------------------------------------
441 | 
442 | - 从这里可知，同一个 ``mobilenetv2.axmodel`` 模型在开发板上运行的结果与 :ref:`《仿真运行》 <model_simulator_20e>` 的结果一致；
443 | - 板上可执行程序 ``ax_classification`` 相关源码及编译生成详情请参考 :ref:`《模型部署进阶指南》 <model_deploy_advanced>`。 
444 | 


--------------------------------------------------------------------------------
/source/user_guides_quick/quick_start_ax650.rst:
--------------------------------------------------------------------------------
  1 | ======================
  2 | Quick Start(AX650)
  3 | ======================
  4 | 
  5 | **本章节适用于以下平台：**
  6 | 
  7 | - AX650A
  8 | - AX650N
  9 | - M76H
 10 | 
 11 | 本章节介绍 ``ONNX`` 模型转换的基本操作, 使用 ``pulsar2`` 工具将 ``ONNX``  模型编译成 ``axmodel`` 模型. 请先参考 :ref:`《开发环境准备》 <dev_env_prepare>` 章节完成开发环境搭建. 
 12 | 本节示例模型为开源模型 ``MobileNetv2``.
 13 | 
 14 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 15 | Pulsar2 工具链命令说明
 16 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 17 | 
 18 | ``Pulsar2`` 工具链中的功能指令以 ``pulsar2`` 开头, 与用户强相关的命令为 ``pulsar2 build`` , ``pulsar2 run`` 以及 ``pulsar2 version``. 
 19 | 
 20 | * ``pulsar2 build`` 用于将 ``onnx`` 模型转换为 ``axmodel`` 格式模型
 21 | * ``pulsar2 run`` 用于模型转换后的仿真运行
 22 | * ``pulsar2 version`` 可以用于查看当前工具链的版本信息, 通常在反馈问题时需要提供此信息
 23 | 
 24 | .. code-block:: shell
 25 | 
 26 |     root@xxx:/data# pulsar2 --help
 27 |     usage: pulsar2 [-h] {version,build,run} ...
 28 |     
 29 |     positional arguments:
 30 |       {version,build,run}
 31 |     
 32 |     optional arguments:
 33 |       -h, --help           show this help message and exit
 34 | 
 35 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 36 | 模型编译配置文件说明
 37 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 38 | 
 39 | ``/data/config/`` 路径下的 ``mobilenet_v2_build_config.json`` 展示:
 40 | 
 41 | .. code-block:: shell
 42 | 
 43 |     {
 44 |       "model_type": "ONNX",
 45 |       "npu_mode": "NPU1",
 46 |       "quant": {
 47 |         "input_configs": [
 48 |           {
 49 |             "tensor_name": "input",
 50 |             "calibration_dataset": "./dataset/imagenet-32-images.tar",
 51 |             "calibration_size": 32,
 52 |             "calibration_mean": [103.939, 116.779, 123.68],
 53 |             "calibration_std": [58.0, 58.0, 58.0]
 54 |           }
 55 |         ],
 56 |         "calibration_method": "MinMax",
 57 |         "precision_analysis": false
 58 |       },
 59 |       "input_processors": [
 60 |         {
 61 |           "tensor_name": "input",
 62 |           "tensor_format": "BGR",
 63 |           "src_format": "BGR",
 64 |           "src_dtype": "U8",
 65 |           "src_layout": "NHWC",
 66 |           "csc_mode": "NoCSC"
 67 |         }
 68 |       ],
 69 |       "compiler": {
 70 |         "check": 0
 71 |       }
 72 |     }
 73 | 
 74 | .. attention::
 75 | 
 76 |     ``input_processors``, ``output_processors`` 及 ``quant`` 节点下 ``input_configs`` 中的 ``tensor_name`` 字段需要根据模型的实际输入/输出节点名称进行设置，也可以设置为 ``DEFAULT`` 代表当前配置应用于全部输入或者输出。
 77 | 
 78 |     .. figure:: ../media/tensor_name.png
 79 |         :alt: pipeline
 80 |         :align: center
 81 | 
 82 | 更加详细的内容，请参考 :ref:`《配置文件详细说明》 <config_details>`.
 83 | 
 84 | .. _model_compile:
 85 | 
 86 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 87 | 编译执行
 88 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 89 | 
 90 | 以 ``mobilenetv2-sim.onnx`` 为例, 执行如下 ``pulsar2 build`` 命令编译生成 ``compiled.axmodel``:
 91 | 
 92 | .. code-block:: shell
 93 | 
 94 |     pulsar2 build  --target_hardware AX650 --input model/mobilenetv2-sim.onnx --output_dir output --config config/mobilenet_v2_build_config.json
 95 | 
 96 | .. warning::
 97 | 
 98 |     在编译模型前，需要确保已经对原始模型使用过 ``onnxsim`` 工具优化，主要目的是将模型转变成更利于 ``Pulsar2`` 编译的静态图及获得更好的推理性能。有以下两种方法：
 99 | 
100 |     1. 在 ``Pulsar2`` docker 内部直接执行命令：``onnxsim in.onnx out.onnx``。
101 |     2. 使用 ``pulsar2 build`` 进行模型转换时，增加参数：``--onnx_opt.enable_onnxsim true`` （默认值为 false）。
102 | 
103 |     如果想要进一步了解 ``onnxsim`` ，可访问 `官方网站 <https://github.com/daquexian/onnx-simplifier>`_ 。
104 | 
105 | ^^^^^^^^^^^^^^^^^^^^^
106 | log 参考信息
107 | ^^^^^^^^^^^^^^^^^^^^^
108 | 
109 | .. code-block::
110 | 
111 |     2024-09-25 11:45:26.533 | WARNING  | yamain.command.build:fill_default:300 - apply default output processor configuration to ['output']
112 |     2024-09-25 11:45:26.533 | WARNING  | yamain.command.build:fill_default:364 - ignore input csc config because of src_format is AutoColorSpace or src_format and tensor_format are the same
113 |     2024-09-25 11:45:26.534 | INFO     | yamain.common.util:extract_archive:181 - extract [dataset/imagenet-32-images.tar] to [output/quant/dataset/input]...
114 |     32 File(s) Loaded.
115 |     Building onnx ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
116 |     2024-09-25 11:45:27.422 | INFO     | yamain.command.build:quant:797 - save optimized onnx to [output/frontend/optimized.onnx]
117 |                                    Quant Config Table                               
118 |     ┏━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┓
119 |     ┃       ┃           ┃ Dataset   ┃ Data      ┃ Tensor    ┃           ┃          ┃
120 |     ┃ Input ┃ Shape     ┃ Directory ┃ Format    ┃ Format    ┃ Mean      ┃ Std      ┃
121 |     ┡━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━┩
122 |     │ input │ [1, 3,    │ output/q… │ Image     │ BGR       │ [103.939… │ [58.0,   │
123 |     │       │ 224, 224] │           │           │           │ 116.7789… │ 58.0,    │
124 |     │       │           │           │           │           │ 123.6800… │ 58.0]    │
125 |     └───────┴───────────┴───────────┴───────────┴───────────┴───────────┴──────────┘
126 |     Transformer optimize level: 0
127 |     32 File(s) Loaded.
128 |     
129 |     Stastic Inf tensor:   0%|          | 0/1 [00:00<?, ?it/s]
130 |     Stastic Inf tensor: 100%|██████████| 1/1 [00:00<00:00,  9.09it/s]
131 |     Stastic Inf tensor: 100%|██████████| 1/1 [00:00<00:00,  9.06it/s]
132 |     [11:45:28] AX Set Float Op Table Pass Running ...         
133 |     [11:45:28] AX Set MixPrecision Pass Running ...           
134 |     [11:45:28] AX Set LN Quant dtype Quant Pass Running ...   
135 |     [11:45:28] AX Reset Mul Config Pass Running ...           
136 |     [11:45:28] AX Refine Operation Config Pass Running ...    
137 |     [11:45:28] AX Tanh Operation Format Pass Running ...      
138 |     [11:45:28] AX Confused Op Refine Pass Running ...         
139 |     [11:45:28] AX Quantization Fusion Pass Running ...        
140 |     [11:45:28] AX Quantization Simplify Pass Running ...      
141 |     [11:45:28] AX Parameter Quantization Pass Running ...     
142 |     [11:45:29] AX Runtime Calibration Pass Running ...        
143 |     
144 |     Calibration Progress(Phase 1):   0%|          | 0/32 [00:00<?, ?it/s]
145 |     Calibration Progress(Phase 1):   3%|▎         | 1/32 [00:00<00:03,  9.10it/s]
146 |     Calibration Progress(Phase 1):   6%|▋         | 2/32 [00:00<00:03,  9.09it/s]
147 |     Calibration Progress(Phase 1):   9%|▉         | 3/32 [00:00<00:03,  9.05it/s]
148 |     Calibration Progress(Phase 1):  12%|█▎        | 4/32 [00:00<00:03,  9.02it/s]
149 |     Calibration Progress(Phase 1):  16%|█▌        | 5/32 [00:00<00:02,  9.00it/s]
150 |     Calibration Progress(Phase 1):  19%|█▉        | 6/32 [00:00<00:02,  8.96it/s]
151 |     Calibration Progress(Phase 1):  22%|██▏       | 7/32 [00:00<00:02,  9.03it/s]
152 |     Calibration Progress(Phase 1):  25%|██▌       | 8/32 [00:00<00:02,  9.03it/s]
153 |     Calibration Progress(Phase 1):  28%|██▊       | 9/32 [00:00<00:02,  9.03it/s]
154 |     Calibration Progress(Phase 1):  31%|███▏      | 10/32 [00:01<00:02,  9.02it/s]
155 |     Calibration Progress(Phase 1):  34%|███▍      | 11/32 [00:01<00:02,  9.00it/s]
156 |     Calibration Progress(Phase 1):  38%|███▊      | 12/32 [00:01<00:02,  8.94it/s]
157 |     Calibration Progress(Phase 1):  41%|████      | 13/32 [00:01<00:02,  8.95it/s]
158 |     Calibration Progress(Phase 1):  44%|████▍     | 14/32 [00:01<00:02,  8.96it/s]
159 |     Calibration Progress(Phase 1):  47%|████▋     | 15/32 [00:01<00:01,  8.92it/s]
160 |     Calibration Progress(Phase 1):  50%|█████     | 16/32 [00:01<00:01,  8.89it/s]
161 |     Calibration Progress(Phase 1):  53%|█████▎    | 17/32 [00:01<00:01,  8.90it/s]
162 |     Calibration Progress(Phase 1):  56%|█████▋    | 18/32 [00:02<00:01,  8.89it/s]
163 |     Calibration Progress(Phase 1):  59%|█████▉    | 19/32 [00:02<00:01,  8.86it/s]
164 |     Calibration Progress(Phase 1):  62%|██████▎   | 20/32 [00:02<00:01,  8.93it/s]
165 |     Calibration Progress(Phase 1):  66%|██████▌   | 21/32 [00:02<00:01,  8.90it/s]
166 |     Calibration Progress(Phase 1):  69%|██████▉   | 22/32 [00:02<00:01,  8.93it/s]
167 |     Calibration Progress(Phase 1):  72%|███████▏  | 23/32 [00:02<00:01,  8.91it/s]
168 |     Calibration Progress(Phase 1):  75%|███████▌  | 24/32 [00:02<00:00,  8.89it/s]
169 |     Calibration Progress(Phase 1):  78%|███████▊  | 25/32 [00:02<00:00,  8.91it/s]
170 |     Calibration Progress(Phase 1):  81%|████████▏ | 26/32 [00:02<00:00,  8.87it/s]
171 |     Calibration Progress(Phase 1):  84%|████████▍ | 27/32 [00:03<00:00,  8.89it/s]
172 |     Calibration Progress(Phase 1):  88%|████████▊ | 28/32 [00:03<00:00,  8.91it/s]
173 |     Calibration Progress(Phase 1):  91%|█████████ | 29/32 [00:03<00:00,  8.86it/s]
174 |     Calibration Progress(Phase 1):  94%|█████████▍| 30/32 [00:03<00:00,  8.85it/s]
175 |     Calibration Progress(Phase 1):  97%|█████████▋| 31/32 [00:03<00:00,  8.77it/s]
176 |     Calibration Progress(Phase 1): 100%|██████████| 32/32 [00:03<00:00,  8.74it/s]
177 |     Calibration Progress(Phase 1): 100%|██████████| 32/32 [00:03<00:00,  8.91it/s]
178 |     [11:45:32] AX Quantization Alignment Pass Running ...     
179 |     [11:45:32] AX Refine Int Parameter Pass Running ...       
180 |     [11:45:33] AX Refine Scale Pass Running ...               
181 |     [11:45:33] AX Passive Parameter Quantization Running ...  
182 |     [11:45:33] AX Parameter Baking Pass Running ...           
183 |     --------- Network Snapshot ---------
184 |     Num of Op:                    [100]
185 |     Num of Quantized Op:          [100]
186 |     Num of Variable:              [278]
187 |     Num of Quantized Var:         [278]
188 |     ------- Quantization Snapshot ------
189 |     Num of Quant Config:          [387]
190 |     BAKED:                        [53]
191 |     OVERLAPPED:                   [145]
192 |     ACTIVATED:                    [65]
193 |     SOI:                          [1]
194 |     PASSIVE_BAKED:                [53]
195 |     FP32:                         [70]
196 |     Network Quantization Finished.
197 |     quant.axmodel export success: 
198 |     	/data/deploy/data/quick_start_example/output/quant/quant_axmodel.onnx
199 |     	/data/deploy/data/quick_start_example/output/quant/quant_axmodel.data
200 |     ===>export pb data to folder: output/quant/debug/test_data_set_0
201 |     ===>export io data to folder: output/quant/debug/io
202 |     Building native ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
203 |     2024-09-25 11:45:33.944 | INFO     | yamain.command.build:compile_ptq_model:1035 - group 0 compiler transformation
204 |     2024-09-25 11:45:33.946 | WARNING  | yamain.command.load_model:pre_process:608 - preprocess tensor [input]
205 |     2024-09-25 11:45:33.946 | INFO     | yamain.command.load_model:pre_process:609 - tensor: input, (1, 224, 224, 3), U8
206 |     2024-09-25 11:45:33.947 | INFO     | yamain.command.load_model:pre_process:609 - op: op:pre_dequant_1, AxDequantizeLinear, {'const_inputs': {'x_zeropoint': array(0, dtype=int32), 'x_scale': array(1., dtype=float32)}, 'output_dtype': <class 'numpy.float32'>, 'quant_method': 0}
207 |     2024-09-25 11:45:33.947 | INFO     | yamain.command.load_model:pre_process:609 - tensor: tensor:pre_norm_1, (1, 224, 224, 3), FP32
208 |     2024-09-25 11:45:33.947 | INFO     | yamain.command.load_model:pre_process:609 - op: op:pre_norm_1, AxNormalize, {'dim': 3, 'mean': [103.93900299072266, 116.77899932861328, 123.68000030517578], 'std': [58.0, 58.0, 58.0], 'output_dtype': FP32}
209 |     2024-09-25 11:45:33.947 | INFO     | yamain.command.load_model:pre_process:609 - tensor: tensor:pre_transpose_1, (1, 224, 224, 3), FP32
210 |     2024-09-25 11:45:33.947 | INFO     | yamain.command.load_model:pre_process:609 - op: op:pre_transpose_1, AxTranspose, {'perm': [0, 3, 1, 2]}
211 |     2024-09-25 11:45:33.947 | WARNING  | yamain.command.load_model:post_process:630 - postprocess tensor [output]
212 |     2024-09-25 11:45:34.159 | INFO     | yamain.command.build:compile_ptq_model:1060 - QuantAxModel macs: 280,262,480
213 |     2024-09-25 11:45:34.169 | INFO     | yamain.command.build:compile_ptq_model:1132 - subgraph [0], group: 0, type: GraphType.NPU
214 |     2024-09-25 11:45:34.187 | INFO     | yasched.test_onepass:test_onepass_ir:3221 - schedule npu subgraph [0]
215 |     tiling op...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 68/68 0:00:00
216 |     <frozen backend.ax650npu.oprimpl.normalize>:186: RuntimeWarning: divide by zero encountered in divide
217 |     <frozen backend.ax650npu.oprimpl.normalize>:187: RuntimeWarning: invalid value encountered in divide
218 |     new_ddr_tensor = []
219 |     build op serially...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 103/103 0:00:00
220 |     build op...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 188/188 0:00:00
221 |     add ddr swap...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 497/497 0:00:00
222 |     calc input dependencies...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 921/921 0:00:00
223 |     calc output dependencies...   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 921/921 0:00:00
224 |     assign eu heuristic   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 921/921 0:00:00
225 |     assign eu onepass   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 921/921 0:00:00
226 |     assign eu greedy   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 921/921 0:00:00
227 |     2024-09-25 11:45:36.467 | INFO     | yasched.test_onepass:results2model:2541 - clear job deps
228 |     2024-09-25 11:45:36.467 | INFO     | yasched.test_onepass:results2model:2542 - max_cycle = 450,154
229 |     build jobs   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 921/921 0:00:00
230 |     2024-09-25 11:45:36.796 | INFO     | yamain.command.build:compile_npu_subgraph:1332 - assembel model [subgraph_npu_0]
231 |     2024-09-25 11:45:38.075 | INFO     | yamain.command.build:compile_ptq_model:1142 - fuse 1 subgraph(s)
232 | 
233 | .. note::
234 | 
235 |     该示例所运行的主机配置为:
236 | 
237 |         - Intel(R) Xeon(R) Gold 6336Y CPU @ 2.40GHz
238 |         - Memory 32G
239 | 
240 |     全流程耗时大约 ``11s`` , 不同配置的主机转换时间略有差异.
241 | 
242 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
243 | 模型编译输出文件说明
244 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
245 | 
246 | .. code-block:: shell  
247 | 
248 |     root@xxx:/data# tree output/
249 |     output/
250 |     ├── build_context.json
251 |     ├── compiled.axmodel            # 最终板上运行模型，AxModel
252 |     ├── compiler                    # 编译器后端中间结果及 debug 信息
253 |     ├── frontend                    # 前端图优化中间结果及 debug 信息
254 |     │   └── optimized.onnx          # 输入模型经过图优化以后的浮点 ONNX 模型
255 |     └── quant                       # 量化工具输出及 debug 信息目录
256 |         ├── dataset                 # 解压后的校准集数据目录
257 |         │   └── input
258 |         │       ├── ILSVRC2012_val_00000001.JPEG
259 |         │       ├── ......
260 |         │       └── ILSVRC2012_val_00000032.JPEG
261 |         ├── debug
262 |         ├── quant_axmodel.json      # 量化配置信息
263 |         └── quant_axmodel.onnx      # 量化后的模型，QuantAxModel
264 | 
265 | 其中 ``compiled.axmodel`` 为最终编译生成的板上可运行的 ``.axmodel`` 模型文件
266 | 
267 | .. note::
268 | 
269 |     因为 ``.axmodel`` 基于 **ONNX** 模型存储格式开发，所以将 ``.axmodel`` 文件后缀修改为 ``.axmodel.onnx`` 后可支持被网络模型图形化工具 **Netron** 直接打开。
270 | 
271 |     .. figure:: ../media/axmodel-netron.png
272 |         :alt: pipeline
273 |         :align: center
274 | 
275 | ----------------------
276 | 模型信息查询
277 | ----------------------
278 | 
279 | 可以通过 ``onnx inspect --io ${axmodel/onnx_path}`` 来查看编译后 ``axmodel`` 模型的输入输出信息，还有其他 ``-m -n -t`` 参数可以查看模型里的 ``meta / node / tensor`` 信息。
280 | 
281 | .. code-block:: shell
282 | 
283 |     root@xxx:/data# onnx inspect -m -n -t output/compiled.axmodel
284 |     Failed to check model output/compiled.axmodel, statistic could be inaccurate!
285 |     Inpect of model output/compiled.axmodel
286 |     ================================================================================
287 |       Graph name: 8
288 |       Graph inputs: 1
289 |       Graph outputs: 1
290 |       Nodes in total: 1
291 |       ValueInfo in total: 4
292 |       Initializers in total: 2
293 |       Sparse Initializers in total: 0
294 |       Quantization in total: 0
295 |     
296 |     Meta information:
297 |     --------------------------------------------------------------------------------
298 |       IR Version: 8
299 |       Opset Import: [domain: ""
300 |     version: 16
301 |     ]
302 |       Producer name: Pulsar2
303 |       Producer version: 
304 |       Domain: 
305 |       Doc string: Pulsar2 Version:  2.4
306 |     Pulsar2 Commit: 2064a8ee
307 |       meta.{} = {} extra_data CgsKBWlucHV0EAEYAgoICgZvdXRwdXQSATEaQQoOc3ViZ3JhcGhfbnB1XzBSLwoVc3ViZ3JhcGhfbnB1XzBfYjFfbmV1EAEaFAoGcGFyYW1zGgpucHVfcGFyYW1zIgA=
308 |     
309 |     Node information:
310 |     --------------------------------------------------------------------------------
311 |       Node type "neu mode" has: 1
312 |     --------------------------------------------------------------------------------
313 |       Node "subgraph_npu_0": type "neu mode", inputs "['input']", outputs "['output']"
314 |     
315 |     Tensor information:
316 |     --------------------------------------------------------------------------------
317 |       ValueInfo "input": type UINT8, shape [1, 224, 224, 3],
318 |       ValueInfo "npu_params": type UINT8, shape [4085516],
319 |       ValueInfo "subgraph_npu_0_b1_neu": type UINT8, shape [56592],
320 |       ValueInfo "output": type FLOAT, shape [1, 1000],
321 |       Initializer "npu_params": type UINT8, shape [4085516],
322 |       Initializer "subgraph_npu_0_b1_neu": type UINT8, shape [56592],
323 | 
324 | .. _model_simulator:
325 | 
326 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
327 | 仿真运行
328 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
329 | 
330 | 本章节介绍 ``axmodel`` 仿真运行的基本操作, 使用 ``pulsar2 run`` 命令可以直接在 ``PC`` 上直接运行由 ``pulsar2 build`` 生成的 ``axmodel`` 模型，无需上板运行即可快速得到网络模型的运行结果。
331 | 
332 | ^^^^^^^^^^^^^^^^^^^^^
333 | 仿真运行准备
334 | ^^^^^^^^^^^^^^^^^^^^^
335 | 
336 | 某些模型只能支持特定的输入数据格式，模型的输出数据也是以模组特定的格式输出的。在模型仿真运行前，需要把输入数据转换成模型支持的数据格式，这部分数据操作称为 ``前处理`` 。在模型仿真运行后，需要把输出数据转换成工具可以分析查看的数据格式，这部分数据操作称为 ``后处理`` 。仿真运行时需要的 ``前处理`` 和 ``后处理`` 工具已包含在 ``pulsar2-run-helper`` 文件夹中。
337 | 
338 | ``pulsar2-run-helper`` 文件夹内容如下所示：
339 | 
340 | .. code-block:: shell
341 | 
342 |     root@xxx:/data# ll pulsar2-run-helper/
343 |     drwxr-xr-x 2 root root 4.0K Dec  2 12:23 models/
344 |     drwxr-xr-x 5 root root 4.0K Dec  2 12:23 pulsar2_run_helper/
345 |     drwxr-xr-x 2 root root 4.0K Dec  2 12:23 sim_images/
346 |     drwxr-xr-x 2 root root 4.0K Dec  2 12:23 sim_inputs/
347 |     drwxr-xr-x 2 root root 4.0K Dec  2 12:23 sim_outputs/
348 |     -rw-r--r-- 1 root root 3.0K Dec  2 12:23 cli_classification.py
349 |     -rw-r--r-- 1 root root 4.6K Dec  2 12:23 cli_detection.py
350 |     -rw-r--r-- 1 root root    2 Dec  2 12:23 list.txt
351 |     -rw-r--r-- 1 root root   29 Dec  2 12:23 requirements.txt
352 |     -rw-r--r-- 1 root root  308 Dec  2 12:23 setup.cfg
353 | 
354 | ^^^^^^^^^^^^^^^^^^^^^
355 | 仿真运行示例 ``mobilenetv2``
356 | ^^^^^^^^^^^^^^^^^^^^^
357 | 
358 | 将 :ref:`《编译执行》 <model_compile>` 章节生成的 ``compiled.axmodel`` 拷贝 ``pulsar2-run-helper/models`` 路径下，并更名为 ``mobilenetv2.axmodel``
359 | 
360 | .. code-block:: shell
361 | 
362 |     root@xxx:/data# cp output/compiled.axmodel pulsar2-run-helper/models/mobilenetv2.axmodel
363 | 
364 | ----------------------
365 | 输入数据准备
366 | ----------------------
367 | 
368 | 进入 ``pulsar2-run-helper`` 目录，使用 ``cli_classification.py`` 脚本将 ``cat.jpg`` 处理成 ``mobilenetv2.axmodel`` 所需要的输入数据格式。
369 | 
370 | .. code-block:: shell
371 | 
372 |     root@xxx:~/data# cd pulsar2-run-helper
373 |     root@xxx:~/data/pulsar2-run-helper# python3 cli_classification.py --pre_processing --image_path sim_images/cat.jpg --axmodel_path models/mobilenetv2.axmodel --intermediate_path sim_inputs/0
374 |     [I] Write [input] to 'sim_inputs/0/input.bin' successfully.
375 | 
376 | ----------------------
377 | 仿真模型推理
378 | ----------------------
379 | 
380 | 运行 ``pulsar2 run`` 命令，将 ``input.bin`` 作为 ``mobilenetv2.axmodel`` 的输入数据并执行推理计算，输出 ``output.bin`` 推理结果。
381 | 
382 | .. code-block:: shell
383 | 
384 |     root@xxx:~/data/pulsar2-run-helper# pulsar2 run --model models/mobilenetv2.axmodel --input_dir sim_inputs --output_dir sim_outputs --list list.txt
385 |     Building native ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00
386 |     >>> [0] start
387 |     write [output] to [sim_outputs/0/output.bin] successfully
388 |     >>> [0] finish
389 | 
390 | ----------------------
391 | 输出数据处理
392 | ----------------------
393 | 
394 | 使用 ``cli_classification.py`` 脚本对仿真模型推理输出的 ``output.bin`` 数据进行后处理，得到最终计算结果。
395 | 
396 | .. code-block:: shell
397 | 
398 |     root@xxx:/data/pulsar2-run-helper# python3 cli_classification.py --post_processing --axmodel_path models/mobilenetv2.axmodel --intermediate_path sim_outputs/0
399 |     [I] The following are the predicted score index pair.
400 |     [I] 9.5094, 285
401 |     [I] 9.3773, 282
402 |     [I] 9.2452, 281
403 |     [I] 8.5849, 283
404 |     [I] 7.6603, 287
405 | 
406 | .. _onboard_running:
407 | 
408 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
409 | 开发板运行
410 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
411 | 
412 | 本章节介绍如何在 ``AX650`` ``M76H`` 开发板上运行通过 :ref:`《编译执行》 <model_compile>` 章节获取 ``compiled.axmodel`` 模型. 
413 | 
414 | ^^^^^^^^^^^^^^^^^^^^^
415 | 开发板获取
416 | ^^^^^^^^^^^^^^^^^^^^^
417 | 
418 | - 通过企业途径向 AXera 签署 NDA 后获取 **AX650 或 M76H EVB**.
419 | 
420 | ^^^^^^^^^^^^^^^^^^^^^
421 | 使用 ax_run_model 工具快速测试模型推理速度
422 | ^^^^^^^^^^^^^^^^^^^^^
423 | 
424 | 为了方便用户测评模型，在开发板上预制了 :ref:`ax_run_model <ax_run_model>` 工具，此工具有若干参数，可以很方便地测试模型速度和精度。
425 | 
426 | 将 ``mobilennetv2.axmodel`` 拷贝到开发板上，执行以下命令即可快速测试模型推理性能（首先推理 3 次进行预热，以排除资源初始化导致的统计误差，然后推理 10 次，统计平均推理速度）。
427 | 
428 | .. code-block:: shell
429 | 
430 |     /root # ax_run_model -m mobilenetv2.axmodel -w 3 -r 10
431 |       Run AxModel:
432 |             model: mobilenetv2.axmodel
433 |              type: 1 Core
434 |              vnpu: Disable
435 |          affinity: 0b001
436 |            warmup: 3
437 |            repeat: 10
438 |             batch: { auto: 1 }
439 |          parallel: false
440 |       pulsar2 ver: 1.2-patch2 7e6b2b5f
441 |        engine ver: 2.3.0a
442 |          tool ver: 2.1.2c
443 |          cmm size: 4428624 Bytes
444 |       ------------------------------------------------------
445 |       min =   0.719 ms   max =   0.726 ms   avg =   0.721 ms
446 |       ------------------------------------------------------
447 | 
448 | ^^^^^^^^^^^^^^^^^^^^^
449 | 使用 sample_npu_classification 示例测试单张图片推理结果
450 | ^^^^^^^^^^^^^^^^^^^^^
451 | 
452 | .. hint::
453 | 
454 |     该运行示例已经预装在开发板的文件系统中，其源文件位于 SDK 的 ``msp/sample/npu`` 路径下文件夹下。将 ``mobilennetv2.axmodel`` 拷贝到开发板上，使用 ``sample_npu_classification`` 进行测试。
455 | 
456 | ``sample_npu_classification`` 输入参数说明: 
457 | 
458 | .. code-block:: shell
459 | 
460 |     /root # sample_npu_classification --help
461 |     usage: sample_npu_classification --model=string --image=string [options] ...
462 |     options:
463 |       -m, --model     joint file(a.k.a. joint model) (string)
464 |       -i, --image     image file (string)
465 |       -g, --size      input_h, input_w (string [=224,224])
466 |       -r, --repeat    repeat count (int [=1])
467 |       -?, --help      print this message
468 | 
469 | 通过执行 ``sample_npu_classification`` 程序实现分类模型板上运行, 运行结果如下:
470 | 
471 | .. code-block:: shell
472 | 
473 |     /root # sample_npu_classification -m mobilenetv2.axmodel -i /opt/data/npu/images/cat.jpg -r 10
474 |     --------------------------------------
475 |     model file : mobilenetv2.axmodel
476 |     image file : /opt/data/npu/images/cat.jpg
477 |     img_h, img_w : 224 224
478 |     --------------------------------------
479 |     Engine creating handle is done.
480 |     Engine creating context is done.
481 |     Engine get io info is done.
482 |     Engine alloc io is done.
483 |     Engine push input is done.
484 |     --------------------------------------
485 |     topk cost time:0.07 ms
486 |     9.5094, 285
487 |     9.3773, 282
488 |     9.2452, 281
489 |     8.5849, 283
490 |     7.6603, 287
491 |     --------------------------------------
492 |     Repeat 10 times, avg time 0.72 ms, max_time 0.72 ms, min_time 0.72 ms
493 |     --------------------------------------
494 | 
495 | - 从这里可知，同一个 ``mobilenetv2.axmodel`` 模型在开发板上运行的结果与 :ref:`《仿真运行》 <model_simulator>` 的结果一致；
496 | - 板上可执行程序 ``ax_classification`` 相关源码及编译生成详情请参考 :ref:`《模型部署进阶指南》 <model_deploy_advanced>`。 
497 | 


--------------------------------------------------------------------------------
/source/user_guides_quick/quick_start_prepare.rst:
--------------------------------------------------------------------------------
  1 | ======================
  2 | 开发环境准备
  3 | ======================
  4 | 
  5 | 本节介绍使用 ``Pulsar2`` 工具链前的开发环境准备工作.
  6 | 
  7 | ``Pulsar2`` 使用 ``Docker`` 容器进行工具链集成, 用户可以通过 ``Docker`` 加载 ``Pulsar2`` 镜像文件, 然后进行模型转换、编译、仿真等工作, 因此开发环境准备阶段只需要正确安装 ``Docker`` 环境即可. 支持的系统 ``MacOS``, ``Linux``, ``Windows``.
  8 | 
  9 | .. _dev_env_prepare:
 10 | 
 11 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 12 | 安装 Docker 开发环境
 13 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 14 | 
 15 | - `MacOS 安装 Docker 环境 <https://docs.docker.com/desktop/mac/install/>`_
 16 | 
 17 | - `Linux 安装 Docker 环境 <https://docs.docker.com/engine/install/##server>`_
 18 | 
 19 | - `Windows 安装 Docker 环境 <https://docs.docker.com/desktop/windows/install/>`_
 20 | 
 21 | ``Docker`` 安装成功后, 输入 ``sudo docker -v``
 22 | 
 23 | .. code-block:: shell
 24 | 
 25 |     $ sudo docker -v
 26 |     Docker version 20.10.7, build f0df350
 27 | 
 28 | 显示以上内容, 说明 ``Docker`` 已经安装成功. 下面将介绍 ``Pulsar2`` 工具链 ``Image`` 的安装和启动.
 29 | 
 30 | ~~~~~~~~~~~~~~~~~~~~~~~
 31 | 安装 Pulsar2 工具链
 32 | ~~~~~~~~~~~~~~~~~~~~~~~
 33 | 
 34 | 以系统版本为 ``Ubuntu 20.04``、工具链 ``ax_pulsar2_${version}.tar.gz`` 为例说明 ``Pulsar2`` 工具链的安装方法.
 35 | 
 36 | .. hint::
 37 | 
 38 |     实际操作时，请务必将 ${version} 替换为对应的工具链版本号。
 39 | 
 40 | 工具链获取途径：
 41 | 
 42 | - `百度网盘 <https://pan.baidu.com/s/1FazlPdW79wQWVY-Qn--qVQ?pwd=sbru>`_
 43 | - `Google Drive <https://drive.google.com/drive/folders/10rfQIAm5ktjJ1bRMsHbUanbAplIn3ium?usp=sharing>`_
 44 | 
 45 | ^^^^^^^^^^^^^^^^^^^^^^^
 46 | 载入 Docker Image
 47 | ^^^^^^^^^^^^^^^^^^^^^^^
 48 | 
 49 | 执行 ``sudo docker load -i ax_pulsar2_${version}.tar.gz`` 导入 docker 镜像文件. 正确导入镜像文件会打印以下日志:
 50 | 
 51 | .. code-block:: shell
 52 | 
 53 |     $ sudo docker load -i ax_pulsar2_${version}.tar.gz
 54 |     Loaded image: pulsar2:${version}
 55 | 
 56 | 完成后, 执行 ``sudo docker image ls``
 57 | 
 58 | .. code-block:: shell
 59 | 
 60 |     $ sudo docker image ls
 61 |     REPOSITORY   TAG          IMAGE ID       CREATED         SIZE
 62 |     pulsar2      ${version}   xxxxxxxxxxxx   9 seconds ago   3.27GB
 63 | 
 64 | 可以看到工具链镜像已经成功载入, 之后便可以基于此镜像启动容器.
 65 | 
 66 | ^^^^^^^^^^^^^^^^^^^^^^^
 67 | 启动工具链镜像
 68 | ^^^^^^^^^^^^^^^^^^^^^^^
 69 | 
 70 | 执行以下命令启动 ``Docker`` 容器, 运行成功后进入 ``bash`` 环境
 71 | 
 72 | .. code-block:: shell
 73 | 
 74 |     $ sudo docker run -it --net host --rm -v $PWD:/data pulsar2:${version}
 75 | 
 76 | ----------------------
 77 | 版本查询
 78 | ----------------------
 79 | 
 80 | ``pulsar2 version`` 用于获取工具的版本信息.
 81 | 
 82 | 示例结果
 83 | 
 84 | .. code-block:: bash
 85 | 
 86 |     root@xxx:/data# pulsar2 version
 87 |     version: ${version}
 88 |     commit: xxxxxxxx
 89 | 
 90 | .. _prepare_data:
 91 | 
 92 | ----------------------
 93 | 数据准备
 94 | ----------------------
 95 | 
 96 | .. hint::
 97 | 
 98 |     后续内容 **模型编译**、 **仿真运行** 所需要的 **原始模型** 、 **数据** 、 **图片** 、 **仿真工具** 已在 ``quick_start_example`` 文件夹中提供 :download:`点击下载示例文件 <https://github.com/xiguadong/assets/releases/download/v0.1/quick_start_example.zip>` 然后将下载的文件解压后拷贝到 ``docker`` 的 ``/data`` 路径下.
 99 | 
100 | .. code-block:: shell
101 | 
102 |     root@xxx:~/data# ls
103 |     config  dataset  model  output  pulsar2-run-helper
104 | 
105 | * ``model``: 存放原始的 ``ONNX`` 模型 ``mobilenetv2-sim.onnx`` (预先已使用 ``onnxsim`` 将 ``mobilenetv2.onnx`` 进行计算图优化)
106 | * ``dataset``: 存放离线量化校准 (PTQ Calibration) 需求的数据集压缩包 (支持 tar、tar.gz、gz 等常见压缩格式)
107 | * ``config``: 存放运行依赖的配置文件 ``config.json``
108 | * ``output``: 存放结果输出
109 | * ``pulsar2-run-helper``: 支持 ``axmodel`` 在 X86 环境进行仿真运行的工具 
110 | 
111 | 数据准备工作完毕后, 目录树结构如下:
112 | 
113 | .. code-block:: shell
114 | 
115 |     root@xxx:/data# tree -L 2
116 |     .
117 |     ├── config
118 |     │   ├── mobilenet_v2_build_config.json
119 |     │   └── yolov5s_config.json
120 |     ├── dataset
121 |     │   ├── coco_4.tar
122 |     │   └── imagenet-32-images.tar
123 |     ├── model
124 |     │   ├── mobilenetv2-sim.onnx
125 |     │   └── yolov5s.onnx
126 |     ├── output
127 |     └── pulsar2-run-helper
128 |         ├── cli_classification.py
129 |         ├── cli_detection.py
130 |         ├── models
131 |         ├── pulsar2_run_helper
132 |         ├── requirements.txt
133 |         ├── setup.cfg
134 |         ├── sim_images
135 |         ├── sim_inputs
136 |         └── sim_outputs
137 | 


--------------------------------------------------------------------------------