├── quantization
    ├── __init__.py
    ├── ptq
    │   ├── channel_wise_correction.py
    │   ├── onnx2trt_step04_ppq_training.py
    │   ├── onnx2trt_step01_merge_mean_scale.py
    │   ├── onnx2trt_step05_pdq_optimize.py
    │   ├── onnx2trt_step03_weight_equalization.py
    │   ├── onnx2trt_step02_remove_output_sigmoid.py
    │   ├── onnx2trt_step06_extra_pdq_init.py
    │   ├── onnx2trt_step07_extra_pdq_training.py
    │   ├── remove_inititalizers_from_inputs.py
    │   ├── filter_cache.py
    │   ├── trt_utils.py
    │   ├── remove_edge_logits_QDQ.py
    │   ├── data_loader.py
    │   ├── create_json_inputs.py
    │   ├── quantization_filter.py
    │   ├── P01_MT_onnx2tensorRT_int8.py
    │   └── ppq_optimize.py
    ├── README.md
    ├── ppq_optimize.py
    ├── calib_cache2json.py
    ├── calib_json2calib.py
    ├── onnx_optimize.py
    ├── fiter_scales.py
    ├── onnx_change_resize_mode.py
    ├── calib_merge_json2calib.py
    ├── calib_filter_Sigmoid.py
    ├── find_nodes_onnx.py
    ├── onnx_remove_split_qdq.py
    ├── json_filter.py
    ├── trt_calibrator.py
    ├── onnx_export_v2.py
    ├── onnx_export.py
    ├── onnx_move_qdq_relu_forward.py
    ├── onnx_remove_dup_qdqs.py
    ├── compare_trt_trt.py
    ├── onnx2trt_lsq.py
    ├── C03_compare_trt_fp32_int8.py
    ├── C02_compare_trt_fp32_int8.py
    ├── P02_MT_onnx2tensorRT_int8.py
    ├── onnx2tensorRT_adaround.py
    ├── P01_MT_onnx2tensorRT_int8.py
    ├── P01_MT_onnx2tensorRT_int8_sample.py
    ├── compare_onnx_trt_v3.py
    ├── onnx_calibrator.py
    ├── compare_onnx_onnx_v2.py
    ├── onnx2trt.py
    ├── P03_MT_onnx2tensorRT_int8.py
    ├── compare_onnx_onnx_v3.py
    ├── compare_onnx_trt_v1.py
    └── qat
    │   └── ppq_optimize.py
├── onnx_optimize
    ├── README.md
    ├── __init__.py
    ├── step05_fuse_repconvs.py
    ├── step04_extract_sub_graph.py
    ├── step01_export_torch_to_onnx.py
    ├── step02_onnx_simplify.py
    └── step03_fuse_normalize_to_conv.py
├── .vscode
    └── settings.json
├── prune
    ├── README_dev.md
    └── README.md
├── 01.install.md
├── README.md
├── trt_calibrator.py
├── onnx2trt.py
└── onnx_calibrator.py


/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/quantization/ptq/channel_wise_correction.py:
--------------------------------------------------------------------------------
1 | # 调整


--------------------------------------------------------------------------------
/quantization/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | # 量化流程
4 | 
5 | 
6 | 
7 | 1. 简单量化


--------------------------------------------------------------------------------
/quantization/ppq_optimize.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | # Step 01- 如果当前节点有多个后续节点，那么将QDQ移动到前面


--------------------------------------------------------------------------------
/quantization/ptq/onnx2trt_step04_ppq_training.py:
--------------------------------------------------------------------------------
1 | # 进行PPQ训练，获取好的结果
2 | 
3 | 


--------------------------------------------------------------------------------
/quantization/ptq/onnx2trt_step01_merge_mean_scale.py:
--------------------------------------------------------------------------------
1 | #  将Step01的结果合入第一层Conv/或者强制添加一层1x1卷积


--------------------------------------------------------------------------------
/quantization/ptq/onnx2trt_step05_pdq_optimize.py:
--------------------------------------------------------------------------------
1 | # 对PPQ训练的模型，进行QDQ节点位置的调整优化
2 | 
3 | 


--------------------------------------------------------------------------------
/quantization/ptq/onnx2trt_step03_weight_equalization.py:
--------------------------------------------------------------------------------
1 | # 连续的两个卷积层，进行输出部分的channel平衡(需要有输入数据)


--------------------------------------------------------------------------------
/quantization/ptq/onnx2trt_step02_remove_output_sigmoid.py:
--------------------------------------------------------------------------------
1 | #  移除输出部分的sigmoid, reshape, transpose等结构


--------------------------------------------------------------------------------
/quantization/ptq/onnx2trt_step06_extra_pdq_init.py:
--------------------------------------------------------------------------------
1 | # 对PPQ训练的模型，添加部分QDQ节点，进一步优化QDQ模型的效果
2 | 
3 | 


--------------------------------------------------------------------------------
/quantization/ptq/onnx2trt_step07_extra_pdq_training.py:
--------------------------------------------------------------------------------
1 | # 对PPQ训练的模型，添加部分QDQ节点，进一步优化QDQ模型的效果
2 | 
3 | 


--------------------------------------------------------------------------------
/onnx_optimize/README.md:
--------------------------------------------------------------------------------
 1 | ## onnx模型结构优化
 2 | 
 3 | 
 4 | # 1 通用优化
 5 | 
 6 | 
 7 | 
 8 | 
 9 | # 2. conv-bn merge
10 | 
11 | 
12 | 
13 | 
14 | # 3. normalize-conv merge


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "workbench.colorCustomizations": {
 3 |         "editor.lineHighlightBackground": "#1073cf2d",
 4 |         "editor.lineHighlightBorder": "#9fced11f",
 5 |         "activityBar.background": "#58024F",
 6 |         "titleBar.activeBackground": "#7B036E",
 7 |         "titleBar.activeForeground": "#FFF8FE"
 8 |     },
 9 |     "python.formatting.provider": "black"
10 | }


--------------------------------------------------------------------------------
/onnx_optimize/__init__.py:
--------------------------------------------------------------------------------
 1 | from step01_export_torch_to_onnx import export_torch_to_onnx
 2 | from step02_onnx_simplify import optimize_onnx_model
 3 | from step03_fuse_normalize_to_conv import fuse_normalize_to_conv
 4 | from step04_extract_sub_graph import extract_sub_graph
 5 | 
 6 | __all__ = ["export_torch_to_onnx",
 7 |  "optimize_onnx_model",
 8 |  "fuse_normalize_to_conv",
 9 |  "extract_sub_graph",
10 | ]
11 | 


--------------------------------------------------------------------------------
/onnx_optimize/step05_fuse_repconvs.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | 
 3 | 
 4 | def find_common_input(onnx_model, node):
 5 |     pass
 6 | 
 7 | def fuse_repconvs(onnx_model):
 8 |     # Step 01: find rep convs
 9 |     for node in onnx_model.graph.node:
10 |         if node.op_type == "Add":
11 |             find_common_input(onnx_model, node)
12 | 
13 | 
14 |     # step 02: merge rep conv weights
15 | 
16 | 
17 |     # step 03: remove extra conv and bn nodes


--------------------------------------------------------------------------------
/onnx_optimize/step04_extract_sub_graph.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | 
 3 | 
 4 | def extract_sub_graph(input_path, output_path, input_names=None, output_names=None):
 5 |     onnx.utils.extract_model(input_path, output_path, input_names, output_names)
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     import sys
10 | 
11 |     input_path = sys.argv[1]
12 |     output_path = sys.argv[2]
13 |     input_names = ["input.1"]
14 |     output_names = ["1080"]
15 |     extract_sub_graph(input_path, output_path, input_names, output_names)
16 | 


--------------------------------------------------------------------------------
/quantization/calib_cache2json.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import struct
 3 | import json
 4 | 
 5 | 
 6 | scale_map = {}
 7 | with open(sys.argv[1]) as fr:
 8 |     for line in fr.readlines()[1:]:
 9 |         name, value = line.strip().split(":")
10 |         if value.strip() == "0":
11 |             val = 0.0
12 |         else:
13 |             val = struct.unpack("!f", bytes.fromhex(value.strip()))[0]
14 | 
15 |         scale_map[name] = val
16 | 
17 | scale_map = {k: scale_map[k] for k in sorted(scale_map)}
18 | 
19 | with open(".".join(sys.argv[1].split(".")[:-1]) + ".json", "w") as fw:
20 |     json.dump(scale_map, fw, indent=4)
21 | 


--------------------------------------------------------------------------------
/quantization/calib_json2calib.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import struct
 3 | import json
 4 | 
 5 | 
 6 | with open(sys.argv[1]) as fr:
 7 |     scale_map = json.load(fr)
 8 | 
 9 | scale_map = {k: scale_map[k] for k in sorted(scale_map)}
10 | with open(".".join(sys.argv[1].split(".")[:-1]) + ".cache", "w") as file:
11 |     file.write("TRT-8400-EntropyCalibration2\n")
12 |     for key in sorted(scale_map.keys()):
13 |         value = scale_map[key]
14 |         scale = float(value)
15 |         scale_hex = hex(struct.unpack("<I", struct.pack("<f", scale))[0])
16 |         s = key + ": " + str(scale_hex).lstrip("0x")
17 |         file.write(s)
18 |         file.write("\n")
19 | 


--------------------------------------------------------------------------------
/quantization/onnx_optimize.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | import onnx
 3 | from onnx import helper, shape_inference
 4 | from onnx import TensorProto
 5 | from onnx.tools import update_model_dims
 6 | import onnxoptimizer
 7 | from onnxsim import simplify
 8 | import sys
 9 | 
10 | input_path = sys.argv[1]
11 | output_path = sys.argv[2]
12 | 
13 | # 获取想要的输出
14 | input_names = ['input.1']
15 | # output_names = ['1080']
16 | # onnx.utils.extract_model(input_path, output_path, input_names, output_names)
17 | 
18 | 
19 | # 推断shape
20 | onnx_model = onnx.load(input_path)
21 | 
22 | model_opt = onnxoptimizer.optimize(onnx_model)
23 | model_simp, check = simplify(model_opt)
24 | model_simp = shape_inference.infer_shapes(model_simp)
25 | onnx.save(model_simp, output_path)
26 | 


--------------------------------------------------------------------------------
/quantization/fiter_scales.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | 
 3 | calib_cache = "/apdcephfs/private_howellyang/onnx2trt/model.trt_int8_with_1578pics_calib_entropy.calib_cache"
 4 | 
 5 | lines = []
 6 | with open(calib_cache) as fr:
 7 |     for i, line in enumerate(fr.readlines()):
 8 |         if i == 0:
 9 |             lines.append(line)
10 |         else:
11 |             name, value = line.strip().split(":")
12 |             if value.strip() == "0":
13 |                 val = 0.0
14 |             else:
15 |                 val = struct.unpack('!f', bytes.fromhex(value.strip()))[0]
16 | 
17 |             if val > 0.5:
18 |                 print(name, val)
19 |             else:
20 |                 lines.append(line)
21 | 
22 | with open(calib_cache + "_filter_scale05.calib_cache", "w") as fw:
23 |     fw.writelines(lines)


--------------------------------------------------------------------------------
/quantization/onnx_change_resize_mode.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | import sys
 3 | import onnxoptimizer
 4 | from onnx import helper, shape_inference
 5 | from onnxsim import simplify
 6 | from onnx import numpy_helper
 7 | import numpy as np
 8 | 
 9 | onnx_model = onnx.load(sys.argv[1])
10 | 
11 | for node in onnx_model.graph.node:
12 |     if node.op_type == "Resize":
13 |         for attr in node.attribute:
14 |             if (attr.name == "coordinate_transformation_mode"):
15 |                 attr.s = "half_pixel".encode("UTF-8")
16 |             elif attr.name == "mode":
17 |                 attr.s = "linear".encode("UTF-8")
18 |             elif attr.name == "nearest_mode":
19 |                 attr.s = "round_prefer_floor".encode("UTF-8")
20 | 
21 | model_opt = onnxoptimizer.optimize(onnx_model)
22 | # model_simp, check = simplify(model_opt)
23 | model_simp = shape_inference.infer_shapes(model_opt)
24 | onnx.save(model_simp, sys.argv[2])
25 | 


--------------------------------------------------------------------------------
/prune/README_dev.md:
--------------------------------------------------------------------------------
 1 | 1. 剪枝概述
 2 | 
 3 | 
 4 | 
 5 | 
 6 | 2. 常用工具
 7 | 
 8 | - nni: 基于torch.fx，在某些复杂网络如nanodet上失效；
 9 | - pytorch原生工具: 难以获取网络的拓扑结构，对于包含Concat等结构的网络剪枝困难；
10 |    - torch.fx工具虽然提供了相关的功能，但是对于其中的shape计算、特殊算子等支持并不友好(其实是我太懒，不想新学一种表示)
11 |    - 更简单的办法是，通过onnx作为中间层，使用onnx simpliier工具优化后，读取模型的拓扑结构；
12 |    - 根据onnx模型中的拓扑结构，来进行结构化的剪枝；保存模型权重，输出模型结构；
13 |    - 最后进行模型的finetune，输出最终的模型；
14 | - 问题: onnx会修改权重的名称，无法根据名称找到对应关系；
15 | - 解决方案:
16 |    - 通过计算相似度等方式找到权重对应关系 ----> 设置training为True后，权重名称也得以与state_dict中一致
17 |    - 通过设置export的training=True来避免Conv-BN融合
18 | - 问题: 对于Add、Concat、Channel Shuffle结构，需要进行针对性的识别和处理 ----> 复杂度较高
19 | - 解决方案: 
20 |    - 使用torch的hook机制，对特定类型的节点进行hook，从而获得每个op的mask
21 |    - 然后根据mask的结果，对权重进行prune(整个流程可能跟nni中的torch.fx是一致的)
22 |    - Add、Concat模块没办法直接进行Hook，需要使用自定义的module实现后才能完成hook功能
23 | - 为了便于后续的研究，将采用复杂的方案来进行剪枝；而不是每个模块设置一个prune函数
24 | 
25 | 3. 剪枝工具开发
26 | - shufflenet剪枝: channel shuffle
27 | - FPN剪枝: Add层的处理
28 | 
29 | 
30 | 4. 实验记录
31 | 
32 | - 方案一: 直接根据BN层的scale参数的l1-norm大小来剪枝
33 | - 方案二: 根据当前层的输出的l1-norm大小来剪枝
34 | - 


--------------------------------------------------------------------------------
/quantization/ptq/remove_inititalizers_from_inputs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import onnx
 4 | 
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--input", required=True, help="input model")
 9 |     parser.add_argument("--output", required=True, help="output model")
10 |     args = parser.parse_args()
11 |     return args
12 | 
13 | 
14 | def remove_initializer_from_input():
15 |     args = get_args()
16 | 
17 |     model = onnx.load(args.input)
18 |     if model.ir_version < 4:
19 |         print("Model with ir_version below 4 requires to include initilizer in graph input")
20 |         return
21 | 
22 |     inputs = model.graph.input
23 |     name_to_input = {}
24 |     for input in inputs:
25 |         name_to_input[input.name] = input
26 | 
27 |     for initializer in model.graph.initializer:
28 |         if initializer.name in name_to_input:
29 |             inputs.remove(name_to_input[initializer.name])
30 | 
31 |     onnx.save(model, args.output)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     remove_initializer_from_input()


--------------------------------------------------------------------------------
/quantization/ptq/filter_cache.py:
--------------------------------------------------------------------------------
 1 | from trt_utils import read_calib_cache
 2 | from trt_utils import write_cache_to_json
 3 | import onnx
 4 | 
 5 | 
 6 | # 过滤掉不输入onnx节点的量化值，以及数字过大的量化值
 7 | onnx_model = onnx.load("/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.onnx")
 8 | onnx_output_names = []
 9 | for node in onnx_model.graph.node:
10 |     for o in node.output:
11 |         onnx_output_names.append(o)
12 | 
13 | scale_map = read_calib_cache("/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.trt_int8_with_531pics_calib_percentile595.calib_cache")
14 | calib_cache = "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.trt_int8_with_531pics_calib_percentile595.filtered.calib_cache"
15 | 
16 | scale_map = {k: scale_map[k] for k in sorted(scale_map)}
17 | with open(calib_cache, "w") as file:
18 |     file.write("TRT-8400-EntropyCalibration2\n")
19 |     for key in sorted(scale_map.keys()):
20 |         value = scale_map[key]
21 |         scale = float(value)
22 |         scale_hex = hex(struct.unpack("<I", struct.pack("<f", scale))[0])
23 |         s = key + ": " + str(scale_hex).lstrip("0x")
24 |         file.write(s)
25 |         file.write("\n")


--------------------------------------------------------------------------------
/onnx_optimize/step01_export_torch_to_onnx.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def export_torch_to_onnx(model, output_path, output_names=None, input_shape=(320, 192)):
 5 |     dummy_input = torch.autograd.Variable(
 6 |         torch.randn(1, 3, input_shape[1], input_shape[0])
 7 |     )  # N, C, H, W
 8 |     model.eval()
 9 |     torch.onnx.export(
10 |         model,
11 |         dummy_input,
12 |         output_path,
13 |         verbose=True,
14 |         keep_initializers_as_inputs=False,
15 |         do_constant_folding=True,
16 |         training=False,
17 |         opset_version=11,
18 |         output_names=output_names,
19 |     )
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     # load your model
24 |     config = "exp01_baseline.yml"
25 |     model_path = "epoch_300.pth"
26 |     output_path = "model.onnx"
27 |     model = build_model(config)
28 |     state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
29 |     model.load(state_dict)
30 |     # set input shape
31 |     input_shape = (320, 192)  # W H
32 |     # set output names, which will be used in model deployment
33 |     output_names = ["s8_cls", "s8_reg", "s16_cls", "s16_reg"]
34 | 
35 |     export_torch_to_onnx(model, output_path, output_names, input_shape)
36 | 


--------------------------------------------------------------------------------
/quantization/calib_merge_json2calib.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import struct
 3 | import json
 4 | import numpy as np
 5 | 
 6 | with open(sys.argv[1]) as fr:
 7 |     scale_map_ppq = json.load(fr)
 8 | 
 9 | 
10 | with open(sys.argv[2]) as fr:
11 |     scale_map_etp = json.load(fr)
12 | 
13 | 
14 | # 第一步: etp里面，相同的值进行归类
15 | same_value_map = {}
16 | for name, value in scale_map_etp.items():
17 |     value = np.round(value, 7)
18 |     if value not in same_value_map:
19 |         same_value_map[value] = []
20 |     same_value_map[value].append(name)
21 | 
22 | # 第二步: 构建value map的映射 ----> 转换容易失败
23 | same_name_map = {}
24 | for value, names in same_value_map.items():
25 |     if len(names) > 1:
26 |         print(names)
27 | 
28 | exit(0)
29 | 
30 | scale_map_etp.update(scale_map_etp)
31 | 
32 | 
33 | with open(sys.argv[3], "w") as file:
34 |     file.write("TRT-8400-EntropyCalibration2\n")
35 |     for key in sorted(scale_map_etp.keys()):
36 |         scale = scale_map_etp[key]
37 |         # if scale > 0.5:
38 |         #     print("scale过大, 建议不量化:", key, scale, 128.0 * scale)
39 |         #     continue
40 |         if len(key) > 5:
41 |             print(key, scale)
42 |             continue
43 |         scale_hex = hex(struct.unpack("<I", struct.pack("<f", scale))[0])
44 |         s = key + ": " + str(scale_hex).lstrip("0x")
45 |         file.write(s)
46 |         file.write("\n")
47 | 


--------------------------------------------------------------------------------
/prune/README.md:
--------------------------------------------------------------------------------
 1 | ## PRUNE: 自定义剪枝工具
 2 | 
 3 | 在模型剪枝的工程实践中，我们发现当前的剪枝工具，对很多特殊的结构(depthwise convolution、res block)是不支持的；端上常用的shufflenet等模型，无法使用现有工具进行剪枝；为此，我临时开发了一个简单的剪枝工具，来完成针对shufflenet和PAN结构的channel-wise的剪枝；  
 4 | 
 5 | <br></br>
 6 | 
 7 | ### 1. 问题描述
 8 | 
 9 | 常见的剪枝工具没有处理特殊结构的能力:
10 | - depthwise convolution: 普通卷积的权重的shape是[Co, Ci, Kw, Kh], 深度分离卷积的权重shape是[Co, 1, Kw, Kh]; 进行channel-wise剪枝时，需要根据前置节点的channel，决定当前节点的channel选择，并且决定了后置节点的输入channel的选择；
11 | 
12 | - channel shuffle: 通道shuffle后的卷积的输入权重，剪枝通道要与通道shuffle前的通道对应上；需要识别不同卷积层之间的通道对应关系；
13 | 
14 | - Add结构: 输入的多个卷积层，其剪枝的channel需要保持一致，否则将会出现Add的channel之间的不对应的问题；
15 | 
16 | - Slice结构: slice前整个feature map的有效channel数是原模型的1/2，但是slice之后的两个feature map中有效channel数不是原来的1/2了；导致模型结构不符合预期；
17 | 
18 | <br></br>
19 | ### 2. 解决方案
20 | 
21 | 放弃的解决方案：
22 | - 使用已有的nni等工具:
23 |    - 基于torch.fx来识别不同模块之间的关联关系；然后开发对应的剪枝功能；
24 |    - torch.fx会转换所有的表达式，粒度很细；学习成本有点高；
25 | - 使用onnx来获取拓扑结构：
26 |    - onnx能够很简单的识别模块间的关联关系
27 |    - onnx对于channel间的转换的识别能力不够(onnx无法单独设置每一层的输入)
28 | 
29 | 
30 | 最终选择的解决方案：
31 | - 使用torch的hook机制，自行构建mask和剪枝流程:
32 |    - 首先，使用hook获取模块的id，用于后续构架关联关系
33 |    - 其次，识别需要联合剪枝的模块(在这里主要是指Add)
34 |    - 然后，构建剪枝用到的mask
35 |    - 最后，对权重进行修改
36 | 
37 | 
38 | 特殊要素的处理:
39 | - depthwise convolution: 根据输入来决定输出channel的选择；
40 | - channel shuffle: 通过输出置零来实现需要剪枝的channel之间的传递；
41 | - add: 记录add相关的模块id，计算权重的均值来剪枝
42 | - slice: 针对shufflenet的结构，将channel划分为4组，分别进行channel选择；
43 | 
44 | <br></br>
45 | 
46 | ### 3. 实际效果
47 | 
48 | 剪枝工具开发
49 | - shufflenet剪枝: channel shuffle
50 | - FPN剪枝: Add层的处理


--------------------------------------------------------------------------------
/quantization/calib_filter_Sigmoid.py:
--------------------------------------------------------------------------------
 1 | # 过滤掉calib_cache中的sigmoid层
 2 | import sys
 3 | import onnx
 4 | 
 5 | onnx_model = onnx.load(sys.argv[1])
 6 | 
 7 | 
 8 | # Sigmoid\HardSigmoid
 9 | sigmoid_inputs = []
10 | sigmoid_outputs = []
11 | 
12 | add_inputs = []
13 | add_outputs = []
14 | 
15 | for node in onnx_model.graph.node:
16 |     if node.op_type in ["HardSigmoid", "Sigmoid"]:
17 |         input_name = node.input[0]
18 |         sigmoid_inputs.append(input_name)
19 |         output_name = node.output[0]
20 |         sigmoid_outputs.append(output_name)
21 |     elif node.op_type in ["Mul", "Add", "Concat"]:
22 |         input_name = node.input[0]
23 |         add_inputs.append(input_name)
24 |         input_name = node.input[1]
25 |         add_inputs.append(input_name)
26 | 
27 |         output_name = node.output[0]
28 |         add_outputs.append(output_name)
29 | 
30 | # print(sigmoid_outputs)
31 | 
32 | # 过滤Sigmoid的输出
33 | lines = []
34 | total_sigmoids = 0
35 | total_nodes = 0
36 | with open(sys.argv[2]) as fr:
37 |     for i, line in enumerate(fr.readlines()):
38 |         if i == 0:
39 |             lines.append(line)
40 |         else:
41 |             total_nodes += 1
42 | 
43 |             name, value = line.strip().split(":")
44 |             name = name.strip(" ")
45 |             if name in sigmoid_outputs or name in add_outputs or name in sigmoid_inputs or name in add_inputs:
46 |                 total_sigmoids += 1
47 |                 continue
48 |             else:
49 |                 lines.append(line)
50 | 
51 | 
52 | print("total nodes", total_nodes)
53 | print("sigmoids ", total_sigmoids)
54 | print("final nodes", len(lines) - 1)
55 | 
56 | with open(sys.argv[3], "w") as fw:
57 |     fw.writelines(lines)


--------------------------------------------------------------------------------
/01.install.md:
--------------------------------------------------------------------------------
 1 | # onnx2trt  
 2 | 
 3 | 
 4 | 【DEPRECATED】 开发过程中，发现了一个整体思路与我这个repo类似，但功能更完善，且实现了一些高级算法的repo，建议使用这个repo来进行模型量化和部署； https://github.com/openppl-public/ppq
 5 | 
 6 | 
 7 | 
 8 | onnx2trt是用于进行tensorRT的int8模型量化的工具; 在进行int8模型量化时，某些int8 tensorRT模型的精度会出现一定程度的下降。而当前tensorRT默认使用的校准算法是Entropy, 为此特意开发onnx2trt工具来优化量化模型的精度。
 9 | 
10 | 
11 | 
12 | ## 安装
13 | python36 (py37会遇到pycuda安装的问题)
14 | 
15 | pip install nvidia-pyindex 
16 | pip install nvidia-tensorrt  
17 | pip install pycuda  
18 | pip install sympy  
19 | 
20 | 
21 | 
22 | ## tensorRT量化存在的问题
23 | 
24 | 1. 大模型的量化误差累积
25 |    在进行模型的量化校准时，通常的做法是先用fp32模型进行一遍infer，然后统计每个节点的动态范围。这样的做法简单快捷，做一遍infer即可得到整个模型所有节点的动态范围。
26 |    但是，当层数较多时，量化的误差会不断累积；距离模型输入越远，这种量化误差越大。
27 | 
28 | 
29 | 2. 量化后阈值偏移
30 |    当某个节点的输出数量比较小时，节点输出的cosine相似度已经很高，但是却出现了阈值偏移；
31 |    例如：fp32_out = [-6.223839, 3.5978181, -2.4270086], int8_out = [-2.37992859, 1.80094731, -1.93005347]
32 |    如果这个输出后接的是softmax结构的话，这种阈值偏移对最终精度的影响会比较小；
33 |    但是如果这里输出的是分数score的话，就会带来一些不利于实际部署的结果：例如recall降低，precision升高的变化；这时需要重新调整阈值来维持precision或者recall不变，以保证模型部署的效果。
34 |    bias correction相关: https://zhuanlan.zhihu.com/p/450227567
35 | 
36 | 
37 | TODO:  
38 | - [ ] QDQ量化工具: 使用QDQ方式进行tensorRT的模型量化.  
39 |    - <img width="1000" alt="image" src="https://user-images.githubusercontent.com/100257957/180491664-ec2fc0ab-db9b-45b1-a758-8d718a217ecc.png">
40 |    - 需要直到tensorRT做了哪些网络节点的优化，才能方便地插入QDQ节点；
41 | - [ ] 量化精度损失分析工具:    
42 |    - tensorRT自带量化分析工具polygraph: https://zhuanlan.zhihu.com/p/535021438 
43 |    - 分析流程示例：https://blog.csdn.net/TracelessLe/article/details/120656484
44 |    - 给定每个节点的量化scale值，计算每一层的量化前后的cosine值.   
45 |    - 给定每个节点的量化scale值，计算这一层量化对最终输出的莲花cosine值.   
46 | - [ ] 自定义scale计算工具/自定义calibrator:     
47 |    - 用于trt exec生成trt engine(隐式设置精度). 
48 |    - 用于QDQ生成trt engine(显式设置精度). 
49 | 
50 | 


--------------------------------------------------------------------------------
/quantization/find_nodes_onnx.py:
--------------------------------------------------------------------------------
 1 | # 找出所有A-->B之间的节点
 2 | from numpy import linspace
 3 | import onnx
 4 | 
 5 | # A = "8305"
 6 | # B = "8527"
 7 | # E = ["8302", "8305", "8530"]
 8 | 
 9 | # A = "8835"
10 | # B = "8897"
11 | 
12 | A = "8772"
13 | B = "8834"
14 | 
15 | Final_Nodes = [A, B]
16 | 
17 | input_path = "/apdcephfs/private_howellyang/onnx2trt/model.onnx"
18 | output_path = "/apdcephfs/private_howellyang/onnx2trt/model_{}_to_{}.onnx".format(A, B)
19 | calib_cache = "/apdcephfs/private_howellyang/onnx2trt/model.trt_int8_with_1578pics_calib_entropy.calib_cache"
20 | input_names = [A]
21 | output_names = [B]
22 | 
23 | onnx.utils.extract_model(input_path, output_path, input_names, output_names)
24 | 
25 | onnx_model = onnx.load(output_path)
26 | 
27 | for node in onnx_model.graph.node:
28 |     Final_Nodes.extend(node.output)
29 |     # if node.op_type in ["QuantizeLinear"]:
30 |     #     act_name = node.input[0]
31 |     #     scale_name = node.input[1]
32 |     #     scale_value = scales_map[scale_name]
33 |     #     if act_name in weights_map:  # 权重量化
34 |     #         if act_name not in weights_scale_map:
35 |     #             weights_scale_map[act_name] = []
36 | 
37 |     #         weights_scale_map[act_name].append(scale_value)
38 |     #     else:  # act 量化
39 |     #         if act_name not in acts_scale_map:
40 |     #             acts_scale_map[act_name] = []
41 | 
42 |     #         acts_scale_map[act_name].append(scale_value)
43 | print(Final_Nodes)
44 | 
45 | lines = []
46 | with open(calib_cache) as fr:
47 |     for i, line in enumerate(fr.readlines()):
48 |         if i == 0:
49 |             lines.append(line)
50 |         else:
51 |             node_name, hex_value = line.strip().split(":")
52 |             if node_name in Final_Nodes:
53 |                 continue
54 |             else:
55 |                 lines.append(line)
56 | 
57 | with open(calib_cache + "_remove_{}_to_{}.calib_cache".format(A, B), "w") as fw:
58 |     fw.writelines(lines)
59 | 


--------------------------------------------------------------------------------
/quantization/ptq/trt_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import struct
 4 | from matplotlib import scale
 5 | import numpy as np
 6 | 
 7 | def read_calib_cache(calib_cache):
 8 |     scale_map = {}
 9 |     with open(calib_cache) as fr:
10 |         for line in fr.readlines()[1:]:
11 |             name, value = line.strip().split(": ")
12 |             name = name.strip(":")
13 |             value = value.strip(":")
14 |             if value.strip() == "0":
15 |                 val = 0.0
16 |             else:
17 |                 val = struct.unpack("!f", bytes.fromhex(value.strip()))[0]
18 | 
19 |             scale_map[name] = val
20 | 
21 |     scale_map = {k: scale_map[k] for k in sorted(scale_map)}
22 |     return scale_map
23 | 
24 | 
25 | def write_cache_to_json(scale_map, calib_cache):
26 |     scale_map = {k: scale_map[k] for k in sorted(scale_map)}
27 |     with open(calib_cache, "w") as file:
28 |         file.write("TRT-8400-EntropyCalibration2\n")
29 |         for key in sorted(scale_map.keys()):
30 |             value = scale_map[key]
31 |             scale = float(value)
32 |             scale_hex = hex(struct.unpack("<I", struct.pack("<f", scale))[0])
33 |             s = key + ": " + str(scale_hex).lstrip("0x")
34 |             file.write(s)
35 |             file.write("\n")
36 | 
37 | def fake_quant_per_channel(weight, scale):
38 |     # The scale is a vector of coefficients and must have the same size as the quantization axis. 
39 |     # The quantization scale must consist of all positive float coefficients. 
40 |     # The rounding method is rounding-to-nearest ties-to-even and clamping is in the range [-128, 127].
41 |     # formula: clamp(round(input[k,c,r,s] / scale[k]))
42 |     scale = np.reshape(scale, (-1, 1, 1, 1))
43 |     int8_weight = np.clip(np.round(weight / scale), -128, 127)
44 |     fake_quantized_weight = int8_weight * scale
45 |     return fake_quantized_weight
46 | 
47 | if __name__ == "__main__":
48 |     scale_map = read_calib_cache("/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.balanced_v0.trt_int8_with_531pics_calib_percentile595.calib_cache")
49 |     print(scale_map)


--------------------------------------------------------------------------------
/quantization/ptq/remove_edge_logits_QDQ.py:
--------------------------------------------------------------------------------
 1 | from lib2to3.pytree import Node
 2 | import onnx
 3 | from onnx.external_data_helper import load_external_data_for_model
 4 | 
 5 | 
 6 | 
 7 | # 移除已训练好的QDQ 多任务模型中的特定分支的QDQ节点
 8 | 
 9 | # Step 01 找到从中心concat到输出的节点
10 | # input_path = '/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.onnx'
11 | output_path = '/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.train_hy.edge_seg_logits_head.onnx'
12 | # input_names = ['onnx::Conv_9157', 'onnx::GlobalAveragePool_9176'] # ['input_0', 'input_1', 'input_2']
13 | # output_names = ['edge_seg_logits'] # ['output_0', 'output_1']
14 | 
15 | # onnx.utils.extract_model(input_path, output_path, input_names, output_names)
16 | 
17 | # Step 02 找到这些节点中的输入和输出节点
18 | 
19 | # 移除特定的QDQ量化节点
20 | sub_model_inouts = ["onnx::Mul_8478", "onnx::Add_8479", "input.4700", "input.4568", "input.4560", "x.239", \
21 |     "input.439", "onnx::Add_11845", "onnx::Add_11836", "onnx::Add_11827", "input.4124", "onnx::Sigmoid_8131", \
22 |         "onnx::Conv_8130", "input.4048", "input.4036", "x.123", "input.3524", ""]
23 | 
24 | # sub_model = onnx.load(output_path)
25 | # for node in sub_model.graph.node:
26 | #     for i in node.input:
27 | #         sub_model_inouts.append(i)
28 | #     for o in node.output:
29 | #         sub_model_inouts.append(o)
30 | 
31 | # Step 03 移除这些QDQ节点，并修改输入输出
32 | full_graph = onnx.load("/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.train_hy.onnx")
33 | 
34 | 
35 | nodes_to_remove = []
36 | for node in full_graph.graph.node:
37 |     if node.op_type == "QuantizeLinear":
38 |         if node.input[0] in sub_model_inouts:
39 |             nodes_to_remove.append(node)
40 |             for dq_node in full_graph.graph.node:
41 |                 if dq_node.input[0] == node.output[0]:
42 |                     # DQ节点
43 |                     nodes_to_remove.append(dq_node)
44 |                     for post_node in full_graph.graph.node:
45 |                         for i, input in enumerate(post_node.input):
46 |                             if input == dq_node.output[0]:
47 |                                 post_node.input[i] = node.input[0]
48 | 
49 | for node in nodes_to_remove:
50 |     full_graph.graph.node.remove(node)
51 | 
52 | onnx.save(full_graph, "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.train_hy.edge_seg_logits_head_fp32.onnx")


--------------------------------------------------------------------------------
/quantization/onnx_remove_split_qdq.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import onnx
 3 | import sys
 4 | import onnxoptimizer
 5 | from onnx import helper, shape_inference
 6 | from onnxsim import simplify
 7 | from onnx import numpy_helper
 8 | import numpy as np
 9 | 
10 | onnx_model = onnx.load(sys.argv[1])
11 | inits = onnx_model.graph.initializer
12 | inits_names =[init.name for init in inits] + ["input.1"]
13 | 
14 | nodes_names_to_remove = []
15 | for i in range(len(onnx_model.graph.node)):
16 |     if onnx_model.graph.node[i].op_type == "Split":
17 |         if "DequantizeLinear" in onnx_model.graph.node[i].input[0]: # 输入是DQ
18 |             # 找到DQ
19 |             for j in range(len(onnx_model.graph.node)):
20 |                 if onnx_model.graph.node[j].output[0] == onnx_model.graph.node[i].input[0]:
21 |                     nodes_names_to_remove.append(onnx_model.graph.node[j].name)
22 |                     dq_input = onnx_model.graph.node[j].input[0]
23 |                     break
24 | 
25 |             # 找到Q
26 |             for j in range(len(onnx_model.graph.node)):
27 |                 if onnx_model.graph.node[j].output[0] == dq_input:
28 |                     nodes_names_to_remove.append(onnx_model.graph.node[j].name)
29 |                     q_input = onnx_model.graph.node[j].input[0]
30 |                     break
31 | 
32 |             # 改变Split的输入
33 |             onnx_model.graph.node[i].input[0] = q_input
34 | 
35 | 
36 | def remove_initializer_from_input(model):
37 |     if model.ir_version < 4:
38 |         print(
39 |             "Model with ir_version below 4 requires to include initilizer in graph input"
40 |         )
41 |         return
42 | 
43 |     inputs = model.graph.input
44 |     name_to_input = {}
45 |     for input in inputs:
46 |         name_to_input[input.name] = input
47 | 
48 |     for initializer in model.graph.initializer:
49 |         if initializer.name in name_to_input:
50 |             inputs.remove(name_to_input[initializer.name])
51 | 
52 |     return model
53 | 
54 | # 删除多余的节点
55 | for rm_name in nodes_names_to_remove:
56 |     for i in range(len(onnx_model.graph.node)):
57 |         if onnx_model.graph.node[i].name == rm_name:
58 |             old_node = onnx_model.graph.node[i]
59 |             print("remove", old_node.name)
60 |             onnx_model.graph.node.remove(old_node)  # 删除旧节点
61 |             break
62 | 
63 | onnx_model = remove_initializer_from_input(onnx_model)
64 | model_opt = onnxoptimizer.optimize(onnx_model)
65 | # model_simp, check = simplify(model_opt)
66 | model_simp = shape_inference.infer_shapes(model_opt)
67 | onnx.save(model_simp, sys.argv[2])
68 | 


--------------------------------------------------------------------------------
/onnx_optimize/step02_onnx_simplify.py:
--------------------------------------------------------------------------------
 1 | import onnxoptimizer
 2 | from onnxsim import simplify
 3 | from onnx import shape_inference
 4 | from onnx import version_converter
 5 | import onnx
 6 | import sys
 7 | 
 8 | def get_post_nodes(onnx_model, tensor_name):
 9 |     post_nodes = []
10 |     for node in onnx_model.graph.node:
11 |         for input_tensor in node.input:
12 |             if input_tensor == tensor_name:
13 |                 post_nodes.append(node)
14 |                 break
15 |     return post_nodes
16 | 
17 | 
18 | def remove_useless_constants(model):
19 |     all_nodes = []
20 |     for node in model.graph.node:
21 |         all_nodes.append(node)
22 | 
23 |     input_name_to_nodes = {}
24 |     for node in all_nodes:
25 |         for input_name in node.input:
26 |             if input_name not in input_name_to_nodes:
27 |                 input_name_to_nodes[input_name] = [node]
28 |             else:
29 |                 input_name_to_nodes[input_name].append(node)
30 | 
31 |     unused_nodes = []
32 |     for node in all_nodes:
33 |         if node.op_type == "Constant" and node.output[0] not in input_name_to_nodes:
34 |             unused_nodes.append(node)
35 | 
36 |     for node in unused_nodes:
37 |         if node in model.graph.node:
38 |             model.graph.node.remove(node)
39 |     return model
40 | 
41 | 
42 | def remove_initializers_from_inputs(model):
43 |     inputs = model.graph.input
44 |     name_to_input = {}
45 |     for input in inputs:
46 |         name_to_input[input.name] = input
47 | 
48 |     unused_initializers = []
49 |     for initializer in model.graph.initializer:
50 |         # remove initializers from input
51 |         if initializer.name in name_to_input:
52 |             inputs.remove(name_to_input[initializer.name])
53 | 
54 |         post_nodes = get_post_nodes(model, initializer.name)
55 |         if len(post_nodes) == 0:
56 |             unused_initializers.append(initializer)
57 | 
58 |     # remove unused initializers
59 |     for initializer in unused_initializers:
60 |         model.graph.initializer.remove(initializer)
61 |     return model
62 | 
63 | 
64 | def optimize_onnx_model(onnx_model):
65 |     remove_useless_constants(onnx_model)
66 |     remove_initializers_from_inputs(onnx_model)
67 |     onnx_model = version_converter.convert_version(onnx_model, 13)
68 |     onnx_model = onnxoptimizer.optimize(onnx_model)
69 |     onnx_model, check = simplify(onnx_model)
70 |     onnx_model = shape_inference.infer_shapes(onnx_model)
71 |     return onnx_model
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     input_path = sys.argv[1]
76 |     output_path = sys.argv[2]
77 | 
78 |     onnx_model = onnx.load(input_path)
79 |     model_simp = optimize_onnx_model(onnx_model)
80 |     onnx.save(model_simp, output_path)
81 | 


--------------------------------------------------------------------------------
/quantization/ptq/data_loader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # encoding:utf-8
 3 | import random
 4 | import os
 5 | import glob
 6 | import torch
 7 | import numpy as np
 8 | from PIL import Image
 9 | from torchvision import transforms
10 | 
11 | 
12 | class DataLoader:
13 |     def __init__(self, image_dir="/mapai/howellyang/code/onnx2trt/calib_images_2k"):
14 |         self.image_dir = image_dir
15 | 
16 |     def get_numpy_data(self, image_num=100):
17 |         calibration_files = glob.glob(
18 |             os.path.join(self.image_dir, "*.jpg")
19 |         )
20 |         calibration_files += glob.glob(
21 |             os.path.join(self.image_dir, "*.png")
22 |         )
23 |         random.seed(103600)
24 |         random.shuffle(calibration_files)
25 |         calibration_files = calibration_files[:image_num]
26 |         input_data = [DataLoader.convert_any_to_numpy(self.read_image(path))
27 |                       for path in calibration_files]
28 |         return input_data
29 | 
30 |     def read_image(self, path):
31 |         _img_transforms = transforms.Compose(
32 |             [
33 |                 transforms.Resize((384, 768)),
34 |                 transforms.ToTensor(),
35 |                 transforms.Normalize((0.485, 0.456, 0.406),
36 |                                      (0.229, 0.224, 0.225)),
37 |             ]
38 |         )
39 |         img = Image.open(path).convert("RGB")
40 |         img_w, img_h = img.size[0], img.size[1]
41 |         img = _img_transforms(img)
42 |         img = img.unsqueeze(0)
43 |         return img
44 | 
45 |     @staticmethod
46 |     def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray:
47 |         if x is None and not accepet_none:
48 |             raise ValueError("Trying to convert an empty value.")
49 |         if isinstance(x, np.ndarray):
50 |             return x
51 |         elif isinstance(x, int) or isinstance(x, float):
52 |             return np.array(
53 |                 [
54 |                     x,
55 |                 ]
56 |             )
57 |         elif isinstance(x, torch.Tensor):
58 |             if x.numel() == 0 and accepet_none:
59 |                 return None
60 |             if x.numel() == 0 and not accepet_none:
61 |                 raise ValueError("Trying to convert an empty value.")
62 |             if x.numel() == 1:
63 |                 return DataLoader.convert_any_to_numpy(x.detach().cpu().item())
64 |             if x.numel() > 1:
65 |                 return x.detach().cpu().numpy()
66 |         elif isinstance(x, list) or isinstance(x, tuple):
67 |             return np.array(x)
68 |         else:
69 |             raise TypeError(
70 |                 f"input value {x}({type(x)}) can not be converted as numpy type."
71 |             )
72 | 
73 | 
74 | def read_image(path):
75 |     # 多任务模型
76 |     _img_transforms = transforms.Compose(
77 |         [
78 |             transforms.Resize((384, 768)),
79 |             transforms.ToTensor(),
80 |             transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
81 |         ]
82 |     )
83 |     img = Image.open(path).convert("RGB")
84 |     img_w, img_h = img.size[0], img.size[1]
85 |     img = _img_transforms(img)
86 |     img = img.unsqueeze(0)
87 |     return img
88 | 


--------------------------------------------------------------------------------
/quantization/json_filter.py:
--------------------------------------------------------------------------------
  1 | 
  2 | failed_tasks = {
  3 |   "code": 0,
  4 |   "msg": "",
  5 |   "detail": [
  6 |     {
  7 |       "pkg_task_id": "999180333210000000161202285_null_210000000161202285",
  8 |       "status": -1,
  9 |       "status_landmark": 2,
 10 |       "status_land_line": 2,
 11 |       "status_scene": 2,
 12 |       "status_camera": 2,
 13 |       "status_traffic_light": -1,
 14 |       "status_img": -1
 15 |     },
 16 |     {
 17 |       "pkg_task_id": "999180100210000000177231923_null_210000000177231923",
 18 |       "status": -1,
 19 |       "status_landmark": 2,
 20 |       "status_land_line": 2,
 21 |       "status_scene": 2,
 22 |       "status_camera": 2,
 23 |       "status_traffic_light": -1,
 24 |       "status_img": -1
 25 |     },
 26 |     {
 27 |       "pkg_task_id": "999180203210000000161217375_null_210000000161217375",
 28 |       "status": -1,
 29 |       "status_landmark": 2,
 30 |       "status_land_line": 2,
 31 |       "status_scene": 2,
 32 |       "status_camera": 2,
 33 |       "status_traffic_light": -1,
 34 |       "status_img": -1
 35 |     },
 36 |     {
 37 |       "pkg_task_id": "999180148210000000161242266_null_210000000161242266",
 38 |       "status": -1,
 39 |       "status_landmark": 2,
 40 |       "status_land_line": 2,
 41 |       "status_scene": 2,
 42 |       "status_camera": 2,
 43 |       "status_traffic_light": -1,
 44 |       "status_img": -1
 45 |     },
 46 |     {
 47 |       "pkg_task_id": "999204618210000000161212957_null_210000000161212957",
 48 |       "status": -1,
 49 |       "status_landmark": 2,
 50 |       "status_land_line": 2,
 51 |       "status_scene": 2,
 52 |       "status_camera": 2,
 53 |       "status_traffic_light": -1,
 54 |       "status_img": -1
 55 |     },
 56 |     {
 57 |       "pkg_task_id": "999195621210000000177226766_null_210000000177226766",
 58 |       "status": -1,
 59 |       "status_landmark": 2,
 60 |       "status_land_line": 2,
 61 |       "status_scene": 2,
 62 |       "status_camera": 2,
 63 |       "status_traffic_light": -1,
 64 |       "status_img": -1
 65 |     },
 66 |     {
 67 |       "pkg_task_id": "999202427210000000161237174_null_210000000161237174",
 68 |       "status": -1,
 69 |       "status_landmark": 2,
 70 |       "status_land_line": 2,
 71 |       "status_scene": 2,
 72 |       "status_camera": 2,
 73 |       "status_traffic_light": -1,
 74 |       "status_img": -1
 75 |     },
 76 |     {
 77 |       "pkg_task_id": "999202404210000000177220569_null_210000000177220569",
 78 |       "status": -1,
 79 |       "status_landmark": 2,
 80 |       "status_land_line": 2,
 81 |       "status_scene": 2,
 82 |       "status_camera": 2,
 83 |       "status_traffic_light": -1,
 84 |       "status_img": -1
 85 |     },
 86 |     {
 87 |       "pkg_task_id": "999192126210000000177226943_null_210000000177226943",
 88 |       "status": -1,
 89 |       "status_landmark": 2,
 90 |       "status_land_line": 2,
 91 |       "status_scene": 2,
 92 |       "status_camera": 2,
 93 |       "status_traffic_light": -1,
 94 |       "status_img": -1
 95 |     },
 96 |     {
 97 |       "pkg_task_id": "999194214210000000177357511_null_210000000177357511",
 98 |       "status": -1,
 99 |       "status_landmark": 2,
100 |       "status_land_line": 2,
101 |       "status_scene": 2,
102 |       "status_camera": 2,
103 |       "status_traffic_light": -1,
104 |       "status_img": -1
105 |     },
106 |     {
107 |       "pkg_task_id": "999194403210000000177230387_null_210000000177230387",
108 |       "status": -1,
109 |       "status_landmark": 2,
110 |       "status_land_line": 2,
111 |       "status_scene": 2,
112 |       "status_camera": 2,
113 |       "status_traffic_light": -1,
114 |       "status_img": -1
115 |     }
116 |   ],
117 |   "detail_explain": "pkg_task_id,status,status_landmark,status_land_line,status_scene,status_camera,status_traffic_light,status_img"
118 | }
119 | 
120 | for info in failed_tasks["detail"]:
121 |     print('"{}"'.format(info["pkg_task_id"]), end = ",")
122 | 
123 | print("\n"*4)


--------------------------------------------------------------------------------
/onnx_optimize/step03_fuse_normalize_to_conv.py:
--------------------------------------------------------------------------------
  1 | import onnx
  2 | import numpy as np
  3 | from onnx import numpy_helper
  4 | from step02_onnx_simplify import get_post_nodes
  5 | 
  6 | 
  7 | def fuse_normalize_to_conv(onnx_model, means, scales, input_tensor_name=None):
  8 |     # Y = (x - means)/scales
  9 |     initializer_names = [
 10 |         initializer.name for initializer in onnx_model.graph.initializer
 11 |     ]
 12 |     inputs = [inp for inp in onnx_model.graph.input if inp not in initializer_names]
 13 |     if input_tensor_name is None:
 14 |         assert (
 15 |             len(inputs) == 1
 16 |         ), "if multiple input exists, please specify input_tensor_name"
 17 |         input_tensor_name = inputs[0]
 18 | 
 19 |     weight_name2tensor = {}
 20 |     for weight in onnx_model.graph.initializer:
 21 |         weight_name2tensor[weight.name] = weight
 22 | 
 23 |     # find post nodes
 24 |     post_nodes = get_post_nodes(onnx_model, input_tensor_name)
 25 |     for post_node in post_nodes:
 26 |         if post_node.op_type != "Conv":
 27 |             raise NameError(
 28 |                 "the input tensor is used by non-Conv node, normalize process can't be fused"
 29 |             )
 30 | 
 31 |         paddings = [0]
 32 |         for attr in post_node.attribute:
 33 |             if attr.name == "pads":
 34 |                 paddings = attr.ints
 35 |                 break
 36 | 
 37 |         for pad in paddings:
 38 |             if pad != 0:
 39 |                 raise NameError(
 40 |                     "the conv after input has padding, normalize process can't be fused"
 41 |                 )
 42 | 
 43 |         group_num = 1
 44 |         for attr in post_node.attribute:
 45 |             if attr.name == "group":
 46 |                 group_num = attr.i
 47 |         if group_num > 1:
 48 |             raise NameError(
 49 |                 "the conv after input has group > 1, normalize process can't be fused"
 50 |             )
 51 | 
 52 |         # fuse normalize-conv
 53 |         assert (
 54 |             len(post_node.input) == 3
 55 |         ), " conv node must has bias for normalize fusion"
 56 |         weight_name = post_node.input[1]
 57 |         weight_tensor = weight_name2tensor[weight_name]
 58 |         weight_value = numpy_helper.to_array(weight_tensor)  # out_ch, in_ch, ker, ker
 59 | 
 60 |         bias_name = post_node.input[2]
 61 |         bias_tensor = weight_name2tensor[bias_name]
 62 |         bias_value = numpy_helper.to_array(bias_tensor)  # out_ch, in_ch, ker, ker
 63 | 
 64 |         assert (
 65 |             len(means) == len(scales) == np.shape(weight_value)[1]
 66 |         ), "mean and scale value mismatch the input channel num"
 67 | 
 68 |         means = np.reshape(np.array(means), (1, -1, 1, 1))
 69 |         scales = np.reshape(np.array(means), (1, -1, 1, 1))
 70 | 
 71 |         new_weight_value = np.array(weight_value / scales).astype(np.float32)
 72 |         raw_shape = tuple([i for i in weight_tensor.dims])
 73 |         new_shape = np.shape(new_weight_value)
 74 |         assert new_shape == raw_shape
 75 |         weight_tensor.ClearField("float_data")
 76 |         weight_tensor.ClearField("int32_data")
 77 |         weight_tensor.ClearField("int64_data")
 78 |         weight_tensor.raw_data = new_weight_value.tobytes()
 79 | 
 80 |         new_bias_value = np.array(
 81 |             bias_value - np.sum(weight_value * means / scales)
 82 |         ).astype(np.float32)
 83 |         raw_shape = tuple([i for i in bias_tensor.dims])
 84 |         new_shape = np.shape(new_bias_value)
 85 |         assert new_shape == raw_shape
 86 |         bias_tensor.ClearField("float_data")
 87 |         bias_tensor.ClearField("int32_data")
 88 |         bias_tensor.ClearField("int64_data")
 89 |         bias_tensor.raw_data = new_bias_value.tobytes()
 90 | 
 91 |     return onnx_model
 92 | 
 93 | 
 94 | if __name__ == "__main__":
 95 |     import sys
 96 | 
 97 |     onnx_model = onnx.load(sys.argv[1])
 98 |     means = eval(sys.argv[2])
 99 |     scales = eval(sys.argv[3])
100 |     onnx_model = fuse_normalize_to_conv(onnx_model, means, scales)
101 | 


--------------------------------------------------------------------------------
/quantization/ptq/create_json_inputs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | from polygraphy.json import save_json
  6 | 
  7 | print(sys.getdefaultencoding())
  8 | s = "中文乱码问题解决"
  9 | print(s)
 10 | 
 11 | # ---------------------------------------------------------------
 12 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 13 | 
 14 | # This script shows you how to export ppq internal graph to tensorRT
 15 | # ---------------------------------------------------------------
 16 | 
 17 | # For this inference test, all test data is randomly picked.
 18 | # If you want to use real data, just rewrite the defination of SAMPLES
 19 | print("开始import")
 20 | import onnxruntime
 21 | import torch
 22 | from tqdm import tqdm
 23 | import glob
 24 | import cv2
 25 | import numpy as np
 26 | from torchvision import transforms
 27 | from PIL import Image
 28 | import os
 29 | from sklearn.metrics.pairwise import cosine_similarity
 30 | import onnx
 31 | from copy import deepcopy
 32 | 
 33 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray:
 34 |     if x is None and not accepet_none:
 35 |         raise ValueError("Trying to convert an empty value.")
 36 |     if isinstance(x, np.ndarray):
 37 |         return x
 38 |     elif isinstance(x, int) or isinstance(x, float):
 39 |         return np.array(
 40 |             [
 41 |                 x,
 42 |             ]
 43 |         )
 44 |     elif isinstance(x, torch.Tensor):
 45 |         if x.numel() == 0 and accepet_none:
 46 |             return None
 47 |         if x.numel() == 0 and not accepet_none:
 48 |             raise ValueError("Trying to convert an empty value.")
 49 |         if x.numel() == 1:
 50 |             return convert_any_to_numpy(x.detach().cpu().item())
 51 |         if x.numel() > 1:
 52 |             return x.detach().cpu().numpy()
 53 |     elif isinstance(x, list) or isinstance(x, tuple):
 54 |         return np.array(x)
 55 |     else:
 56 |         raise TypeError(
 57 |             f"input value {x}({type(x)}) can not be converted as numpy type."
 58 |         )
 59 | 
 60 | def read_image(path):
 61 |     # 多任务模型
 62 |     _img_transforms = transforms.Compose(
 63 |         [
 64 |             transforms.Resize((384, 768)),
 65 |             transforms.ToTensor(),
 66 |             transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
 67 |         ]
 68 |     )
 69 |     img = Image.open(path).convert("RGB")
 70 |     img_w, img_h = img.size[0], img.size[1]
 71 |     img = _img_transforms(img)
 72 |     img = img.unsqueeze(0)
 73 |     return img
 74 | 
 75 | 
 76 | def read_image_v2(path):
 77 |     mean = [123.675, 116.28, 103.53]
 78 |     std = [58.395, 57.12, 57.375]
 79 |     input_w = 960
 80 |     input_h = 480
 81 |     mean = np.array(mean)
 82 |     std = np.array(std)
 83 |     img = cv2.imread(path)
 84 |     img = cv2.resize(img, (input_w, input_h))
 85 |     img = img.astype(np.float32)
 86 |     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 87 | 
 88 |     # Norm
 89 |     for i in range(3):
 90 |         img[..., i] = (img[..., i] - mean[i]) / std[i]
 91 | 
 92 |     # hwc -> nchw ----> 这里输入方式不对
 93 |     # h, w, c = img.shape
 94 |     # img = img.reshape((1, c, h ,w))
 95 |     img = np.transpose(img, (2, 0, 1)).astype(np.float32)
 96 |     img = np.expand_dims(img, axis=0)
 97 |     return np.ascontiguousarray(img, dtype=np.float32)
 98 | 
 99 | calibration_files = glob.glob(
100 |     os.path.join("/mapai/howellyang/code/road-service/road_service/calib_images/", "*.jpg")
101 | )[:1]
102 | 
103 | 
104 | SAMPLES = [
105 |     read_image_v2(path) for path in calibration_files
106 | ]  # rewirte this to use real data.
107 | 
108 | # List[Dict[str, numpy.ndarray]]
109 | import json
110 | from json import JSONEncoder
111 | import numpy
112 | class NumpyArrayEncoder(JSONEncoder):
113 |     def default(self, obj):
114 |         if isinstance(obj, numpy.ndarray):
115 |             return obj.tolist()
116 |         return JSONEncoder.default(self, obj)
117 | 
118 | feed_dict_list = [{"input.1": np.array(read_image_v2(path))} for path in calibration_files]
119 | 
120 | 
121 | save_json(feed_dict_list, "calibration_data_1k5.json")
122 | 
123 | # with open( ,"w") as fw:
124 | #     json.dump(feed_dict_list, fw, cls=NumpyArrayEncoder, indent=4)
125 | #     # encodedNumpyData = json.dumps(feed_dict_list, cls=NumpyArrayEncoder)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ONNX2TRT: 端上模型部署[整理中]
  2 | 
  3 | 
  4 | ## 1. 概述
  5 | 
  6 | 模型的压缩(蒸馏、剪枝、量化)和部署，是模型在自动驾驶和物联网产业落地中的重要步骤。端上的设备
  7 | 
  8 | 在实际工作过程中，我们遇到了很多的困难: 文档缺失、依赖库冲突、算子不支持、精度差、速度慢等。
  9 | 
 10 | 因此，我将我在实际工作过程中的一些经验，整理成文档记录在这里，供其它开发者参考。同时，我会将过程中用到的一些脚本，整理成一些独立的工具脚本，方便大家使用。
 11 | 
 12 | <br>
 13 | 
 14 | ## 2. 模型部署流程
 15 | 
 16 | 模型部署的一般步骤为:
 17 | - 模型导出onnx
 18 | - onnx模型结构优化
 19 | - 模型量化，构建tensorRT的engine
 20 | - tensorRT模型部署
 21 | - 精度和速度测试
 22 | - 问题排查与分析
 23 | 
 24 | 接下来，我将给出相关的工具，并对其中的关键步骤进行详细说明；
 25 | 
 26 | <br>
 27 | 
 28 | ### 2.1 模型导出
 29 | onnx是一种模型表示方式，能够将不同框架下的模型，统一表示为同一种形式；因此，尝尝被用来作为模型转换的中间节点；目前，tensorRT已经支持了直接用torch转成tensorRT的engine；但是其它的SDK框架，如MNN、TNN、Paddle-Lite、OpenVino等仍然只支持onnx格式的模型转换；并且，onnx本身也是一种很好用的模型框架，可以很方便地在上面做开发；
 30 | 
 31 | ```
 32 | import torch
 33 | # 加载你的模型
 34 | model = build_model(config.model)
 35 | checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
 36 | load_model_weight(model, checkpoint)
 37 | 
 38 | # 设置输入的大小
 39 | input_shape = (320, 192)  # W H
 40 | dummy_input = torch.autograd.Variable(torch.randn(1, 3, input_shape[1],input_shape[0]))  # N, C, H, W
 41 | 
 42 | # 设置输出节点名称，便于后续部署
 43 | output_names = ["s8_cls", "s8_reg", "s16_cls", "s16_reg"]
 44 | model.eval()
 45 | torch.onnx.export(
 46 |     model,
 47 |     dummy_input,
 48 |     output_path,
 49 |     verbose=True,
 50 |     keep_initializers_as_inputs=False,
 51 |     do_constant_folding=True,
 52 |     training=False,
 53 |     opset_version=11,
 54 |     output_names=output_names,
 55 | )
 56 | 
 57 | ```
 58 | 
 59 | 其它常用框架基本都有导出为onnx模型的代码，可以通过搜索引擎很容易得到相关结果，在此不作介绍。
 60 | 
 61 | 导出模型为onnx以后，如果不需要做模型量化，可以直接将onnx模型转换为所需的格式后进行模型部署；如果想快速完成部署，可以使用在线模型转换的工具来完成模型转换 https://convertmodel.com/；
 62 | 
 63 | <br>
 64 | 
 65 | ### 2.2 onnx模型结构优化 ###
 66 | 
 67 | onnx模型结构优化，一方面是为后续的模型量化做准备；另一方面是减少了输入和输出部分的计算，这部分计算对云端的算力而言可能是无关紧要的，但是对端上的微弱算力而言，这部分计算能省则省。
 68 | 
 69 | <br>
 70 | 
 71 | *2.2.1 onnx-simplify和optimize* 
 72 | 
 73 | optimize的目的是进行算子的融合, 从而减少计算量；例如fuse_bn_into_conv, fuse_concat_into_reshape; 详见[onnx-optimizer](https://github.com/onnx/optimizer);
 74 | 
 75 | ![fuse_bn](https://pic1.zhimg.com/v2-98dbfa847caf6d9c9d411348592c8815_1440w.jpg)
 76 | 
 77 | <br>
 78 | simplify的目的是消除onnx模型中的多余算子。从torch得到的onnx模型中，会存在一些从tensor计算出常量的操作，例如Reshape算子会从tensor中获取形状后给Resize算子；这就导致onnx模型中存在某些不必要的节点(最常见的是Gather节点)；因此，[onnx-simplifier](https://github.com/daquexian/onnx-simplifier)会对整个网络进行一次推理，然后将这类多余的算子替换成常量.
 79 | 
 80 | ![simplify](https://github.com/daquexian/onnx-simplifier/raw/master/imgs/complicated_reshape.png)
 81 | 
 82 | <br>
 83 | 
 84 | 使用在线网站，可以便捷地进行以上操作：https://www.convertmodel.com/#input=onnx&output=onnx；
 85 | 
 86 | <br>
 87 | 
 88 | *2.2.2 预处理融合*  
 89 | 
 90 | 在onnx-optimizer中，有一个操作是将Conv-BN结构中的BN层融合进Conv中，其原理可以简单理解为:
 91 | - Conv: Y = k * x + b
 92 | - BN:   Z = (Y - m)/s
 93 | - Conv-BN: Z = (k * x + b - m)/s = k/s * x + (b - m)/s
 94 | - new Conv: k1 = k/s, b1 = (b-m)/s, Z = k1 * x + b1
 95 | 
 96 | 那么，在某些模型中BN是放在Conv的，这种BN-Conv是否可以进行融合呢？答案是当Conv层没有padding(padding=0)时，也是可以融合的；但是当Conv层有padding时，BN-Conv的融合会导致输出的feature map与原始输出相比，在边界上存在diff；具体原理可以通过分析BN-Conv的计算过程得到，在此不作推导；
 97 | 
 98 | 在将图片输入到模型前，常常会进行减均值除方差(normalize)的操作；基于BN-Conv层融合的原理，这个normalize过程也同样可以融合到Conv层中(需要Conv层不带padding)；在端上硬件算力很小的情况下，这一融合也是十分有必要的；
 99 | 
100 | <br>
101 | 
102 | *2.2.3 sigmoid移除*
103 | 
104 | sigmoid函数中的exp计算以及除法运算，是比较耗时的；当模型最后输出的feature map比较大时，这个过程的耗时就会更加明显；当这个feature map是输出一个置信度时，可以通过计算sigmoid的反函数，提前计算好置信度阈值，从而省掉这个sigmoid的计算；为此，在实际部署时，常常会去掉模型输出前的sigmoid节点；同时，一些transpose、resize等操作，也可以在后处理流程中通过直接访问相应位置的元素来实现，不需要在模型中进行这一步额外的计算；
105 | 
106 | <br>
107 | 
108 | *2.2.4 RepConv融合*
109 | 
110 | RepConv是一种有效增加模型容量的技术。在训练时添加额外的卷积层，在部署时通过权重融合去掉这部分计算。通常情况下，RepConv的权重融合是在pytorch层面做的，但是当训练代码比较复杂或者重复代码较多时，在onnx层面进行权重的融合，可能是一个更好的选择；相关原理参见论文: [RepVGG: Making VGG-style ConvNets Great Again](https://arxiv.org/abs/2101.03697).
111 | 
112 | ![RepConv](https://miro.medium.com/max/1400/1*87dCul2yHq0_dRfV3nEubg.png)
113 | 
114 | 
115 | ### 2.3 量化
116 | 
117 | <br>
118 | 
119 | *2.3.1 量化的理论基础* 
120 | 
121 | 
122 | *2.3.2 量化的计算过程*  
123 | 
124 | 
125 | *2.3.3 常用的量化工具箱* 
126 | 
127 | *2.3.4 PTQ量化*
128 | 
129 | - 简单量化
130 | - balance vector(weight equalization)
131 | - bias correction
132 | 
133 | 
134 | *2.3.5 QAT量化*
135 | 
136 | - QDQ模式介绍
137 | - QDQ流程优化
138 | 
139 | 
140 | ### 2.4 剪枝
141 | 
142 | <br>
143 | 
144 | 
145 | ### 2.5 蒸馏
146 | 
147 | <br>
148 | 
149 | ## 3. 参考
150 | 1. tiny-tensorRT: https://github.com/zerollzeng/tiny-tensorrt
151 | 2. micronet: https://github.com/666DZY666/micronet
152 | 3. ppq: https://github.com/openppl-public/ppq
153 | 4. onnx-runtime quantization: https://onnxruntime.ai/docs/performance/quantization.html
154 | 5. polygraphy: https://github.com/NVIDIA/TensorRT/tree/main/tools/Polygraphy
155 | 


--------------------------------------------------------------------------------
/trt_calibrator.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import tensorrt as trt
  4 | import pycuda.driver as cuda
  5 | import pycuda.autoinit  # fix init error of cuda
  6 | 
  7 | # __all__ = [
  8 | #     "TRTPercentileCalibrator",
  9 | #     "TRTEntropyCalibrator",
 10 | #     "TRTMinMaxCalibrator",
 11 | # ]
 12 | 
 13 | 
 14 | class TRTEntropyCalibrator(trt.IInt8EntropyCalibrator2):
 15 |     def __init__(self, input_layers, stream, cache_file):
 16 |         super(TRTEntropyCalibrator, self).__init__()
 17 |         self.input_layers = input_layers
 18 | 
 19 |         # 数据读取的类, 等同于图片处理的回调
 20 |         self.stream = stream
 21 | 
 22 |         # 分配GPU
 23 |         self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
 24 | 
 25 |         # cache路径
 26 |         self.cache_file = cache_file
 27 | 
 28 |         # 重置校准集
 29 |         self.stream.reset()
 30 | 
 31 |     def get_batch_size(self):
 32 |         return self.stream.batch_size
 33 | 
 34 |     def get_batch(self, names):
 35 |         try:
 36 |             batch = self.stream.next_batch()
 37 |             if not batch.size:
 38 |                 return None
 39 |             cuda.memcpy_htod(self.d_input, batch)
 40 |             return [int(self.d_input)]
 41 |         except StopIteration:
 42 |             return None
 43 | 
 44 |     def read_calibration_cache(self):
 45 |         # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
 46 |         if os.path.exists(self.cache_file):
 47 |             with open(self.cache_file, "rb") as f:
 48 |                 return f.read()
 49 |         else:
 50 |             return None
 51 | 
 52 |     def write_calibration_cache(self, cache):
 53 |         # cache = ctypes.c_char_p(int(ptr))
 54 |         with open(self.cache_file, "wb") as f:
 55 |             f.write(cache)
 56 | 
 57 | 
 58 | class TRTMinMaxCalibrator(trt.IInt8MinMaxCalibrator):
 59 |     def __init__(self, input_layers, stream, cache_file):
 60 |         super(TRTMinMaxCalibrator, self).__init__()
 61 |         self.input_layers = input_layers
 62 | 
 63 |         # 数据读取的类, 等同于图片处理的回调
 64 |         self.stream = stream
 65 | 
 66 |         # 分配GP
 67 |         self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
 68 | 
 69 |         # cache路径
 70 |         self.cache_file = cache_file
 71 | 
 72 |         # 重置校准集
 73 |         self.stream.reset()
 74 | 
 75 |     def get_batch_size(self):
 76 |         return self.stream.batch_size
 77 | 
 78 |     def get_batch(self, names):
 79 |         try:
 80 |             batch = self.stream.next_batch()
 81 |             if not batch.size:
 82 |                 return None
 83 |             cuda.memcpy_htod(self.d_input, batch)
 84 |             return [int(self.d_input)]
 85 |         except StopIteration:
 86 |             return None
 87 | 
 88 |     def read_calibration_cache(self):
 89 |         # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
 90 |         if os.path.exists(self.cache_file):
 91 |             with open(self.cache_file, "rb") as f:
 92 |                 return f.read()
 93 |         else:
 94 |             return None
 95 | 
 96 |     def write_calibration_cache(self, cache):
 97 |         # cache = ctypes.c_char_p(int(ptr))
 98 |         with open(self.cache_file, "wb") as f:
 99 |             f.write(cache)
100 | 
101 | 
102 | class TRTPercentileCalibrator(trt.IInt8LegacyCalibrator):
103 |     def __init__(
104 |         self, input_layers, stream, cache_file, quantile=0.9995, regression_cutoff=1.0
105 |     ):
106 |         super(TRTPercentileCalibrator, self).__init__()
107 |         self.input_layers = input_layers
108 |         self.stream = stream
109 |         self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
110 |         self.cache_file = cache_file
111 |         self.stream.reset()
112 |         self.quantile = quantile
113 |         self.regression_cutoff = regression_cutoff
114 | 
115 |     def get_batch_size(self):
116 |         return self.stream.batch_size
117 | 
118 |     def get_batch(self, names):
119 |         try:
120 |             batch = self.stream.next_batch()
121 |             if not batch.size:
122 |                 return None
123 |             cuda.memcpy_htod(self.d_input, batch)
124 |             return [int(self.d_input)]
125 |         except StopIteration:
126 |             return None
127 | 
128 |     def read_calibration_cache(self):
129 |         # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
130 |         if os.path.exists(self.cache_file):
131 |             with open(self.cache_file, "rb") as f:
132 |                 return f.read()
133 |         else:
134 |             return None
135 | 
136 |     def write_calibration_cache(self, cache):
137 |         # cache = ctypes.c_char_p(int(ptr))
138 |         with open(self.cache_file, "wb") as f:
139 |             f.write(cache)
140 | 
141 |     def get_quantile(self):
142 |         return self.quantile
143 | 
144 |     def get_regression_cutoff(self):
145 |         return self.regression_cutoff
146 | 
147 |     def read_histogram_cache(self, length):
148 |         return None
149 | 
150 |     def write_histogram_cache(self, ptr, length):
151 |         return None
152 | 


--------------------------------------------------------------------------------
/quantization/trt_calibrator.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import tensorrt as trt
  4 | import pycuda.driver as cuda
  5 | import pycuda.autoinit  # fix init error of cuda
  6 | 
  7 | # __all__ = [
  8 | #     "TRTPercentileCalibrator",
  9 | #     "TRTEntropyCalibrator",
 10 | #     "TRTMinMaxCalibrator",
 11 | # ]
 12 | 
 13 | 
 14 | class TRTEntropyCalibrator(trt.IInt8EntropyCalibrator2):
 15 |     def __init__(self, input_layers, stream, cache_file):
 16 |         super(TRTEntropyCalibrator, self).__init__()
 17 |         self.input_layers = input_layers
 18 | 
 19 |         # 数据读取的类, 等同于图片处理的回调
 20 |         self.stream = stream
 21 | 
 22 |         # 分配GPU
 23 |         self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
 24 | 
 25 |         # cache路径
 26 |         self.cache_file = cache_file
 27 | 
 28 |         # 重置校准集
 29 |         self.stream.reset()
 30 | 
 31 |     def get_batch_size(self):
 32 |         return self.stream.batch_size
 33 | 
 34 |     def get_batch(self, names):
 35 |         try:
 36 |             batch = self.stream.next_batch()
 37 |             if not batch.size:
 38 |                 return None
 39 |             cuda.memcpy_htod(self.d_input, batch)
 40 |             return [int(self.d_input)]
 41 |         except StopIteration:
 42 |             return None
 43 | 
 44 |     def read_calibration_cache(self):
 45 |         # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
 46 |         if os.path.exists(self.cache_file):
 47 |             with open(self.cache_file, "rb") as f:
 48 |                 return f.read()
 49 |         else:
 50 |             return None
 51 | 
 52 |     def write_calibration_cache(self, cache):
 53 |         # cache = ctypes.c_char_p(int(ptr))
 54 |         with open(self.cache_file, "wb") as f:
 55 |             f.write(cache)
 56 | 
 57 | 
 58 | class TRTMinMaxCalibrator(trt.IInt8MinMaxCalibrator):
 59 |     def __init__(self, input_layers, stream, cache_file):
 60 |         super(TRTMinMaxCalibrator, self).__init__()
 61 |         self.input_layers = input_layers
 62 | 
 63 |         # 数据读取的类, 等同于图片处理的回调
 64 |         self.stream = stream
 65 | 
 66 |         # 分配GP
 67 |         self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
 68 | 
 69 |         # cache路径
 70 |         self.cache_file = cache_file
 71 | 
 72 |         # 重置校准集
 73 |         self.stream.reset()
 74 | 
 75 |     def get_batch_size(self):
 76 |         return self.stream.batch_size
 77 | 
 78 |     def get_batch(self, names):
 79 |         try:
 80 |             batch = self.stream.next_batch()
 81 |             if not batch.size:
 82 |                 return None
 83 |             cuda.memcpy_htod(self.d_input, batch)
 84 |             return [int(self.d_input)]
 85 |         except StopIteration:
 86 |             return None
 87 | 
 88 |     def read_calibration_cache(self):
 89 |         # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
 90 |         if os.path.exists(self.cache_file):
 91 |             with open(self.cache_file, "rb") as f:
 92 |                 return f.read()
 93 |         else:
 94 |             return None
 95 | 
 96 |     def write_calibration_cache(self, cache):
 97 |         # cache = ctypes.c_char_p(int(ptr))
 98 |         with open(self.cache_file, "wb") as f:
 99 |             f.write(cache)
100 | 
101 | 
102 | class TRTPercentileCalibrator(trt.IInt8LegacyCalibrator):
103 |     def __init__(
104 |         self, input_layers, stream, cache_file, quantile=0.9995, regression_cutoff=1.0
105 |     ):
106 |         super(TRTPercentileCalibrator, self).__init__()
107 |         self.input_layers = input_layers
108 |         self.stream = stream
109 |         self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
110 |         self.cache_file = cache_file
111 |         self.stream.reset()
112 |         self.quantile = quantile
113 |         self.regression_cutoff = regression_cutoff
114 | 
115 |     def get_batch_size(self):
116 |         return self.stream.batch_size
117 | 
118 |     def get_batch(self, names):
119 |         try:
120 |             batch = self.stream.next_batch()
121 |             if not batch.size:
122 |                 return None
123 |             cuda.memcpy_htod(self.d_input, batch)
124 |             return [int(self.d_input)]
125 |         except StopIteration:
126 |             return None
127 | 
128 |     def read_calibration_cache(self):
129 |         # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
130 |         if os.path.exists(self.cache_file):
131 |             with open(self.cache_file, "rb") as f:
132 |                 return f.read()
133 |         else:
134 |             return None
135 | 
136 |     def write_calibration_cache(self, cache):
137 |         # cache = ctypes.c_char_p(int(ptr))
138 |         with open(self.cache_file, "wb") as f:
139 |             f.write(cache)
140 | 
141 |     def get_quantile(self):
142 |         return self.quantile
143 | 
144 |     def get_regression_cutoff(self):
145 |         return self.regression_cutoff
146 | 
147 |     def read_histogram_cache(self, length):
148 |         return None
149 | 
150 |     def write_histogram_cache(self, ptr, length):
151 |         return None
152 | 


--------------------------------------------------------------------------------
/quantization/onnx_export_v2.py:
--------------------------------------------------------------------------------
  1 | from unicodedata import name
  2 | import onnx
  3 | from onnx import numpy_helper
  4 | import numpy as np
  5 | from torch import init_num_threads
  6 | import json
  7 | import struct
  8 | 
  9 | from sklearn.metrics.pairwise import cosine_similarity
 10 | 
 11 | fp32_model = "/apdcephfs/private_howellyang/road_service_app/LaneModel/onnx_infer/model/epoch_390_mm2conv.opt.onnx"
 12 | onnx_updated_model_path = fp32_model.replace(".onnx", ".weight_quantized_v4.onnx")
 13 | int8_qat_model = "/apdcephfs/private_howellyang/road_service_app/LaneModel/onnx_infer/model/epoch_390_mm2conv.opt_int8.onnx"
 14 | 
 15 | 
 16 | # 第一步, 从int8-qat模型中取出所有的zero points和scales
 17 | print("[Step1] read scales values from model ")
 18 | onnx_model = onnx.load(int8_qat_model)
 19 | inits = onnx_model.graph.initializer
 20 | scales_map = {}
 21 | weights_map = {}
 22 | for init in inits:
 23 |     if "PPQ_Variable" in init.name:
 24 |         W = numpy_helper.to_array(init)
 25 |         scales_map[init.name] = W
 26 |     else:
 27 |         W = numpy_helper.to_array(init)
 28 |         weights_map[init.name] = W
 29 | 
 30 | 
 31 | # {'Relu', 'Mul', 'MaxPool', 'GlobalAveragePool', 'Conv', \
 32 | # 'QuantizeLinear', 'Resize', 'Add', 'Concat', 'HardSigmoid', 'DequantizeLinear', 'Sigmoid'}
 33 | 
 34 | # 第二步, 统计权重和输出的scale值
 35 | print("[Step2] Collect scales and average ")
 36 | acts_scale_map = {}
 37 | weights_scale_map = {}
 38 | for node in onnx_model.graph.node:
 39 |     if node.op_type in ["QuantizeLinear"]:
 40 |         act_name = node.input[0]
 41 |         scale_name = node.input[1]
 42 |         scale_value = scales_map[scale_name]
 43 |         if act_name in weights_map:  # 权重量化
 44 |             if act_name not in weights_scale_map:
 45 |                 weights_scale_map[act_name] = []
 46 | 
 47 |             weights_scale_map[act_name].append(scale_value)
 48 |         else:  # act 量化
 49 |             if act_name not in acts_scale_map:
 50 |                 acts_scale_map[act_name] = []
 51 | 
 52 |             acts_scale_map[act_name].append(scale_value)
 53 | 
 54 | 
 55 | for key, value in acts_scale_map.items():
 56 |     assert isinstance(value, list), " {} {}".format(key, value)
 57 |     assert isinstance(value[0], float) or np.size(value[0]) == 1, " {} {}".format(
 58 |         key, value
 59 |     )
 60 | 
 61 |     acts_scale_map[key] = float(np.median(value))
 62 |     # act_min_q = -128
 63 |     # act_max_q = 127
 64 |     # act_min = act_min_q * float(np.median(value))
 65 |     # act_max = act_max_q * float(np.median(value))
 66 |     # acts_scale_map[key] = max(abs(act_min), abs(act_max))
 67 |     # 这里是scale值 q = x/scale ---> -128, 127
 68 |     # 转换为min max值需要乘以128.0
 69 | 
 70 | 
 71 | for key, value in weights_scale_map.items():
 72 |     assert isinstance(value, list), " {} {}".format(key, value)
 73 |     weights_scale_map[key] = np.median(value, axis=0, keepdims=False)
 74 | 
 75 | 
 76 | 
 77 | # 第三步, 对权重部分，进行fakequant后，放回onnx模型中;
 78 | print("[Step3] Fake quant weights ")
 79 | 
 80 | 
 81 | def fake_quant(weight, scales):
 82 |     weight = np.array(weight)
 83 |     scales = np.array(scales)
 84 |     assert np.shape(weight)[0] == len(scales)
 85 |     # 权重量化在QAT中是-128, 127; 但是在直接转换中是-127,127
 86 |     quantized_weight = np.clip(np.round(weight / scales.reshape(-1, 1, 1, 1) + 0.0), -128, 127)
 87 |     # output = clamp(round(input / scale) + zeroPt)
 88 | 
 89 |     # 反量化
 90 |     weight_r = (quantized_weight.astype(np.float32) - 0.0) * scales.reshape(-1, 1, 1, 1)
 91 | 
 92 |     quant_output = np.reshape(weight, (1, -1))
 93 |     origin_output = np.reshape(weight_r, (1, -1))
 94 |     cos_sim = cosine_similarity(quant_output, origin_output)
 95 |     assert cos_sim > 0.99, " {} {} {}".format(
 96 |         cos_sim, scales.reshape((-1,))[:5], weight_r.reshape((-1,))[:5]
 97 |     )
 98 |     return weight_r
 99 | 
100 | 
101 | onnx_model = onnx.load(fp32_model)  # 主要目的是获取模型结构
102 | inits = onnx_model.graph.initializer
103 | for idx, init in enumerate(inits):
104 |     if init.name in weights_scale_map:
105 |         # 需要使用LSQ更新后的权重和scale
106 |         W_new = fake_quant(weights_map[init.name], weights_scale_map[init.name])
107 |         print(init.name, np.shape(W_new))
108 |         tensor = numpy_helper.from_array(W_new, init.name)
109 |         onnx_model.graph.initializer[idx].CopyFrom(tensor)
110 |     # else:
111 |     #     print(init.name)
112 | 
113 | onnx.save(onnx_model, onnx_updated_model_path)
114 | 
115 | # 第三步，对act部分，记录scale值, 生成calib.cache文件
116 | print("[Step4] Dump act scales")
117 | with open(onnx_updated_model_path + "_calib_cache.json", "w") as file:
118 |     file.write(json.dumps(acts_scale_map, indent=4))  # use `json.loads` to do the reverse
119 | 
120 | # write plain text: tensorRT需要对结果做转换
121 | # TRT-8400-EntropyCalibration2
122 | # input.1: 3ca94044
123 | # 9131: 3cf4f8d5
124 | # 加密 hex(struct.unpack('<I', struct.pack('<f', f))[0])
125 | # 解析 struct.unpack('!f', bytes.fromhex('41973333'))[0]
126 | with open(onnx_updated_model_path + "_calib_cache.cache", "w") as file:
127 |     file.write("TRT-8400-EntropyCalibration2\n")
128 |     for key in sorted(acts_scale_map.keys()):
129 |         scale = acts_scale_map[key]
130 |         # if scale > 0.5:
131 |         #     print("scale过大, 建议不量化:", key, scale, 128.0 * scale)
132 |         #     continue
133 |         scale_hex = hex(struct.unpack("<I", struct.pack("<f", scale))[0])
134 |         s = key + ": " + str(scale_hex).lstrip("0x")
135 |         file.write(s)
136 |         file.write("\n")
137 | 


--------------------------------------------------------------------------------
/quantization/onnx_export.py:
--------------------------------------------------------------------------------
  1 | from unicodedata import name
  2 | import onnx
  3 | from onnx import numpy_helper
  4 | import numpy as np
  5 | from torch import init_num_threads
  6 | import json
  7 | import struct
  8 | import sys
  9 | 
 10 | 
 11 | from sklearn.metrics.pairwise import cosine_similarity
 12 | 
 13 | fp32_model = "/apdcephfs/private_howellyang/onnx2trt/model_T01/model.onnx"
 14 | int8_qat_model = sys.argv[1] # "/apdcephfs/private_howellyang/onnx2trt/model_T01/model.move_relu_qdq_forword.onnx"
 15 | onnx_updated_model_path = int8_qat_model.replace(".onnx", ".rm_dqd.weight_quantized.onnx")
 16 | 
 17 | 
 18 | # 第一步, 从int8-qat模型中取出所有的zero points和scales
 19 | print("[Step1] read scales values from model ")
 20 | onnx_model = onnx.load(int8_qat_model)
 21 | inits = onnx_model.graph.initializer
 22 | scales_map = {}
 23 | weights_map = {}
 24 | for init in inits:
 25 |     if "PPQ_Variable" in init.name:
 26 |         W = numpy_helper.to_array(init)
 27 |         scales_map[init.name] = W
 28 |     else:
 29 |         W = numpy_helper.to_array(init)
 30 |         weights_map[init.name] = W
 31 | 
 32 | 
 33 | # {'Relu', 'Mul', 'MaxPool', 'GlobalAveragePool', 'Conv', \
 34 | # 'QuantizeLinear', 'Resize', 'Add', 'Concat', 'HardSigmoid', 'DequantizeLinear', 'Sigmoid'}
 35 | 
 36 | # 第二步, 统计权重和输出的scale值
 37 | print("[Step2] Collect scales and average ")
 38 | acts_scale_map = {}
 39 | weights_scale_map = {}
 40 | for node in onnx_model.graph.node:
 41 |     if node.op_type in ["QuantizeLinear"]:
 42 |         act_name = node.input[0]
 43 |         scale_name = node.input[1]
 44 |         scale_value = scales_map[scale_name]
 45 |         if act_name in weights_map:  # 权重量化
 46 |             if act_name not in weights_scale_map:
 47 |                 weights_scale_map[act_name] = []
 48 | 
 49 |             weights_scale_map[act_name].append(scale_value)
 50 |         else:  # act 量化
 51 |             if act_name not in acts_scale_map:
 52 |                 acts_scale_map[act_name] = []
 53 | 
 54 |             acts_scale_map[act_name].append(scale_value)
 55 | 
 56 | 
 57 | for key, value in acts_scale_map.items():
 58 |     assert isinstance(value, list), " {} {}".format(key, value)
 59 |     assert isinstance(value[0], float) or np.size(value[0]) == 1, " {} {}".format(
 60 |         key, value
 61 |     )
 62 | 
 63 |     acts_scale_map[key] = float(np.median(value))
 64 |     # act_min_q = -128
 65 |     # act_max_q = 127
 66 |     # act_min = act_min_q * float(np.median(value))
 67 |     # act_max = act_max_q * float(np.median(value))
 68 |     # acts_scale_map[key] = max(abs(act_min), abs(act_max))
 69 |     # 这里是scale值 q = x/scale ---> -128, 127
 70 |     # 转换为min max值需要乘以128.0
 71 | 
 72 | 
 73 | for key, value in weights_scale_map.items():
 74 |     assert isinstance(value, list), " {} {}".format(key, value)
 75 |     weights_scale_map[key] = np.median(value, axis=0, keepdims=False)
 76 | 
 77 | 
 78 | 
 79 | # 第三步, 对权重部分，进行fakequant后，放回onnx模型中;
 80 | print("[Step3] Fake quant weights ")
 81 | 
 82 | 
 83 | def fake_quant(weight, scales):
 84 |     weight = np.array(weight)
 85 |     scales = np.array(scales)
 86 |     assert np.shape(weight)[0] == len(scales)
 87 |     # 权重量化在QAT中是-128, 127; 但是在直接转换中是-127,127
 88 |     quantized_weight = np.clip(np.round(weight / scales.reshape(-1, 1, 1, 1) + 0.0), -128, 127)
 89 |     # output = clamp(round(input / scale) + zeroPt)
 90 | 
 91 |     # 反量化
 92 |     weight_r = (quantized_weight.astype(np.float32) - 0.0) * scales.reshape(-1, 1, 1, 1)
 93 | 
 94 |     quant_output = np.reshape(weight, (1, -1))
 95 |     origin_output = np.reshape(weight_r, (1, -1))
 96 |     cos_sim = cosine_similarity(quant_output, origin_output)
 97 |     assert cos_sim > 0.99, " {} {} {}".format(
 98 |         cos_sim, scales.reshape((-1,))[:5], weight_r.reshape((-1,))[:5]
 99 |     )
100 |     return weight_r
101 | 
102 | 
103 | onnx_model = onnx.load(fp32_model)  # 主要目的是获取模型结构
104 | inits = onnx_model.graph.initializer
105 | for idx, init in enumerate(inits):
106 |     if init.name in weights_scale_map:
107 |         # 需要使用LSQ更新后的权重和scale
108 |         W_new = fake_quant(weights_map[init.name], weights_scale_map[init.name])
109 |         print(init.name, np.shape(W_new))
110 |         tensor = numpy_helper.from_array(W_new, init.name)
111 |         onnx_model.graph.initializer[idx].CopyFrom(tensor)
112 |     # else:
113 |     #     print(init.name)
114 | 
115 | onnx.save(onnx_model, onnx_updated_model_path)
116 | 
117 | 
118 | acts_scale_map = {k: acts_scale_map[k] for k in sorted(acts_scale_map)}
119 | 
120 | 
121 | # 第三步，对act部分，记录scale值, 生成calib.cache文件
122 | print("[Step4] Dump act scales")
123 | with open(onnx_updated_model_path + "_calib_cache.json", "w") as file:
124 |     file.write(json.dumps(acts_scale_map, indent=4))  # use `json.loads` to do the reverse
125 | 
126 | # write plain text: tensorRT需要对结果做转换
127 | # TRT-8400-EntropyCalibration2
128 | # input.1: 3ca94044
129 | # 9131: 3cf4f8d5
130 | # 加密 hex(struct.unpack('<I', struct.pack('<f', f))[0])
131 | # 解析 struct.unpack('!f', bytes.fromhex('41973333'))[0]
132 | with open(onnx_updated_model_path + "_calib_cache.cache", "w") as file:
133 |     file.write("TRT-8400-EntropyCalibration2\n")
134 |     for key in sorted(acts_scale_map.keys()):
135 |         scale = acts_scale_map[key]
136 |         # if scale > 0.5:
137 |         #     print("scale过大, 建议不量化:", key, scale, 128.0 * scale)
138 |         #     continue
139 |         scale_hex = hex(struct.unpack("<I", struct.pack("<f", scale))[0])
140 |         s = key + ": " + str(scale_hex).lstrip("0x")
141 |         file.write(s)
142 |         file.write("\n")
143 | 


--------------------------------------------------------------------------------
/quantization/onnx_move_qdq_relu_forward.py:
--------------------------------------------------------------------------------
  1 | # 对于relu节点的qdq,将其移动到qdq前面
  2 | 
  3 | import onnx
  4 | import sys
  5 | import onnxoptimizer
  6 | from onnx import helper, shape_inference
  7 | from onnxsim import simplify
  8 | from onnx import numpy_helper
  9 | 
 10 | 
 11 | def find_input_node_index(onnx_model, node_input):
 12 |     for i in range(len(onnx_model.graph.node)):
 13 |         if onnx_model.graph.node[i].output[0] == node_input:
 14 |             return i
 15 |     return None
 16 | 
 17 | 
 18 | def find_output_node_index(onnx_model, node_output):
 19 |     for i in range(len(onnx_model.graph.node)):
 20 |         if onnx_model.graph.node[i].input[0] == node_output:
 21 |             return i
 22 |     return None
 23 | 
 24 | 
 25 | def find_pre_relu(onnx_model, node_input):
 26 |     for i in range(len(onnx_model.graph.node)):
 27 |         for j in range(len(onnx_model.graph.node[i].output)):
 28 |             if onnx_model.graph.node[i].output[j] == node_input:
 29 |                 if onnx_model.graph.node[i].op_type in [
 30 |                     "QuantizeLinear",
 31 |                     "DequantizeLinear",
 32 |                 ]:
 33 |                     raise NameError("Relu的前置节点是QDQ-{}".format(node_input))
 34 |                 return i, j
 35 |     return None, None
 36 | 
 37 | 
 38 | def find_post_dq(onnx_model, node_output):
 39 |     node_indexes = []
 40 |     node_indexes_i = []
 41 |     for i in range(len(onnx_model.graph.node)):
 42 |         for j in range(len(onnx_model.graph.node[i].input)):
 43 |             if onnx_model.graph.node[i].input[j] == node_output:
 44 |                 node_indexes.append(i)
 45 |                 node_indexes_i.append(j)
 46 |     return node_indexes, node_indexes_i
 47 | 
 48 | 
 49 | def find_post_relu(onnx_model, node_output, q_index):
 50 |     node_indexes = []
 51 |     node_indexes_i = []
 52 |     for i in range(len(onnx_model.graph.node)):
 53 |         if i == q_index:
 54 |             continue
 55 |         for j in range(len(onnx_model.graph.node[i].input)):
 56 |             if onnx_model.graph.node[i].input[j] == node_output:
 57 |                 node_indexes.append(i)
 58 |                 node_indexes_i.append(j)
 59 |     return node_indexes, node_indexes_i
 60 | 
 61 | 
 62 | # 将Relu节点后的QDQ移动到Relu前面
 63 | 
 64 | onnx_model = onnx.load(sys.argv[1])
 65 | inits = onnx_model.graph.initializer
 66 | init_names = [init.name for init in inits]  + ["input.1"]
 67 | 
 68 | scales_map = {}
 69 | weights_map = {}
 70 | for init in inits:
 71 |     if "PPQ_Variable" in init.name:
 72 |         W = numpy_helper.to_array(init)
 73 |         scales_map[init.name] = W
 74 |     else:
 75 |         W = numpy_helper.to_array(init)
 76 |         weights_map[init.name] = W
 77 | 
 78 | relu2qde = {}
 79 | for i in range(len(onnx_model.graph.node)):
 80 |     if onnx_model.graph.node[i].op_type == "QuantizeLinear":
 81 |         assert "PPQ_Variable" in onnx_model.graph.node[i].output[0]
 82 |         # 首先，当前节点的输入节点是否是Relu
 83 |         node_input = onnx_model.graph.node[i].input[0]
 84 |         input_node_index = find_input_node_index(onnx_model, node_input)
 85 |         if input_node_index is None:
 86 |             assert node_input in init_names, "input is not found in model - {}".format(node_input)
 87 |             continue
 88 | 
 89 |         node_output = onnx_model.graph.node[i].output[0]
 90 |         output_node_index = find_output_node_index(onnx_model, node_output)
 91 | 
 92 |         if onnx_model.graph.node[input_node_index].op_type == "Relu":
 93 |             # relu及其 qdq节点
 94 |             q_index = i
 95 |             # q_node = onnx_model.graph.node[q_index]
 96 |             relu_index = input_node_index
 97 |             # relu_node = onnx_model.graph.node[relu_index]
 98 |             dq_index = output_node_index
 99 |             # dq_node = onnx_model.graph.node[relu_index]
100 | 
101 |             # relu的前置节点
102 |             pre_relu_index, pre_relu_index_i = find_pre_relu(
103 |                 onnx_model, onnx_model.graph.node[relu_index].input[0]
104 |             )
105 | 
106 |             # dq的后置节点(可能有多个)
107 |             post_dq_indexes, post_dq_indexes_i = find_post_dq(
108 |                 onnx_model, onnx_model.graph.node[dq_index].output[0]
109 |             )
110 | 
111 |             # relu的后置节点(可能有多个)
112 |             post_relu_indexes, post_relu_indexes_i = find_post_relu(
113 |                 onnx_model, onnx_model.graph.node[relu_index].output[0], q_index
114 |             )
115 | 
116 |             # 首先q节点的输入，改为relu前置节点的输出
117 |             onnx_model.graph.node[q_index].input[0] = onnx_model.graph.node[
118 |                 pre_relu_index
119 |             ].output[pre_relu_index_i]
120 | 
121 |             # 然后，relu节点的输入，改为dq节点的输出
122 |             onnx_model.graph.node[relu_index].input[0] = onnx_model.graph.node[
123 |                 dq_index
124 |             ].output[0]
125 | 
126 |             # 再然后，dp的后置节点的输入，改为relu节点的输出
127 |             for idx, idx_i in zip(post_dq_indexes, post_dq_indexes_i):
128 |                 onnx_model.graph.node[idx].input[idx_i] = onnx_model.graph.node[
129 |                     relu_index
130 |                 ].output[0]
131 | 
132 |             # 最后，其它relu后置节点的输入，仍然是relu
133 |             # for idx, idx_i in zip(post_relu_indexes, post_relu_indexes_i):
134 |             #     onnx_model.graph.node[idx].input[idx_i] = onnx_model.graph.node[dq_index].output[0]
135 | 
136 | model_opt = onnxoptimizer.optimize(onnx_model)
137 | # model_simp, check = simplify(model_opt)
138 | model_simp = shape_inference.infer_shapes(model_opt)
139 | onnx.save(model_simp, sys.argv[2])
140 | 


--------------------------------------------------------------------------------
/quantization/onnx_remove_dup_qdqs.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import onnx
  3 | import sys
  4 | import onnxoptimizer
  5 | from onnx import helper, shape_inference
  6 | from onnxsim import simplify
  7 | from onnx import numpy_helper
  8 | import numpy as np
  9 | 
 10 | onnx_model = onnx.load(sys.argv[1])
 11 | inits = onnx_model.graph.initializer
 12 | inits_names =[init.name for init in inits] + ["input.1"]
 13 | 
 14 | scales_map = {}
 15 | weights_map = {}
 16 | for init in inits:
 17 |     if "PPQ_Variable" in init.name:
 18 |         W = numpy_helper.to_array(init)
 19 |         scales_map[init.name] = W
 20 |     else:
 21 |         W = numpy_helper.to_array(init)
 22 |         weights_map[init.name] = W
 23 | 
 24 | 
 25 | def find_node_input_name(onnx_model, onnx_v):
 26 |     graph = onnx_model.graph
 27 |     node = graph.node
 28 |     for i in range(len(node)):
 29 |         for o in node[i].output:
 30 |             if o == onnx_v:
 31 |                 return node[i].name, node[i].op_type
 32 |     return None, None
 33 | 
 34 | def find_dq_node_output_node(onnx_model, q_node_o):
 35 |     graph = onnx_model.graph
 36 |     node = graph.node
 37 |     for i in range(len(node)):
 38 |         for o in node[i].input:
 39 |             if o == q_node_o: # 找到dq节点
 40 |                 dq_index = i
 41 |                 break
 42 |     dq_o = node[dq_index].output[0]
 43 |     for i in range(len(node)):
 44 |         for j, o in enumerate(node[i].input):
 45 |             if o == dq_o: # 找到dq后置节点
 46 |                 dq_o_index = i
 47 |                 dq_o_index_i = j
 48 |                 break
 49 |     return dq_o_index, dq_o_index_i, dq_index
 50 | 
 51 | 
 52 | # 记录每个Relu的Q/DQ的名称
 53 | relu2qde = {}
 54 | for i in range(len(onnx_model.graph.node)):
 55 |     if onnx_model.graph.node[i].op_type == "QuantizeLinear":
 56 |         assert "PPQ_Variable" in onnx_model.graph.node[i].output[0]
 57 |         # 如果输入节点是Relu
 58 |         node_input = onnx_model.graph.node[i].input[0]
 59 |         node_name, node_type = find_node_input_name(onnx_model, node_input)
 60 |         if node_type is None:
 61 |             assert node_input in inits_names, "{} - {}".format(onnx_model.graph.node[i].name, node_input)
 62 |         if node_type == "Relu":
 63 |             if node_name not in relu2qde:
 64 |                 relu2qde[node_name] = []
 65 |             relu2qde[node_name].append(i)  # 这里记录的是index
 66 | 
 67 | 
 68 | # 如果有多个, 则取均值
 69 | redirect_nodes = {}
 70 | remove_nodes = []
 71 | 
 72 | act_scale_map = {}
 73 | for i in range(len(onnx_model.graph.node)):
 74 |     if onnx_model.graph.node[i].op_type == "Relu":
 75 |         if onnx_model.graph.node[i].name in relu2qde:
 76 |             if len(relu2qde[onnx_model.graph.node[i].name]) > 1:
 77 |                 print(onnx_model.graph.node[i].name)
 78 |                 qdq_indexes = relu2qde[onnx_model.graph.node[i].name]
 79 | 
 80 |                 # 取多个scale的均值
 81 |                 q_vals = []
 82 |                 dq_vals = []
 83 |                 q_init_names = []
 84 |                 dq_init_names = []
 85 |                 for idx in qdq_indexes:
 86 |                     q_node_o = onnx_model.graph.node[idx].output[0]
 87 |                     dq_o_index, dq_o_index_i, dq_index = find_dq_node_output_node(onnx_model, q_node_o)
 88 |                     q_val = scales_map[onnx_model.graph.node[idx].input[1]]
 89 |                     dq_val = scales_map[onnx_model.graph.node[dq_index].input[1]]
 90 |                     q_init_names.append(onnx_model.graph.node[idx].input[1])
 91 |                     dq_init_names.append(onnx_model.graph.node[dq_index].input[1])
 92 |                     q_vals.append(q_val)
 93 |                     dq_vals.append(dq_val)
 94 | 
 95 |                 # 给权重重新赋值
 96 |                 for idx, init in enumerate(inits):
 97 |                     if init.name in q_init_names:
 98 |                         W_new = np.mean(q_vals, axis=0)
 99 |                         tensor = numpy_helper.from_array(W_new, init.name)
100 |                         onnx_model.graph.initializer[idx].CopyFrom(tensor)
101 |                     elif init.name in dq_init_names:
102 |                         W_new = np.mean(dq_vals, axis=0)
103 |                         tensor = numpy_helper.from_array(W_new, init.name)
104 |                         onnx_model.graph.initializer[idx].CopyFrom(tensor)
105 | 
106 | 
107 |                 # 修改移除后的输入输出，并记录需要移除的点
108 |                 for idx in qdq_indexes[1:]:
109 |                     remove_nodes.append(onnx_model.graph.node[idx].name)
110 |                     # 找到后续的dq节点
111 |                     q_node_o = onnx_model.graph.node[idx].output[0]
112 |                     dq_o_index, dq_o_index_i, dq_index = find_dq_node_output_node(onnx_model, q_node_o)
113 |                     remove_nodes.append(onnx_model.graph.node[dq_index].name)                    
114 |                     onnx_model.graph.node[dq_o_index].input[dq_o_index_i] = onnx_model.graph.node[idx].input[0]
115 |         else:
116 |             print("Relu wo QDQ", onnx_model.graph.node[i].name)
117 | 
118 | # 删除多余的节点
119 | for rm_name in remove_nodes:
120 |     for i in range(len(onnx_model.graph.node)):
121 |         if onnx_model.graph.node[i].name == rm_name:
122 |             old_node = onnx_model.graph.node[i]
123 |             print("remove", old_node.name)
124 |             onnx_model.graph.node.remove(old_node)  # 删除旧节点
125 |             break
126 | 
127 | model_opt = onnxoptimizer.optimize(onnx_model)
128 | # model_simp, check = simplify(model_opt)
129 | model_simp = shape_inference.infer_shapes(model_opt)
130 | onnx.save(model_simp, sys.argv[2])
131 | 


--------------------------------------------------------------------------------
/quantization/compare_trt_trt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from tqdm import tqdm
 22 | import glob
 23 | import cv2
 24 | import numpy as np
 25 | from torchvision import transforms
 26 | from PIL import Image
 27 | import os
 28 | from sklearn.metrics.pairwise import cosine_similarity
 29 | import onnx
 30 | from copy import deepcopy
 31 | 
 32 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray:
 33 |     if x is None and not accepet_none:
 34 |         raise ValueError("Trying to convert an empty value.")
 35 |     if isinstance(x, np.ndarray):
 36 |         return x
 37 |     elif isinstance(x, int) or isinstance(x, float):
 38 |         return np.array(
 39 |             [
 40 |                 x,
 41 |             ]
 42 |         )
 43 |     elif isinstance(x, torch.Tensor):
 44 |         if x.numel() == 0 and accepet_none:
 45 |             return None
 46 |         if x.numel() == 0 and not accepet_none:
 47 |             raise ValueError("Trying to convert an empty value.")
 48 |         if x.numel() == 1:
 49 |             return convert_any_to_numpy(x.detach().cpu().item())
 50 |         if x.numel() > 1:
 51 |             return x.detach().cpu().numpy()
 52 |     elif isinstance(x, list) or isinstance(x, tuple):
 53 |         return np.array(x)
 54 |     else:
 55 |         raise TypeError(
 56 |             f"input value {x}({type(x)}) can not be converted as numpy type."
 57 |         )
 58 | 
 59 | def read_image(path):
 60 |     # 多任务模型
 61 |     _img_transforms = transforms.Compose(
 62 |         [
 63 |             transforms.Resize((384, 768)),
 64 |             transforms.ToTensor(),
 65 |             transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
 66 |         ]
 67 |     )
 68 |     img = Image.open(path).convert("RGB")
 69 |     img_w, img_h = img.size[0], img.size[1]
 70 |     img = _img_transforms(img)
 71 |     img = img.unsqueeze(0)
 72 |     return img
 73 | 
 74 | calibration_files = glob.glob(
 75 |     os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg")
 76 | )[-100:]
 77 | 
 78 | SAMPLES = [
 79 |     read_image(path) for path in calibration_files
 80 | ]  # rewirte this to use real data.
 81 | 
 82 | 
 83 | DEVICE = "cuda"
 84 | FINETUNE = True
 85 | EXECUTING_DEVICE = "cuda"
 86 | REQUIRE_ANALYSE = True
 87 | 
 88 | # -------------------------------------------------------------------
 89 | # 启动 tensorRT 进行推理，你先装一下 trt
 90 | # -------------------------------------------------------------------
 91 | 
 92 | 
 93 | def infer_with_trt(trt_int8_path = ""):
 94 |     import tensorrt as trt
 95 |     import trt_infer
 96 | 
 97 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
 98 |     logger = trt.Logger(trt.Logger.INFO)
 99 |     with open(trt_int8_path, "rb") as f, trt.Runtime(
100 |         logger
101 |     ) as runtime:
102 |         engine = runtime.deserialize_cuda_engine(f.read())
103 | 
104 |     trt_outpus_all  = []
105 |     with engine.create_execution_context() as context:
106 |         inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers(
107 |             context.engine
108 |         )
109 |         for sample in tqdm(samples, desc="TensorRT is running..."):
110 |             # trt infer
111 |             inputs[0].host = convert_any_to_numpy(sample)
112 |             trt_outputs_list = trt_infer.do_inference(
113 |                 context,
114 |                 bindings=bindings,
115 |                 inputs=inputs,
116 |                 outputs=outputs,
117 |                 stream=stream,
118 |                 batch_size=1,
119 |             )
120 |             trt_outputs_dict = {
121 |                 trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names))
122 |             }
123 |             trt_outpus_all.append(deepcopy(trt_outputs_dict))
124 |     return trt_outpus_all
125 | 
126 | 
127 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/road-service/road_service/engine/mod_road_multi_tasks/model/RMTNet_release20220609.trtmodel")
128 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.trt_int8_with_1578pics_calib_entropy_less_int8_v1.trtmodel") # 原始QAT转换的模型
129 | trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.trt_int8_with_1578pics_calib_entropy.trtmodel") # 进行虚拟量化转换后的模型
130 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.no_weight_quant.int8.trtmodel") # 不虚拟量化，仅使用min max值
131 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.trtmodel") # 进行虚拟量化
132 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.trtmodel") # 进行虚拟量化
133 | trt_outpus_all_fp32 = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/road-service/road_service/engine/mod_road_multi_tasks/model/RMTNet_release20220609.fp16.trtmodel")
134 | 
135 | 
136 | sims = {}
137 | for i in range(len(trt_outpus_all)):
138 |     for output_name, _ in trt_outpus_all[i].items():
139 |         trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1))
140 |         trt_fp32_output = np.reshape(trt_outpus_all_fp32[i][output_name], (1, -1))
141 |         cos_sim = cosine_similarity(trt_output, trt_fp32_output)
142 |         if output_name not in sims:
143 |             sims[output_name] = []
144 |         sims[output_name].append(cos_sim.ravel())
145 |         # if cos_sim < 0.985:
146 |         #     print(output_name, cos_sim)
147 |         #     print(trt_fp32_output[0, :5])
148 |         #     print(trt_output[0, :5])
149 | 
150 | print("===================")
151 | mean_sims = []
152 | for key, value in sims.items():
153 |     print(key, np.mean(value), np.min(value))
154 |     mean_sims.append(np.mean(value))
155 | print("average cosine sim = ", np.mean(mean_sims))


--------------------------------------------------------------------------------
/quantization/onnx2trt_lsq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | print(sys.getdefaultencoding())
  6 | s = "中文乱码问题解决"
  7 | print(s)
  8 | 
  9 | # ---------------------------------------------------------------
 10 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 11 | 
 12 | # This script shows you how to export ppq internal graph to tensorRT
 13 | # ---------------------------------------------------------------
 14 | 
 15 | # For this inference test, all test data is randomly picked.
 16 | # If you want to use real data, just rewrite the defination of SAMPLES
 17 | print("开始import")
 18 | import onnxruntime
 19 | import torch
 20 | from ppq import *
 21 | from ppq.api import *
 22 | from tqdm import tqdm
 23 | import glob
 24 | import cv2
 25 | import numpy as np
 26 | from torchvision import transforms
 27 | from PIL import Image
 28 | import os
 29 | 
 30 | def read_image(path):
 31 |     # 多任务模型
 32 |     _img_transforms = transforms.Compose([
 33 |         transforms.Resize((384, 768)),
 34 |         transforms.ToTensor(),
 35 |         transforms.Normalize((.485, .456, .406), (.229, .224, .225))
 36 |     ])
 37 |     img = Image.open(path).convert('RGB')
 38 |     img_w, img_h = img.size[0], img.size[1]
 39 |     img = _img_transforms(img)
 40 |     img = img.unsqueeze(0)
 41 |     return img
 42 | 
 43 | QUANT_PLATFROM = TargetPlatform.TRT_INT8
 44 | MODEL          = 'model_lsq.onnx'
 45 | INPUT_SHAPE    = [1, 3, 384, 768]
 46 | 
 47 | calibration_files = glob.glob(os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", '*.jpg'))[:256]
 48 | SAMPLES        = [read_image(path) for path in calibration_files] # rewirte this to use real data.
 49 | 
 50 | 
 51 | DEVICE         = 'cuda'
 52 | FINETUNE       = True
 53 | QS             = QuantizationSettingFactory.default_setting()
 54 | EXECUTING_DEVICE = 'cuda'
 55 | REQUIRE_ANALYSE  = True
 56 | 
 57 | # -------------------------------------------------------------------
 58 | # 下面向你展示了常用参数调节选项：
 59 | # -------------------------------------------------------------------
 60 | if PPQ_CONFIG.USING_CUDA_KERNEL:
 61 |     print("====== using advanced_optimization =====")
 62 |     QS.advanced_optimization = FINETUNE                             # 启动网络再训练过程，降低量化误差
 63 |     QS.advanced_optimization_setting.steps = 2500                   # 再训练步数，影响训练时间，2500步大概几分钟
 64 |     QS.advanced_optimization_setting.collecting_device = 'executor' # 缓存数据放在那，executor 就是放在gpu，如果显存超了你就换成 'cpu'
 65 |     QS.advanced_optimization_setting.auto_check = False             # 打开这个选项则训练过程中会防止过拟合，以及意外情况，通常不需要开。
 66 | else:
 67 |     print("====== using lsq_optimization =====")
 68 |     QS.lsq_optimization = FINETUNE                                  # 启动网络再训练过程，降低量化误差
 69 |     QS.lsq_optimization_setting.epochs = 64                         # 再训练轮数，影响训练时间，30轮大概几分钟
 70 |     QS.lsq_optimization_setting.collecting_device = 'cuda'          # 缓存数据放在那，cuda 就是放在gpu，如果显存超了你就换成 'cpu'
 71 | 
 72 | QS.dispatching_table.append(operation='Sigmoid', platform=TargetPlatform.FP32) # 把量化的不太好的算子送回 FP32
 73 | 
 74 | print('正准备量化你的网络，检查下列设置:')
 75 | print(f'TARGET PLATFORM      : {QUANT_PLATFROM.name}')
 76 | print(f'NETWORK INPUTSHAPE   : {INPUT_SHAPE}')
 77 | 
 78 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x，但是你如果没有装相应编译环境的话是编译不了的
 79 | # 你可以尝试安装编译环境，或者在不启动 CUDA KERNEL 的情况下完成量化：移除 with ENABLE_CUDA_KERNEL(): 即可
 80 | with ENABLE_CUDA_KERNEL():
 81 |     qir = quantize_onnx_model(
 82 |         onnx_import_file=MODEL, calib_dataloader=SAMPLES, calib_steps=128, setting=QS,
 83 |         input_shape=INPUT_SHAPE, collate_fn=lambda x: x.to(EXECUTING_DEVICE), 
 84 |         platform=QUANT_PLATFROM, do_quantize=True)
 85 | 
 86 |     # -------------------------------------------------------------------
 87 |     # PPQ 计算量化误差时，使用信噪比的倒数作为指标，即噪声能量 / 信号能量
 88 |     # 量化误差 0.1 表示在整体信号中，量化噪声的能量约为 10%
 89 |     # 你应当注意，在 graphwise_error_analyse 分析中，我们衡量的是累计误差
 90 |     # 网络的最后一层往往都具有较大的累计误差，这些误差是其前面的所有层所共同造成的
 91 |     # 你需要使用 layerwise_error_analyse 逐层分析误差的来源
 92 |     # -------------------------------------------------------------------
 93 |     print('正计算网络量化误差(SNR)，最后一层的误差应小于 0.1 以保证量化精度:')
 94 |     reports = graphwise_error_analyse(
 95 |         graph=qir, running_device=EXECUTING_DEVICE, steps=32,
 96 |         dataloader=SAMPLES, collate_fn=lambda x: x.to(EXECUTING_DEVICE))
 97 |     for op, snr in reports.items():
 98 |         if snr > 0.1: ppq_warning(f'层 {op} 的累计量化误差显著，请考虑进行优化')
 99 | 
100 |     if REQUIRE_ANALYSE:
101 |         print('正计算逐层量化误差(SNR)，每一层的独立量化误差应小于 0.1 以保证量化精度:')
102 |         layerwise_error_analyse(graph=qir, running_device=EXECUTING_DEVICE,
103 |                                 interested_outputs=None,
104 |                                 dataloader=SAMPLES, collate_fn=lambda x: x.to(EXECUTING_DEVICE))
105 | 
106 |     print('网络量化结束，正在生成目标文件:')
107 |     export_ppq_graph(
108 |         graph=qir, platform=QUANT_PLATFROM,
109 |         graph_save_to = 'model_copy_int8.onnx')
110 | 
111 |     # -------------------------------------------------------------------
112 |     # 记录一下输入输出的名字，onnxruntime 跑的时候需要提供这些名字
113 |     # 我写的只是单输出单输入的版本，多输出多输入你得自己改改
114 |     # -------------------------------------------------------------------
115 |     int8_input_names  = [name for name, _ in qir.inputs.items()]
116 |     int8_output_names = [name for name, _ in qir.outputs.items()]
117 | 
118 |     # -------------------------------------------------------------------
119 |     # 启动 tensorRT 进行推理，你先装一下 trt
120 |     # -------------------------------------------------------------------
121 |     import tensorrt as trt
122 |     import trt_infer
123 | 
124 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
125 |     logger = trt.Logger(trt.Logger.INFO)
126 |     with open('model_copy_int8.engine', 'rb') as f, trt.Runtime(logger) as runtime:
127 |         engine = runtime.deserialize_cuda_engine(f.read())
128 | 
129 |     results = []
130 |     with engine.create_execution_context() as context:
131 |         inputs, outputs, bindings, stream = trt_infer.allocate_buffers(context.engine)
132 |         for sample in tqdm(samples, desc='TensorRT is running...'):
133 |             inputs[0].host = convert_any_to_numpy(sample)
134 |             [output] = trt_infer.do_inference(
135 |                 context, bindings=bindings, inputs=inputs, 
136 |                 outputs=outputs, stream=stream, batch_size=1)
137 |             results.append(convert_any_to_torch_tensor(output).reshape([-1, 1000]))


--------------------------------------------------------------------------------
/quantization/C03_compare_trt_fp32_int8.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from tqdm import tqdm
 22 | import glob
 23 | import cv2
 24 | import numpy as np
 25 | from torchvision import transforms
 26 | from PIL import Image
 27 | import os
 28 | from sklearn.metrics.pairwise import cosine_similarity
 29 | import onnx
 30 | from copy import deepcopy
 31 | 
 32 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray:
 33 |     if x is None and not accepet_none:
 34 |         raise ValueError("Trying to convert an empty value.")
 35 |     if isinstance(x, np.ndarray):
 36 |         return x
 37 |     elif isinstance(x, int) or isinstance(x, float):
 38 |         return np.array(
 39 |             [
 40 |                 x,
 41 |             ]
 42 |         )
 43 |     elif isinstance(x, torch.Tensor):
 44 |         if x.numel() == 0 and accepet_none:
 45 |             return None
 46 |         if x.numel() == 0 and not accepet_none:
 47 |             raise ValueError("Trying to convert an empty value.")
 48 |         if x.numel() == 1:
 49 |             return convert_any_to_numpy(x.detach().cpu().item())
 50 |         if x.numel() > 1:
 51 |             return x.detach().cpu().numpy()
 52 |     elif isinstance(x, list) or isinstance(x, tuple):
 53 |         return np.array(x)
 54 |     else:
 55 |         raise TypeError(
 56 |             f"input value {x}({type(x)}) can not be converted as numpy type."
 57 |         )
 58 | 
 59 | def read_image(path):
 60 |     mean_val = [103.53, 116.28, 123.675]
 61 |     std_val = [57.375, 57.12, 58.395]
 62 |     input_size = [768, 448]
 63 | 
 64 |     # img = np.random.randint(255, size=input_size + [3]).astype(np.uint8)
 65 |     img_raw = cv2.imread(path)
 66 |     img = cv2.resize(img_raw, (input_size[0],input_size[1])).astype(np.float32)
 67 |     img -= mean_val
 68 |     img /= std_val
 69 |     img = np.transpose(img, (2, 0, 1)).astype(np.float32)
 70 |     img = np.expand_dims(img, axis=0)
 71 | 
 72 |     img = np.ascontiguousarray(img, dtype=np.float32)
 73 |     # img_tensor = torch.from_numpy(img)
 74 |     # dummy_input = torch.autograd.Variable(img_tensor)
 75 |     return img
 76 | 
 77 | calibration_files = glob.glob(
 78 |     os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg")
 79 | )[-100:]
 80 | 
 81 | SAMPLES = [
 82 |     read_image(path) for path in calibration_files
 83 | ]  # rewirte this to use real data.
 84 | 
 85 | 
 86 | DEVICE = "cuda"
 87 | FINETUNE = True
 88 | EXECUTING_DEVICE = "cuda"
 89 | REQUIRE_ANALYSE = True
 90 | 
 91 | # -------------------------------------------------------------------
 92 | # 启动 tensorRT 进行推理，你先装一下 trt
 93 | # -------------------------------------------------------------------
 94 | 
 95 | 
 96 | def infer_with_trt(trt_int8_path = ""):
 97 |     import tensorrt as trt
 98 |     import trt_infer
 99 |     trt.init_libnvinfer_plugins(None, "")
100 | 
101 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
102 |     logger = trt.Logger(trt.Logger.INFO)
103 |     with open(trt_int8_path, "rb") as f, trt.Runtime(
104 |         logger
105 |     ) as runtime:
106 |         engine = runtime.deserialize_cuda_engine(f.read())
107 | 
108 |     trt_outpus_all  = []
109 |     with engine.create_execution_context() as context:
110 |         inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers(
111 |             context.engine
112 |         )
113 |         for sample in tqdm(samples, desc="TensorRT is running..."):
114 |             # trt infer
115 |             inputs[0].host = convert_any_to_numpy(sample)
116 |             trt_outputs_list = trt_infer.do_inference(
117 |                 context,
118 |                 bindings=bindings,
119 |                 inputs=inputs,
120 |                 outputs=outputs,
121 |                 stream=stream,
122 |                 batch_size=1,
123 |             )
124 |             trt_outputs_dict = {
125 |                 trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names))
126 |             }
127 |             trt_outpus_all.append(deepcopy(trt_outputs_dict))
128 |     return trt_outpus_all
129 | 
130 | 
131 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/road-service/road_service/engine/mod_road_multi_tasks/model/RMTNet_release20220609.trtmodel")
132 | trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/LaneModel/onnx_infer/model/epoch_390_mm2conv.opt.trt_int8_with_1578pics_calib_entropy.trtmodel") # 原始QAT转换的模型
133 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized.int8.trtmodel") # 进行虚拟量化转换后的模型
134 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.no_weight_quant.int8.trtmodel") # 不虚拟量化，仅使用min max值
135 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.trtmodel") # 进行虚拟量化
136 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.trtmodel") # 进行虚拟量化
137 | trt_outpus_all_fp32 = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/LaneModel/onnx_infer/model/epoch_390_mm2conv.opt.fp16.trtmodel")
138 | 
139 | 
140 | sims = {}
141 | for i in range(len(trt_outpus_all)):
142 |     for output_name, _ in trt_outpus_all[i].items():
143 |         trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1))
144 |         trt_fp32_output = np.reshape(trt_outpus_all_fp32[i][output_name], (1, -1))
145 |         cos_sim = cosine_similarity(trt_output, trt_fp32_output)
146 |         if output_name not in sims:
147 |             sims[output_name] = []
148 |         sims[output_name].append(cos_sim.ravel())
149 |         # if cos_sim < 0.985:
150 |         #     print(output_name, cos_sim)
151 |         #     print(trt_fp32_output[0, :5])
152 |         #     print(trt_output[0, :5])
153 | 
154 | print("===================")
155 | mean_sims = []
156 | for key, value in sims.items():
157 |     print(key, np.mean(value), np.min(value))
158 |     mean_sims.append(np.mean(value))
159 | print("average cosine sim = ", np.mean(mean_sims))


--------------------------------------------------------------------------------
/quantization/C02_compare_trt_fp32_int8.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from tqdm import tqdm
 22 | import glob
 23 | import cv2
 24 | import numpy as np
 25 | from torchvision import transforms
 26 | from PIL import Image
 27 | import os
 28 | from sklearn.metrics.pairwise import cosine_similarity
 29 | import onnx
 30 | from copy import deepcopy
 31 | 
 32 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray:
 33 |     if x is None and not accepet_none:
 34 |         raise ValueError("Trying to convert an empty value.")
 35 |     if isinstance(x, np.ndarray):
 36 |         return x
 37 |     elif isinstance(x, int) or isinstance(x, float):
 38 |         return np.array(
 39 |             [
 40 |                 x,
 41 |             ]
 42 |         )
 43 |     elif isinstance(x, torch.Tensor):
 44 |         if x.numel() == 0 and accepet_none:
 45 |             return None
 46 |         if x.numel() == 0 and not accepet_none:
 47 |             raise ValueError("Trying to convert an empty value.")
 48 |         if x.numel() == 1:
 49 |             return convert_any_to_numpy(x.detach().cpu().item())
 50 |         if x.numel() > 1:
 51 |             return x.detach().cpu().numpy()
 52 |     elif isinstance(x, list) or isinstance(x, tuple):
 53 |         return np.array(x)
 54 |     else:
 55 |         raise TypeError(
 56 |             f"input value {x}({type(x)}) can not be converted as numpy type."
 57 |         )
 58 | 
 59 | def read_image(path):
 60 |     mean = [123.675, 116.28, 103.53]
 61 |     std = [58.395, 57.12, 57.375]
 62 |     input_w = 960
 63 |     input_h = 480
 64 | 
 65 |     # for onnx inference
 66 |     mean = np.array(mean)
 67 |     std = np.array(std)
 68 | 
 69 |     # Load by OpenCV
 70 |     img = cv2.imread(path)
 71 |     # Convert to RGB
 72 |     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 73 | 
 74 |     img = cv2.resize(img, (input_w, input_h))
 75 | 
 76 |     img = img.astype(np.float32)
 77 | 
 78 |     # Norm
 79 |     for i in range(3):
 80 |         img[..., i] = (img[..., i] - mean[i]) / std[i]
 81 | 
 82 |     # hwc -> nchw
 83 |     h, w, c = img.shape
 84 |     img = img.reshape((1, c, h ,w))
 85 | 
 86 |     return np.array(img)
 87 | 
 88 | calibration_files = glob.glob(
 89 |     os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg")
 90 | )[-100:]
 91 | 
 92 | SAMPLES = [
 93 |     read_image(path) for path in calibration_files
 94 | ]  # rewirte this to use real data.
 95 | 
 96 | 
 97 | DEVICE = "cuda"
 98 | FINETUNE = True
 99 | EXECUTING_DEVICE = "cuda"
100 | REQUIRE_ANALYSE = True
101 | 
102 | # -------------------------------------------------------------------
103 | # 启动 tensorRT 进行推理，你先装一下 trt
104 | # -------------------------------------------------------------------
105 | 
106 | 
107 | def infer_with_trt(trt_int8_path = ""):
108 |     import tensorrt as trt
109 |     import trt_infer
110 |     trt.init_libnvinfer_plugins(None, "")
111 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
112 |     logger = trt.Logger(trt.Logger.INFO)
113 |     with open(trt_int8_path, "rb") as f, trt.Runtime(
114 |         logger
115 |     ) as runtime:
116 |         engine = runtime.deserialize_cuda_engine(f.read())
117 | 
118 |     trt_outpus_all  = []
119 |     with engine.create_execution_context() as context:
120 |         inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers(
121 |             context.engine
122 |         )
123 |         for sample in tqdm(samples, desc="TensorRT is running..."):
124 |             # trt infer
125 |             inputs[0].host = convert_any_to_numpy(sample)
126 |             trt_outputs_list = trt_infer.do_inference(
127 |                 context,
128 |                 bindings=bindings,
129 |                 inputs=inputs,
130 |                 outputs=outputs,
131 |                 stream=stream,
132 |                 batch_size=1,
133 |             )
134 |             trt_outputs_dict = {
135 |                 trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names))
136 |             }
137 |             trt_outpus_all.append(deepcopy(trt_outputs_dict))
138 |     return trt_outpus_all
139 | 
140 | 
141 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/road-service/road_service/engine/mod_road_multi_tasks/model/RMTNet_release20220609.trtmodel")
142 | trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/roadseg-infer/res101_ep100.opt.trt_int8_with_1578pics_calib_entropy.trtmodel") # 原始QAT转换的模型
143 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized.int8.trtmodel") # 进行虚拟量化转换后的模型
144 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.no_weight_quant.int8.trtmodel") # 不虚拟量化，仅使用min max值
145 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.trtmodel") # 进行虚拟量化
146 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.trtmodel") # 进行虚拟量化
147 | trt_outpus_all_fp32 = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/roadseg-infer/res101_ep100.opt.fp16.trtmodel")
148 | 
149 | 
150 | sims = {}
151 | for i in range(len(trt_outpus_all)):
152 |     for output_name, _ in trt_outpus_all[i].items():
153 |         trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1))
154 |         trt_fp32_output = np.reshape(trt_outpus_all_fp32[i][output_name], (1, -1))
155 |         cos_sim = cosine_similarity(trt_output, trt_fp32_output)
156 |         if output_name not in sims:
157 |             sims[output_name] = []
158 |         sims[output_name].append(cos_sim.ravel())
159 |         # if cos_sim < 0.985:
160 |         #     print(output_name, cos_sim)
161 |         #     print(trt_fp32_output[0, :5])
162 |         #     print(trt_output[0, :5])
163 | 
164 | print("===================")
165 | mean_sims = []
166 | for key, value in sims.items():
167 |     print(key, np.mean(value), np.min(value))
168 |     mean_sims.append(np.mean(value))
169 | print("average cosine sim = ", np.mean(mean_sims))


--------------------------------------------------------------------------------
/onnx2trt.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | from __future__ import print_function
  3 | 
  4 | import argparse
  5 | import glob
  6 | import os
  7 | from tabnanny import verbose
  8 | import tensorrt as trt
  9 | import pycuda.driver as cuda
 10 | import pycuda.autoinit  # fix init error of cuda
 11 | from google.protobuf.json_format import MessageToDict
 12 | import onnx
 13 | from onnxsim import simplify
 14 | try:
 15 |     import onnxoptimizer as optimizer
 16 | except:
 17 |     from onnx import optimizer
 18 | 
 19 | from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
 20 | import numpy as np
 21 | from trt_utils import (
 22 |     create_image_stream,
 23 |     create_calibrator,
 24 |     create_tensorrt_engine,
 25 |     evaluate_engine,
 26 | )
 27 | 
 28 | parser = argparse.ArgumentParser(description="Onnx Calibration Params")
 29 | parser.add_argument("--onnx", type=str, default=None, required=True, help="原始的onnx路径")
 30 | parser.add_argument(
 31 |     "--trt_engine", type=str, default=None, required=False, help="tensorRT engine的保存路径"
 32 | )
 33 | 
 34 | parser.add_argument(
 35 |     "--engine_type",
 36 |     type=str,
 37 |     default="int8",
 38 |     choices=["int8", "fp32", "fp16", "best"],
 39 |     required=False,
 40 |     help="模型的计算精度",
 41 | )
 42 | 
 43 | parser.add_argument(
 44 |     "--trt_calib_cache",
 45 |     type=str,
 46 |     default="./trt_int8.cache",
 47 |     required=False,
 48 |     help="用来存储每个节点动态范围的路径",
 49 | )
 50 | parser.add_argument(
 51 |     "--calib_dir", type=str, default=None, required=False, help="进行精度测试以及量化校准使用的图片路径"
 52 | )
 53 | parser.add_argument(
 54 |     "--calib_algo",
 55 |     type=str,
 56 |     default=None,
 57 |     required=False,
 58 |     choices=["Search", "TRTEntropy", "TRTMinMax", "TRTPercentile", "ONNXEntropy", "ONNXMinMax", "ONNXPercentile"],
 59 |     help="""量化校准使用的算法:
 60 |     Search 进行自动化搜索, 自动选择最终输出的cosine距离最高的校准算法
 61 |     TRTEntropy 使用交叉熵评估量化前后的量化误差,自动选择误差最小的动态范围值
 62 |     TRTMinMax 计算每个节点输出的最大最小值，作为最终的动态范围值
 63 |     TRTPercentile 计算每个节点输出值，然后求其分位点作为动态范围值
 64 | """,
 65 | )
 66 | 
 67 | parser.add_argument(
 68 |     "--channel_order",
 69 |     type=str,
 70 |     default="RGB",
 71 |     required=False,
 72 |     choices=["RGB", "BGR"],
 73 |     help="图片的输入顺序, 可选BGR、RGB",
 74 | )
 75 | parser.add_argument(
 76 |     "--means", type=str, default="0.0,0.0,0.0", required=False, help="图片预处理的均值"
 77 | )
 78 | parser.add_argument(
 79 |     "--stds", type=str, default="1.0,1.0,1.0", required=False, help="图片预处理的方差"
 80 | )
 81 | parser.add_argument(
 82 |     "--pixel_type",
 83 |     type=str,
 84 |     default="NCHW",
 85 |     required=False,
 86 |     choices=["NCHW", "NHWC"],
 87 |     help="模型输入的通道顺序, 一般而言",
 88 | )
 89 | 
 90 | args = parser.parse_args()
 91 | onnx_path = args.onnx
 92 | engine_type = args.engine_type
 93 | trt_engine = args.trt_engine
 94 | calib_algo = args.calib_algo
 95 | calib_dir = args.calib_dir
 96 | means = args.means
 97 | stds = args.stds
 98 | pixel_type = args.pixel_type
 99 | trt_calib_cache = args.trt_calib_cache
100 | channel_order = args.channel_order
101 | 
102 | # 获取输入输出信息
103 | print("[ONNX2TRT] Optimizing Onnx Model....")
104 | INPUT_SHAPES = []
105 | INPUT_NAMES = []
106 | onnx_model = onnx.load(onnx_path)
107 | onnx_model, check = simplify(onnx_model) # simplify 
108 | optimized_model = optimizer.optimize(onnx_model) # optimize
109 | onnx_model = SymbolicShapeInference.infer_shapes(
110 |     onnx_model,
111 |     int_max=2**31 - 1,
112 |     auto_merge=True,
113 |     guess_output_rank=True,
114 |     verbose=2
115 | )
116 | 
117 | onnx_path = onnx_path.replace(".onnx", "") + "_with_shape.onnx"
118 | onnx.save(onnx_model, onnx_path)
119 | 
120 | input_all = [node.name for node in onnx_model.graph.input]
121 | input_initializer = [node.name for node in onnx_model.graph.initializer]
122 | net_feed_input_names = list(set(input_all) - set(input_initializer))
123 | 
124 | for _input in onnx_model.graph.input:
125 |     m_dict = MessageToDict(_input)
126 |     dim_info = m_dict.get("type").get("tensorType").get("shape").get("dim")
127 |     input_shape = [int(d.get("dimValue")) for d in dim_info]  # [4,3,384,640]
128 |     input_name = m_dict.get("name")
129 |     if input_name in net_feed_input_names:
130 |         INPUT_SHAPES.append(input_shape)
131 |         INPUT_NAMES.append(input_name)
132 |         print(INPUT_NAMES[-1], INPUT_SHAPES[-1])
133 | 
134 | if len(INPUT_SHAPES) > 1:
135 |     print("模型存在多个输入, 本工具暂不支持多输入模型")
136 |     raise NameError("模型存在多个输入, 本工具暂不支持多输入模型")
137 | 
138 | elif len(INPUT_SHAPES[0]) != 4:
139 |     print("模型的输入不是NCHW或NHWC, 本工具暂不支持这种输入格式")
140 |     raise NameError("模型的输入不是NCHW或NHWC, 本工具暂不支持这种输入格式")
141 | 
142 | if engine_type == "int8":
143 |     if calib_algo == "Search":
144 |         search_types = ["TRTEntropy", "TRTMinMax", "TRTPercentile"]
145 |     else:
146 |         search_types = [calib_algo]
147 |     image_stream = create_image_stream(
148 |         calib_dir, INPUT_SHAPES[0], means, stds, pixel_type, channel_order
149 |     )
150 |     final_cos_similarity = -1.0
151 |     final_engine = None
152 |     print("[ONNX2TRT] Start Calibration with {}".format(search_types))
153 |     for calibrator_type in search_types:
154 |         calibrator = create_calibrator(
155 |             image_stream, INPUT_NAMES, trt_calib_cache, calib_algo, onnx_path
156 |         )
157 |         engine = create_tensorrt_engine(onnx_path, engine_type, calibrator)
158 |         cos_similarity, infer_time = evaluate_engine(onnx_path, engine, image_stream)
159 |         if cos_similarity > final_cos_similarity:
160 |             final_cos_similarity = cos_similarity
161 |             final_engine = engine
162 |             final_infer_time = infer_time
163 |         print("[ONNX2TRT] INFO: 校准算法 = ", calib_algo)
164 |         print("[ONNX2TRT] INFO: 与onnx输出的cos相似度 = ", cos_similarity)
165 |         print("[ONNX2TRT] INFO: 模型infer的平均耗时 = ", infer_time)
166 | 
167 | else:
168 |     final_engine = create_tensorrt_engine(onnx_path, engine_type)
169 |     if calib_dir != "":
170 |         image_stream = create_image_stream(
171 |             calib_dir, INPUT_SHAPES[0], means, stds, pixel_type, channel_order
172 |         )
173 |         cos_similarity, infer_time = evaluate_engine(
174 |             onnx_path, final_engine, image_stream
175 |         )
176 |         print("[ONNX2TRT] INFO: 校准算法 = ", None)
177 |         print("[ONNX2TRT] INFO: 与onnx输出的cos相似度 = ", cos_similarity)
178 |         print("[ONNX2TRT] INFO: 模型infer的平均耗时 = ", infer_time)
179 | 
180 | # 将trt engine写入文件
181 | print("[ONNX2TRT] INFO: 模型构建完成, 将模型写入路径 = ", trt_engine)
182 | if not os.path.exists(os.path.dirname(trt_engine)):
183 |     os.makedirs(os.path.dirname(trt_engine), exist_ok=True)
184 | with open(trt_engine, "wb") as f:
185 |     f.write(final_engine.serialize())
186 | 


--------------------------------------------------------------------------------
/quantization/ptq/quantization_filter.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | from matplotlib.pyplot import axis
  3 | from trt_utils import read_calib_cache
  4 | from data_loader import DataLoader
  5 | from onnx_model import OnnxModel
  6 | from queue import Queue
  7 | import numpy as np
  8 | from onnx import numpy_helper
  9 | import onnx
 10 | from time import time
 11 | from sklearn.metrics.pairwise import cosine_similarity
 12 | 
 13 | class QuantizaitonFilter:
 14 |     def __init__(self, model_path: str, calib_path: str):
 15 |         self.model_path = model_path
 16 |         self.data_loader = DataLoader()
 17 |         self.model = OnnxModel(model_path)
 18 |         self.calib_path = calib_path
 19 |         OnnxModel.quantize_weights(self.model.qdq_model)
 20 |         self.fp32_weight_name2tensor = {}
 21 |         for weight in self.model.fp32_model.graph.initializer:
 22 |             self.fp32_weight_name2tensor[weight.name] = weight
 23 | 
 24 |         self.int8_weight_name2tensor = {}
 25 |         for weight in self.model.qdq_model.graph.initializer:
 26 |             self.int8_weight_name2tensor[weight.name] = weight
 27 | 
 28 |     def is_conv_output(self, onnx_model, tensor_name):
 29 |         pre_node = None
 30 |         for node in onnx_model.graph.node:
 31 |             for output in node.output:
 32 |                 if output == tensor_name:
 33 |                     pre_node = node
 34 |                     break
 35 | 
 36 |         if pre_node is None:
 37 |             return False
 38 | 
 39 |         if pre_node.op_type == "Conv":
 40 |             return True
 41 |         elif pre_node.op_type == "Relu":
 42 |             return self.is_conv_output(onnx_model, pre_node.input[0])
 43 |         elif pre_node.op_type == "Concat":
 44 |             ret = False
 45 |             for input in pre_node.input[0]:
 46 |                 ret |= self.is_conv_output(onnx_model, input)
 47 |             return ret            
 48 | 
 49 |     def get_conv_tensors(self, onnx_model, act_scale_map):
 50 |         conv_out_scale_map = {}
 51 |         conv_out_to_bias = {}
 52 |         for node in onnx_model.graph.node:
 53 |             for tensor_name in node.output:
 54 |                 if tensor_name not in act_scale_map:
 55 |                     continue
 56 |                 else:
 57 |                     if node.op_type == "Conv":
 58 |                         if len(node.input) >= 3:  # with bias
 59 |                             conv_out_scale_map[tensor_name] = act_scale_map[tensor_name]
 60 |                             conv_out_to_bias[tensor_name] = node.input[2]
 61 |                     elif node.op_type == "Relu":
 62 |                         pre_nodes = OnnxModel.get_previous_nodes(onnx_model, node.input[0])
 63 |                         assert len(pre_nodes) == 1, "Relu should only have one input"
 64 |                         if pre_nodes[0].op_type == "Conv" and len(pre_nodes[0].input) >= 3:
 65 |                             conv_out_scale_map[node.input[0]] = act_scale_map[tensor_name]
 66 |                             conv_out_to_bias[node.input[0]] = pre_nodes[0].input[2]
 67 |                     elif node.op_type == "Concat":
 68 |                         for input in node.input:
 69 |                             pre_nodes = OnnxModel.get_previous_nodes(onnx_model, input)
 70 |                             assert len(pre_nodes) == 1, "each input shold corespond to one node"
 71 |                             if pre_nodes[0].op_type == "Conv" and len(pre_nodes[0].input) >= 3:
 72 |                                 conv_out_scale_map[input] = act_scale_map[tensor_name]
 73 |                                 conv_out_to_bias[input] = pre_nodes[0].input[2]
 74 |                             elif pre_nodes[0].op_type == "Relu":
 75 |                                 nodes_before_relu = OnnxModel.get_previous_nodes(onnx_model, pre_nodes[0].input[0])
 76 |                                 assert len(nodes_before_relu) == 1, "Relu should only have one input"
 77 |                                 if nodes_before_relu[0].op_type == "Conv" and len(nodes_before_relu[0].input) >= 3:
 78 |                                     conv_out_scale_map[pre_nodes[0].input[0]] = act_scale_map[tensor_name]
 79 |                                     conv_out_to_bias[pre_nodes[0].input[0]] = nodes_before_relu[0].input[2]
 80 |         return conv_out_scale_map, conv_out_to_bias
 81 | 
 82 |     def eval_quantize(self, fp32_output, int8_output):
 83 |         sims = []
 84 |         diffs = []
 85 |         rel_diffs = []
 86 |         for fp32, int8 in zip(fp32_output, int8_output):
 87 |             fp32 = np.reshape(fp32, (1, -1))
 88 |             int8 = np.reshape(int8, (1, -1))
 89 |             sim = cosine_similarity(fp32, int8)
 90 |             diff = np.abs(fp32 - int8)
 91 |             rel_diff = diff / (np.abs(fp32) + 1e-8)
 92 |             sims.append(sim)
 93 |             diffs.append(np.median(diff))
 94 |             rel_diffs.append(np.median(rel_diff))
 95 |         return np.mean(sims), np.mean(diffs), np.mean(rel_diffs)
 96 | 
 97 |     def process(self):
 98 |         # Step 01. read input data
 99 |         input_data = self.data_loader.get_numpy_data(image_num=100)
100 | 
101 |         # Step 02. read calibration cache
102 |         act_scale_map = read_calib_cache(self.calib_path)
103 |         act_scale_map = {name: value for name, value in act_scale_map.items() if name in self.model.all_tensor_names}
104 |         qdq_model = deepcopy(self.model.qdq_model)
105 |         for tensor_name, scale_value in act_scale_map.items():
106 |             OnnxModel.add_act_dqd_node(qdq_model, tensor_name, scale_value)
107 |         onnx.save(qdq_model, self.model_path + "_qdq100.onnx")
108 | 
109 |         # Step 03. caculate snrs
110 |         # fp32_outputs = OnnxModel.get_onnx_outputs(self.model.fp32_model, list(act_scale_map.keys()), input_data)
111 |         # snrs = {}
112 |         # for name, fp32_output in fp32_outputs.items():
113 |         #     snrs[name] = self.caculate_snr(fp32_outputs[name], act_scale_map[name])
114 | 
115 |         return self.model.fp32_model, self.model.qdq_model
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     import sys
120 |     onnx_path = "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.onnx"
121 |     calib_path = "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.trt_int8_with_531pics_calib_percentile595.calib_cache"
122 |     BS = QuantizaitonFilter(onnx_path, calib_path)
123 |     onnx_model, qdq_model = BS.process()
124 |     # onnx.save(onnx_model,  "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.bias_correction_v1.onnx")
125 |     # onnx.save(qdq_model,  "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.bias_correction_v1.qdq.onnx")


--------------------------------------------------------------------------------
/quantization/P02_MT_onnx2tensorRT_int8.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from ppq import *
 22 | from ppq.api import *
 23 | from tqdm import tqdm
 24 | import glob
 25 | import cv2
 26 | import numpy as np
 27 | from torchvision import transforms
 28 | from PIL import Image
 29 | import os
 30 | 
 31 | 
 32 | def read_image(path):
 33 |     # 多任务模型
 34 |     _img_transforms = transforms.Compose(
 35 |         [
 36 |             transforms.Resize((384, 768)),
 37 |             transforms.ToTensor(),
 38 |             transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
 39 |         ]
 40 |     )
 41 |     img = Image.open(path).convert("RGB")
 42 |     img_w, img_h = img.size[0], img.size[1]
 43 |     img = _img_transforms(img)
 44 |     img = img.unsqueeze(0)
 45 |     return img
 46 | 
 47 | 
 48 | QUANT_PLATFROM = TargetPlatform.TRT_INT8
 49 | MODEL = "Models/RMTNet_release20220609_v2.opt.onnx"
 50 | INPUT_SHAPE = [1, 3, 384, 768]
 51 | 
 52 | calibration_files = glob.glob(
 53 |     os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg")
 54 | )[:128]
 55 | SAMPLES = [
 56 |     read_image(path) for path in calibration_files
 57 | ]  # rewirte this to use real data.
 58 | 
 59 | 
 60 | DEVICE = "cuda"
 61 | FINETUNE = True
 62 | QS = QuantizationSettingFactory.default_setting()
 63 | EXECUTING_DEVICE = "cuda"
 64 | REQUIRE_ANALYSE = True
 65 | 
 66 | # -------------------------------------------------------------------
 67 | # 下面向你展示了常用参数调节选项：
 68 | # -------------------------------------------------------------------
 69 | if PPQ_CONFIG.USING_CUDA_KERNEL:
 70 |     print("====== using advanced_optimization =====")
 71 |     QS.advanced_optimization = FINETUNE  # 启动网络再训练过程，降低量化误差
 72 |     QS.advanced_optimization_setting.steps = 2500  # 再训练步数，影响训练时间，2500步大概几分钟
 73 |     QS.advanced_optimization_setting.collecting_device = (
 74 |         "executor"  # 缓存数据放在那，executor 就是放在gpu，如果显存超了你就换成 'cpu'
 75 |     )
 76 |     QS.advanced_optimization_setting.auto_check = (
 77 |         False  # 打开这个选项则训练过程中会防止过拟合，以及意外情况，通常不需要开。
 78 |     )
 79 | else:
 80 |     print("====== using lsq_optimization =====")
 81 |     QS.lsq_optimization = FINETUNE  # 启动网络再训练过程，降低量化误差
 82 |     QS.lsq_optimization_setting.epochs = 128  # 再训练轮数，影响训练时间，30轮大概几分钟
 83 |     QS.lsq_optimization_setting.collecting_device = (
 84 |         "cuda"  # 缓存数据放在那，cuda 就是放在gpu，如果显存超了你就换成 'cpu'
 85 |     )
 86 | 
 87 | # 把量化的不太好的算子送回 FP32
 88 | # QS.dispatching_table.append(operation="Concat_2420", platform=TargetPlatform.FP32)
 89 | 
 90 | 
 91 | print("正准备量化你的网络，检查下列设置:")
 92 | print(f"TARGET PLATFORM      : {QUANT_PLATFROM.name}")
 93 | print(f"NETWORK INPUTSHAPE   : {INPUT_SHAPE}")
 94 | 
 95 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x，但是你如果没有装相应编译环境的话是编译不了的
 96 | # 你可以尝试安装编译环境，或者在不启动 CUDA KERNEL 的情况下完成量化：移除 with ENABLE_CUDA_KERNEL(): 即可
 97 | with ENABLE_CUDA_KERNEL():
 98 |     qir = quantize_onnx_model(
 99 |         onnx_import_file=MODEL,
100 |         calib_dataloader=SAMPLES,
101 |         calib_steps=128,
102 |         setting=QS,
103 |         input_shape=INPUT_SHAPE,
104 |         collate_fn=lambda x: x.to(EXECUTING_DEVICE),
105 |         platform=QUANT_PLATFROM,
106 |         do_quantize=True,
107 |     )
108 | 
109 |     # -------------------------------------------------------------------
110 |     # PPQ 计算量化误差时，使用信噪比的倒数作为指标，即噪声能量 / 信号能量
111 |     # 量化误差 0.1 表示在整体信号中，量化噪声的能量约为 10%
112 |     # 你应当注意，在 graphwise_error_analyse 分析中，我们衡量的是累计误差
113 |     # 网络的最后一层往往都具有较大的累计误差，这些误差是其前面的所有层所共同造成的
114 |     # 你需要使用 layerwise_error_analyse 逐层分析误差的来源
115 |     # -------------------------------------------------------------------
116 |     print("正计算网络量化误差(SNR)，最后一层的误差应小于 0.1 以保证量化精度:")
117 |     reports = graphwise_error_analyse(
118 |         graph=qir,
119 |         running_device=EXECUTING_DEVICE,
120 |         steps=32,
121 |         dataloader=SAMPLES,
122 |         collate_fn=lambda x: x.to(EXECUTING_DEVICE),
123 |     )
124 |     for op, snr in reports.items():
125 |         if snr > 0.1:
126 |             ppq_warning(f"层 {op} 的累计量化误差显著，请考虑进行优化")
127 | 
128 |     if REQUIRE_ANALYSE:
129 |         print("正计算逐层量化误差(SNR)，每一层的独立量化误差应小于 0.1 以保证量化精度:")
130 |         layerwise_error_analyse(
131 |             graph=qir,
132 |             running_device=EXECUTING_DEVICE,
133 |             interested_outputs=None,
134 |             dataloader=SAMPLES,
135 |             collate_fn=lambda x: x.to(EXECUTING_DEVICE),
136 |         )
137 | 
138 |     print("网络量化结束，正在生成目标文件:")
139 |     export_ppq_graph(
140 |         graph=qir,
141 |         platform=QUANT_PLATFROM,
142 |         graph_save_to=MODEL.replace(".onnx", "_v2_int8.onnx"),
143 |     )
144 | 
145 |     # -------------------------------------------------------------------
146 |     # 记录一下输入输出的名字，onnxruntime 跑的时候需要提供这些名字
147 |     # 我写的只是单输出单输入的版本，多输出多输入你得自己改改
148 |     # -------------------------------------------------------------------
149 |     int8_input_names = [name for name, _ in qir.inputs.items()]
150 |     int8_output_names = [name for name, _ in qir.outputs.items()]
151 | 
152 |     # -------------------------------------------------------------------
153 |     # 启动 tensorRT 进行推理，你先装一下 trt
154 |     # -------------------------------------------------------------------
155 |     import tensorrt as trt
156 |     import trt_infer
157 | 
158 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
159 |     logger = trt.Logger(trt.Logger.INFO)
160 |     with open(MODEL.replace(".onnx", "_v2_int8.engine"), "rb") as f, trt.Runtime(
161 |         logger
162 |     ) as runtime:
163 |         engine = runtime.deserialize_cuda_engine(f.read())
164 | 
165 |     results = []
166 |     with engine.create_execution_context() as context:
167 |         inputs, outputs, bindings, stream, _ = trt_infer.allocate_buffers(
168 |             context.engine
169 |         )
170 |         for sample in tqdm(samples, desc="TensorRT is running..."):
171 |             inputs[0].host = convert_any_to_numpy(sample)
172 |             output = trt_infer.do_inference(
173 |                 context,
174 |                 bindings=bindings,
175 |                 inputs=inputs,
176 |                 outputs=outputs,
177 |                 stream=stream,
178 |                 batch_size=1,
179 |             )
180 |             # results.append(convert_any_to_torch_tensor(output).reshape([-1, 1000]))
181 | 


--------------------------------------------------------------------------------
/quantization/onnx2tensorRT_adaround.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from ppq import *
 22 | from ppq.api import *
 23 | from tqdm import tqdm
 24 | import glob
 25 | import cv2
 26 | import numpy as np
 27 | from torchvision import transforms
 28 | from PIL import Image
 29 | import os
 30 | 
 31 | 
 32 | def read_image(path):
 33 |     # 多任务模型
 34 |     _img_transforms = transforms.Compose(
 35 |         [
 36 |             transforms.Resize((384, 768)),
 37 |             transforms.ToTensor(),
 38 |             transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
 39 |         ]
 40 |     )
 41 |     img = Image.open(path).convert("RGB")
 42 |     img_w, img_h = img.size[0], img.size[1]
 43 |     img = _img_transforms(img)
 44 |     img = img.unsqueeze(0)
 45 |     return img
 46 | 
 47 | 
 48 | QUANT_PLATFROM = TargetPlatform.TRT_INT8
 49 | MODEL = "model_copy_adaround.onnx"
 50 | INPUT_SHAPE = [1, 3, 384, 768]
 51 | 
 52 | calibration_files = glob.glob(
 53 |     os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg")
 54 | )[:128]
 55 | SAMPLES = [
 56 |     read_image(path) for path in calibration_files
 57 | ]  # rewirte this to use real data.
 58 | 
 59 | 
 60 | DEVICE = "cuda"
 61 | FINETUNE = True
 62 | QS = QuantizationSettingFactory.default_setting()
 63 | EXECUTING_DEVICE = "cuda"
 64 | REQUIRE_ANALYSE = True
 65 | 
 66 | # -------------------------------------------------------------------
 67 | # 下面向你展示了常用参数调节选项：
 68 | # -------------------------------------------------------------------
 69 | if True:  # PPQ_CONFIG.USING_CUDA_KERNEL:
 70 |     print("====== using advanced_optimization =====")
 71 |     QS.advanced_optimization = FINETUNE  # 启动网络再训练过程，降低量化误差
 72 |     QS.advanced_optimization_setting.steps = 2500  # 再训练步数，影响训练时间，2500步大概几分钟
 73 |     QS.advanced_optimization_setting.collecting_device = (
 74 |         "executor"  # 缓存数据放在那，executor 就是放在gpu，如果显存超了你就换成 'cpu'
 75 |     )
 76 |     QS.advanced_optimization_setting.auto_check = (
 77 |         False  # 打开这个选项则训练过程中会防止过拟合，以及意外情况，通常不需要开。
 78 |     )
 79 | else:
 80 |     print("====== using lsq_optimization =====")
 81 |     QS.lsq_optimization = FINETUNE  # 启动网络再训练过程，降低量化误差
 82 |     QS.lsq_optimization_setting.epochs = 32  # 再训练轮数，影响训练时间，30轮大概几分钟
 83 |     QS.lsq_optimization_setting.collecting_device = (
 84 |         "cuda"  # 缓存数据放在那，cuda 就是放在gpu，如果显存超了你就换成 'cpu'
 85 |     )
 86 | 
 87 | QS.dispatching_table.append(
 88 |     operation="Sigmoid", platform=TargetPlatform.FP32
 89 | )  # 把量化的不太好的算子送回 FP32
 90 | 
 91 | print("正准备量化你的网络，检查下列设置:")
 92 | print(f"TARGET PLATFORM      : {QUANT_PLATFROM.name}")
 93 | print(f"NETWORK INPUTSHAPE   : {INPUT_SHAPE}")
 94 | 
 95 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x，但是你如果没有装相应编译环境的话是编译不了的
 96 | # 你可以尝试安装编译环境，或者在不启动 CUDA KERNEL 的情况下完成量化：移除 with ENABLE_CUDA_KERNEL(): 即可
 97 | with ENABLE_CUDA_KERNEL():
 98 |     qir = quantize_onnx_model(
 99 |         onnx_import_file=MODEL,
100 |         calib_dataloader=SAMPLES,
101 |         calib_steps=32,
102 |         setting=QS,
103 |         input_shape=INPUT_SHAPE,
104 |         collate_fn=lambda x: x.to(EXECUTING_DEVICE),
105 |         platform=QUANT_PLATFROM,
106 |         do_quantize=True,
107 |     )
108 | 
109 |     # -------------------------------------------------------------------
110 |     # PPQ 计算量化误差时，使用信噪比的倒数作为指标，即噪声能量 / 信号能量
111 |     # 量化误差 0.1 表示在整体信号中，量化噪声的能量约为 10%
112 |     # 你应当注意，在 graphwise_error_analyse 分析中，我们衡量的是累计误差
113 |     # 网络的最后一层往往都具有较大的累计误差，这些误差是其前面的所有层所共同造成的
114 |     # 你需要使用 layerwise_error_analyse 逐层分析误差的来源
115 |     # -------------------------------------------------------------------
116 |     print("正计算网络量化误差(SNR)，最后一层的误差应小于 0.1 以保证量化精度:")
117 |     reports = graphwise_error_analyse(
118 |         graph=qir,
119 |         running_device=EXECUTING_DEVICE,
120 |         steps=32,
121 |         dataloader=SAMPLES,
122 |         collate_fn=lambda x: x.to(EXECUTING_DEVICE),
123 |     )
124 |     for op, snr in reports.items():
125 |         if snr > 0.1:
126 |             ppq_warning(f"层 {op} 的累计量化误差显著，请考虑进行优化")
127 | 
128 |     if REQUIRE_ANALYSE:
129 |         print("正计算逐层量化误差(SNR)，每一层的独立量化误差应小于 0.1 以保证量化精度:")
130 |         layerwise_error_analyse(
131 |             graph=qir,
132 |             running_device=EXECUTING_DEVICE,
133 |             interested_outputs=None,
134 |             dataloader=SAMPLES,
135 |             collate_fn=lambda x: x.to(EXECUTING_DEVICE),
136 |         )
137 | 
138 |     print("网络量化结束，正在生成目标文件:")
139 |     export_ppq_graph(
140 |         graph=qir,
141 |         platform=QUANT_PLATFROM,
142 |         graph_save_to="model_copy_adaround_int8.onnx",
143 |     )
144 | 
145 |     # -------------------------------------------------------------------
146 |     # 记录一下输入输出的名字，onnxruntime 跑的时候需要提供这些名字
147 |     # 我写的只是单输出单输入的版本，多输出多输入你得自己改改
148 |     # -------------------------------------------------------------------
149 |     int8_input_names = [name for name, _ in qir.inputs.items()]
150 |     int8_output_names = [name for name, _ in qir.outputs.items()]
151 | 
152 |     # -------------------------------------------------------------------
153 |     # 启动 tensorRT 进行推理，你先装一下 trt
154 |     # -------------------------------------------------------------------
155 |     import tensorrt as trt
156 |     import trt_infer
157 | 
158 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
159 |     logger = trt.Logger(trt.Logger.INFO)
160 |     with open("model_copy_adaround_int8.engine", "rb") as f, trt.Runtime(
161 |         logger
162 |     ) as runtime:
163 |         engine = runtime.deserialize_cuda_engine(f.read())
164 | 
165 |     results = []
166 |     with engine.create_execution_context() as context:
167 |         inputs, outputs, bindings, stream, output_names = trt_infer.allocate_buffers(
168 |             context.engine
169 |         )
170 |         for sample in tqdm(samples, desc="TensorRT is running..."):
171 |             inputs[0].host = convert_any_to_numpy(sample)
172 |             outputs_list = trt_infer.do_inference(
173 |                 context,
174 |                 bindings=bindings,
175 |                 inputs=inputs,
176 |                 outputs=outputs,
177 |                 stream=stream,
178 |                 batch_size=1,
179 |             )
180 |             for output in outputs_list:
181 |                 print(np.reshape(output, (1, -1))[0, :10])
182 | 


--------------------------------------------------------------------------------
/quantization/P01_MT_onnx2tensorRT_int8.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from ppq import *
 22 | from ppq.api import *
 23 | from tqdm import tqdm
 24 | import glob
 25 | import cv2
 26 | import numpy as np
 27 | from torchvision import transforms
 28 | from PIL import Image
 29 | import os
 30 | 
 31 | 
 32 | def read_image(path):
 33 |     # 多任务模型
 34 |     _img_transforms = transforms.Compose(
 35 |         [
 36 |             transforms.Resize((384, 768)),
 37 |             transforms.ToTensor(),
 38 |             transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
 39 |         ]
 40 |     )
 41 |     img = Image.open(path).convert("RGB")
 42 |     img_w, img_h = img.size[0], img.size[1]
 43 |     img = _img_transforms(img)
 44 |     img = img.unsqueeze(0)
 45 |     return img
 46 | 
 47 | 
 48 | QUANT_PLATFROM = TargetPlatform.TRT_INT8
 49 | MODEL = "/apdcephfs/private_howellyang/onnx2trt/model_T01/model.onnx"
 50 | INPUT_SHAPE = [1, 3, 384, 768]
 51 | 
 52 | calibration_files = glob.glob(
 53 |     os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg")
 54 | )[:128]
 55 | SAMPLES = [
 56 |     read_image(path) for path in calibration_files
 57 | ]  # rewirte this to use real data.
 58 | 
 59 | 
 60 | DEVICE = "cuda"
 61 | FINETUNE = True
 62 | QS = QuantizationSettingFactory.default_setting()
 63 | EXECUTING_DEVICE = "cuda"
 64 | REQUIRE_ANALYSE = True
 65 | 
 66 | # -------------------------------------------------------------------
 67 | # 下面向你展示了常用参数调节选项：
 68 | # -------------------------------------------------------------------
 69 | if PPQ_CONFIG.USING_CUDA_KERNEL:
 70 |     print("====== using advanced_optimization =====")
 71 |     QS.advanced_optimization = FINETUNE  # 启动网络再训练过程，降低量化误差
 72 |     QS.advanced_optimization_setting.steps = 2500  # 再训练步数，影响训练时间，2500步大概几分钟
 73 |     QS.advanced_optimization_setting.collecting_device = (
 74 |         "executor"  # 缓存数据放在那，executor 就是放在gpu，如果显存超了你就换成 'cpu'
 75 |     )
 76 |     QS.advanced_optimization_setting.auto_check = (
 77 |         False  # 打开这个选项则训练过程中会防止过拟合，以及意外情况，通常不需要开。
 78 |     )
 79 | else:
 80 |     print("====== using lsq_optimization =====")
 81 |     QS.lsq_optimization = FINETUNE  # 启动网络再训练过程，降低量化误差
 82 |     QS.lsq_optimization_setting.epochs = 128  # 再训练轮数，影响训练时间，30轮大概几分钟
 83 |     QS.lsq_optimization_setting.collecting_device = (
 84 |         "cuda"  # 缓存数据放在那，cuda 就是放在gpu，如果显存超了你就换成 'cpu'
 85 |     )
 86 | 
 87 | # 把量化的不太好的算子送回 FP32
 88 | # QS.dispatching_table.append(operation="Concat_2420", platform=TargetPlatform.FP32)
 89 | 
 90 | 
 91 | print("正准备量化你的网络，检查下列设置:")
 92 | print(f"TARGET PLATFORM      : {QUANT_PLATFROM.name}")
 93 | print(f"NETWORK INPUTSHAPE   : {INPUT_SHAPE}")
 94 | 
 95 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x，但是你如果没有装相应编译环境的话是编译不了的
 96 | # 你可以尝试安装编译环境，或者在不启动 CUDA KERNEL 的情况下完成量化：移除 with ENABLE_CUDA_KERNEL(): 即可
 97 | with ENABLE_CUDA_KERNEL():
 98 |     qir = quantize_onnx_model(
 99 |         onnx_import_file=MODEL,
100 |         calib_dataloader=SAMPLES,
101 |         calib_steps=128,
102 |         setting=QS,
103 |         input_shape=INPUT_SHAPE,
104 |         collate_fn=lambda x: x.to(EXECUTING_DEVICE),
105 |         platform=QUANT_PLATFROM,
106 |         do_quantize=True,
107 |     )
108 | 
109 |     # -------------------------------------------------------------------
110 |     # PPQ 计算量化误差时，使用信噪比的倒数作为指标，即噪声能量 / 信号能量
111 |     # 量化误差 0.1 表示在整体信号中，量化噪声的能量约为 10%
112 |     # 你应当注意，在 graphwise_error_analyse 分析中，我们衡量的是累计误差
113 |     # 网络的最后一层往往都具有较大的累计误差，这些误差是其前面的所有层所共同造成的
114 |     # 你需要使用 layerwise_error_analyse 逐层分析误差的来源
115 |     # -------------------------------------------------------------------
116 |     print("正计算网络量化误差(SNR)，最后一层的误差应小于 0.1 以保证量化精度:")
117 |     reports = graphwise_error_analyse(
118 |         graph=qir,
119 |         running_device=EXECUTING_DEVICE,
120 |         steps=32,
121 |         dataloader=SAMPLES,
122 |         collate_fn=lambda x: x.to(EXECUTING_DEVICE),
123 |     )
124 |     for op, snr in reports.items():
125 |         if snr > 0.1:
126 |             ppq_warning(f"层 {op} 的累计量化误差显著，请考虑进行优化")
127 | 
128 |     if REQUIRE_ANALYSE:
129 |         print("正计算逐层量化误差(SNR)，每一层的独立量化误差应小于 0.1 以保证量化精度:")
130 |         layerwise_error_analyse(
131 |             graph=qir,
132 |             running_device=EXECUTING_DEVICE,
133 |             interested_outputs=None,
134 |             dataloader=SAMPLES,
135 |             collate_fn=lambda x: x.to(EXECUTING_DEVICE),
136 |         )
137 | 
138 |     print("网络量化结束，正在生成目标文件:")
139 |     export_ppq_graph(
140 |         graph=qir,
141 |         platform=QUANT_PLATFROM,
142 |         graph_save_to=MODEL.replace(".onnx", "_v2_int8.onnx"),
143 |     )
144 | 
145 |     # -------------------------------------------------------------------
146 |     # 记录一下输入输出的名字，onnxruntime 跑的时候需要提供这些名字
147 |     # 我写的只是单输出单输入的版本，多输出多输入你得自己改改
148 |     # -------------------------------------------------------------------
149 |     int8_input_names = [name for name, _ in qir.inputs.items()]
150 |     int8_output_names = [name for name, _ in qir.outputs.items()]
151 | 
152 |     # -------------------------------------------------------------------
153 |     # 启动 tensorRT 进行推理，你先装一下 trt
154 |     # -------------------------------------------------------------------
155 |     import tensorrt as trt
156 |     import trt_infer
157 | 
158 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
159 |     logger = trt.Logger(trt.Logger.INFO)
160 |     with open(MODEL.replace(".onnx", "_v2_int8.engine"), "rb") as f, trt.Runtime(
161 |         logger
162 |     ) as runtime:
163 |         engine = runtime.deserialize_cuda_engine(f.read())
164 | 
165 |     results = []
166 |     with engine.create_execution_context() as context:
167 |         inputs, outputs, bindings, stream, _ = trt_infer.allocate_buffers(
168 |             context.engine
169 |         )
170 |         for sample in tqdm(samples, desc="TensorRT is running..."):
171 |             inputs[0].host = convert_any_to_numpy(sample)
172 |             output = trt_infer.do_inference(
173 |                 context,
174 |                 bindings=bindings,
175 |                 inputs=inputs,
176 |                 outputs=outputs,
177 |                 stream=stream,
178 |                 batch_size=1,
179 |             )
180 |             # results.append(convert_any_to_torch_tensor(output).reshape([-1, 1000]))
181 | 


--------------------------------------------------------------------------------
/quantization/ptq/P01_MT_onnx2tensorRT_int8.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from ppq import *
 22 | from ppq.api import *
 23 | from tqdm import tqdm
 24 | import glob
 25 | import cv2
 26 | import numpy as np
 27 | from torchvision import transforms
 28 | from PIL import Image
 29 | import os
 30 | 
 31 | 
 32 | def read_image(path):
 33 |     # 多任务模型
 34 |     _img_transforms = transforms.Compose(
 35 |         [
 36 |             transforms.Resize((384, 768)),
 37 |             transforms.ToTensor(),
 38 |             transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
 39 |         ]
 40 |     )
 41 |     img = Image.open(path).convert("RGB")
 42 |     img_w, img_h = img.size[0], img.size[1]
 43 |     img = _img_transforms(img)
 44 |     img = img.unsqueeze(0)
 45 |     return img
 46 | 
 47 | 
 48 | QUANT_PLATFROM = TargetPlatform.TRT_INT8
 49 | MODEL = "/apdcephfs/private_howellyang/onnx2trt/model_T01/model.onnx"
 50 | INPUT_SHAPE = [1, 3, 384, 768]
 51 | 
 52 | calibration_files = glob.glob(
 53 |     os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg")
 54 | )[:128]
 55 | SAMPLES = [
 56 |     read_image(path) for path in calibration_files
 57 | ]  # rewirte this to use real data.
 58 | 
 59 | 
 60 | DEVICE = "cuda"
 61 | FINETUNE = True
 62 | QS = QuantizationSettingFactory.default_setting()
 63 | EXECUTING_DEVICE = "cuda"
 64 | REQUIRE_ANALYSE = True
 65 | 
 66 | # -------------------------------------------------------------------
 67 | # 下面向你展示了常用参数调节选项：
 68 | # -------------------------------------------------------------------
 69 | if PPQ_CONFIG.USING_CUDA_KERNEL:
 70 |     print("====== using advanced_optimization =====")
 71 |     QS.advanced_optimization = FINETUNE  # 启动网络再训练过程，降低量化误差
 72 |     QS.advanced_optimization_setting.steps = 2500  # 再训练步数，影响训练时间，2500步大概几分钟
 73 |     QS.advanced_optimization_setting.collecting_device = (
 74 |         "executor"  # 缓存数据放在那，executor 就是放在gpu，如果显存超了你就换成 'cpu'
 75 |     )
 76 |     QS.advanced_optimization_setting.auto_check = (
 77 |         False  # 打开这个选项则训练过程中会防止过拟合，以及意外情况，通常不需要开。
 78 |     )
 79 | else:
 80 |     print("====== using lsq_optimization =====")
 81 |     QS.lsq_optimization = FINETUNE  # 启动网络再训练过程，降低量化误差
 82 |     QS.lsq_optimization_setting.epochs = 128  # 再训练轮数，影响训练时间，30轮大概几分钟
 83 |     QS.lsq_optimization_setting.collecting_device = (
 84 |         "cuda"  # 缓存数据放在那，cuda 就是放在gpu，如果显存超了你就换成 'cpu'
 85 |     )
 86 | 
 87 | # 把量化的不太好的算子送回 FP32
 88 | # QS.dispatching_table.append(operation="Concat_2420", platform=TargetPlatform.FP32)
 89 | 
 90 | 
 91 | print("正准备量化你的网络，检查下列设置:")
 92 | print(f"TARGET PLATFORM      : {QUANT_PLATFROM.name}")
 93 | print(f"NETWORK INPUTSHAPE   : {INPUT_SHAPE}")
 94 | 
 95 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x，但是你如果没有装相应编译环境的话是编译不了的
 96 | # 你可以尝试安装编译环境，或者在不启动 CUDA KERNEL 的情况下完成量化：移除 with ENABLE_CUDA_KERNEL(): 即可
 97 | with ENABLE_CUDA_KERNEL():
 98 |     qir = quantize_onnx_model(
 99 |         onnx_import_file=MODEL,
100 |         calib_dataloader=SAMPLES,
101 |         calib_steps=128,
102 |         setting=QS,
103 |         input_shape=INPUT_SHAPE,
104 |         collate_fn=lambda x: x.to(EXECUTING_DEVICE),
105 |         platform=QUANT_PLATFROM,
106 |         do_quantize=True,
107 |     )
108 | 
109 |     # -------------------------------------------------------------------
110 |     # PPQ 计算量化误差时，使用信噪比的倒数作为指标，即噪声能量 / 信号能量
111 |     # 量化误差 0.1 表示在整体信号中，量化噪声的能量约为 10%
112 |     # 你应当注意，在 graphwise_error_analyse 分析中，我们衡量的是累计误差
113 |     # 网络的最后一层往往都具有较大的累计误差，这些误差是其前面的所有层所共同造成的
114 |     # 你需要使用 layerwise_error_analyse 逐层分析误差的来源
115 |     # -------------------------------------------------------------------
116 |     print("正计算网络量化误差(SNR)，最后一层的误差应小于 0.1 以保证量化精度:")
117 |     reports = graphwise_error_analyse(
118 |         graph=qir,
119 |         running_device=EXECUTING_DEVICE,
120 |         steps=32,
121 |         dataloader=SAMPLES,
122 |         collate_fn=lambda x: x.to(EXECUTING_DEVICE),
123 |     )
124 |     for op, snr in reports.items():
125 |         if snr > 0.1:
126 |             ppq_warning(f"层 {op} 的累计量化误差显著，请考虑进行优化")
127 | 
128 |     if REQUIRE_ANALYSE:
129 |         print("正计算逐层量化误差(SNR)，每一层的独立量化误差应小于 0.1 以保证量化精度:")
130 |         layerwise_error_analyse(
131 |             graph=qir,
132 |             running_device=EXECUTING_DEVICE,
133 |             interested_outputs=None,
134 |             dataloader=SAMPLES,
135 |             collate_fn=lambda x: x.to(EXECUTING_DEVICE),
136 |         )
137 | 
138 |     print("网络量化结束，正在生成目标文件:")
139 |     export_ppq_graph(
140 |         graph=qir,
141 |         platform=QUANT_PLATFROM,
142 |         graph_save_to=MODEL.replace(".onnx", "_v2_int8.onnx"),
143 |     )
144 | 
145 |     # -------------------------------------------------------------------
146 |     # 记录一下输入输出的名字，onnxruntime 跑的时候需要提供这些名字
147 |     # 我写的只是单输出单输入的版本，多输出多输入你得自己改改
148 |     # -------------------------------------------------------------------
149 |     int8_input_names = [name for name, _ in qir.inputs.items()]
150 |     int8_output_names = [name for name, _ in qir.outputs.items()]
151 | 
152 |     # -------------------------------------------------------------------
153 |     # 启动 tensorRT 进行推理，你先装一下 trt
154 |     # -------------------------------------------------------------------
155 |     import tensorrt as trt
156 |     import trt_infer
157 | 
158 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
159 |     logger = trt.Logger(trt.Logger.INFO)
160 |     with open(MODEL.replace(".onnx", "_v2_int8.engine"), "rb") as f, trt.Runtime(
161 |         logger
162 |     ) as runtime:
163 |         engine = runtime.deserialize_cuda_engine(f.read())
164 | 
165 |     results = []
166 |     with engine.create_execution_context() as context:
167 |         inputs, outputs, bindings, stream, _ = trt_infer.allocate_buffers(
168 |             context.engine
169 |         )
170 |         for sample in tqdm(samples, desc="TensorRT is running..."):
171 |             inputs[0].host = convert_any_to_numpy(sample)
172 |             output = trt_infer.do_inference(
173 |                 context,
174 |                 bindings=bindings,
175 |                 inputs=inputs,
176 |                 outputs=outputs,
177 |                 stream=stream,
178 |                 batch_size=1,
179 |             )
180 |             # results.append(convert_any_to_torch_tensor(output).reshape([-1, 1000]))
181 | 


--------------------------------------------------------------------------------
/quantization/P01_MT_onnx2tensorRT_int8_sample.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from ppq import *
 22 | from ppq.api import *
 23 | from tqdm import tqdm
 24 | import glob
 25 | import cv2
 26 | import numpy as np
 27 | from torchvision import transforms
 28 | from PIL import Image
 29 | import os
 30 | 
 31 | 
 32 | def read_image(path):
 33 |     # 多任务模型
 34 |     _img_transforms = transforms.Compose(
 35 |         [
 36 |             transforms.Resize((384, 768)),
 37 |             transforms.ToTensor(),
 38 |             transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
 39 |         ]
 40 |     )
 41 |     img = Image.open(path).convert("RGB")
 42 |     img_w, img_h = img.size[0], img.size[1]
 43 |     img = _img_transforms(img)
 44 |     img = img.unsqueeze(0)
 45 |     return img
 46 | 
 47 | 
 48 | QUANT_PLATFROM = TargetPlatform.TRT_INT8
 49 | MODEL = "Models/RMTNet_release20220609_v2.opt.onnx"
 50 | INPUT_SHAPE = [1, 3, 384, 768]
 51 | 
 52 | calibration_files = glob.glob(
 53 |     os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg")
 54 | )[:2]
 55 | SAMPLES = [
 56 |     read_image(path) for path in calibration_files
 57 | ]  # rewirte this to use real data.
 58 | 
 59 | 
 60 | DEVICE = "cuda"
 61 | FINETUNE = True
 62 | QS = QuantizationSettingFactory.default_setting()
 63 | EXECUTING_DEVICE = "cuda"
 64 | REQUIRE_ANALYSE = True
 65 | 
 66 | # -------------------------------------------------------------------
 67 | # 下面向你展示了常用参数调节选项：
 68 | # -------------------------------------------------------------------
 69 | if PPQ_CONFIG.USING_CUDA_KERNEL:
 70 |     print("====== using advanced_optimization =====")
 71 |     QS.advanced_optimization = FINETUNE  # 启动网络再训练过程，降低量化误差
 72 |     QS.advanced_optimization_setting.steps = 2500  # 再训练步数，影响训练时间，2500步大概几分钟
 73 |     QS.advanced_optimization_setting.collecting_device = (
 74 |         "executor"  # 缓存数据放在那，executor 就是放在gpu，如果显存超了你就换成 'cpu'
 75 |     )
 76 |     QS.advanced_optimization_setting.auto_check = (
 77 |         False  # 打开这个选项则训练过程中会防止过拟合，以及意外情况，通常不需要开。
 78 |     )
 79 | else:
 80 |     print("====== using lsq_optimization =====")
 81 |     QS.lsq_optimization = FINETUNE  # 启动网络再训练过程，降低量化误差
 82 |     QS.lsq_optimization_setting.epochs = 4  # 再训练轮数，影响训练时间，30轮大概几分钟
 83 |     QS.lsq_optimization_setting.collecting_device = (
 84 |         "cuda"  # 缓存数据放在那，cuda 就是放在gpu，如果显存超了你就换成 'cpu'
 85 |     )
 86 | 
 87 | # 把量化的不太好的算子送回 FP32
 88 | QS.dispatching_table.append(operation="Conv_3342", platform=TargetPlatform.FP32)
 89 | QS.dispatching_table.append(operation="Relu_3343", platform=TargetPlatform.FP32)
 90 | QS.dispatching_table.append(operation="Conv_2523", platform=TargetPlatform.FP32)
 91 | 
 92 | print("正准备量化你的网络，检查下列设置:")
 93 | print(f"TARGET PLATFORM      : {QUANT_PLATFROM.name}")
 94 | print(f"NETWORK INPUTSHAPE   : {INPUT_SHAPE}")
 95 | 
 96 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x，但是你如果没有装相应编译环境的话是编译不了的
 97 | # 你可以尝试安装编译环境，或者在不启动 CUDA KERNEL 的情况下完成量化：移除 with ENABLE_CUDA_KERNEL(): 即可
 98 | with ENABLE_CUDA_KERNEL():
 99 |     qir = quantize_onnx_model(
100 |         onnx_import_file=MODEL,
101 |         calib_dataloader=SAMPLES,
102 |         calib_steps=16,
103 |         setting=QS,
104 |         input_shape=INPUT_SHAPE,
105 |         collate_fn=lambda x: x.to(EXECUTING_DEVICE),
106 |         platform=QUANT_PLATFROM,
107 |         do_quantize=True,
108 |     )
109 | 
110 |     # -------------------------------------------------------------------
111 |     # PPQ 计算量化误差时，使用信噪比的倒数作为指标，即噪声能量 / 信号能量
112 |     # 量化误差 0.1 表示在整体信号中，量化噪声的能量约为 10%
113 |     # 你应当注意，在 graphwise_error_analyse 分析中，我们衡量的是累计误差
114 |     # 网络的最后一层往往都具有较大的累计误差，这些误差是其前面的所有层所共同造成的
115 |     # 你需要使用 layerwise_error_analyse 逐层分析误差的来源
116 |     # -------------------------------------------------------------------
117 |     print("正计算网络量化误差(SNR)，最后一层的误差应小于 0.1 以保证量化精度:")
118 |     reports = graphwise_error_analyse(
119 |         graph=qir,
120 |         running_device=EXECUTING_DEVICE,
121 |         steps=32,
122 |         dataloader=SAMPLES,
123 |         collate_fn=lambda x: x.to(EXECUTING_DEVICE),
124 |     )
125 |     for op, snr in reports.items():
126 |         if snr > 0.1:
127 |             ppq_warning(f"层 {op} 的累计量化误差显著，请考虑进行优化")
128 | 
129 |     if REQUIRE_ANALYSE:
130 |         print("正计算逐层量化误差(SNR)，每一层的独立量化误差应小于 0.1 以保证量化精度:")
131 |         layerwise_error_analyse(
132 |             graph=qir,
133 |             running_device=EXECUTING_DEVICE,
134 |             interested_outputs=None,
135 |             dataloader=SAMPLES,
136 |             collate_fn=lambda x: x.to(EXECUTING_DEVICE),
137 |         )
138 | 
139 |     print("网络量化结束，正在生成目标文件:")
140 |     export_ppq_graph(
141 |         graph=qir,
142 |         platform=QUANT_PLATFROM,
143 |         graph_save_to=MODEL.replace(".onnx", "_int8_sample2.onnx"),
144 |     )
145 | 
146 |     # -------------------------------------------------------------------
147 |     # 记录一下输入输出的名字，onnxruntime 跑的时候需要提供这些名字
148 |     # 我写的只是单输出单输入的版本，多输出多输入你得自己改改
149 |     # -------------------------------------------------------------------
150 |     int8_input_names = [name for name, _ in qir.inputs.items()]
151 |     int8_output_names = [name for name, _ in qir.outputs.items()]
152 | 
153 |     # -------------------------------------------------------------------
154 |     # 启动 tensorRT 进行推理，你先装一下 trt
155 |     # -------------------------------------------------------------------
156 |     import tensorrt as trt
157 |     import trt_infer
158 | 
159 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
160 |     logger = trt.Logger(trt.Logger.INFO)
161 |     with open(MODEL.replace(".onnx", "_int8_sample2.engine"), "rb") as f, trt.Runtime(
162 |         logger
163 |     ) as runtime:
164 |         engine = runtime.deserialize_cuda_engine(f.read())
165 | 
166 |     results = []
167 |     with engine.create_execution_context() as context:
168 |         inputs, outputs, bindings, stream = trt_infer.allocate_buffers(context.engine)
169 |         for sample in tqdm(samples, desc="TensorRT is running..."):
170 |             inputs[0].host = convert_any_to_numpy(sample)
171 |             output = trt_infer.do_inference(
172 |                 context,
173 |                 bindings=bindings,
174 |                 inputs=inputs,
175 |                 outputs=outputs,
176 |                 stream=stream,
177 |                 batch_size=1,
178 |             )
179 |             # results.append(convert_any_to_torch_tensor(output).reshape([-1, 1000]))
180 | 


--------------------------------------------------------------------------------
/onnx_calibrator.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import abc
  3 | import json
  4 | import numpy as np
  5 | import tensorrt as trt
  6 | import tensorrt as trt
  7 | import pycuda.driver as cuda
  8 | import pycuda.autoinit  # fix init error of cuda
  9 | import os
 10 | import onnx
 11 | import struct
 12 | from onnxruntime.quantization.calibrate import (
 13 |     CalibrationDataReader,
 14 |     MinMaxCalibrater,
 15 |     EntropyCalibrater,
 16 |     PercentileCalibrater,
 17 | )
 18 | 
 19 | # 使用onnx的quantize tools生成每个节点的scales和zero point
 20 | # 并转换为tensorRT可用的calibration cache file
 21 | # 后续需要用tensorrt模型转换工具生成trt engine
 22 | class ONNXDataReader(CalibrationDataReader):
 23 |     def __init__(self, input_name, image_stream, max_iter_num=None):
 24 |         super(ONNXDataReader).__init__()
 25 |         self.input_name = input_name
 26 |         self.image_stream = image_stream
 27 |         self.max_iter_num = max_iter_num
 28 |         self.iter_num = 0
 29 | 
 30 |     def get_next(self) -> dict:
 31 |         self.iter_num += 1
 32 |         if self.iter_num > self.max_iter_num:
 33 |             return None
 34 |         batch = self.image_stream.next_batch()
 35 |         if not batch.size:
 36 |             return None
 37 |         """generate the input data dict for ONNXinferenceSession run"""
 38 |         return {
 39 |             self.input_name: batch,
 40 |             # "image_shape": np.asarray([[self.image_stream.WIDTH, self.image_stream.HEIGHT]], dtype=np.float32),
 41 |         }
 42 | 
 43 | 
 44 | class ONNXCalibrator(trt.IInt8EntropyCalibrator2):
 45 |     def __init__(self, input_layers, stream, cache_file, calib_algo, onnx_model_path):
 46 |         super(ONNXCalibrator, self).__init__()
 47 |         self.input_layers = input_layers
 48 | 
 49 |         # 数据读取的类, 等同于图片处理的回调
 50 |         self.stream = stream
 51 | 
 52 |         # 分配GPU
 53 |         self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
 54 | 
 55 |         # cache路径
 56 |         self.cache_file = cache_file
 57 | 
 58 |         # 重置校准集
 59 |         self.stream.reset()
 60 | 
 61 |         # 使用onnx的calibrator来统计每个节点的dynamic range
 62 |         calibrator = self.create_calibrator(calib_algo, onnx_model_path)
 63 |         calibrator.set_execution_providers(
 64 |             ["CUDAExecutionProvider", "CPUExecutionProvider"]
 65 |         )
 66 |         each_iter_num = 1
 67 |         for i in range(self.stream.max_batches // each_iter_num):
 68 |             data_reader = ONNXDataReader(
 69 |                 self.input_layers[0], self.stream, each_iter_num
 70 |             )
 71 |             calibrator.collect_data(data_reader)
 72 |         self.write_calibration_table(calibrator.compute_range(), self.cache_file)
 73 | 
 74 |     @staticmethod
 75 |     def write_calibration_table(calibration_cache, save_path):
 76 |         """
 77 |         Helper function to write calibration table to files.
 78 |         """
 79 |         with open(save_path + "_calib_cache.json", "w") as file:
 80 |             file.write(
 81 |                 json.dumps(calibration_cache)
 82 |             )  # use `json.loads` to do the reverse
 83 | 
 84 |         # write plain text: tensorRT需要对结果做转换
 85 |         # TRT-8400-EntropyCalibration2
 86 |         # input.1: 3ca94044
 87 |         # 9131: 3cf4f8d5
 88 |         # 加密 hex(struct.unpack('<I', struct.pack('<f', f))[0])
 89 |         # 解析 struct.unpack('!f', bytes.fromhex('41973333'))[0]
 90 |         with open(save_path, "w") as file:
 91 |             file.write("TRT-8400-EntropyCalibration2\n")
 92 |             for key in sorted(calibration_cache.keys()):
 93 |                 value = calibration_cache[key]
 94 |                 scale = max(abs(value[0]), abs(value[1]))
 95 |                 scale_hex = hex(struct.unpack("<I", struct.pack("<f", scale))[0])
 96 |                 s = key + ": " + str(scale_hex).lstrip("0x")
 97 |                 file.write(s)
 98 |                 file.write("\n")
 99 | 
100 |     @staticmethod
101 |     def create_calibrator(calib_algo, onnx_model_path):
102 |         augmented_model_path = onnx_model_path.replace(".onnx", "_calib.onnx")
103 |         if calib_algo == "ONNXMinMax":
104 |             # default settings for min-max algorithm
105 |             # symmetric = True  # tensorRT使用的是对称量化
106 |             # moving_average = True
107 |             # averaging_constant = 0.01
108 |             return MinMaxCalibrater(
109 |                 onnx_model_path,
110 |                 op_types_to_calibrate=[],
111 |                 augmented_model_path=augmented_model_path,
112 |                 # use_external_data_format=False,
113 |                 # symmetric=symmetric,
114 |                 # moving_average=moving_average,
115 |                 # averaging_constant=averaging_constant,
116 |             )
117 |         elif calib_algo == "ONNXEntropy":
118 |             # default settings for entropy algorithm
119 |             # num_bins = 128
120 |             num_quantized_bins = 128
121 |             # symmetric = True
122 |             return EntropyCalibrater(
123 |                 onnx_model_path,
124 |                 op_types_to_calibrate=[],
125 |                 augmented_model_path=augmented_model_path,
126 |                 # use_external_data_format=False,
127 |                 # symmetric=symmetric,
128 |                 # num_bins=num_bins,
129 |                 num_quantized_bins=num_quantized_bins,
130 |             )
131 |         elif calib_algo == "ONNXPercentile":
132 |             # default settings for percentile algorithm
133 |             num_quantized_bins = 2048
134 |             percentile = 99.95
135 |             # symmetric = True
136 |             return PercentileCalibrater(
137 |                 onnx_model_path,
138 |                 op_types_to_calibrate=[],
139 |                 augmented_model_path=augmented_model_path,
140 |                 # use_external_data_format=False,
141 |                 # symmetric=symmetric,
142 |                 num_quantized_bins=num_quantized_bins,
143 |                 percentile=percentile,
144 |             )
145 | 
146 |         raise ValueError("Unsupported calibration method {}".format(calib_algo))
147 | 
148 |     def get_batch_size(self):
149 |         return self.stream.batch_size
150 | 
151 |     def get_batch(self, names):
152 |         try:
153 |             batch = self.stream.next_batch()
154 |             if not batch.size:
155 |                 return None
156 |             cuda.memcpy_htod(self.d_input, batch)
157 |             return [int(self.d_input)]
158 |         except StopIteration:
159 |             return None
160 | 
161 |     def read_calibration_cache(self):
162 |         # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
163 |         if os.path.exists(self.cache_file):
164 |             with open(self.cache_file, "rb") as f:
165 |                 return f.read()
166 |         else:
167 |             return None
168 | 
169 |     def write_calibration_cache(self, cache):
170 |         # cache = ctypes.c_char_p(int(ptr))
171 |         with open(self.cache_file, "wb") as f:
172 |             f.write(cache)
173 | 


--------------------------------------------------------------------------------
/quantization/compare_onnx_trt_v3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from tqdm import tqdm
 22 | import glob
 23 | import cv2
 24 | import numpy as np
 25 | from torchvision import transforms
 26 | from PIL import Image
 27 | import os
 28 | from sklearn.metrics.pairwise import cosine_similarity
 29 | import onnx
 30 | from copy import deepcopy
 31 | 
 32 | # bool didInitPlugins = initLibNvInferPlugins(nullptr, "");
 33 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray:
 34 |     if x is None and not accepet_none:
 35 |         raise ValueError("Trying to convert an empty value.")
 36 |     if isinstance(x, np.ndarray):
 37 |         return x
 38 |     elif isinstance(x, int) or isinstance(x, float):
 39 |         return np.array(
 40 |             [
 41 |                 x,
 42 |             ]
 43 |         )
 44 |     elif isinstance(x, torch.Tensor):
 45 |         if x.numel() == 0 and accepet_none:
 46 |             return None
 47 |         if x.numel() == 0 and not accepet_none:
 48 |             raise ValueError("Trying to convert an empty value.")
 49 |         if x.numel() == 1:
 50 |             return convert_any_to_numpy(x.detach().cpu().item())
 51 |         if x.numel() > 1:
 52 |             return x.detach().cpu().numpy()
 53 |     elif isinstance(x, list) or isinstance(x, tuple):
 54 |         return np.array(x)
 55 |     else:
 56 |         raise TypeError(
 57 |             f"input value {x}({type(x)}) can not be converted as numpy type."
 58 |         )
 59 | 
 60 | def read_image(path):
 61 |     mean_val = [103.53, 116.28, 123.675]
 62 |     std_val = [57.375, 57.12, 58.395]
 63 |     input_size = [768, 448]
 64 | 
 65 |     # img = np.random.randint(255, size=input_size + [3]).astype(np.uint8)
 66 |     img_raw = cv2.imread(path)
 67 |     img = cv2.resize(img_raw, (input_size[0],input_size[1])).astype(np.float32)
 68 |     img -= mean_val
 69 |     img /= std_val
 70 |     img = np.transpose(img, (2, 0, 1)).astype(np.float32)
 71 |     img = np.expand_dims(img, axis=0)
 72 | 
 73 |     img = np.ascontiguousarray(img, dtype=np.float32)
 74 |     return img
 75 | 
 76 | calibration_files = glob.glob(
 77 |     os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg")
 78 | )[-100:]
 79 | SAMPLES = [
 80 |     read_image(path) for path in calibration_files
 81 | ]  # rewirte this to use real data.
 82 | 
 83 | 
 84 | DEVICE = "cuda"
 85 | FINETUNE = True
 86 | EXECUTING_DEVICE = "cuda"
 87 | REQUIRE_ANALYSE = True
 88 | 
 89 | # -------------------------------------------------------------------
 90 | # 启动 tensorRT 进行推理，你先装一下 trt
 91 | # -------------------------------------------------------------------
 92 | 
 93 | 
 94 | def infer_with_trt(trt_int8_path = ""):
 95 |     import tensorrt as trt
 96 |     import trt_infer
 97 |     trt.init_libnvinfer_plugins(None, "")
 98 | 
 99 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
100 |     logger = trt.Logger(trt.Logger.INFO)
101 |     with open(trt_int8_path, "rb") as f, trt.Runtime(
102 |         logger
103 |     ) as runtime:
104 |         engine = runtime.deserialize_cuda_engine(f.read())
105 | 
106 |     trt_outpus_all  = []
107 |     with engine.create_execution_context() as context:
108 |         inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers(
109 |             context.engine
110 |         )
111 |         for sample in tqdm(samples, desc="TensorRT is running..."):
112 |             # trt infer
113 |             inputs[0].host = convert_any_to_numpy(sample)
114 |             trt_outputs_list = trt_infer.do_inference(
115 |                 context,
116 |                 bindings=bindings,
117 |                 inputs=inputs,
118 |                 outputs=outputs,
119 |                 stream=stream,
120 |                 batch_size=1,
121 |             )
122 |             trt_outputs_dict = {
123 |                 trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names))
124 |             }
125 |             trt_outpus_all.append(deepcopy(trt_outputs_dict))
126 |     return trt_outpus_all
127 | 
128 | 
129 | def infer_with_onnx(onnx_path = ""):
130 | 
131 |     sess = onnxruntime.InferenceSession(
132 |         onnx_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
133 |     )
134 |     input_name = sess.get_inputs()[0].name
135 |     onnx_output_names = [output.name for output in sess.get_outputs()]
136 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
137 | 
138 |     onnx_outpus_all  = []
139 |     for sample in tqdm(samples, desc="Onnx is running..."):
140 |         onnx_outputs = sess.run(onnx_output_names, {input_name: sample})
141 |         onnx_outputs_dict = {
142 |             onnx_output_names[i]: onnx_outputs[i] for i in range(len(onnx_output_names))
143 |         }  
144 |         onnx_outpus_all.append(deepcopy(onnx_outputs_dict))
145 |     return onnx_outpus_all
146 | 
147 | 
148 | import sys
149 | 
150 | if len(sys.argv) > 2:
151 |     onnx_path = sys.argv[1]
152 |     trt_path = sys.argv[2]
153 | else:
154 |     onnx_path = "/apdcephfs/private_howellyang/onnx2trt/Models_Fp16/RMTNet_release20220609_v2.opt.onnx"
155 |     trt_path = "/apdcephfs/private_howellyang/onnx2trt/Models_Fp16/RMTNet_release20220609.fp16.trtmodel"
156 | 
157 | trt_outpus_all = infer_with_trt(trt_path)
158 | onnx_outputs_all = infer_with_onnx(onnx_path)
159 | 
160 | sims = {}
161 | diffs = {}
162 | for i in range(len(trt_outpus_all)):
163 |     for output_name, _ in trt_outpus_all[i].items():
164 |         trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1))
165 |         trt_fp32_output = np.reshape(onnx_outputs_all[i][output_name], (1, -1))
166 |         cos_sim = cosine_similarity(trt_output, trt_fp32_output)
167 |         abs_diff_mean = np.mean(np.abs(trt_output - trt_fp32_output))
168 |         if output_name not in sims:
169 |             sims[output_name] = []
170 |             diffs[output_name] = []
171 |         sims[output_name].append(cos_sim.ravel())
172 |         diffs[output_name].append(abs_diff_mean.ravel())
173 |         # if cos_sim < 0.985:
174 |         #     print(output_name, cos_sim)
175 |         #     print(trt_fp32_output[0, :5])
176 |         #     print(trt_output[0, :5])
177 | 
178 | print("===================")
179 | mean_sims = []
180 | mean_diffs = []
181 | for key, value in sims.items():
182 |     print(key, np.mean(value), np.min(value), np.mean(diffs[key]), np.max(diffs[key]))
183 |     mean_sims.append(np.mean(value))
184 |     mean_diffs.append(np.mean(diffs[key]))
185 | print("average cosine sim = ", np.mean(mean_sims))
186 | print("average dff abs = ", np.mean(mean_diffs))


--------------------------------------------------------------------------------
/quantization/onnx_calibrator.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import abc
  3 | import json
  4 | import numpy as np
  5 | import tensorrt as trt
  6 | import tensorrt as trt
  7 | import pycuda.driver as cuda
  8 | import pycuda.autoinit  # fix init error of cuda
  9 | import os
 10 | import onnx
 11 | import struct
 12 | from onnxruntime.quantization.calibrate import (
 13 |     CalibrationDataReader,
 14 |     MinMaxCalibrater,
 15 |     EntropyCalibrater,
 16 |     PercentileCalibrater,
 17 | )
 18 | 
 19 | # 使用onnx的quantize tools生成每个节点的scales和zero point
 20 | # 并转换为tensorRT可用的calibration cache file
 21 | # 后续需要用tensorrt模型转换工具生成trt engine
 22 | class ONNXDataReader(CalibrationDataReader):
 23 |     def __init__(self, input_name, image_stream, max_iter_num=None):
 24 |         super(ONNXDataReader).__init__()
 25 |         self.input_name = input_name
 26 |         self.image_stream = image_stream
 27 |         self.max_iter_num = max_iter_num
 28 |         self.iter_num = 0
 29 | 
 30 |     def get_next(self) -> dict:
 31 |         self.iter_num += 1
 32 |         if self.iter_num > self.max_iter_num:
 33 |             return None
 34 |         batch = self.image_stream.next_batch()
 35 |         if not batch.size:
 36 |             return None
 37 |         """generate the input data dict for ONNXinferenceSession run"""
 38 |         return {
 39 |             self.input_name: batch,
 40 |             # "image_shape": np.asarray([[self.image_stream.WIDTH, self.image_stream.HEIGHT]], dtype=np.float32),
 41 |         }
 42 | 
 43 | 
 44 | class ONNXCalibrator(trt.IInt8EntropyCalibrator2):
 45 |     def __init__(self, input_layers, stream, cache_file, calib_algo, onnx_model_path):
 46 |         super(ONNXCalibrator, self).__init__()
 47 |         self.input_layers = input_layers
 48 | 
 49 |         # 数据读取的类, 等同于图片处理的回调
 50 |         self.stream = stream
 51 | 
 52 |         # 分配GPU
 53 |         self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
 54 | 
 55 |         # cache路径
 56 |         self.cache_file = cache_file
 57 | 
 58 |         # 重置校准集
 59 |         self.stream.reset()
 60 | 
 61 |         # 使用onnx的calibrator来统计每个节点的dynamic range
 62 |         calibrator = self.create_calibrator(calib_algo, onnx_model_path)
 63 |         # calibrator.set_execution_providers(
 64 |         #     ["CUDAExecutionProvider", "CPUExecutionProvider"]
 65 |         # )
 66 |         calibrator.set_execution_providers(
 67 |             ["CPUExecutionProvider"]
 68 |         )
 69 |         each_iter_num = 1
 70 |         for i in range(self.stream.max_batches // each_iter_num):
 71 |             data_reader = ONNXDataReader(
 72 |                 self.input_layers[0], self.stream, each_iter_num
 73 |             )
 74 |             calibrator.collect_data(data_reader)
 75 |         self.write_calibration_table(calibrator.compute_range(), self.cache_file)
 76 | 
 77 |     @staticmethod
 78 |     def write_calibration_table(calibration_cache, save_path):
 79 |         """
 80 |         Helper function to write calibration table to files.
 81 |         """
 82 |         with open(save_path + "_calib_cache.json", "w") as file:
 83 |             file.write(
 84 |                 json.dumps(calibration_cache)
 85 |             )  # use `json.loads` to do the reverse
 86 | 
 87 |         # write plain text: tensorRT需要对结果做转换
 88 |         # TRT-8400-EntropyCalibration2
 89 |         # input.1: 3ca94044
 90 |         # 9131: 3cf4f8d5
 91 |         # 加密 hex(struct.unpack('<I', struct.pack('<f', f))[0])
 92 |         # 解析 struct.unpack('!f', bytes.fromhex('41973333'))[0]
 93 |         with open(save_path, "w") as file:
 94 |             file.write("TRT-8400-EntropyCalibration2\n")
 95 |             for key in sorted(calibration_cache.keys()):
 96 |                 value = calibration_cache[key]
 97 |                 scale = max(abs(value[0]), abs(value[1]))
 98 |                 scale_hex = hex(struct.unpack("<I", struct.pack("<f", scale))[0])
 99 |                 s = key + ": " + str(scale_hex).lstrip("0x")
100 |                 file.write(s)
101 |                 file.write("\n")
102 | 
103 |     @staticmethod
104 |     def create_calibrator(calib_algo, onnx_model_path):
105 |         augmented_model_path = onnx_model_path.replace(".onnx", "_calib.onnx")
106 |         if calib_algo == "ONNXMinMax":
107 |             # default settings for min-max algorithm
108 |             # symmetric = True  # tensorRT使用的是对称量化
109 |             # moving_average = True
110 |             # averaging_constant = 0.01
111 |             return MinMaxCalibrater(
112 |                 onnx_model_path,
113 |                 op_types_to_calibrate=[],
114 |                 augmented_model_path=augmented_model_path,
115 |                 # use_external_data_format=False,
116 |                 # symmetric=symmetric,
117 |                 # moving_average=moving_average,
118 |                 # averaging_constant=averaging_constant,
119 |             )
120 |         elif calib_algo == "ONNXEntropy":
121 |             # default settings for entropy algorithm
122 |             # num_bins = 128
123 |             num_quantized_bins = 128
124 |             # symmetric = True
125 |             return EntropyCalibrater(
126 |                 onnx_model_path,
127 |                 op_types_to_calibrate=[],
128 |                 augmented_model_path=augmented_model_path,
129 |                 # use_external_data_format=False,
130 |                 # symmetric=symmetric,
131 |                 # num_bins=num_bins,
132 |                 num_quantized_bins=num_quantized_bins,
133 |             )
134 |         elif calib_algo == "ONNXPercentile":
135 |             # default settings for percentile algorithm
136 |             num_quantized_bins = 2048
137 |             percentile = 99.95
138 |             # symmetric = True
139 |             return PercentileCalibrater(
140 |                 onnx_model_path,
141 |                 op_types_to_calibrate=[],
142 |                 augmented_model_path=augmented_model_path,
143 |                 # use_external_data_format=False,
144 |                 # symmetric=symmetric,
145 |                 num_quantized_bins=num_quantized_bins,
146 |                 percentile=percentile,
147 |             )
148 | 
149 |         raise ValueError("Unsupported calibration method {}".format(calib_algo))
150 | 
151 |     def get_batch_size(self):
152 |         return self.stream.batch_size
153 | 
154 |     def get_batch(self, names):
155 |         try:
156 |             batch = self.stream.next_batch()
157 |             if not batch.size:
158 |                 return None
159 |             cuda.memcpy_htod(self.d_input, batch)
160 |             return [int(self.d_input)]
161 |         except StopIteration:
162 |             return None
163 | 
164 |     def read_calibration_cache(self):
165 |         # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
166 |         if os.path.exists(self.cache_file):
167 |             with open(self.cache_file, "rb") as f:
168 |                 return f.read()
169 |         else:
170 |             return None
171 | 
172 |     def write_calibration_cache(self, cache):
173 |         # cache = ctypes.c_char_p(int(ptr))
174 |         with open(self.cache_file, "wb") as f:
175 |             f.write(cache)
176 | 


--------------------------------------------------------------------------------
/quantization/compare_onnx_onnx_v2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from tqdm import tqdm
 22 | import glob
 23 | import cv2
 24 | import numpy as np
 25 | from torchvision import transforms
 26 | from PIL import Image
 27 | import os
 28 | from sklearn.metrics.pairwise import cosine_similarity
 29 | import onnx
 30 | from copy import deepcopy
 31 | 
 32 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray:
 33 |     if x is None and not accepet_none:
 34 |         raise ValueError("Trying to convert an empty value.")
 35 |     if isinstance(x, np.ndarray):
 36 |         return x
 37 |     elif isinstance(x, int) or isinstance(x, float):
 38 |         return np.array(
 39 |             [
 40 |                 x,
 41 |             ]
 42 |         )
 43 |     elif isinstance(x, torch.Tensor):
 44 |         if x.numel() == 0 and accepet_none:
 45 |             return None
 46 |         if x.numel() == 0 and not accepet_none:
 47 |             raise ValueError("Trying to convert an empty value.")
 48 |         if x.numel() == 1:
 49 |             return convert_any_to_numpy(x.detach().cpu().item())
 50 |         if x.numel() > 1:
 51 |             return x.detach().cpu().numpy()
 52 |     elif isinstance(x, list) or isinstance(x, tuple):
 53 |         return np.array(x)
 54 |     else:
 55 |         raise TypeError(
 56 |             f"input value {x}({type(x)}) can not be converted as numpy type."
 57 |         )
 58 | def read_image(path):
 59 |     mean = [123.675, 116.28, 103.53]
 60 |     std = [58.395, 57.12, 57.375]
 61 |     input_w = 960
 62 |     input_h = 480
 63 | 
 64 |     # for onnx inference
 65 |     mean = np.array(mean)
 66 |     std = np.array(std)
 67 | 
 68 |     # Load by OpenCV
 69 |     img = cv2.imread(path)
 70 |     # Convert to RGB
 71 |     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 72 | 
 73 |     img = cv2.resize(img, (input_w, input_h))
 74 | 
 75 |     img = img.astype(np.float32)
 76 | 
 77 |     # Norm
 78 |     for i in range(3):
 79 |         img[..., i] = (img[..., i] - mean[i]) / std[i]
 80 | 
 81 |     # hwc -> nchw
 82 |     h, w, c = img.shape
 83 |     img = img.reshape((1, c, h ,w))
 84 | 
 85 |     return np.array(img)
 86 | 
 87 | calibration_files = glob.glob(
 88 |     os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg")
 89 | )[-100:]
 90 | SAMPLES = [
 91 |     read_image(path) for path in calibration_files
 92 | ]  # rewirte this to use real data.
 93 | 
 94 | 
 95 | DEVICE = "cuda"
 96 | FINETUNE = True
 97 | EXECUTING_DEVICE = "cuda"
 98 | REQUIRE_ANALYSE = True
 99 | 
100 | # -------------------------------------------------------------------
101 | # 启动 tensorRT 进行推理，你先装一下 trt
102 | # -------------------------------------------------------------------
103 | 
104 | 
105 | def infer_with_trt(trt_int8_path = ""):
106 |     import tensorrt as trt
107 |     import trt_infer
108 | 
109 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
110 |     logger = trt.Logger(trt.Logger.INFO)
111 |     with open(trt_int8_path, "rb") as f, trt.Runtime(
112 |         logger
113 |     ) as runtime:
114 |         engine = runtime.deserialize_cuda_engine(f.read())
115 | 
116 |     trt_outpus_all  = []
117 |     with engine.create_execution_context() as context:
118 |         inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers(
119 |             context.engine
120 |         )
121 |         for sample in tqdm(samples, desc="TensorRT is running..."):
122 |             # trt infer
123 |             inputs[0].host = convert_any_to_numpy(sample)
124 |             trt_outputs_list = trt_infer.do_inference(
125 |                 context,
126 |                 bindings=bindings,
127 |                 inputs=inputs,
128 |                 outputs=outputs,
129 |                 stream=stream,
130 |                 batch_size=1,
131 |             )
132 |             trt_outputs_dict = {
133 |                 trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names))
134 |             }
135 |             trt_outpus_all.append(deepcopy(trt_outputs_dict))
136 |     return trt_outpus_all
137 | 
138 | 
139 | def infer_with_onnx(onnx_path = ""):
140 | 
141 |     sess = onnxruntime.InferenceSession(
142 |         onnx_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
143 |     )
144 |     input_name = sess.get_inputs()[0].name
145 |     onnx_output_names = [output.name for output in sess.get_outputs()]
146 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
147 | 
148 |     onnx_outpus_all  = []
149 |     for sample in tqdm(samples, desc="Onnx is running..."):
150 |         onnx_outputs = sess.run(onnx_output_names, {input_name: sample})
151 |         onnx_outputs_dict = {
152 |             onnx_output_names[i]: onnx_outputs[i] for i in range(len(onnx_output_names))
153 |         }  
154 |         onnx_outpus_all.append(deepcopy(onnx_outputs_dict))
155 |     return onnx_outpus_all
156 | 
157 | 
158 | import sys
159 | 
160 | if len(sys.argv) > 2:
161 |     onnx_path = sys.argv[1]
162 |     trt_path = sys.argv[2]
163 | else:
164 |     onnx_path = "/apdcephfs/private_howellyang/onnx2trt/Models_Fp16/RMTNet_release20220609_v2.opt.onnx"
165 |     trt_path = "/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.onnx"
166 | 
167 | trt_outpus_all = infer_with_onnx(trt_path)
168 | onnx_outputs_all = infer_with_onnx(onnx_path)
169 | 
170 | sims = {}
171 | diffs = {}
172 | for i in range(len(trt_outpus_all)):
173 |     for output_name, _ in trt_outpus_all[i].items():
174 |         trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1))
175 |         trt_fp32_output = np.reshape(onnx_outputs_all[i][output_name], (1, -1))
176 |         cos_sim = cosine_similarity(trt_output, trt_fp32_output)
177 |         abs_diff_mean = np.mean(np.abs(trt_output - trt_fp32_output))
178 |         if output_name not in sims:
179 |             sims[output_name] = []
180 |             diffs[output_name] = []
181 |         sims[output_name].append(cos_sim.ravel())
182 |         diffs[output_name].append(abs_diff_mean.ravel())
183 |         # if cos_sim < 0.985:
184 |         #     print(output_name, cos_sim)
185 |         #     print(trt_fp32_output[0, :5])
186 |         #     print(trt_output[0, :5])
187 | 
188 | print("===================")
189 | mean_sims = []
190 | mean_diffs = []
191 | for key, value in sims.items():
192 |     print(key, np.mean(value), np.min(value), np.mean(diffs[key]), np.max(diffs[key]))
193 |     mean_sims.append(np.mean(value))
194 |     mean_diffs.append(np.mean(diffs[key]))
195 | print("average cosine sim = ", np.mean(mean_sims))
196 | print("average dff abs = ", np.mean(mean_diffs))


--------------------------------------------------------------------------------
/quantization/onnx2trt.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | from __future__ import print_function
  3 | 
  4 | import argparse
  5 | import glob
  6 | import os
  7 | from tabnanny import verbose
  8 | import tensorrt as trt
  9 | import pycuda.driver as cuda
 10 | import pycuda.autoinit  # fix init error of cuda
 11 | from google.protobuf.json_format import MessageToDict
 12 | import onnx
 13 | from onnxsim import simplify
 14 | try:
 15 |     import onnxoptimizer as optimizer
 16 | except:
 17 |     from onnx import optimizer
 18 | 
 19 | from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
 20 | import numpy as np
 21 | from trt_utils import (
 22 |     create_image_stream,
 23 |     create_calibrator,
 24 |     create_tensorrt_engine,
 25 |     evaluate_engine,
 26 | )
 27 | 
 28 | parser = argparse.ArgumentParser(description="Onnx Calibration Params")
 29 | parser.add_argument("--onnx", type=str, default=None, required=True, help="原始的onnx路径")
 30 | parser.add_argument(
 31 |     "--trt_engine", type=str, default=None, required=True, help="tensorRT engine的保存路径"
 32 | )
 33 | 
 34 | parser.add_argument(
 35 |     "--engine_type",
 36 |     type=str,
 37 |     default="int8",
 38 |     choices=["int8", "fp32", "fp16", "best"],
 39 |     required=False,
 40 |     help="模型的计算精度",
 41 | )
 42 | 
 43 | parser.add_argument(
 44 |     "--trt_calib_cache",
 45 |     type=str,
 46 |     default="./trt_int8.cache",
 47 |     required=False,
 48 |     help="用来存储每个节点动态范围的路径",
 49 | )
 50 | parser.add_argument(
 51 |     "--calib_dir", type=str, default=None, required=False, help="进行精度测试以及量化校准使用的图片路径"
 52 | )
 53 | parser.add_argument(
 54 |     "--calib_algo",
 55 |     type=str,
 56 |     default="TRTEntropy",
 57 |     required=False,
 58 |     choices=["Search", "TRTEntropy", "TRTMinMax", "TRTPercentile", "ONNXEntropy", "ONNXMinMax", "ONNXPercentile"],
 59 |     help="""量化校准使用的算法:
 60 |     Search 进行自动化搜索, 自动选择最终输出的cosine距离最高的校准算法
 61 |     TRTEntropy 使用KL散度评估量化前后的量化误差,自动选择误差最小的动态范围值
 62 |     TRTMinMax 计算每个节点输出的最大最小值，作为最终的动态范围值
 63 |     TRTPercentile 计算每个节点输出值，然后求其分位点作为动态范围值
 64 |     ONNXEntropy 计算原理同TRTEntropy，采用onnx quantization的工程实现
 65 |     ONNXMinMax 计算原理同TRTMinMax，采用onnx quantization的工程实现
 66 |     ONNXPercentile 计算原理同TRTPercentile，采用onnx quantization的工程实现
 67 |     """,
 68 | )
 69 | 
 70 | parser.add_argument(
 71 |     "--channel_order",
 72 |     type=str,
 73 |     default="RGB",
 74 |     required=False,
 75 |     choices=["RGB", "BGR"],
 76 |     help="图片的输入顺序, 可选BGR、RGB",
 77 | )
 78 | parser.add_argument(
 79 |     "--means", type=str, default="0.0,0.0,0.0", required=False, help="图片预处理的均值"
 80 | )
 81 | parser.add_argument(
 82 |     "--stds", type=str, default="1.0,1.0,1.0", required=False, help="图片预处理的方差"
 83 | )
 84 | parser.add_argument(
 85 |     "--pixel_type",
 86 |     type=str,
 87 |     default="NCHW",
 88 |     required=False,
 89 |     choices=["NCHW", "NHWC"],
 90 |     help="模型输入的通道顺序, 一般而言",
 91 | )
 92 | 
 93 | args = parser.parse_args()
 94 | onnx_path = args.onnx
 95 | engine_type = args.engine_type
 96 | trt_engine = args.trt_engine
 97 | calib_algo = args.calib_algo
 98 | calib_dir = args.calib_dir
 99 | means = args.means
100 | stds = args.stds
101 | pixel_type = args.pixel_type
102 | trt_calib_cache = args.trt_calib_cache
103 | channel_order = args.channel_order
104 | 
105 | # 获取输入输出信息
106 | print("[ONNX2TRT] INFO: Optimizing Onnx Model....")
107 | INPUT_SHAPES = []
108 | INPUT_NAMES = []
109 | onnx_model = onnx.load(onnx_path)
110 | onnx_model, check = simplify(onnx_model) # simplify 
111 | optimized_model = optimizer.optimize(onnx_model) # optimize
112 | onnx_model = SymbolicShapeInference.infer_shapes(
113 |     onnx_model,
114 |     int_max=2**31 - 1,
115 |     auto_merge=True,
116 |     guess_output_rank=True,
117 |     verbose=2
118 | )
119 | 
120 | onnx_path = onnx_path.replace(".onnx", "") + "_with_shape.onnx"
121 | onnx.save(onnx_model, onnx_path)
122 | 
123 | input_all = [node.name for node in onnx_model.graph.input]
124 | input_initializer = [node.name for node in onnx_model.graph.initializer]
125 | net_feed_input_names = list(set(input_all) - set(input_initializer))
126 | 
127 | for _input in onnx_model.graph.input:
128 |     m_dict = MessageToDict(_input)
129 |     dim_info = m_dict.get("type").get("tensorType").get("shape").get("dim")
130 |     input_shape = [int(d.get("dimValue")) for d in dim_info]  # [4,3,384,640]
131 |     input_name = m_dict.get("name")
132 |     if input_name in net_feed_input_names:
133 |         INPUT_SHAPES.append(input_shape)
134 |         INPUT_NAMES.append(input_name)
135 |         print("[ONNX2TRT] INFO: 模型输入 ", INPUT_NAMES[-1], INPUT_SHAPES[-1])
136 | 
137 | if len(INPUT_SHAPES) > 1:
138 |     print("模型存在多个输入, 本工具暂不支持多输入模型")
139 |     raise NameError("模型存在多个输入, 本工具暂不支持多输入模型")
140 | 
141 | elif len(INPUT_SHAPES[0]) != 4:
142 |     print("模型的输入不是NCHW或NHWC, 本工具暂不支持这种输入格式")
143 |     raise NameError("模型的输入不是NCHW或NHWC, 本工具暂不支持这种输入格式")
144 | 
145 | if engine_type == "int8":
146 |     if calib_algo == "Search":
147 |         search_types = ["TRTEntropy", "TRTMinMax", "TRTPercentile"]
148 |     else:
149 |         search_types = [calib_algo]
150 |     image_stream = create_image_stream(
151 |         calib_dir, INPUT_SHAPES[0], means, stds, pixel_type, channel_order
152 |     )
153 |     final_cos_similarity = -1.0
154 |     final_engine = None
155 |     print("[ONNX2TRT] INFO: Search Best Calibration in {}".format(search_types))
156 |     for calib_algo in search_types:
157 |         print("[ONNX2TRT] INFO: Start Calibration with {}".format(calib_algo))
158 |         calibrator = create_calibrator(
159 |             image_stream, INPUT_NAMES, trt_calib_cache, calib_algo, onnx_path
160 |         )
161 |         print("[ONNX2TRT] INFO: Start Create TensorRT Engine with {}".format(calib_algo))
162 |         engine = create_tensorrt_engine(onnx_path, engine_type, calibrator)
163 |         print("[ONNX2TRT] INFO: Start Evaluation of {}".format(calib_algo))
164 |         cos_similarity, infer_time = evaluate_engine(onnx_path, engine, image_stream)
165 |         if cos_similarity > final_cos_similarity:
166 |             final_cos_similarity = cos_similarity
167 |             final_engine = engine
168 |             final_infer_time = infer_time
169 |         print("[ONNX2TRT] INFO: 校准算法 = ", calib_algo)
170 |         print("[ONNX2TRT] INFO: 与onnx输出的cos相似度 = ", cos_similarity)
171 |         print("[ONNX2TRT] INFO: 模型infer的平均耗时 = ", infer_time)
172 | 
173 | else:
174 |     final_engine = create_tensorrt_engine(onnx_path, engine_type)
175 |     if calib_dir != "":
176 |         image_stream = create_image_stream(
177 |             calib_dir, INPUT_SHAPES[0], means, stds, pixel_type, channel_order
178 |         )
179 |         cos_similarity, infer_time = evaluate_engine(
180 |             onnx_path, final_engine, image_stream
181 |         )
182 |         print("[ONNX2TRT] INFO: 校准算法 = ", None)
183 |         print("[ONNX2TRT] INFO: 与onnx输出的cos相似度 = ", cos_similarity)
184 |         print("[ONNX2TRT] INFO: 模型infer的平均耗时 = ", infer_time)
185 | 
186 | # 将trt engine写入文件
187 | print("[ONNX2TRT] INFO: 模型构建完成, 将模型写入路径 = ", trt_engine)
188 | if not os.path.exists(os.path.dirname(trt_engine)):
189 |     os.makedirs(os.path.dirname(trt_engine), exist_ok=True)
190 | with open(trt_engine, "wb") as f:
191 |     f.write(final_engine.serialize())
192 | 


--------------------------------------------------------------------------------
/quantization/P03_MT_onnx2tensorRT_int8.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from ppq import *
 22 | from ppq.api import *
 23 | from tqdm import tqdm
 24 | import glob
 25 | import cv2
 26 | import numpy as np
 27 | from torchvision import transforms
 28 | from PIL import Image
 29 | import os
 30 | 
 31 | 
 32 | 
 33 | def read_image(path):
 34 |     mean_val = [103.53, 116.28, 123.675]
 35 |     std_val = [57.375, 57.12, 58.395]
 36 |     input_size = [768, 448]
 37 | 
 38 |     # img = np.random.randint(255, size=input_size + [3]).astype(np.uint8)
 39 |     img_raw = cv2.imread(path)
 40 |     img = cv2.resize(img_raw, (input_size[0],input_size[1])).astype(np.float32)
 41 |     img -= mean_val
 42 |     img /= std_val
 43 |     img = np.transpose(img, (2, 0, 1)).astype(np.float32)
 44 |     img = np.expand_dims(img, axis=0)
 45 | 
 46 |     img = np.ascontiguousarray(img, dtype=np.float32)
 47 |     img_tensor = torch.from_numpy(img)
 48 |     print("======", np.shape(img_tensor))
 49 |     # dummy_input = torch.autograd.Variable(img_tensor)
 50 |     return img_tensor
 51 | 
 52 | 
 53 | QUANT_PLATFROM = TargetPlatform.TRT_INT8
 54 | MODEL = "/apdcephfs/private_howellyang/road_service_app/LaneModel/onnx_infer/model/epoch_390_mm2conv.opt.onnx"
 55 | INPUT_SHAPE = [1, 3, 448, 768]
 56 | 
 57 | calibration_files = glob.glob(
 58 |     os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg")
 59 | )[:128]
 60 | SAMPLES = [
 61 |     read_image(path) for path in calibration_files
 62 | ]  # rewirte this to use real data.
 63 | 
 64 | 
 65 | DEVICE = "cuda"
 66 | FINETUNE = True
 67 | QS = QuantizationSettingFactory.default_setting()
 68 | EXECUTING_DEVICE = "cuda"
 69 | REQUIRE_ANALYSE = True
 70 | 
 71 | # -------------------------------------------------------------------
 72 | # 下面向你展示了常用参数调节选项：
 73 | # -------------------------------------------------------------------
 74 | if PPQ_CONFIG.USING_CUDA_KERNEL:
 75 |     print("====== using advanced_optimization =====")
 76 |     QS.advanced_optimization = FINETUNE  # 启动网络再训练过程，降低量化误差
 77 |     QS.advanced_optimization_setting.steps = 2500  # 再训练步数，影响训练时间，2500步大概几分钟
 78 |     QS.advanced_optimization_setting.collecting_device = (
 79 |         # "executor"  # 缓存数据放在那，executor 就是放在gpu，如果显存超了你就换成 'cpu'
 80 |         "cpu"
 81 |     )
 82 |     QS.advanced_optimization_setting.auto_check = (
 83 |         False  # 打开这个选项则训练过程中会防止过拟合，以及意外情况，通常不需要开。
 84 |     )
 85 | else:
 86 |     print("====== using lsq_optimization =====")
 87 |     QS.lsq_optimization = FINETUNE  # 启动网络再训练过程，降低量化误差
 88 |     QS.lsq_optimization_setting.epochs = 128  # 再训练轮数，影响训练时间，30轮大概几分钟
 89 |     QS.lsq_optimization_setting.collecting_device = (
 90 |         "cuda"  # 缓存数据放在那，cuda 就是放在gpu，如果显存超了你就换成 'cpu'
 91 |     )
 92 | 
 93 | # 把量化的不太好的算子送回 FP32
 94 | # QS.dispatching_table.append(operation="Conv_3342", platform=TargetPlatform.FP32)
 95 | # QS.dispatching_table.append(operation="Relu_3343", platform=TargetPlatform.FP32)
 96 | # QS.dispatching_table.append(operation="Conv_2523", platform=TargetPlatform.FP32)
 97 | 
 98 | print("正准备量化你的网络，检查下列设置:")
 99 | print(f"TARGET PLATFORM      : {QUANT_PLATFROM.name}")
100 | print(f"NETWORK INPUTSHAPE   : {INPUT_SHAPE}")
101 | 
102 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x，但是你如果没有装相应编译环境的话是编译不了的
103 | # 你可以尝试安装编译环境，或者在不启动 CUDA KERNEL 的情况下完成量化：移除 with ENABLE_CUDA_KERNEL(): 即可
104 | with ENABLE_CUDA_KERNEL():
105 |     qir = quantize_onnx_model(
106 |         onnx_import_file=MODEL,
107 |         calib_dataloader=SAMPLES,
108 |         calib_steps=128,
109 |         setting=QS,
110 |         input_shape=INPUT_SHAPE,
111 |         collate_fn=lambda x: x.to(EXECUTING_DEVICE),
112 |         platform=QUANT_PLATFROM,
113 |         do_quantize=True,
114 |     )
115 | 
116 |     # -------------------------------------------------------------------
117 |     # PPQ 计算量化误差时，使用信噪比的倒数作为指标，即噪声能量 / 信号能量
118 |     # 量化误差 0.1 表示在整体信号中，量化噪声的能量约为 10%
119 |     # 你应当注意，在 graphwise_error_analyse 分析中，我们衡量的是累计误差
120 |     # 网络的最后一层往往都具有较大的累计误差，这些误差是其前面的所有层所共同造成的
121 |     # 你需要使用 layerwise_error_analyse 逐层分析误差的来源
122 |     # -------------------------------------------------------------------
123 |     print("正计算网络量化误差(SNR)，最后一层的误差应小于 0.1 以保证量化精度:")
124 |     reports = graphwise_error_analyse(
125 |         graph=qir,
126 |         running_device=EXECUTING_DEVICE,
127 |         steps=32,
128 |         dataloader=SAMPLES,
129 |         collate_fn=lambda x: x.to(EXECUTING_DEVICE),
130 |     )
131 |     for op, snr in reports.items():
132 |         if snr > 0.1:
133 |             ppq_warning(f"层 {op} 的累计量化误差显著，请考虑进行优化")
134 | 
135 |     if REQUIRE_ANALYSE:
136 |         print("正计算逐层量化误差(SNR)，每一层的独立量化误差应小于 0.1 以保证量化精度:")
137 |         layerwise_error_analyse(
138 |             graph=qir,
139 |             running_device=EXECUTING_DEVICE,
140 |             interested_outputs=None,
141 |             dataloader=SAMPLES,
142 |             collate_fn=lambda x: x.to(EXECUTING_DEVICE),
143 |         )
144 | 
145 |     print("网络量化结束，正在生成目标文件:")
146 |     export_ppq_graph(
147 |         graph=qir,
148 |         platform=QUANT_PLATFROM,
149 |         graph_save_to=MODEL.replace(".onnx", "_v2_int8.onnx"),
150 |     )
151 | 
152 |     # -------------------------------------------------------------------
153 |     # 记录一下输入输出的名字，onnxruntime 跑的时候需要提供这些名字
154 |     # 我写的只是单输出单输入的版本，多输出多输入你得自己改改
155 |     # -------------------------------------------------------------------
156 |     int8_input_names = [name for name, _ in qir.inputs.items()]
157 |     int8_output_names = [name for name, _ in qir.outputs.items()]
158 | 
159 |     # -------------------------------------------------------------------
160 |     # 启动 tensorRT 进行推理，你先装一下 trt
161 |     # -------------------------------------------------------------------
162 |     import tensorrt as trt
163 |     import trt_infer
164 | 
165 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
166 |     logger = trt.Logger(trt.Logger.INFO)
167 |     with open(MODEL.replace(".onnx", "_v2_int8.engine"), "rb") as f, trt.Runtime(
168 |         logger
169 |     ) as runtime:
170 |         engine = runtime.deserialize_cuda_engine(f.read())
171 | 
172 |     results = []
173 |     with engine.create_execution_context() as context:
174 |         inputs, outputs, bindings, stream, _ = trt_infer.allocate_buffers(context.engine)
175 |         for sample in tqdm(samples, desc="TensorRT is running..."):
176 |             inputs[0].host = convert_any_to_numpy(sample)
177 |             output = trt_infer.do_inference(
178 |                 context,
179 |                 bindings=bindings,
180 |                 inputs=inputs,
181 |                 outputs=outputs,
182 |                 stream=stream,
183 |                 batch_size=1,
184 |             )
185 |             # results.append(convert_any_to_torch_tensor(output).reshape([-1, 1000]))
186 | 


--------------------------------------------------------------------------------
/quantization/compare_onnx_onnx_v3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from tqdm import tqdm
 22 | import glob
 23 | import cv2
 24 | import numpy as np
 25 | from torchvision import transforms
 26 | from PIL import Image
 27 | import os
 28 | from sklearn.metrics.pairwise import cosine_similarity
 29 | import onnx
 30 | from copy import deepcopy
 31 | 
 32 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray:
 33 |     if x is None and not accepet_none:
 34 |         raise ValueError("Trying to convert an empty value.")
 35 |     if isinstance(x, np.ndarray):
 36 |         return x
 37 |     elif isinstance(x, int) or isinstance(x, float):
 38 |         return np.array(
 39 |             [
 40 |                 x,
 41 |             ]
 42 |         )
 43 |     elif isinstance(x, torch.Tensor):
 44 |         if x.numel() == 0 and accepet_none:
 45 |             return None
 46 |         if x.numel() == 0 and not accepet_none:
 47 |             raise ValueError("Trying to convert an empty value.")
 48 |         if x.numel() == 1:
 49 |             return convert_any_to_numpy(x.detach().cpu().item())
 50 |         if x.numel() > 1:
 51 |             return x.detach().cpu().numpy()
 52 |     elif isinstance(x, list) or isinstance(x, tuple):
 53 |         return np.array(x)
 54 |     else:
 55 |         raise TypeError(
 56 |             f"input value {x}({type(x)}) can not be converted as numpy type."
 57 |         )
 58 | 
 59 | def read_image(path):
 60 |     # 多任务模型
 61 |     # _img_transforms = transforms.Compose(
 62 |     #     [
 63 |     #         transforms.Resize((384, 768)),
 64 |     #         transforms.ToTensor(),
 65 |     #         transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
 66 |     #     ]
 67 |     # )
 68 |     # img = Image.open(path).convert("RGB")
 69 |     # img_w, img_h = img.size[0], img.size[1]
 70 |     # img = _img_transforms(img)
 71 |     # img = img.unsqueeze(0)
 72 |     # return img
 73 |     mean_val = [103.53, 116.28, 123.675]
 74 |     std_val = [57.375, 57.12, 58.395]
 75 |     input_size = [768, 448]
 76 | 
 77 |     # img = np.random.randint(255, size=input_size + [3]).astype(np.uint8)
 78 |     img_raw = cv2.imread(path)
 79 |     img = cv2.resize(img_raw, (input_size[0],input_size[1])).astype(np.float32)
 80 |     img -= mean_val
 81 |     img /= std_val
 82 |     img = np.transpose(img, (2, 0, 1)).astype(np.float32)
 83 |     img = np.expand_dims(img, axis=0)
 84 | 
 85 |     img = np.ascontiguousarray(img, dtype=np.float32)
 86 |     # img_tensor = torch.from_numpy(img)
 87 |     return img
 88 | 
 89 | calibration_files = glob.glob(
 90 |     os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg")
 91 | )[-100:]
 92 | SAMPLES = [
 93 |     read_image(path) for path in calibration_files
 94 | ]  # rewirte this to use real data.
 95 | 
 96 | 
 97 | DEVICE = "cuda"
 98 | FINETUNE = True
 99 | EXECUTING_DEVICE = "cuda"
100 | REQUIRE_ANALYSE = True
101 | 
102 | # -------------------------------------------------------------------
103 | # 启动 tensorRT 进行推理，你先装一下 trt
104 | # -------------------------------------------------------------------
105 | 
106 | 
107 | def infer_with_trt(trt_int8_path = ""):
108 |     import tensorrt as trt
109 |     import trt_infer
110 | 
111 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
112 |     logger = trt.Logger(trt.Logger.INFO)
113 |     with open(trt_int8_path, "rb") as f, trt.Runtime(
114 |         logger
115 |     ) as runtime:
116 |         engine = runtime.deserialize_cuda_engine(f.read())
117 | 
118 |     trt_outpus_all  = []
119 |     with engine.create_execution_context() as context:
120 |         inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers(
121 |             context.engine
122 |         )
123 |         for sample in tqdm(samples, desc="TensorRT is running..."):
124 |             # trt infer
125 |             inputs[0].host = convert_any_to_numpy(sample)
126 |             trt_outputs_list = trt_infer.do_inference(
127 |                 context,
128 |                 bindings=bindings,
129 |                 inputs=inputs,
130 |                 outputs=outputs,
131 |                 stream=stream,
132 |                 batch_size=1,
133 |             )
134 |             trt_outputs_dict = {
135 |                 trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names))
136 |             }
137 |             trt_outpus_all.append(deepcopy(trt_outputs_dict))
138 |     return trt_outpus_all
139 | 
140 | 
141 | def infer_with_onnx(onnx_path = ""):
142 | 
143 |     sess = onnxruntime.InferenceSession(
144 |         onnx_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
145 |     )
146 |     input_name = sess.get_inputs()[0].name
147 |     onnx_output_names = [output.name for output in sess.get_outputs()]
148 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
149 | 
150 |     onnx_outpus_all  = []
151 |     for sample in tqdm(samples, desc="Onnx is running..."):
152 |         onnx_outputs = sess.run(onnx_output_names, {input_name: sample})
153 |         onnx_outputs_dict = {
154 |             onnx_output_names[i]: onnx_outputs[i] for i in range(len(onnx_output_names))
155 |         }  
156 |         onnx_outpus_all.append(deepcopy(onnx_outputs_dict))
157 |     return onnx_outpus_all
158 | 
159 | 
160 | import sys
161 | 
162 | if len(sys.argv) > 2:
163 |     onnx_path = sys.argv[1]
164 |     trt_path = sys.argv[2]
165 | else:
166 |     onnx_path = "/apdcephfs/private_howellyang/onnx2trt/Models_Fp16/RMTNet_release20220609_v2.opt.onnx"
167 |     trt_path = "/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.onnx"
168 | 
169 | trt_outpus_all = infer_with_onnx(trt_path)
170 | onnx_outputs_all = infer_with_onnx(onnx_path)
171 | 
172 | sims = {}
173 | diffs = {}
174 | for i in range(len(trt_outpus_all)):
175 |     for output_name, _ in trt_outpus_all[i].items():
176 |         trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1))
177 |         trt_fp32_output = np.reshape(onnx_outputs_all[i][output_name], (1, -1))
178 |         cos_sim = cosine_similarity(trt_output, trt_fp32_output)
179 |         abs_diff_mean = np.mean(np.abs(trt_output - trt_fp32_output))
180 |         if output_name not in sims:
181 |             sims[output_name] = []
182 |             diffs[output_name] = []
183 |         sims[output_name].append(cos_sim.ravel())
184 |         diffs[output_name].append(abs_diff_mean.ravel())
185 |         # if cos_sim < 0.985:
186 |         #     print(output_name, cos_sim)
187 |         #     print(trt_fp32_output[0, :5])
188 |         #     print(trt_output[0, :5])
189 | 
190 | print("===================")
191 | mean_sims = []
192 | mean_diffs = []
193 | for key, value in sims.items():
194 |     print(key, np.mean(value), np.min(value), np.mean(diffs[key]), np.max(diffs[key]))
195 |     mean_sims.append(np.mean(value))
196 |     mean_diffs.append(np.mean(diffs[key]))
197 | print("average cosine sim = ", np.mean(mean_sims))
198 | print("average dff abs = ", np.mean(mean_diffs))


--------------------------------------------------------------------------------
/quantization/compare_onnx_trt_v1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3           #指定解释器
  2 | # encoding:utf-8
  3 | 
  4 | import sys
  5 | 
  6 | print(sys.getdefaultencoding())
  7 | s = "中文乱码问题解决"
  8 | print(s)
  9 | 
 10 | # ---------------------------------------------------------------
 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理
 12 | 
 13 | # This script shows you how to export ppq internal graph to tensorRT
 14 | # ---------------------------------------------------------------
 15 | 
 16 | # For this inference test, all test data is randomly picked.
 17 | # If you want to use real data, just rewrite the defination of SAMPLES
 18 | print("开始import")
 19 | import onnxruntime
 20 | import torch
 21 | from tqdm import tqdm
 22 | import glob
 23 | import cv2
 24 | import numpy as np
 25 | from torchvision import transforms
 26 | from PIL import Image
 27 | import os
 28 | from sklearn.metrics.pairwise import cosine_similarity
 29 | import onnx
 30 | from copy import deepcopy
 31 | 
 32 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray:
 33 |     if x is None and not accepet_none:
 34 |         raise ValueError("Trying to convert an empty value.")
 35 |     if isinstance(x, np.ndarray):
 36 |         return x
 37 |     elif isinstance(x, int) or isinstance(x, float):
 38 |         return np.array(
 39 |             [
 40 |                 x,
 41 |             ]
 42 |         )
 43 |     elif isinstance(x, torch.Tensor):
 44 |         if x.numel() == 0 and accepet_none:
 45 |             return None
 46 |         if x.numel() == 0 and not accepet_none:
 47 |             raise ValueError("Trying to convert an empty value.")
 48 |         if x.numel() == 1:
 49 |             return convert_any_to_numpy(x.detach().cpu().item())
 50 |         if x.numel() > 1:
 51 |             return x.detach().cpu().numpy()
 52 |     elif isinstance(x, list) or isinstance(x, tuple):
 53 |         return np.array(x)
 54 |     else:
 55 |         raise TypeError(
 56 |             f"input value {x}({type(x)}) can not be converted as numpy type."
 57 |         )
 58 | 
 59 | def read_image(path):
 60 |     # 多任务模型
 61 |     _img_transforms = transforms.Compose(
 62 |         [
 63 |             transforms.Resize((384, 768)),
 64 |             transforms.ToTensor(),
 65 |             transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
 66 |         ]
 67 |     )
 68 |     img = Image.open(path).convert("RGB")
 69 |     img_w, img_h = img.size[0], img.size[1]
 70 |     img = _img_transforms(img)
 71 |     img = img.unsqueeze(0)
 72 |     return img
 73 | 
 74 | calibration_files = glob.glob(
 75 |     os.path.join("/mapai/howellyang/code/road-service/road_service/calib_images/", "*.jpg")
 76 | )[:100]
 77 | 
 78 | 
 79 | SAMPLES = [
 80 |     read_image(path) for path in calibration_files
 81 | ]  # rewirte this to use real data.
 82 | 
 83 | 
 84 | DEVICE = "cuda"
 85 | FINETUNE = True
 86 | EXECUTING_DEVICE = "cuda"
 87 | REQUIRE_ANALYSE = True
 88 | 
 89 | # -------------------------------------------------------------------
 90 | # 启动 tensorRT 进行推理，你先装一下 trt
 91 | # -------------------------------------------------------------------
 92 | 
 93 | 
 94 | def infer_with_trt(trt_int8_path = ""):
 95 |     import tensorrt as trt
 96 |     import trt_infer
 97 |     trt.init_libnvinfer_plugins(None, "")
 98 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
 99 |     logger = trt.Logger(trt.Logger.INFO)
100 |     with open(trt_int8_path, "rb") as f, trt.Runtime(logger) as runtime:
101 |         engine = runtime.deserialize_cuda_engine(f.read())
102 | 
103 |     trt_outpus_all  = []
104 |     with engine.create_execution_context() as context:
105 |         inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers(
106 |             context.engine
107 |         )
108 |         for k, sample in enumerate(tqdm(samples, desc="TensorRT is running...")):
109 |             # trt infer
110 |             inputs[0].host = convert_any_to_numpy(sample)
111 |             trt_outputs_list = trt_infer.do_inference(
112 |                 context,
113 |                 bindings=bindings,
114 |                 inputs=inputs,
115 |                 outputs=outputs,
116 |                 stream=stream,
117 |                 batch_size=1,
118 |             )
119 | 
120 |             sample_base = os.path.basename(calibration_files[k])
121 |             # for i in range(len(trt_output_names)):
122 |             #     save_path = os.path.join("/mapai/howellyang/code/road-service/road_service/engine/mod_road_multi_tasks/outputs_trt", sample_base + "_{}.npy".format(i))
123 |             #     np.save(save_path, trt_outputs_list[i])
124 | 
125 |             trt_outputs_dict = {
126 |                 trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names))
127 |             }
128 |             trt_outpus_all.append(deepcopy(trt_outputs_dict))
129 |     return trt_outpus_all
130 | 
131 | 
132 | def infer_with_onnx(onnx_path = ""):
133 |     options = onnxruntime.SessionOptions()
134 |     options.intra_op_num_threads = 1
135 |     options.inter_op_num_threads = 1
136 |     sess = onnxruntime.InferenceSession(
137 |         onnx_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"], sess_options=options
138 |     )
139 |     input_name = sess.get_inputs()[0].name
140 |     onnx_output_names = [output.name for output in sess.get_outputs()]
141 |     samples = [convert_any_to_numpy(sample) for sample in SAMPLES]
142 | 
143 |     onnx_outpus_all  = []
144 |     for k, sample in enumerate(tqdm(samples, desc="Onnx is running...")):
145 |         onnx_outputs = sess.run(onnx_output_names, {input_name: sample})
146 | 
147 |         sample_base = os.path.basename(calibration_files[k])
148 |         # for i in range(len(onnx_output_names)):
149 |         #    save_path = os.path.join("/mapai/howellyang/code/road-service/road_service/engine/mod_road_multi_tasks/outputs_onnx", sample_base + "_{}.npy".format(i))
150 |         #    np.save(save_path, onnx_outputs[i])
151 | 
152 |         onnx_outputs_dict = {
153 |             onnx_output_names[i]: onnx_outputs[i] for i in range(len(onnx_output_names))
154 |         }  
155 |         onnx_outpus_all.append(deepcopy(onnx_outputs_dict))
156 |     return onnx_outpus_all
157 | 
158 | 
159 | 
160 | import sys
161 | 
162 | if len(sys.argv) > 2:
163 |     onnx_path = sys.argv[1]
164 |     trt_path = sys.argv[2]
165 | else:
166 |     onnx_path = "/apdcephfs/private_howellyang/onnx2trt/Models_Fp16/RMTNet_release20220609_v2.opt.onnx"
167 |     trt_path = "/apdcephfs/private_howellyang/onnx2trt/Models_Fp16/RMTNet_release20220609.fp16.trtmodel"
168 | 
169 | trt_outpus_all = infer_with_trt(trt_path)
170 | onnx_outputs_all = infer_with_onnx(onnx_path)
171 | 
172 | sims = {}
173 | diffs = {}
174 | for i in range(len(trt_outpus_all)):
175 |     for output_name, _ in trt_outpus_all[i].items():
176 |         trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1))
177 |         trt_fp32_output = np.reshape(onnx_outputs_all[i][output_name], (1, -1))
178 |         cos_sim = cosine_similarity(trt_output, trt_fp32_output)
179 |         abs_diff_mean = np.mean(np.abs(trt_output - trt_fp32_output))
180 |         if output_name not in sims:
181 |             sims[output_name] = []
182 |             diffs[output_name] = []
183 |         sims[output_name].append(cos_sim.ravel())
184 |         diffs[output_name].append(abs_diff_mean.ravel())
185 |         # if cos_sim < 0.985:
186 |         #     print(output_name, cos_sim)
187 |         #     print(trt_fp32_output[0, :5])
188 |         #     print(trt_output[0, :5])
189 | 
190 | print("===================")
191 | mean_sims = []
192 | mean_diffs = []
193 | for key, value in sims.items():
194 |     print(key, np.mean(value), np.min(value), np.mean(diffs[key]), np.max(diffs[key]))
195 |     mean_sims.append(np.mean(value))
196 |     mean_diffs.append(np.mean(diffs[key]))
197 | print("average cosine sim = ", np.mean(mean_sims))
198 | print("average dff abs = ", np.mean(mean_diffs))
199 | 


--------------------------------------------------------------------------------
/quantization/ptq/ppq_optimize.py:
--------------------------------------------------------------------------------
  1 | import onnx
  2 | from onnx import numpy_helper
  3 | import numpy as np
  4 | import json
  5 | import sys
  6 | 
  7 | 
  8 | def get_post_nodes(onnx_model, tensor_name):
  9 |     post_nodes = []
 10 |     for node in onnx_model.graph.node:
 11 |         for input_tensor in node.input:
 12 |             if input_tensor == tensor_name:
 13 |                 post_nodes.append(node)
 14 |                 break
 15 |     return post_nodes
 16 | 
 17 | 
 18 | def remove_qdq(onnx_model, node):
 19 |     nodes_to_remove = []
 20 |     assert node.op_type == "QuantizeLinear"
 21 |     nodes_to_remove.append(node)
 22 |     for dq_node in onnx_model.graph.node:
 23 |         if dq_node.input[0] == node.output[0]:
 24 |             assert dq_node.op_type == "DequantizeLinear"
 25 |             nodes_to_remove.append(dq_node)
 26 |             for post_node in onnx_model.graph.node:
 27 |                 for i, input in enumerate(post_node.input):
 28 |                     if input == dq_node.output[0]:
 29 |                         post_node.input[i] = node.input[0]
 30 |     return nodes_to_remove
 31 | 
 32 | 
 33 | def create_act_initializer_tensor(
 34 |     name,
 35 |     tensor_array,
 36 |     data_type=onnx.TensorProto.FLOAT,
 37 | ):
 38 | 
 39 |     # (TensorProto)
 40 |     initializer_tensor = onnx.helper.make_tensor(
 41 |         name=name,
 42 |         data_type=data_type,
 43 |         dims=(),  # [1],
 44 |         vals=[tensor_array],
 45 |     )
 46 | 
 47 |     return initializer_tensor
 48 | 
 49 | 
 50 | def add_act_dqd_node(qdq_model, tensor_name, scale):
 51 |     flag_found = False
 52 |     for node in qdq_model.graph.node:
 53 |         for j in range(len(node.input)):
 54 |             if node.input[j] == tensor_name:
 55 |                 flag_found = True
 56 |     if not flag_found:
 57 |         return None
 58 | 
 59 |     quant_node_name = tensor_name + "_QuantizeLinear"
 60 |     dequant_node_name = tensor_name + "_DequantizeLinear"
 61 |     q_input = tensor_name
 62 |     q_output = tensor_name + "_QuantizeLinear"
 63 |     dq_input = q_output
 64 |     dq_output = tensor_name + "_DequantizeLinear"
 65 | 
 66 |     scale_name = tensor_name + "_QuantizeScale"
 67 |     zp_name = tensor_name + "_QuantizeZp"
 68 |     qlinear_node = onnx.helper.make_node(
 69 |         "QuantizeLinear",
 70 |         [q_input, scale_name, zp_name],
 71 |         [q_output],
 72 |         quant_node_name,
 73 |     )
 74 |     dequant_node = onnx.helper.make_node(
 75 |         "DequantizeLinear",
 76 |         [dq_input, scale_name, zp_name],
 77 |         [dq_output],
 78 |         dequant_node_name,
 79 |     )
 80 | 
 81 |     for node in qdq_model.graph.node:
 82 |         for j in range(len(node.input)):
 83 |             if node.input[j] == tensor_name:
 84 |                 node.input[j] = dq_output
 85 | 
 86 |     qdq_model.graph.node.extend([qlinear_node, dequant_node])
 87 | 
 88 |     scale_initializer_tensor = create_act_initializer_tensor(
 89 |         name=scale_name, tensor_array=scale, data_type=onnx.TensorProto.FLOAT
 90 |     )
 91 | 
 92 |     zp_initializer_tensor = create_act_initializer_tensor(
 93 |         name=zp_name, tensor_array=0, data_type=onnx.TensorProto.INT8
 94 |     )
 95 | 
 96 |     qdq_model.graph.initializer.append(scale_initializer_tensor)
 97 |     qdq_model.graph.initializer.append(zp_initializer_tensor)
 98 |     return qdq_model
 99 | 
100 | # Step 01. Move QDQ forward
101 | int8_model_path = sys.argv[1] # onnx.load("/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.onnx.model_int8.onnx")
102 | int8_model = onnx.load(int8_model_path)
103 | weight_name2tensor = {}
104 | for weight in int8_model.graph.initializer:
105 |     weight_name2tensor[weight.name] = weight
106 | 
107 | nodes_to_remove = []
108 | scale_map = {}
109 | scale_map_final = {}
110 | for node in int8_model.graph.node:
111 |   output_tensor = node.output[0]
112 |   post_nodes = get_post_nodes(int8_model, output_tensor)
113 | 
114 |   QDQ_count = 0
115 |   for post_node in post_nodes:
116 |     if post_node.op_type in ["QuantizeLinear"]:
117 |       QDQ_count += 1
118 | 
119 |   # 第一种情况: 存在QDQ, 但是与后续节点个数不同
120 |   # 第二种情况: 存在多于1个QDQ
121 |   if node.op_type not in ["Concat"] and (QDQ_count > 0 and QDQ_count != len(post_nodes)) or QDQ_count > 1:
122 |     scale_values = []
123 |     for post_node in post_nodes:
124 |       if post_node.op_type in ["QuantizeLinear"]:
125 |         scale_name = post_node.input[1]
126 |         scale_tensor = weight_name2tensor[scale_name]
127 |         scale_value = numpy_helper.to_array(scale_tensor) # out_ch, in_ch, ker, ker
128 |         scale_values.append(float(scale_value.ravel()))
129 |         nodes_to_remove.extend(remove_qdq(int8_model, post_node))
130 |     print(node.name, QDQ_count, len(post_nodes), scale_values)
131 |     scale_map[node.output[0]] = np.mean(scale_values)
132 |   elif QDQ_count == 1 and len(post_nodes) == 1:
133 |     scale_values = []
134 |     for post_node in post_nodes:
135 |       if post_node.op_type in ["QuantizeLinear"]:
136 |         scale_name = post_node.input[1]
137 |         scale_tensor = weight_name2tensor[scale_name]
138 |         scale_value = numpy_helper.to_array(scale_tensor) # out_ch, in_ch, ker, ker
139 |         scale_values.append(float(scale_value.ravel()))
140 |       assert len(scale_values) == 1
141 |       scale_map_final[node.output[0]] = np.mean(scale_values)
142 | 
143 | for node in nodes_to_remove:
144 |   int8_model.graph.node.remove(node)
145 | 
146 | for tensor_name, scale in scale_map.items():
147 |   add_act_dqd_node(int8_model, tensor_name, scale)
148 | 
149 | onnx.save(int8_model, int8_model_path + ".opt_step1.onnx")
150 | 
151 | scale_map_final.update(scale_map)
152 | with open(int8_model_path + ".opt_step1.scale_map.json", "w") as fw:
153 |   json.dump(scale_map_final, fw, indent=4)
154 | 
155 | 
156 | # Step 02 add QDQ node to model
157 | def read_calib_cache(calib_cache):
158 |     import struct
159 |     scale_map = {}
160 |     with open(calib_cache) as fr:
161 |         for line in fr.readlines()[1:]:
162 |             print(line.strip())
163 |             name, value = line.strip().split(": ")
164 |             name = name.strip(":")
165 |             value = value.strip(":")
166 |             if value.strip() == "0":
167 |                 val = 0.0
168 |             else:
169 |                 val = struct.unpack("!f", bytes.fromhex(value.strip()))[0]
170 |             scale_map[name] = val
171 |     scale_map = {k: scale_map[k] for k in sorted(scale_map)}
172 |     return scale_map
173 | 
174 | 
175 | # int8_model = onnx.load(
176 | #     "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.onnx.model_int8.opt_step1.onnx")
177 | 
178 | weight_name2tensor = {}
179 | for weight in int8_model.graph.initializer:
180 |     weight_name2tensor[weight.name] = weight
181 | 
182 | QDQ_scales = {}
183 | for node in int8_model.graph.node:
184 |     if node.op_type in ["QuantizeLinear"]:
185 |         scale_name = node.input[1]
186 |         scale_tensor = weight_name2tensor[scale_name]
187 |         scale_value = numpy_helper.to_array(
188 |             scale_tensor)  # out_ch, in_ch, ker, ker
189 |         if np.size(scale_value) > 1:
190 |             continue
191 |         scale_value = float(scale_value.ravel())
192 |         QDQ_scales[node.input[0]] = scale_value
193 | 
194 | calib_cache = "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.trt_int8_with_1687pics_calib_percentile595.calib_cache"
195 | full_scale_map = read_calib_cache(calib_cache)
196 | 
197 | for tensor_name, scale in full_scale_map.items():
198 |     if tensor_name in QDQ_scales:
199 |         print(tensor_name, scale, QDQ_scales[tensor_name])
200 |     else:
201 |         print(tensor_name, "not exist")
202 |         scale = max(scale, 1e-8)
203 |         add_act_dqd_node(int8_model, tensor_name, scale)
204 | 
205 | 
206 | onnx.save(int8_model, int8_model_path + ".opt_step2.onnx")
207 | 
208 | # print(QDQ_scales)
209 | # print(full_scale_map)
210 | 


--------------------------------------------------------------------------------
/quantization/qat/ppq_optimize.py:
--------------------------------------------------------------------------------
  1 | import onnx
  2 | from onnx import numpy_helper
  3 | import numpy as np
  4 | import json
  5 | import sys
  6 | 
  7 | 
  8 | def get_post_nodes(onnx_model, tensor_name):
  9 |     post_nodes = []
 10 |     for node in onnx_model.graph.node:
 11 |         for input_tensor in node.input:
 12 |             if input_tensor == tensor_name:
 13 |                 post_nodes.append(node)
 14 |                 break
 15 |     return post_nodes
 16 | 
 17 | 
 18 | def remove_qdq(onnx_model, node):
 19 |     nodes_to_remove = []
 20 |     assert node.op_type == "QuantizeLinear"
 21 |     nodes_to_remove.append(node)
 22 |     for dq_node in onnx_model.graph.node:
 23 |         if dq_node.input[0] == node.output[0]:
 24 |             assert dq_node.op_type == "DequantizeLinear"
 25 |             nodes_to_remove.append(dq_node)
 26 |             for post_node in onnx_model.graph.node:
 27 |                 for i, input in enumerate(post_node.input):
 28 |                     if input == dq_node.output[0]:
 29 |                         post_node.input[i] = node.input[0]
 30 |     return nodes_to_remove
 31 | 
 32 | 
 33 | def create_act_initializer_tensor(
 34 |     name,
 35 |     tensor_array,
 36 |     data_type=onnx.TensorProto.FLOAT,
 37 | ):
 38 | 
 39 |     # (TensorProto)
 40 |     initializer_tensor = onnx.helper.make_tensor(
 41 |         name=name,
 42 |         data_type=data_type,
 43 |         dims=(),  # [1],
 44 |         vals=[tensor_array],
 45 |     )
 46 | 
 47 |     return initializer_tensor
 48 | 
 49 | 
 50 | def add_act_dqd_node(qdq_model, tensor_name, scale):
 51 |     flag_found = False
 52 |     for node in qdq_model.graph.node:
 53 |         for j in range(len(node.input)):
 54 |             if node.input[j] == tensor_name:
 55 |                 flag_found = True
 56 |     if not flag_found:
 57 |         return None
 58 | 
 59 |     quant_node_name = tensor_name + "_QuantizeLinear"
 60 |     dequant_node_name = tensor_name + "_DequantizeLinear"
 61 |     q_input = tensor_name
 62 |     q_output = tensor_name + "_QuantizeLinear"
 63 |     dq_input = q_output
 64 |     dq_output = tensor_name + "_DequantizeLinear"
 65 | 
 66 |     scale_name = tensor_name + "_QuantizeScale"
 67 |     zp_name = tensor_name + "_QuantizeZp"
 68 |     qlinear_node = onnx.helper.make_node(
 69 |         "QuantizeLinear",
 70 |         [q_input, scale_name, zp_name],
 71 |         [q_output],
 72 |         quant_node_name,
 73 |     )
 74 |     dequant_node = onnx.helper.make_node(
 75 |         "DequantizeLinear",
 76 |         [dq_input, scale_name, zp_name],
 77 |         [dq_output],
 78 |         dequant_node_name,
 79 |     )
 80 | 
 81 |     for node in qdq_model.graph.node:
 82 |         for j in range(len(node.input)):
 83 |             if node.input[j] == tensor_name:
 84 |                 node.input[j] = dq_output
 85 | 
 86 |     qdq_model.graph.node.extend([qlinear_node, dequant_node])
 87 | 
 88 |     scale_initializer_tensor = create_act_initializer_tensor(
 89 |         name=scale_name, tensor_array=scale, data_type=onnx.TensorProto.FLOAT
 90 |     )
 91 | 
 92 |     zp_initializer_tensor = create_act_initializer_tensor(
 93 |         name=zp_name, tensor_array=0, data_type=onnx.TensorProto.INT8
 94 |     )
 95 | 
 96 |     qdq_model.graph.initializer.append(scale_initializer_tensor)
 97 |     qdq_model.graph.initializer.append(zp_initializer_tensor)
 98 |     return qdq_model
 99 | 
100 | # Step 01. Move QDQ forward
101 | int8_model_path = sys.argv[1] # onnx.load("/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.onnx.model_int8.onnx")
102 | int8_model = onnx.load(int8_model_path)
103 | weight_name2tensor = {}
104 | for weight in int8_model.graph.initializer:
105 |     weight_name2tensor[weight.name] = weight
106 | 
107 | nodes_to_remove = []
108 | scale_map = {}
109 | scale_map_final = {}
110 | for node in int8_model.graph.node:
111 |   output_tensor = node.output[0]
112 |   post_nodes = get_post_nodes(int8_model, output_tensor)
113 | 
114 |   QDQ_count = 0
115 |   for post_node in post_nodes:
116 |     if post_node.op_type in ["QuantizeLinear"]:
117 |       QDQ_count += 1
118 | 
119 |   # 第一种情况: 存在QDQ, 但是与后续节点个数不同
120 |   # 第二种情况: 存在多于1个QDQ
121 |   if node.op_type not in ["Concat"] and (QDQ_count > 0 and QDQ_count != len(post_nodes)) or QDQ_count > 1:
122 |     scale_values = []
123 |     for post_node in post_nodes:
124 |       if post_node.op_type in ["QuantizeLinear"]:
125 |         scale_name = post_node.input[1]
126 |         scale_tensor = weight_name2tensor[scale_name]
127 |         scale_value = numpy_helper.to_array(scale_tensor) # out_ch, in_ch, ker, ker
128 |         scale_values.append(float(scale_value.ravel()))
129 |         nodes_to_remove.extend(remove_qdq(int8_model, post_node))
130 |     print(node.name, QDQ_count, len(post_nodes), scale_values)
131 |     scale_map[node.output[0]] = np.mean(scale_values)
132 |   elif QDQ_count == 1 and len(post_nodes) == 1:
133 |     scale_values = []
134 |     for post_node in post_nodes:
135 |       if post_node.op_type in ["QuantizeLinear"]:
136 |         scale_name = post_node.input[1]
137 |         scale_tensor = weight_name2tensor[scale_name]
138 |         scale_value = numpy_helper.to_array(scale_tensor) # out_ch, in_ch, ker, ker
139 |         scale_values.append(float(scale_value.ravel()))
140 |       assert len(scale_values) == 1
141 |       scale_map_final[node.output[0]] = np.mean(scale_values)
142 | 
143 | for node in nodes_to_remove:
144 |   int8_model.graph.node.remove(node)
145 | 
146 | for tensor_name, scale in scale_map.items():
147 |   add_act_dqd_node(int8_model, tensor_name, scale)
148 | 
149 | onnx.save(int8_model, int8_model_path + ".opt_step1.onnx")
150 | 
151 | scale_map_final.update(scale_map)
152 | with open(int8_model_path + ".opt_step1.scale_map.json", "w") as fw:
153 |   json.dump(scale_map_final, fw, indent=4)
154 | 
155 | 
156 | # Step 02 add QDQ node to model
157 | def read_calib_cache(calib_cache):
158 |     import struct
159 |     scale_map = {}
160 |     with open(calib_cache) as fr:
161 |         for line in fr.readlines()[1:]:
162 |             print(line.strip())
163 |             name, value = line.strip().split(": ")
164 |             name = name.strip(":")
165 |             value = value.strip(":")
166 |             if value.strip() == "0":
167 |                 val = 0.0
168 |             else:
169 |                 val = struct.unpack("!f", bytes.fromhex(value.strip()))[0]
170 |             scale_map[name] = val
171 |     scale_map = {k: scale_map[k] for k in sorted(scale_map)}
172 |     return scale_map
173 | 
174 | 
175 | # int8_model = onnx.load(
176 | #     "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.onnx.model_int8.opt_step1.onnx")
177 | 
178 | weight_name2tensor = {}
179 | for weight in int8_model.graph.initializer:
180 |     weight_name2tensor[weight.name] = weight
181 | 
182 | QDQ_scales = {}
183 | for node in int8_model.graph.node:
184 |     if node.op_type in ["QuantizeLinear"]:
185 |         scale_name = node.input[1]
186 |         scale_tensor = weight_name2tensor[scale_name]
187 |         scale_value = numpy_helper.to_array(
188 |             scale_tensor)  # out_ch, in_ch, ker, ker
189 |         if np.size(scale_value) > 1:
190 |             continue
191 |         scale_value = float(scale_value.ravel())
192 |         QDQ_scales[node.input[0]] = scale_value
193 | 
194 | calib_cache = "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.trt_int8_with_1687pics_calib_percentile595.calib_cache"
195 | full_scale_map = read_calib_cache(calib_cache)
196 | 
197 | for tensor_name, scale in full_scale_map.items():
198 |     if tensor_name in QDQ_scales:
199 |         print(tensor_name, scale, QDQ_scales[tensor_name])
200 |     else:
201 |         print(tensor_name, "not exist")
202 |         scale = max(scale, 1e-8)
203 |         add_act_dqd_node(int8_model, tensor_name, scale)
204 | 
205 | 
206 | onnx.save(int8_model, int8_model_path + ".opt_step2.onnx")
207 | 
208 | # print(QDQ_scales)
209 | # print(full_scale_map)
210 | 


--------------------------------------------------------------------------------