├── quantization ├── __init__.py ├── ptq │ ├── channel_wise_correction.py │ ├── onnx2trt_step04_ppq_training.py │ ├── onnx2trt_step01_merge_mean_scale.py │ ├── onnx2trt_step05_pdq_optimize.py │ ├── onnx2trt_step03_weight_equalization.py │ ├── onnx2trt_step02_remove_output_sigmoid.py │ ├── onnx2trt_step06_extra_pdq_init.py │ ├── onnx2trt_step07_extra_pdq_training.py │ ├── remove_inititalizers_from_inputs.py │ ├── filter_cache.py │ ├── trt_utils.py │ ├── remove_edge_logits_QDQ.py │ ├── data_loader.py │ ├── create_json_inputs.py │ ├── quantization_filter.py │ ├── P01_MT_onnx2tensorRT_int8.py │ └── ppq_optimize.py ├── README.md ├── ppq_optimize.py ├── calib_cache2json.py ├── calib_json2calib.py ├── onnx_optimize.py ├── fiter_scales.py ├── onnx_change_resize_mode.py ├── calib_merge_json2calib.py ├── calib_filter_Sigmoid.py ├── find_nodes_onnx.py ├── onnx_remove_split_qdq.py ├── json_filter.py ├── trt_calibrator.py ├── onnx_export_v2.py ├── onnx_export.py ├── onnx_move_qdq_relu_forward.py ├── onnx_remove_dup_qdqs.py ├── compare_trt_trt.py ├── onnx2trt_lsq.py ├── C03_compare_trt_fp32_int8.py ├── C02_compare_trt_fp32_int8.py ├── P02_MT_onnx2tensorRT_int8.py ├── onnx2tensorRT_adaround.py ├── P01_MT_onnx2tensorRT_int8.py ├── P01_MT_onnx2tensorRT_int8_sample.py ├── compare_onnx_trt_v3.py ├── onnx_calibrator.py ├── compare_onnx_onnx_v2.py ├── onnx2trt.py ├── P03_MT_onnx2tensorRT_int8.py ├── compare_onnx_onnx_v3.py ├── compare_onnx_trt_v1.py └── qat │ └── ppq_optimize.py ├── onnx_optimize ├── README.md ├── __init__.py ├── step05_fuse_repconvs.py ├── step04_extract_sub_graph.py ├── step01_export_torch_to_onnx.py ├── step02_onnx_simplify.py └── step03_fuse_normalize_to_conv.py ├── .vscode └── settings.json ├── prune ├── README_dev.md └── README.md ├── 01.install.md ├── README.md ├── trt_calibrator.py ├── onnx2trt.py └── onnx_calibrator.py /quantization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /quantization/ptq/channel_wise_correction.py: -------------------------------------------------------------------------------- 1 | # 调整 -------------------------------------------------------------------------------- /quantization/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 量化流程 4 | 5 | 6 | 7 | 1. 简单量化 -------------------------------------------------------------------------------- /quantization/ppq_optimize.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Step 01- 如果当前节点有多个后续节点,那么将QDQ移动到前面 -------------------------------------------------------------------------------- /quantization/ptq/onnx2trt_step04_ppq_training.py: -------------------------------------------------------------------------------- 1 | # 进行PPQ训练,获取好的结果 2 | 3 | -------------------------------------------------------------------------------- /quantization/ptq/onnx2trt_step01_merge_mean_scale.py: -------------------------------------------------------------------------------- 1 | # 将Step01的结果合入第一层Conv/或者强制添加一层1x1卷积 -------------------------------------------------------------------------------- /quantization/ptq/onnx2trt_step05_pdq_optimize.py: -------------------------------------------------------------------------------- 1 | # 对PPQ训练的模型,进行QDQ节点位置的调整优化 2 | 3 | -------------------------------------------------------------------------------- /quantization/ptq/onnx2trt_step03_weight_equalization.py: -------------------------------------------------------------------------------- 1 | # 连续的两个卷积层,进行输出部分的channel平衡(需要有输入数据) -------------------------------------------------------------------------------- /quantization/ptq/onnx2trt_step02_remove_output_sigmoid.py: -------------------------------------------------------------------------------- 1 | # 移除输出部分的sigmoid, reshape, transpose等结构 -------------------------------------------------------------------------------- /quantization/ptq/onnx2trt_step06_extra_pdq_init.py: -------------------------------------------------------------------------------- 1 | # 对PPQ训练的模型,添加部分QDQ节点,进一步优化QDQ模型的效果 2 | 3 | -------------------------------------------------------------------------------- /quantization/ptq/onnx2trt_step07_extra_pdq_training.py: -------------------------------------------------------------------------------- 1 | # 对PPQ训练的模型,添加部分QDQ节点,进一步优化QDQ模型的效果 2 | 3 | -------------------------------------------------------------------------------- /onnx_optimize/README.md: -------------------------------------------------------------------------------- 1 | ## onnx模型结构优化 2 | 3 | 4 | # 1 通用优化 5 | 6 | 7 | 8 | 9 | # 2. conv-bn merge 10 | 11 | 12 | 13 | 14 | # 3. normalize-conv merge -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "workbench.colorCustomizations": { 3 | "editor.lineHighlightBackground": "#1073cf2d", 4 | "editor.lineHighlightBorder": "#9fced11f", 5 | "activityBar.background": "#58024F", 6 | "titleBar.activeBackground": "#7B036E", 7 | "titleBar.activeForeground": "#FFF8FE" 8 | }, 9 | "python.formatting.provider": "black" 10 | } -------------------------------------------------------------------------------- /onnx_optimize/__init__.py: -------------------------------------------------------------------------------- 1 | from step01_export_torch_to_onnx import export_torch_to_onnx 2 | from step02_onnx_simplify import optimize_onnx_model 3 | from step03_fuse_normalize_to_conv import fuse_normalize_to_conv 4 | from step04_extract_sub_graph import extract_sub_graph 5 | 6 | __all__ = ["export_torch_to_onnx", 7 | "optimize_onnx_model", 8 | "fuse_normalize_to_conv", 9 | "extract_sub_graph", 10 | ] 11 | -------------------------------------------------------------------------------- /onnx_optimize/step05_fuse_repconvs.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | 3 | 4 | def find_common_input(onnx_model, node): 5 | pass 6 | 7 | def fuse_repconvs(onnx_model): 8 | # Step 01: find rep convs 9 | for node in onnx_model.graph.node: 10 | if node.op_type == "Add": 11 | find_common_input(onnx_model, node) 12 | 13 | 14 | # step 02: merge rep conv weights 15 | 16 | 17 | # step 03: remove extra conv and bn nodes -------------------------------------------------------------------------------- /onnx_optimize/step04_extract_sub_graph.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | 3 | 4 | def extract_sub_graph(input_path, output_path, input_names=None, output_names=None): 5 | onnx.utils.extract_model(input_path, output_path, input_names, output_names) 6 | 7 | 8 | if __name__ == "__main__": 9 | import sys 10 | 11 | input_path = sys.argv[1] 12 | output_path = sys.argv[2] 13 | input_names = ["input.1"] 14 | output_names = ["1080"] 15 | extract_sub_graph(input_path, output_path, input_names, output_names) 16 | -------------------------------------------------------------------------------- /quantization/calib_cache2json.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import struct 3 | import json 4 | 5 | 6 | scale_map = {} 7 | with open(sys.argv[1]) as fr: 8 | for line in fr.readlines()[1:]: 9 | name, value = line.strip().split(":") 10 | if value.strip() == "0": 11 | val = 0.0 12 | else: 13 | val = struct.unpack("!f", bytes.fromhex(value.strip()))[0] 14 | 15 | scale_map[name] = val 16 | 17 | scale_map = {k: scale_map[k] for k in sorted(scale_map)} 18 | 19 | with open(".".join(sys.argv[1].split(".")[:-1]) + ".json", "w") as fw: 20 | json.dump(scale_map, fw, indent=4) 21 | -------------------------------------------------------------------------------- /quantization/calib_json2calib.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import struct 3 | import json 4 | 5 | 6 | with open(sys.argv[1]) as fr: 7 | scale_map = json.load(fr) 8 | 9 | scale_map = {k: scale_map[k] for k in sorted(scale_map)} 10 | with open(".".join(sys.argv[1].split(".")[:-1]) + ".cache", "w") as file: 11 | file.write("TRT-8400-EntropyCalibration2\n") 12 | for key in sorted(scale_map.keys()): 13 | value = scale_map[key] 14 | scale = float(value) 15 | scale_hex = hex(struct.unpack(" 0.5: 18 | print(name, val) 19 | else: 20 | lines.append(line) 21 | 22 | with open(calib_cache + "_filter_scale05.calib_cache", "w") as fw: 23 | fw.writelines(lines) -------------------------------------------------------------------------------- /quantization/onnx_change_resize_mode.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | import sys 3 | import onnxoptimizer 4 | from onnx import helper, shape_inference 5 | from onnxsim import simplify 6 | from onnx import numpy_helper 7 | import numpy as np 8 | 9 | onnx_model = onnx.load(sys.argv[1]) 10 | 11 | for node in onnx_model.graph.node: 12 | if node.op_type == "Resize": 13 | for attr in node.attribute: 14 | if (attr.name == "coordinate_transformation_mode"): 15 | attr.s = "half_pixel".encode("UTF-8") 16 | elif attr.name == "mode": 17 | attr.s = "linear".encode("UTF-8") 18 | elif attr.name == "nearest_mode": 19 | attr.s = "round_prefer_floor".encode("UTF-8") 20 | 21 | model_opt = onnxoptimizer.optimize(onnx_model) 22 | # model_simp, check = simplify(model_opt) 23 | model_simp = shape_inference.infer_shapes(model_opt) 24 | onnx.save(model_simp, sys.argv[2]) 25 | -------------------------------------------------------------------------------- /prune/README_dev.md: -------------------------------------------------------------------------------- 1 | 1. 剪枝概述 2 | 3 | 4 | 5 | 6 | 2. 常用工具 7 | 8 | - nni: 基于torch.fx,在某些复杂网络如nanodet上失效; 9 | - pytorch原生工具: 难以获取网络的拓扑结构,对于包含Concat等结构的网络剪枝困难; 10 | - torch.fx工具虽然提供了相关的功能,但是对于其中的shape计算、特殊算子等支持并不友好(其实是我太懒,不想新学一种表示) 11 | - 更简单的办法是,通过onnx作为中间层,使用onnx simpliier工具优化后,读取模型的拓扑结构; 12 | - 根据onnx模型中的拓扑结构,来进行结构化的剪枝;保存模型权重,输出模型结构; 13 | - 最后进行模型的finetune,输出最终的模型; 14 | - 问题: onnx会修改权重的名称,无法根据名称找到对应关系; 15 | - 解决方案: 16 | - 通过计算相似度等方式找到权重对应关系 ----> 设置training为True后,权重名称也得以与state_dict中一致 17 | - 通过设置export的training=True来避免Conv-BN融合 18 | - 问题: 对于Add、Concat、Channel Shuffle结构,需要进行针对性的识别和处理 ----> 复杂度较高 19 | - 解决方案: 20 | - 使用torch的hook机制,对特定类型的节点进行hook,从而获得每个op的mask 21 | - 然后根据mask的结果,对权重进行prune(整个流程可能跟nni中的torch.fx是一致的) 22 | - Add、Concat模块没办法直接进行Hook,需要使用自定义的module实现后才能完成hook功能 23 | - 为了便于后续的研究,将采用复杂的方案来进行剪枝;而不是每个模块设置一个prune函数 24 | 25 | 3. 剪枝工具开发 26 | - shufflenet剪枝: channel shuffle 27 | - FPN剪枝: Add层的处理 28 | 29 | 30 | 4. 实验记录 31 | 32 | - 方案一: 直接根据BN层的scale参数的l1-norm大小来剪枝 33 | - 方案二: 根据当前层的输出的l1-norm大小来剪枝 34 | - -------------------------------------------------------------------------------- /quantization/ptq/remove_inititalizers_from_inputs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import onnx 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--input", required=True, help="input model") 9 | parser.add_argument("--output", required=True, help="output model") 10 | args = parser.parse_args() 11 | return args 12 | 13 | 14 | def remove_initializer_from_input(): 15 | args = get_args() 16 | 17 | model = onnx.load(args.input) 18 | if model.ir_version < 4: 19 | print("Model with ir_version below 4 requires to include initilizer in graph input") 20 | return 21 | 22 | inputs = model.graph.input 23 | name_to_input = {} 24 | for input in inputs: 25 | name_to_input[input.name] = input 26 | 27 | for initializer in model.graph.initializer: 28 | if initializer.name in name_to_input: 29 | inputs.remove(name_to_input[initializer.name]) 30 | 31 | onnx.save(model, args.output) 32 | 33 | 34 | if __name__ == "__main__": 35 | remove_initializer_from_input() -------------------------------------------------------------------------------- /quantization/ptq/filter_cache.py: -------------------------------------------------------------------------------- 1 | from trt_utils import read_calib_cache 2 | from trt_utils import write_cache_to_json 3 | import onnx 4 | 5 | 6 | # 过滤掉不输入onnx节点的量化值,以及数字过大的量化值 7 | onnx_model = onnx.load("/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.onnx") 8 | onnx_output_names = [] 9 | for node in onnx_model.graph.node: 10 | for o in node.output: 11 | onnx_output_names.append(o) 12 | 13 | scale_map = read_calib_cache("/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.trt_int8_with_531pics_calib_percentile595.calib_cache") 14 | calib_cache = "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.trt_int8_with_531pics_calib_percentile595.filtered.calib_cache" 15 | 16 | scale_map = {k: scale_map[k] for k in sorted(scale_map)} 17 | with open(calib_cache, "w") as file: 18 | file.write("TRT-8400-EntropyCalibration2\n") 19 | for key in sorted(scale_map.keys()): 20 | value = scale_map[key] 21 | scale = float(value) 22 | scale_hex = hex(struct.unpack(" 转换容易失败 23 | same_name_map = {} 24 | for value, names in same_value_map.items(): 25 | if len(names) > 1: 26 | print(names) 27 | 28 | exit(0) 29 | 30 | scale_map_etp.update(scale_map_etp) 31 | 32 | 33 | with open(sys.argv[3], "w") as file: 34 | file.write("TRT-8400-EntropyCalibration2\n") 35 | for key in sorted(scale_map_etp.keys()): 36 | scale = scale_map_etp[key] 37 | # if scale > 0.5: 38 | # print("scale过大, 建议不量化:", key, scale, 128.0 * scale) 39 | # continue 40 | if len(key) > 5: 41 | print(key, scale) 42 | continue 43 | scale_hex = hex(struct.unpack("
6 | 7 | ### 1. 问题描述 8 | 9 | 常见的剪枝工具没有处理特殊结构的能力: 10 | - depthwise convolution: 普通卷积的权重的shape是[Co, Ci, Kw, Kh], 深度分离卷积的权重shape是[Co, 1, Kw, Kh]; 进行channel-wise剪枝时,需要根据前置节点的channel,决定当前节点的channel选择,并且决定了后置节点的输入channel的选择; 11 | 12 | - channel shuffle: 通道shuffle后的卷积的输入权重,剪枝通道要与通道shuffle前的通道对应上;需要识别不同卷积层之间的通道对应关系; 13 | 14 | - Add结构: 输入的多个卷积层,其剪枝的channel需要保持一致,否则将会出现Add的channel之间的不对应的问题; 15 | 16 | - Slice结构: slice前整个feature map的有效channel数是原模型的1/2,但是slice之后的两个feature map中有效channel数不是原来的1/2了;导致模型结构不符合预期; 17 | 18 |

19 | ### 2. 解决方案 20 | 21 | 放弃的解决方案: 22 | - 使用已有的nni等工具: 23 | - 基于torch.fx来识别不同模块之间的关联关系;然后开发对应的剪枝功能; 24 | - torch.fx会转换所有的表达式,粒度很细;学习成本有点高; 25 | - 使用onnx来获取拓扑结构: 26 | - onnx能够很简单的识别模块间的关联关系 27 | - onnx对于channel间的转换的识别能力不够(onnx无法单独设置每一层的输入) 28 | 29 | 30 | 最终选择的解决方案: 31 | - 使用torch的hook机制,自行构建mask和剪枝流程: 32 | - 首先,使用hook获取模块的id,用于后续构架关联关系 33 | - 其次,识别需要联合剪枝的模块(在这里主要是指Add) 34 | - 然后,构建剪枝用到的mask 35 | - 最后,对权重进行修改 36 | 37 | 38 | 特殊要素的处理: 39 | - depthwise convolution: 根据输入来决定输出channel的选择; 40 | - channel shuffle: 通过输出置零来实现需要剪枝的channel之间的传递; 41 | - add: 记录add相关的模块id,计算权重的均值来剪枝 42 | - slice: 针对shufflenet的结构,将channel划分为4组,分别进行channel选择; 43 | 44 |

45 | 46 | ### 3. 实际效果 47 | 48 | 剪枝工具开发 49 | - shufflenet剪枝: channel shuffle 50 | - FPN剪枝: Add层的处理 -------------------------------------------------------------------------------- /quantization/calib_filter_Sigmoid.py: -------------------------------------------------------------------------------- 1 | # 过滤掉calib_cache中的sigmoid层 2 | import sys 3 | import onnx 4 | 5 | onnx_model = onnx.load(sys.argv[1]) 6 | 7 | 8 | # Sigmoid\HardSigmoid 9 | sigmoid_inputs = [] 10 | sigmoid_outputs = [] 11 | 12 | add_inputs = [] 13 | add_outputs = [] 14 | 15 | for node in onnx_model.graph.node: 16 | if node.op_type in ["HardSigmoid", "Sigmoid"]: 17 | input_name = node.input[0] 18 | sigmoid_inputs.append(input_name) 19 | output_name = node.output[0] 20 | sigmoid_outputs.append(output_name) 21 | elif node.op_type in ["Mul", "Add", "Concat"]: 22 | input_name = node.input[0] 23 | add_inputs.append(input_name) 24 | input_name = node.input[1] 25 | add_inputs.append(input_name) 26 | 27 | output_name = node.output[0] 28 | add_outputs.append(output_name) 29 | 30 | # print(sigmoid_outputs) 31 | 32 | # 过滤Sigmoid的输出 33 | lines = [] 34 | total_sigmoids = 0 35 | total_nodes = 0 36 | with open(sys.argv[2]) as fr: 37 | for i, line in enumerate(fr.readlines()): 38 | if i == 0: 39 | lines.append(line) 40 | else: 41 | total_nodes += 1 42 | 43 | name, value = line.strip().split(":") 44 | name = name.strip(" ") 45 | if name in sigmoid_outputs or name in add_outputs or name in sigmoid_inputs or name in add_inputs: 46 | total_sigmoids += 1 47 | continue 48 | else: 49 | lines.append(line) 50 | 51 | 52 | print("total nodes", total_nodes) 53 | print("sigmoids ", total_sigmoids) 54 | print("final nodes", len(lines) - 1) 55 | 56 | with open(sys.argv[3], "w") as fw: 57 | fw.writelines(lines) -------------------------------------------------------------------------------- /01.install.md: -------------------------------------------------------------------------------- 1 | # onnx2trt 2 | 3 | 4 | 【DEPRECATED】 开发过程中,发现了一个整体思路与我这个repo类似,但功能更完善,且实现了一些高级算法的repo,建议使用这个repo来进行模型量化和部署; https://github.com/openppl-public/ppq 5 | 6 | 7 | 8 | onnx2trt是用于进行tensorRT的int8模型量化的工具; 在进行int8模型量化时,某些int8 tensorRT模型的精度会出现一定程度的下降。而当前tensorRT默认使用的校准算法是Entropy, 为此特意开发onnx2trt工具来优化量化模型的精度。 9 | 10 | 11 | 12 | ## 安装 13 | python36 (py37会遇到pycuda安装的问题) 14 | 15 | pip install nvidia-pyindex 16 | pip install nvidia-tensorrt 17 | pip install pycuda 18 | pip install sympy 19 | 20 | 21 | 22 | ## tensorRT量化存在的问题 23 | 24 | 1. 大模型的量化误差累积 25 | 在进行模型的量化校准时,通常的做法是先用fp32模型进行一遍infer,然后统计每个节点的动态范围。这样的做法简单快捷,做一遍infer即可得到整个模型所有节点的动态范围。 26 | 但是,当层数较多时,量化的误差会不断累积;距离模型输入越远,这种量化误差越大。 27 | 28 | 29 | 2. 量化后阈值偏移 30 | 当某个节点的输出数量比较小时,节点输出的cosine相似度已经很高,但是却出现了阈值偏移; 31 | 例如:fp32_out = [-6.223839, 3.5978181, -2.4270086], int8_out = [-2.37992859, 1.80094731, -1.93005347] 32 | 如果这个输出后接的是softmax结构的话,这种阈值偏移对最终精度的影响会比较小; 33 | 但是如果这里输出的是分数score的话,就会带来一些不利于实际部署的结果:例如recall降低,precision升高的变化;这时需要重新调整阈值来维持precision或者recall不变,以保证模型部署的效果。 34 | bias correction相关: https://zhuanlan.zhihu.com/p/450227567 35 | 36 | 37 | TODO: 38 | - [ ] QDQ量化工具: 使用QDQ方式进行tensorRT的模型量化. 39 | - image 40 | - 需要直到tensorRT做了哪些网络节点的优化,才能方便地插入QDQ节点; 41 | - [ ] 量化精度损失分析工具: 42 | - tensorRT自带量化分析工具polygraph: https://zhuanlan.zhihu.com/p/535021438 43 | - 分析流程示例:https://blog.csdn.net/TracelessLe/article/details/120656484 44 | - 给定每个节点的量化scale值,计算每一层的量化前后的cosine值. 45 | - 给定每个节点的量化scale值,计算这一层量化对最终输出的莲花cosine值. 46 | - [ ] 自定义scale计算工具/自定义calibrator: 47 | - 用于trt exec生成trt engine(隐式设置精度). 48 | - 用于QDQ生成trt engine(显式设置精度). 49 | 50 | -------------------------------------------------------------------------------- /quantization/find_nodes_onnx.py: -------------------------------------------------------------------------------- 1 | # 找出所有A-->B之间的节点 2 | from numpy import linspace 3 | import onnx 4 | 5 | # A = "8305" 6 | # B = "8527" 7 | # E = ["8302", "8305", "8530"] 8 | 9 | # A = "8835" 10 | # B = "8897" 11 | 12 | A = "8772" 13 | B = "8834" 14 | 15 | Final_Nodes = [A, B] 16 | 17 | input_path = "/apdcephfs/private_howellyang/onnx2trt/model.onnx" 18 | output_path = "/apdcephfs/private_howellyang/onnx2trt/model_{}_to_{}.onnx".format(A, B) 19 | calib_cache = "/apdcephfs/private_howellyang/onnx2trt/model.trt_int8_with_1578pics_calib_entropy.calib_cache" 20 | input_names = [A] 21 | output_names = [B] 22 | 23 | onnx.utils.extract_model(input_path, output_path, input_names, output_names) 24 | 25 | onnx_model = onnx.load(output_path) 26 | 27 | for node in onnx_model.graph.node: 28 | Final_Nodes.extend(node.output) 29 | # if node.op_type in ["QuantizeLinear"]: 30 | # act_name = node.input[0] 31 | # scale_name = node.input[1] 32 | # scale_value = scales_map[scale_name] 33 | # if act_name in weights_map: # 权重量化 34 | # if act_name not in weights_scale_map: 35 | # weights_scale_map[act_name] = [] 36 | 37 | # weights_scale_map[act_name].append(scale_value) 38 | # else: # act 量化 39 | # if act_name not in acts_scale_map: 40 | # acts_scale_map[act_name] = [] 41 | 42 | # acts_scale_map[act_name].append(scale_value) 43 | print(Final_Nodes) 44 | 45 | lines = [] 46 | with open(calib_cache) as fr: 47 | for i, line in enumerate(fr.readlines()): 48 | if i == 0: 49 | lines.append(line) 50 | else: 51 | node_name, hex_value = line.strip().split(":") 52 | if node_name in Final_Nodes: 53 | continue 54 | else: 55 | lines.append(line) 56 | 57 | with open(calib_cache + "_remove_{}_to_{}.calib_cache".format(A, B), "w") as fw: 58 | fw.writelines(lines) 59 | -------------------------------------------------------------------------------- /quantization/ptq/trt_utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import struct 4 | from matplotlib import scale 5 | import numpy as np 6 | 7 | def read_calib_cache(calib_cache): 8 | scale_map = {} 9 | with open(calib_cache) as fr: 10 | for line in fr.readlines()[1:]: 11 | name, value = line.strip().split(": ") 12 | name = name.strip(":") 13 | value = value.strip(":") 14 | if value.strip() == "0": 15 | val = 0.0 16 | else: 17 | val = struct.unpack("!f", bytes.fromhex(value.strip()))[0] 18 | 19 | scale_map[name] = val 20 | 21 | scale_map = {k: scale_map[k] for k in sorted(scale_map)} 22 | return scale_map 23 | 24 | 25 | def write_cache_to_json(scale_map, calib_cache): 26 | scale_map = {k: scale_map[k] for k in sorted(scale_map)} 27 | with open(calib_cache, "w") as file: 28 | file.write("TRT-8400-EntropyCalibration2\n") 29 | for key in sorted(scale_map.keys()): 30 | value = scale_map[key] 31 | scale = float(value) 32 | scale_hex = hex(struct.unpack(" np.ndarray: 47 | if x is None and not accepet_none: 48 | raise ValueError("Trying to convert an empty value.") 49 | if isinstance(x, np.ndarray): 50 | return x 51 | elif isinstance(x, int) or isinstance(x, float): 52 | return np.array( 53 | [ 54 | x, 55 | ] 56 | ) 57 | elif isinstance(x, torch.Tensor): 58 | if x.numel() == 0 and accepet_none: 59 | return None 60 | if x.numel() == 0 and not accepet_none: 61 | raise ValueError("Trying to convert an empty value.") 62 | if x.numel() == 1: 63 | return DataLoader.convert_any_to_numpy(x.detach().cpu().item()) 64 | if x.numel() > 1: 65 | return x.detach().cpu().numpy() 66 | elif isinstance(x, list) or isinstance(x, tuple): 67 | return np.array(x) 68 | else: 69 | raise TypeError( 70 | f"input value {x}({type(x)}) can not be converted as numpy type." 71 | ) 72 | 73 | 74 | def read_image(path): 75 | # 多任务模型 76 | _img_transforms = transforms.Compose( 77 | [ 78 | transforms.Resize((384, 768)), 79 | transforms.ToTensor(), 80 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 81 | ] 82 | ) 83 | img = Image.open(path).convert("RGB") 84 | img_w, img_h = img.size[0], img.size[1] 85 | img = _img_transforms(img) 86 | img = img.unsqueeze(0) 87 | return img 88 | -------------------------------------------------------------------------------- /quantization/json_filter.py: -------------------------------------------------------------------------------- 1 | 2 | failed_tasks = { 3 | "code": 0, 4 | "msg": "", 5 | "detail": [ 6 | { 7 | "pkg_task_id": "999180333210000000161202285_null_210000000161202285", 8 | "status": -1, 9 | "status_landmark": 2, 10 | "status_land_line": 2, 11 | "status_scene": 2, 12 | "status_camera": 2, 13 | "status_traffic_light": -1, 14 | "status_img": -1 15 | }, 16 | { 17 | "pkg_task_id": "999180100210000000177231923_null_210000000177231923", 18 | "status": -1, 19 | "status_landmark": 2, 20 | "status_land_line": 2, 21 | "status_scene": 2, 22 | "status_camera": 2, 23 | "status_traffic_light": -1, 24 | "status_img": -1 25 | }, 26 | { 27 | "pkg_task_id": "999180203210000000161217375_null_210000000161217375", 28 | "status": -1, 29 | "status_landmark": 2, 30 | "status_land_line": 2, 31 | "status_scene": 2, 32 | "status_camera": 2, 33 | "status_traffic_light": -1, 34 | "status_img": -1 35 | }, 36 | { 37 | "pkg_task_id": "999180148210000000161242266_null_210000000161242266", 38 | "status": -1, 39 | "status_landmark": 2, 40 | "status_land_line": 2, 41 | "status_scene": 2, 42 | "status_camera": 2, 43 | "status_traffic_light": -1, 44 | "status_img": -1 45 | }, 46 | { 47 | "pkg_task_id": "999204618210000000161212957_null_210000000161212957", 48 | "status": -1, 49 | "status_landmark": 2, 50 | "status_land_line": 2, 51 | "status_scene": 2, 52 | "status_camera": 2, 53 | "status_traffic_light": -1, 54 | "status_img": -1 55 | }, 56 | { 57 | "pkg_task_id": "999195621210000000177226766_null_210000000177226766", 58 | "status": -1, 59 | "status_landmark": 2, 60 | "status_land_line": 2, 61 | "status_scene": 2, 62 | "status_camera": 2, 63 | "status_traffic_light": -1, 64 | "status_img": -1 65 | }, 66 | { 67 | "pkg_task_id": "999202427210000000161237174_null_210000000161237174", 68 | "status": -1, 69 | "status_landmark": 2, 70 | "status_land_line": 2, 71 | "status_scene": 2, 72 | "status_camera": 2, 73 | "status_traffic_light": -1, 74 | "status_img": -1 75 | }, 76 | { 77 | "pkg_task_id": "999202404210000000177220569_null_210000000177220569", 78 | "status": -1, 79 | "status_landmark": 2, 80 | "status_land_line": 2, 81 | "status_scene": 2, 82 | "status_camera": 2, 83 | "status_traffic_light": -1, 84 | "status_img": -1 85 | }, 86 | { 87 | "pkg_task_id": "999192126210000000177226943_null_210000000177226943", 88 | "status": -1, 89 | "status_landmark": 2, 90 | "status_land_line": 2, 91 | "status_scene": 2, 92 | "status_camera": 2, 93 | "status_traffic_light": -1, 94 | "status_img": -1 95 | }, 96 | { 97 | "pkg_task_id": "999194214210000000177357511_null_210000000177357511", 98 | "status": -1, 99 | "status_landmark": 2, 100 | "status_land_line": 2, 101 | "status_scene": 2, 102 | "status_camera": 2, 103 | "status_traffic_light": -1, 104 | "status_img": -1 105 | }, 106 | { 107 | "pkg_task_id": "999194403210000000177230387_null_210000000177230387", 108 | "status": -1, 109 | "status_landmark": 2, 110 | "status_land_line": 2, 111 | "status_scene": 2, 112 | "status_camera": 2, 113 | "status_traffic_light": -1, 114 | "status_img": -1 115 | } 116 | ], 117 | "detail_explain": "pkg_task_id,status,status_landmark,status_land_line,status_scene,status_camera,status_traffic_light,status_img" 118 | } 119 | 120 | for info in failed_tasks["detail"]: 121 | print('"{}"'.format(info["pkg_task_id"]), end = ",") 122 | 123 | print("\n"*4) -------------------------------------------------------------------------------- /onnx_optimize/step03_fuse_normalize_to_conv.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | import numpy as np 3 | from onnx import numpy_helper 4 | from step02_onnx_simplify import get_post_nodes 5 | 6 | 7 | def fuse_normalize_to_conv(onnx_model, means, scales, input_tensor_name=None): 8 | # Y = (x - means)/scales 9 | initializer_names = [ 10 | initializer.name for initializer in onnx_model.graph.initializer 11 | ] 12 | inputs = [inp for inp in onnx_model.graph.input if inp not in initializer_names] 13 | if input_tensor_name is None: 14 | assert ( 15 | len(inputs) == 1 16 | ), "if multiple input exists, please specify input_tensor_name" 17 | input_tensor_name = inputs[0] 18 | 19 | weight_name2tensor = {} 20 | for weight in onnx_model.graph.initializer: 21 | weight_name2tensor[weight.name] = weight 22 | 23 | # find post nodes 24 | post_nodes = get_post_nodes(onnx_model, input_tensor_name) 25 | for post_node in post_nodes: 26 | if post_node.op_type != "Conv": 27 | raise NameError( 28 | "the input tensor is used by non-Conv node, normalize process can't be fused" 29 | ) 30 | 31 | paddings = [0] 32 | for attr in post_node.attribute: 33 | if attr.name == "pads": 34 | paddings = attr.ints 35 | break 36 | 37 | for pad in paddings: 38 | if pad != 0: 39 | raise NameError( 40 | "the conv after input has padding, normalize process can't be fused" 41 | ) 42 | 43 | group_num = 1 44 | for attr in post_node.attribute: 45 | if attr.name == "group": 46 | group_num = attr.i 47 | if group_num > 1: 48 | raise NameError( 49 | "the conv after input has group > 1, normalize process can't be fused" 50 | ) 51 | 52 | # fuse normalize-conv 53 | assert ( 54 | len(post_node.input) == 3 55 | ), " conv node must has bias for normalize fusion" 56 | weight_name = post_node.input[1] 57 | weight_tensor = weight_name2tensor[weight_name] 58 | weight_value = numpy_helper.to_array(weight_tensor) # out_ch, in_ch, ker, ker 59 | 60 | bias_name = post_node.input[2] 61 | bias_tensor = weight_name2tensor[bias_name] 62 | bias_value = numpy_helper.to_array(bias_tensor) # out_ch, in_ch, ker, ker 63 | 64 | assert ( 65 | len(means) == len(scales) == np.shape(weight_value)[1] 66 | ), "mean and scale value mismatch the input channel num" 67 | 68 | means = np.reshape(np.array(means), (1, -1, 1, 1)) 69 | scales = np.reshape(np.array(means), (1, -1, 1, 1)) 70 | 71 | new_weight_value = np.array(weight_value / scales).astype(np.float32) 72 | raw_shape = tuple([i for i in weight_tensor.dims]) 73 | new_shape = np.shape(new_weight_value) 74 | assert new_shape == raw_shape 75 | weight_tensor.ClearField("float_data") 76 | weight_tensor.ClearField("int32_data") 77 | weight_tensor.ClearField("int64_data") 78 | weight_tensor.raw_data = new_weight_value.tobytes() 79 | 80 | new_bias_value = np.array( 81 | bias_value - np.sum(weight_value * means / scales) 82 | ).astype(np.float32) 83 | raw_shape = tuple([i for i in bias_tensor.dims]) 84 | new_shape = np.shape(new_bias_value) 85 | assert new_shape == raw_shape 86 | bias_tensor.ClearField("float_data") 87 | bias_tensor.ClearField("int32_data") 88 | bias_tensor.ClearField("int64_data") 89 | bias_tensor.raw_data = new_bias_value.tobytes() 90 | 91 | return onnx_model 92 | 93 | 94 | if __name__ == "__main__": 95 | import sys 96 | 97 | onnx_model = onnx.load(sys.argv[1]) 98 | means = eval(sys.argv[2]) 99 | scales = eval(sys.argv[3]) 100 | onnx_model = fuse_normalize_to_conv(onnx_model, means, scales) 101 | -------------------------------------------------------------------------------- /quantization/ptq/create_json_inputs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | from polygraphy.json import save_json 6 | 7 | print(sys.getdefaultencoding()) 8 | s = "中文乱码问题解决" 9 | print(s) 10 | 11 | # --------------------------------------------------------------- 12 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 13 | 14 | # This script shows you how to export ppq internal graph to tensorRT 15 | # --------------------------------------------------------------- 16 | 17 | # For this inference test, all test data is randomly picked. 18 | # If you want to use real data, just rewrite the defination of SAMPLES 19 | print("开始import") 20 | import onnxruntime 21 | import torch 22 | from tqdm import tqdm 23 | import glob 24 | import cv2 25 | import numpy as np 26 | from torchvision import transforms 27 | from PIL import Image 28 | import os 29 | from sklearn.metrics.pairwise import cosine_similarity 30 | import onnx 31 | from copy import deepcopy 32 | 33 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray: 34 | if x is None and not accepet_none: 35 | raise ValueError("Trying to convert an empty value.") 36 | if isinstance(x, np.ndarray): 37 | return x 38 | elif isinstance(x, int) or isinstance(x, float): 39 | return np.array( 40 | [ 41 | x, 42 | ] 43 | ) 44 | elif isinstance(x, torch.Tensor): 45 | if x.numel() == 0 and accepet_none: 46 | return None 47 | if x.numel() == 0 and not accepet_none: 48 | raise ValueError("Trying to convert an empty value.") 49 | if x.numel() == 1: 50 | return convert_any_to_numpy(x.detach().cpu().item()) 51 | if x.numel() > 1: 52 | return x.detach().cpu().numpy() 53 | elif isinstance(x, list) or isinstance(x, tuple): 54 | return np.array(x) 55 | else: 56 | raise TypeError( 57 | f"input value {x}({type(x)}) can not be converted as numpy type." 58 | ) 59 | 60 | def read_image(path): 61 | # 多任务模型 62 | _img_transforms = transforms.Compose( 63 | [ 64 | transforms.Resize((384, 768)), 65 | transforms.ToTensor(), 66 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 67 | ] 68 | ) 69 | img = Image.open(path).convert("RGB") 70 | img_w, img_h = img.size[0], img.size[1] 71 | img = _img_transforms(img) 72 | img = img.unsqueeze(0) 73 | return img 74 | 75 | 76 | def read_image_v2(path): 77 | mean = [123.675, 116.28, 103.53] 78 | std = [58.395, 57.12, 57.375] 79 | input_w = 960 80 | input_h = 480 81 | mean = np.array(mean) 82 | std = np.array(std) 83 | img = cv2.imread(path) 84 | img = cv2.resize(img, (input_w, input_h)) 85 | img = img.astype(np.float32) 86 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 87 | 88 | # Norm 89 | for i in range(3): 90 | img[..., i] = (img[..., i] - mean[i]) / std[i] 91 | 92 | # hwc -> nchw ----> 这里输入方式不对 93 | # h, w, c = img.shape 94 | # img = img.reshape((1, c, h ,w)) 95 | img = np.transpose(img, (2, 0, 1)).astype(np.float32) 96 | img = np.expand_dims(img, axis=0) 97 | return np.ascontiguousarray(img, dtype=np.float32) 98 | 99 | calibration_files = glob.glob( 100 | os.path.join("/mapai/howellyang/code/road-service/road_service/calib_images/", "*.jpg") 101 | )[:1] 102 | 103 | 104 | SAMPLES = [ 105 | read_image_v2(path) for path in calibration_files 106 | ] # rewirte this to use real data. 107 | 108 | # List[Dict[str, numpy.ndarray]] 109 | import json 110 | from json import JSONEncoder 111 | import numpy 112 | class NumpyArrayEncoder(JSONEncoder): 113 | def default(self, obj): 114 | if isinstance(obj, numpy.ndarray): 115 | return obj.tolist() 116 | return JSONEncoder.default(self, obj) 117 | 118 | feed_dict_list = [{"input.1": np.array(read_image_v2(path))} for path in calibration_files] 119 | 120 | 121 | save_json(feed_dict_list, "calibration_data_1k5.json") 122 | 123 | # with open( ,"w") as fw: 124 | # json.dump(feed_dict_list, fw, cls=NumpyArrayEncoder, indent=4) 125 | # # encodedNumpyData = json.dumps(feed_dict_list, cls=NumpyArrayEncoder) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ONNX2TRT: 端上模型部署[整理中] 2 | 3 | 4 | ## 1. 概述 5 | 6 | 模型的压缩(蒸馏、剪枝、量化)和部署,是模型在自动驾驶和物联网产业落地中的重要步骤。端上的设备 7 | 8 | 在实际工作过程中,我们遇到了很多的困难: 文档缺失、依赖库冲突、算子不支持、精度差、速度慢等。 9 | 10 | 因此,我将我在实际工作过程中的一些经验,整理成文档记录在这里,供其它开发者参考。同时,我会将过程中用到的一些脚本,整理成一些独立的工具脚本,方便大家使用。 11 | 12 |
13 | 14 | ## 2. 模型部署流程 15 | 16 | 模型部署的一般步骤为: 17 | - 模型导出onnx 18 | - onnx模型结构优化 19 | - 模型量化,构建tensorRT的engine 20 | - tensorRT模型部署 21 | - 精度和速度测试 22 | - 问题排查与分析 23 | 24 | 接下来,我将给出相关的工具,并对其中的关键步骤进行详细说明; 25 | 26 |
27 | 28 | ### 2.1 模型导出 29 | onnx是一种模型表示方式,能够将不同框架下的模型,统一表示为同一种形式;因此,尝尝被用来作为模型转换的中间节点;目前,tensorRT已经支持了直接用torch转成tensorRT的engine;但是其它的SDK框架,如MNN、TNN、Paddle-Lite、OpenVino等仍然只支持onnx格式的模型转换;并且,onnx本身也是一种很好用的模型框架,可以很方便地在上面做开发; 30 | 31 | ``` 32 | import torch 33 | # 加载你的模型 34 | model = build_model(config.model) 35 | checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) 36 | load_model_weight(model, checkpoint) 37 | 38 | # 设置输入的大小 39 | input_shape = (320, 192) # W H 40 | dummy_input = torch.autograd.Variable(torch.randn(1, 3, input_shape[1],input_shape[0])) # N, C, H, W 41 | 42 | # 设置输出节点名称,便于后续部署 43 | output_names = ["s8_cls", "s8_reg", "s16_cls", "s16_reg"] 44 | model.eval() 45 | torch.onnx.export( 46 | model, 47 | dummy_input, 48 | output_path, 49 | verbose=True, 50 | keep_initializers_as_inputs=False, 51 | do_constant_folding=True, 52 | training=False, 53 | opset_version=11, 54 | output_names=output_names, 55 | ) 56 | 57 | ``` 58 | 59 | 其它常用框架基本都有导出为onnx模型的代码,可以通过搜索引擎很容易得到相关结果,在此不作介绍。 60 | 61 | 导出模型为onnx以后,如果不需要做模型量化,可以直接将onnx模型转换为所需的格式后进行模型部署;如果想快速完成部署,可以使用在线模型转换的工具来完成模型转换 https://convertmodel.com/; 62 | 63 |
64 | 65 | ### 2.2 onnx模型结构优化 ### 66 | 67 | onnx模型结构优化,一方面是为后续的模型量化做准备;另一方面是减少了输入和输出部分的计算,这部分计算对云端的算力而言可能是无关紧要的,但是对端上的微弱算力而言,这部分计算能省则省。 68 | 69 |
70 | 71 | *2.2.1 onnx-simplify和optimize* 72 | 73 | optimize的目的是进行算子的融合, 从而减少计算量;例如fuse_bn_into_conv, fuse_concat_into_reshape; 详见[onnx-optimizer](https://github.com/onnx/optimizer); 74 | 75 | ![fuse_bn](https://pic1.zhimg.com/v2-98dbfa847caf6d9c9d411348592c8815_1440w.jpg) 76 | 77 |
78 | simplify的目的是消除onnx模型中的多余算子。从torch得到的onnx模型中,会存在一些从tensor计算出常量的操作,例如Reshape算子会从tensor中获取形状后给Resize算子;这就导致onnx模型中存在某些不必要的节点(最常见的是Gather节点);因此,[onnx-simplifier](https://github.com/daquexian/onnx-simplifier)会对整个网络进行一次推理,然后将这类多余的算子替换成常量. 79 | 80 | ![simplify](https://github.com/daquexian/onnx-simplifier/raw/master/imgs/complicated_reshape.png) 81 | 82 |
83 | 84 | 使用在线网站,可以便捷地进行以上操作:https://www.convertmodel.com/#input=onnx&output=onnx; 85 | 86 |
87 | 88 | *2.2.2 预处理融合* 89 | 90 | 在onnx-optimizer中,有一个操作是将Conv-BN结构中的BN层融合进Conv中,其原理可以简单理解为: 91 | - Conv: Y = k * x + b 92 | - BN: Z = (Y - m)/s 93 | - Conv-BN: Z = (k * x + b - m)/s = k/s * x + (b - m)/s 94 | - new Conv: k1 = k/s, b1 = (b-m)/s, Z = k1 * x + b1 95 | 96 | 那么,在某些模型中BN是放在Conv的,这种BN-Conv是否可以进行融合呢?答案是当Conv层没有padding(padding=0)时,也是可以融合的;但是当Conv层有padding时,BN-Conv的融合会导致输出的feature map与原始输出相比,在边界上存在diff;具体原理可以通过分析BN-Conv的计算过程得到,在此不作推导; 97 | 98 | 在将图片输入到模型前,常常会进行减均值除方差(normalize)的操作;基于BN-Conv层融合的原理,这个normalize过程也同样可以融合到Conv层中(需要Conv层不带padding);在端上硬件算力很小的情况下,这一融合也是十分有必要的; 99 | 100 |
101 | 102 | *2.2.3 sigmoid移除* 103 | 104 | sigmoid函数中的exp计算以及除法运算,是比较耗时的;当模型最后输出的feature map比较大时,这个过程的耗时就会更加明显;当这个feature map是输出一个置信度时,可以通过计算sigmoid的反函数,提前计算好置信度阈值,从而省掉这个sigmoid的计算;为此,在实际部署时,常常会去掉模型输出前的sigmoid节点;同时,一些transpose、resize等操作,也可以在后处理流程中通过直接访问相应位置的元素来实现,不需要在模型中进行这一步额外的计算; 105 | 106 |
107 | 108 | *2.2.4 RepConv融合* 109 | 110 | RepConv是一种有效增加模型容量的技术。在训练时添加额外的卷积层,在部署时通过权重融合去掉这部分计算。通常情况下,RepConv的权重融合是在pytorch层面做的,但是当训练代码比较复杂或者重复代码较多时,在onnx层面进行权重的融合,可能是一个更好的选择;相关原理参见论文: [RepVGG: Making VGG-style ConvNets Great Again](https://arxiv.org/abs/2101.03697). 111 | 112 | ![RepConv](https://miro.medium.com/max/1400/1*87dCul2yHq0_dRfV3nEubg.png) 113 | 114 | 115 | ### 2.3 量化 116 | 117 |
118 | 119 | *2.3.1 量化的理论基础* 120 | 121 | 122 | *2.3.2 量化的计算过程* 123 | 124 | 125 | *2.3.3 常用的量化工具箱* 126 | 127 | *2.3.4 PTQ量化* 128 | 129 | - 简单量化 130 | - balance vector(weight equalization) 131 | - bias correction 132 | 133 | 134 | *2.3.5 QAT量化* 135 | 136 | - QDQ模式介绍 137 | - QDQ流程优化 138 | 139 | 140 | ### 2.4 剪枝 141 | 142 |
143 | 144 | 145 | ### 2.5 蒸馏 146 | 147 |
148 | 149 | ## 3. 参考 150 | 1. tiny-tensorRT: https://github.com/zerollzeng/tiny-tensorrt 151 | 2. micronet: https://github.com/666DZY666/micronet 152 | 3. ppq: https://github.com/openppl-public/ppq 153 | 4. onnx-runtime quantization: https://onnxruntime.ai/docs/performance/quantization.html 154 | 5. polygraphy: https://github.com/NVIDIA/TensorRT/tree/main/tools/Polygraphy 155 | -------------------------------------------------------------------------------- /trt_calibrator.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import tensorrt as trt 4 | import pycuda.driver as cuda 5 | import pycuda.autoinit # fix init error of cuda 6 | 7 | # __all__ = [ 8 | # "TRTPercentileCalibrator", 9 | # "TRTEntropyCalibrator", 10 | # "TRTMinMaxCalibrator", 11 | # ] 12 | 13 | 14 | class TRTEntropyCalibrator(trt.IInt8EntropyCalibrator2): 15 | def __init__(self, input_layers, stream, cache_file): 16 | super(TRTEntropyCalibrator, self).__init__() 17 | self.input_layers = input_layers 18 | 19 | # 数据读取的类, 等同于图片处理的回调 20 | self.stream = stream 21 | 22 | # 分配GPU 23 | self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes) 24 | 25 | # cache路径 26 | self.cache_file = cache_file 27 | 28 | # 重置校准集 29 | self.stream.reset() 30 | 31 | def get_batch_size(self): 32 | return self.stream.batch_size 33 | 34 | def get_batch(self, names): 35 | try: 36 | batch = self.stream.next_batch() 37 | if not batch.size: 38 | return None 39 | cuda.memcpy_htod(self.d_input, batch) 40 | return [int(self.d_input)] 41 | except StopIteration: 42 | return None 43 | 44 | def read_calibration_cache(self): 45 | # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None. 46 | if os.path.exists(self.cache_file): 47 | with open(self.cache_file, "rb") as f: 48 | return f.read() 49 | else: 50 | return None 51 | 52 | def write_calibration_cache(self, cache): 53 | # cache = ctypes.c_char_p(int(ptr)) 54 | with open(self.cache_file, "wb") as f: 55 | f.write(cache) 56 | 57 | 58 | class TRTMinMaxCalibrator(trt.IInt8MinMaxCalibrator): 59 | def __init__(self, input_layers, stream, cache_file): 60 | super(TRTMinMaxCalibrator, self).__init__() 61 | self.input_layers = input_layers 62 | 63 | # 数据读取的类, 等同于图片处理的回调 64 | self.stream = stream 65 | 66 | # 分配GP 67 | self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes) 68 | 69 | # cache路径 70 | self.cache_file = cache_file 71 | 72 | # 重置校准集 73 | self.stream.reset() 74 | 75 | def get_batch_size(self): 76 | return self.stream.batch_size 77 | 78 | def get_batch(self, names): 79 | try: 80 | batch = self.stream.next_batch() 81 | if not batch.size: 82 | return None 83 | cuda.memcpy_htod(self.d_input, batch) 84 | return [int(self.d_input)] 85 | except StopIteration: 86 | return None 87 | 88 | def read_calibration_cache(self): 89 | # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None. 90 | if os.path.exists(self.cache_file): 91 | with open(self.cache_file, "rb") as f: 92 | return f.read() 93 | else: 94 | return None 95 | 96 | def write_calibration_cache(self, cache): 97 | # cache = ctypes.c_char_p(int(ptr)) 98 | with open(self.cache_file, "wb") as f: 99 | f.write(cache) 100 | 101 | 102 | class TRTPercentileCalibrator(trt.IInt8LegacyCalibrator): 103 | def __init__( 104 | self, input_layers, stream, cache_file, quantile=0.9995, regression_cutoff=1.0 105 | ): 106 | super(TRTPercentileCalibrator, self).__init__() 107 | self.input_layers = input_layers 108 | self.stream = stream 109 | self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes) 110 | self.cache_file = cache_file 111 | self.stream.reset() 112 | self.quantile = quantile 113 | self.regression_cutoff = regression_cutoff 114 | 115 | def get_batch_size(self): 116 | return self.stream.batch_size 117 | 118 | def get_batch(self, names): 119 | try: 120 | batch = self.stream.next_batch() 121 | if not batch.size: 122 | return None 123 | cuda.memcpy_htod(self.d_input, batch) 124 | return [int(self.d_input)] 125 | except StopIteration: 126 | return None 127 | 128 | def read_calibration_cache(self): 129 | # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None. 130 | if os.path.exists(self.cache_file): 131 | with open(self.cache_file, "rb") as f: 132 | return f.read() 133 | else: 134 | return None 135 | 136 | def write_calibration_cache(self, cache): 137 | # cache = ctypes.c_char_p(int(ptr)) 138 | with open(self.cache_file, "wb") as f: 139 | f.write(cache) 140 | 141 | def get_quantile(self): 142 | return self.quantile 143 | 144 | def get_regression_cutoff(self): 145 | return self.regression_cutoff 146 | 147 | def read_histogram_cache(self, length): 148 | return None 149 | 150 | def write_histogram_cache(self, ptr, length): 151 | return None 152 | -------------------------------------------------------------------------------- /quantization/trt_calibrator.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import tensorrt as trt 4 | import pycuda.driver as cuda 5 | import pycuda.autoinit # fix init error of cuda 6 | 7 | # __all__ = [ 8 | # "TRTPercentileCalibrator", 9 | # "TRTEntropyCalibrator", 10 | # "TRTMinMaxCalibrator", 11 | # ] 12 | 13 | 14 | class TRTEntropyCalibrator(trt.IInt8EntropyCalibrator2): 15 | def __init__(self, input_layers, stream, cache_file): 16 | super(TRTEntropyCalibrator, self).__init__() 17 | self.input_layers = input_layers 18 | 19 | # 数据读取的类, 等同于图片处理的回调 20 | self.stream = stream 21 | 22 | # 分配GPU 23 | self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes) 24 | 25 | # cache路径 26 | self.cache_file = cache_file 27 | 28 | # 重置校准集 29 | self.stream.reset() 30 | 31 | def get_batch_size(self): 32 | return self.stream.batch_size 33 | 34 | def get_batch(self, names): 35 | try: 36 | batch = self.stream.next_batch() 37 | if not batch.size: 38 | return None 39 | cuda.memcpy_htod(self.d_input, batch) 40 | return [int(self.d_input)] 41 | except StopIteration: 42 | return None 43 | 44 | def read_calibration_cache(self): 45 | # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None. 46 | if os.path.exists(self.cache_file): 47 | with open(self.cache_file, "rb") as f: 48 | return f.read() 49 | else: 50 | return None 51 | 52 | def write_calibration_cache(self, cache): 53 | # cache = ctypes.c_char_p(int(ptr)) 54 | with open(self.cache_file, "wb") as f: 55 | f.write(cache) 56 | 57 | 58 | class TRTMinMaxCalibrator(trt.IInt8MinMaxCalibrator): 59 | def __init__(self, input_layers, stream, cache_file): 60 | super(TRTMinMaxCalibrator, self).__init__() 61 | self.input_layers = input_layers 62 | 63 | # 数据读取的类, 等同于图片处理的回调 64 | self.stream = stream 65 | 66 | # 分配GP 67 | self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes) 68 | 69 | # cache路径 70 | self.cache_file = cache_file 71 | 72 | # 重置校准集 73 | self.stream.reset() 74 | 75 | def get_batch_size(self): 76 | return self.stream.batch_size 77 | 78 | def get_batch(self, names): 79 | try: 80 | batch = self.stream.next_batch() 81 | if not batch.size: 82 | return None 83 | cuda.memcpy_htod(self.d_input, batch) 84 | return [int(self.d_input)] 85 | except StopIteration: 86 | return None 87 | 88 | def read_calibration_cache(self): 89 | # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None. 90 | if os.path.exists(self.cache_file): 91 | with open(self.cache_file, "rb") as f: 92 | return f.read() 93 | else: 94 | return None 95 | 96 | def write_calibration_cache(self, cache): 97 | # cache = ctypes.c_char_p(int(ptr)) 98 | with open(self.cache_file, "wb") as f: 99 | f.write(cache) 100 | 101 | 102 | class TRTPercentileCalibrator(trt.IInt8LegacyCalibrator): 103 | def __init__( 104 | self, input_layers, stream, cache_file, quantile=0.9995, regression_cutoff=1.0 105 | ): 106 | super(TRTPercentileCalibrator, self).__init__() 107 | self.input_layers = input_layers 108 | self.stream = stream 109 | self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes) 110 | self.cache_file = cache_file 111 | self.stream.reset() 112 | self.quantile = quantile 113 | self.regression_cutoff = regression_cutoff 114 | 115 | def get_batch_size(self): 116 | return self.stream.batch_size 117 | 118 | def get_batch(self, names): 119 | try: 120 | batch = self.stream.next_batch() 121 | if not batch.size: 122 | return None 123 | cuda.memcpy_htod(self.d_input, batch) 124 | return [int(self.d_input)] 125 | except StopIteration: 126 | return None 127 | 128 | def read_calibration_cache(self): 129 | # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None. 130 | if os.path.exists(self.cache_file): 131 | with open(self.cache_file, "rb") as f: 132 | return f.read() 133 | else: 134 | return None 135 | 136 | def write_calibration_cache(self, cache): 137 | # cache = ctypes.c_char_p(int(ptr)) 138 | with open(self.cache_file, "wb") as f: 139 | f.write(cache) 140 | 141 | def get_quantile(self): 142 | return self.quantile 143 | 144 | def get_regression_cutoff(self): 145 | return self.regression_cutoff 146 | 147 | def read_histogram_cache(self, length): 148 | return None 149 | 150 | def write_histogram_cache(self, ptr, length): 151 | return None 152 | -------------------------------------------------------------------------------- /quantization/onnx_export_v2.py: -------------------------------------------------------------------------------- 1 | from unicodedata import name 2 | import onnx 3 | from onnx import numpy_helper 4 | import numpy as np 5 | from torch import init_num_threads 6 | import json 7 | import struct 8 | 9 | from sklearn.metrics.pairwise import cosine_similarity 10 | 11 | fp32_model = "/apdcephfs/private_howellyang/road_service_app/LaneModel/onnx_infer/model/epoch_390_mm2conv.opt.onnx" 12 | onnx_updated_model_path = fp32_model.replace(".onnx", ".weight_quantized_v4.onnx") 13 | int8_qat_model = "/apdcephfs/private_howellyang/road_service_app/LaneModel/onnx_infer/model/epoch_390_mm2conv.opt_int8.onnx" 14 | 15 | 16 | # 第一步, 从int8-qat模型中取出所有的zero points和scales 17 | print("[Step1] read scales values from model ") 18 | onnx_model = onnx.load(int8_qat_model) 19 | inits = onnx_model.graph.initializer 20 | scales_map = {} 21 | weights_map = {} 22 | for init in inits: 23 | if "PPQ_Variable" in init.name: 24 | W = numpy_helper.to_array(init) 25 | scales_map[init.name] = W 26 | else: 27 | W = numpy_helper.to_array(init) 28 | weights_map[init.name] = W 29 | 30 | 31 | # {'Relu', 'Mul', 'MaxPool', 'GlobalAveragePool', 'Conv', \ 32 | # 'QuantizeLinear', 'Resize', 'Add', 'Concat', 'HardSigmoid', 'DequantizeLinear', 'Sigmoid'} 33 | 34 | # 第二步, 统计权重和输出的scale值 35 | print("[Step2] Collect scales and average ") 36 | acts_scale_map = {} 37 | weights_scale_map = {} 38 | for node in onnx_model.graph.node: 39 | if node.op_type in ["QuantizeLinear"]: 40 | act_name = node.input[0] 41 | scale_name = node.input[1] 42 | scale_value = scales_map[scale_name] 43 | if act_name in weights_map: # 权重量化 44 | if act_name not in weights_scale_map: 45 | weights_scale_map[act_name] = [] 46 | 47 | weights_scale_map[act_name].append(scale_value) 48 | else: # act 量化 49 | if act_name not in acts_scale_map: 50 | acts_scale_map[act_name] = [] 51 | 52 | acts_scale_map[act_name].append(scale_value) 53 | 54 | 55 | for key, value in acts_scale_map.items(): 56 | assert isinstance(value, list), " {} {}".format(key, value) 57 | assert isinstance(value[0], float) or np.size(value[0]) == 1, " {} {}".format( 58 | key, value 59 | ) 60 | 61 | acts_scale_map[key] = float(np.median(value)) 62 | # act_min_q = -128 63 | # act_max_q = 127 64 | # act_min = act_min_q * float(np.median(value)) 65 | # act_max = act_max_q * float(np.median(value)) 66 | # acts_scale_map[key] = max(abs(act_min), abs(act_max)) 67 | # 这里是scale值 q = x/scale ---> -128, 127 68 | # 转换为min max值需要乘以128.0 69 | 70 | 71 | for key, value in weights_scale_map.items(): 72 | assert isinstance(value, list), " {} {}".format(key, value) 73 | weights_scale_map[key] = np.median(value, axis=0, keepdims=False) 74 | 75 | 76 | 77 | # 第三步, 对权重部分,进行fakequant后,放回onnx模型中; 78 | print("[Step3] Fake quant weights ") 79 | 80 | 81 | def fake_quant(weight, scales): 82 | weight = np.array(weight) 83 | scales = np.array(scales) 84 | assert np.shape(weight)[0] == len(scales) 85 | # 权重量化在QAT中是-128, 127; 但是在直接转换中是-127,127 86 | quantized_weight = np.clip(np.round(weight / scales.reshape(-1, 1, 1, 1) + 0.0), -128, 127) 87 | # output = clamp(round(input / scale) + zeroPt) 88 | 89 | # 反量化 90 | weight_r = (quantized_weight.astype(np.float32) - 0.0) * scales.reshape(-1, 1, 1, 1) 91 | 92 | quant_output = np.reshape(weight, (1, -1)) 93 | origin_output = np.reshape(weight_r, (1, -1)) 94 | cos_sim = cosine_similarity(quant_output, origin_output) 95 | assert cos_sim > 0.99, " {} {} {}".format( 96 | cos_sim, scales.reshape((-1,))[:5], weight_r.reshape((-1,))[:5] 97 | ) 98 | return weight_r 99 | 100 | 101 | onnx_model = onnx.load(fp32_model) # 主要目的是获取模型结构 102 | inits = onnx_model.graph.initializer 103 | for idx, init in enumerate(inits): 104 | if init.name in weights_scale_map: 105 | # 需要使用LSQ更新后的权重和scale 106 | W_new = fake_quant(weights_map[init.name], weights_scale_map[init.name]) 107 | print(init.name, np.shape(W_new)) 108 | tensor = numpy_helper.from_array(W_new, init.name) 109 | onnx_model.graph.initializer[idx].CopyFrom(tensor) 110 | # else: 111 | # print(init.name) 112 | 113 | onnx.save(onnx_model, onnx_updated_model_path) 114 | 115 | # 第三步,对act部分,记录scale值, 生成calib.cache文件 116 | print("[Step4] Dump act scales") 117 | with open(onnx_updated_model_path + "_calib_cache.json", "w") as file: 118 | file.write(json.dumps(acts_scale_map, indent=4)) # use `json.loads` to do the reverse 119 | 120 | # write plain text: tensorRT需要对结果做转换 121 | # TRT-8400-EntropyCalibration2 122 | # input.1: 3ca94044 123 | # 9131: 3cf4f8d5 124 | # 加密 hex(struct.unpack(' 0.5: 131 | # print("scale过大, 建议不量化:", key, scale, 128.0 * scale) 132 | # continue 133 | scale_hex = hex(struct.unpack(" -128, 127 70 | # 转换为min max值需要乘以128.0 71 | 72 | 73 | for key, value in weights_scale_map.items(): 74 | assert isinstance(value, list), " {} {}".format(key, value) 75 | weights_scale_map[key] = np.median(value, axis=0, keepdims=False) 76 | 77 | 78 | 79 | # 第三步, 对权重部分,进行fakequant后,放回onnx模型中; 80 | print("[Step3] Fake quant weights ") 81 | 82 | 83 | def fake_quant(weight, scales): 84 | weight = np.array(weight) 85 | scales = np.array(scales) 86 | assert np.shape(weight)[0] == len(scales) 87 | # 权重量化在QAT中是-128, 127; 但是在直接转换中是-127,127 88 | quantized_weight = np.clip(np.round(weight / scales.reshape(-1, 1, 1, 1) + 0.0), -128, 127) 89 | # output = clamp(round(input / scale) + zeroPt) 90 | 91 | # 反量化 92 | weight_r = (quantized_weight.astype(np.float32) - 0.0) * scales.reshape(-1, 1, 1, 1) 93 | 94 | quant_output = np.reshape(weight, (1, -1)) 95 | origin_output = np.reshape(weight_r, (1, -1)) 96 | cos_sim = cosine_similarity(quant_output, origin_output) 97 | assert cos_sim > 0.99, " {} {} {}".format( 98 | cos_sim, scales.reshape((-1,))[:5], weight_r.reshape((-1,))[:5] 99 | ) 100 | return weight_r 101 | 102 | 103 | onnx_model = onnx.load(fp32_model) # 主要目的是获取模型结构 104 | inits = onnx_model.graph.initializer 105 | for idx, init in enumerate(inits): 106 | if init.name in weights_scale_map: 107 | # 需要使用LSQ更新后的权重和scale 108 | W_new = fake_quant(weights_map[init.name], weights_scale_map[init.name]) 109 | print(init.name, np.shape(W_new)) 110 | tensor = numpy_helper.from_array(W_new, init.name) 111 | onnx_model.graph.initializer[idx].CopyFrom(tensor) 112 | # else: 113 | # print(init.name) 114 | 115 | onnx.save(onnx_model, onnx_updated_model_path) 116 | 117 | 118 | acts_scale_map = {k: acts_scale_map[k] for k in sorted(acts_scale_map)} 119 | 120 | 121 | # 第三步,对act部分,记录scale值, 生成calib.cache文件 122 | print("[Step4] Dump act scales") 123 | with open(onnx_updated_model_path + "_calib_cache.json", "w") as file: 124 | file.write(json.dumps(acts_scale_map, indent=4)) # use `json.loads` to do the reverse 125 | 126 | # write plain text: tensorRT需要对结果做转换 127 | # TRT-8400-EntropyCalibration2 128 | # input.1: 3ca94044 129 | # 9131: 3cf4f8d5 130 | # 加密 hex(struct.unpack(' 0.5: 137 | # print("scale过大, 建议不量化:", key, scale, 128.0 * scale) 138 | # continue 139 | scale_hex = hex(struct.unpack(" 1: 77 | print(onnx_model.graph.node[i].name) 78 | qdq_indexes = relu2qde[onnx_model.graph.node[i].name] 79 | 80 | # 取多个scale的均值 81 | q_vals = [] 82 | dq_vals = [] 83 | q_init_names = [] 84 | dq_init_names = [] 85 | for idx in qdq_indexes: 86 | q_node_o = onnx_model.graph.node[idx].output[0] 87 | dq_o_index, dq_o_index_i, dq_index = find_dq_node_output_node(onnx_model, q_node_o) 88 | q_val = scales_map[onnx_model.graph.node[idx].input[1]] 89 | dq_val = scales_map[onnx_model.graph.node[dq_index].input[1]] 90 | q_init_names.append(onnx_model.graph.node[idx].input[1]) 91 | dq_init_names.append(onnx_model.graph.node[dq_index].input[1]) 92 | q_vals.append(q_val) 93 | dq_vals.append(dq_val) 94 | 95 | # 给权重重新赋值 96 | for idx, init in enumerate(inits): 97 | if init.name in q_init_names: 98 | W_new = np.mean(q_vals, axis=0) 99 | tensor = numpy_helper.from_array(W_new, init.name) 100 | onnx_model.graph.initializer[idx].CopyFrom(tensor) 101 | elif init.name in dq_init_names: 102 | W_new = np.mean(dq_vals, axis=0) 103 | tensor = numpy_helper.from_array(W_new, init.name) 104 | onnx_model.graph.initializer[idx].CopyFrom(tensor) 105 | 106 | 107 | # 修改移除后的输入输出,并记录需要移除的点 108 | for idx in qdq_indexes[1:]: 109 | remove_nodes.append(onnx_model.graph.node[idx].name) 110 | # 找到后续的dq节点 111 | q_node_o = onnx_model.graph.node[idx].output[0] 112 | dq_o_index, dq_o_index_i, dq_index = find_dq_node_output_node(onnx_model, q_node_o) 113 | remove_nodes.append(onnx_model.graph.node[dq_index].name) 114 | onnx_model.graph.node[dq_o_index].input[dq_o_index_i] = onnx_model.graph.node[idx].input[0] 115 | else: 116 | print("Relu wo QDQ", onnx_model.graph.node[i].name) 117 | 118 | # 删除多余的节点 119 | for rm_name in remove_nodes: 120 | for i in range(len(onnx_model.graph.node)): 121 | if onnx_model.graph.node[i].name == rm_name: 122 | old_node = onnx_model.graph.node[i] 123 | print("remove", old_node.name) 124 | onnx_model.graph.node.remove(old_node) # 删除旧节点 125 | break 126 | 127 | model_opt = onnxoptimizer.optimize(onnx_model) 128 | # model_simp, check = simplify(model_opt) 129 | model_simp = shape_inference.infer_shapes(model_opt) 130 | onnx.save(model_simp, sys.argv[2]) 131 | -------------------------------------------------------------------------------- /quantization/compare_trt_trt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | 6 | print(sys.getdefaultencoding()) 7 | s = "中文乱码问题解决" 8 | print(s) 9 | 10 | # --------------------------------------------------------------- 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 12 | 13 | # This script shows you how to export ppq internal graph to tensorRT 14 | # --------------------------------------------------------------- 15 | 16 | # For this inference test, all test data is randomly picked. 17 | # If you want to use real data, just rewrite the defination of SAMPLES 18 | print("开始import") 19 | import onnxruntime 20 | import torch 21 | from tqdm import tqdm 22 | import glob 23 | import cv2 24 | import numpy as np 25 | from torchvision import transforms 26 | from PIL import Image 27 | import os 28 | from sklearn.metrics.pairwise import cosine_similarity 29 | import onnx 30 | from copy import deepcopy 31 | 32 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray: 33 | if x is None and not accepet_none: 34 | raise ValueError("Trying to convert an empty value.") 35 | if isinstance(x, np.ndarray): 36 | return x 37 | elif isinstance(x, int) or isinstance(x, float): 38 | return np.array( 39 | [ 40 | x, 41 | ] 42 | ) 43 | elif isinstance(x, torch.Tensor): 44 | if x.numel() == 0 and accepet_none: 45 | return None 46 | if x.numel() == 0 and not accepet_none: 47 | raise ValueError("Trying to convert an empty value.") 48 | if x.numel() == 1: 49 | return convert_any_to_numpy(x.detach().cpu().item()) 50 | if x.numel() > 1: 51 | return x.detach().cpu().numpy() 52 | elif isinstance(x, list) or isinstance(x, tuple): 53 | return np.array(x) 54 | else: 55 | raise TypeError( 56 | f"input value {x}({type(x)}) can not be converted as numpy type." 57 | ) 58 | 59 | def read_image(path): 60 | # 多任务模型 61 | _img_transforms = transforms.Compose( 62 | [ 63 | transforms.Resize((384, 768)), 64 | transforms.ToTensor(), 65 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 66 | ] 67 | ) 68 | img = Image.open(path).convert("RGB") 69 | img_w, img_h = img.size[0], img.size[1] 70 | img = _img_transforms(img) 71 | img = img.unsqueeze(0) 72 | return img 73 | 74 | calibration_files = glob.glob( 75 | os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg") 76 | )[-100:] 77 | 78 | SAMPLES = [ 79 | read_image(path) for path in calibration_files 80 | ] # rewirte this to use real data. 81 | 82 | 83 | DEVICE = "cuda" 84 | FINETUNE = True 85 | EXECUTING_DEVICE = "cuda" 86 | REQUIRE_ANALYSE = True 87 | 88 | # ------------------------------------------------------------------- 89 | # 启动 tensorRT 进行推理,你先装一下 trt 90 | # ------------------------------------------------------------------- 91 | 92 | 93 | def infer_with_trt(trt_int8_path = ""): 94 | import tensorrt as trt 95 | import trt_infer 96 | 97 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 98 | logger = trt.Logger(trt.Logger.INFO) 99 | with open(trt_int8_path, "rb") as f, trt.Runtime( 100 | logger 101 | ) as runtime: 102 | engine = runtime.deserialize_cuda_engine(f.read()) 103 | 104 | trt_outpus_all = [] 105 | with engine.create_execution_context() as context: 106 | inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers( 107 | context.engine 108 | ) 109 | for sample in tqdm(samples, desc="TensorRT is running..."): 110 | # trt infer 111 | inputs[0].host = convert_any_to_numpy(sample) 112 | trt_outputs_list = trt_infer.do_inference( 113 | context, 114 | bindings=bindings, 115 | inputs=inputs, 116 | outputs=outputs, 117 | stream=stream, 118 | batch_size=1, 119 | ) 120 | trt_outputs_dict = { 121 | trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names)) 122 | } 123 | trt_outpus_all.append(deepcopy(trt_outputs_dict)) 124 | return trt_outpus_all 125 | 126 | 127 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/road-service/road_service/engine/mod_road_multi_tasks/model/RMTNet_release20220609.trtmodel") 128 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.trt_int8_with_1578pics_calib_entropy_less_int8_v1.trtmodel") # 原始QAT转换的模型 129 | trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.trt_int8_with_1578pics_calib_entropy.trtmodel") # 进行虚拟量化转换后的模型 130 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.no_weight_quant.int8.trtmodel") # 不虚拟量化,仅使用min max值 131 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.trtmodel") # 进行虚拟量化 132 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.trtmodel") # 进行虚拟量化 133 | trt_outpus_all_fp32 = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/road-service/road_service/engine/mod_road_multi_tasks/model/RMTNet_release20220609.fp16.trtmodel") 134 | 135 | 136 | sims = {} 137 | for i in range(len(trt_outpus_all)): 138 | for output_name, _ in trt_outpus_all[i].items(): 139 | trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1)) 140 | trt_fp32_output = np.reshape(trt_outpus_all_fp32[i][output_name], (1, -1)) 141 | cos_sim = cosine_similarity(trt_output, trt_fp32_output) 142 | if output_name not in sims: 143 | sims[output_name] = [] 144 | sims[output_name].append(cos_sim.ravel()) 145 | # if cos_sim < 0.985: 146 | # print(output_name, cos_sim) 147 | # print(trt_fp32_output[0, :5]) 148 | # print(trt_output[0, :5]) 149 | 150 | print("===================") 151 | mean_sims = [] 152 | for key, value in sims.items(): 153 | print(key, np.mean(value), np.min(value)) 154 | mean_sims.append(np.mean(value)) 155 | print("average cosine sim = ", np.mean(mean_sims)) -------------------------------------------------------------------------------- /quantization/onnx2trt_lsq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | print(sys.getdefaultencoding()) 6 | s = "中文乱码问题解决" 7 | print(s) 8 | 9 | # --------------------------------------------------------------- 10 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 11 | 12 | # This script shows you how to export ppq internal graph to tensorRT 13 | # --------------------------------------------------------------- 14 | 15 | # For this inference test, all test data is randomly picked. 16 | # If you want to use real data, just rewrite the defination of SAMPLES 17 | print("开始import") 18 | import onnxruntime 19 | import torch 20 | from ppq import * 21 | from ppq.api import * 22 | from tqdm import tqdm 23 | import glob 24 | import cv2 25 | import numpy as np 26 | from torchvision import transforms 27 | from PIL import Image 28 | import os 29 | 30 | def read_image(path): 31 | # 多任务模型 32 | _img_transforms = transforms.Compose([ 33 | transforms.Resize((384, 768)), 34 | transforms.ToTensor(), 35 | transforms.Normalize((.485, .456, .406), (.229, .224, .225)) 36 | ]) 37 | img = Image.open(path).convert('RGB') 38 | img_w, img_h = img.size[0], img.size[1] 39 | img = _img_transforms(img) 40 | img = img.unsqueeze(0) 41 | return img 42 | 43 | QUANT_PLATFROM = TargetPlatform.TRT_INT8 44 | MODEL = 'model_lsq.onnx' 45 | INPUT_SHAPE = [1, 3, 384, 768] 46 | 47 | calibration_files = glob.glob(os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", '*.jpg'))[:256] 48 | SAMPLES = [read_image(path) for path in calibration_files] # rewirte this to use real data. 49 | 50 | 51 | DEVICE = 'cuda' 52 | FINETUNE = True 53 | QS = QuantizationSettingFactory.default_setting() 54 | EXECUTING_DEVICE = 'cuda' 55 | REQUIRE_ANALYSE = True 56 | 57 | # ------------------------------------------------------------------- 58 | # 下面向你展示了常用参数调节选项: 59 | # ------------------------------------------------------------------- 60 | if PPQ_CONFIG.USING_CUDA_KERNEL: 61 | print("====== using advanced_optimization =====") 62 | QS.advanced_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 63 | QS.advanced_optimization_setting.steps = 2500 # 再训练步数,影响训练时间,2500步大概几分钟 64 | QS.advanced_optimization_setting.collecting_device = 'executor' # 缓存数据放在那,executor 就是放在gpu,如果显存超了你就换成 'cpu' 65 | QS.advanced_optimization_setting.auto_check = False # 打开这个选项则训练过程中会防止过拟合,以及意外情况,通常不需要开。 66 | else: 67 | print("====== using lsq_optimization =====") 68 | QS.lsq_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 69 | QS.lsq_optimization_setting.epochs = 64 # 再训练轮数,影响训练时间,30轮大概几分钟 70 | QS.lsq_optimization_setting.collecting_device = 'cuda' # 缓存数据放在那,cuda 就是放在gpu,如果显存超了你就换成 'cpu' 71 | 72 | QS.dispatching_table.append(operation='Sigmoid', platform=TargetPlatform.FP32) # 把量化的不太好的算子送回 FP32 73 | 74 | print('正准备量化你的网络,检查下列设置:') 75 | print(f'TARGET PLATFORM : {QUANT_PLATFROM.name}') 76 | print(f'NETWORK INPUTSHAPE : {INPUT_SHAPE}') 77 | 78 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x,但是你如果没有装相应编译环境的话是编译不了的 79 | # 你可以尝试安装编译环境,或者在不启动 CUDA KERNEL 的情况下完成量化:移除 with ENABLE_CUDA_KERNEL(): 即可 80 | with ENABLE_CUDA_KERNEL(): 81 | qir = quantize_onnx_model( 82 | onnx_import_file=MODEL, calib_dataloader=SAMPLES, calib_steps=128, setting=QS, 83 | input_shape=INPUT_SHAPE, collate_fn=lambda x: x.to(EXECUTING_DEVICE), 84 | platform=QUANT_PLATFROM, do_quantize=True) 85 | 86 | # ------------------------------------------------------------------- 87 | # PPQ 计算量化误差时,使用信噪比的倒数作为指标,即噪声能量 / 信号能量 88 | # 量化误差 0.1 表示在整体信号中,量化噪声的能量约为 10% 89 | # 你应当注意,在 graphwise_error_analyse 分析中,我们衡量的是累计误差 90 | # 网络的最后一层往往都具有较大的累计误差,这些误差是其前面的所有层所共同造成的 91 | # 你需要使用 layerwise_error_analyse 逐层分析误差的来源 92 | # ------------------------------------------------------------------- 93 | print('正计算网络量化误差(SNR),最后一层的误差应小于 0.1 以保证量化精度:') 94 | reports = graphwise_error_analyse( 95 | graph=qir, running_device=EXECUTING_DEVICE, steps=32, 96 | dataloader=SAMPLES, collate_fn=lambda x: x.to(EXECUTING_DEVICE)) 97 | for op, snr in reports.items(): 98 | if snr > 0.1: ppq_warning(f'层 {op} 的累计量化误差显著,请考虑进行优化') 99 | 100 | if REQUIRE_ANALYSE: 101 | print('正计算逐层量化误差(SNR),每一层的独立量化误差应小于 0.1 以保证量化精度:') 102 | layerwise_error_analyse(graph=qir, running_device=EXECUTING_DEVICE, 103 | interested_outputs=None, 104 | dataloader=SAMPLES, collate_fn=lambda x: x.to(EXECUTING_DEVICE)) 105 | 106 | print('网络量化结束,正在生成目标文件:') 107 | export_ppq_graph( 108 | graph=qir, platform=QUANT_PLATFROM, 109 | graph_save_to = 'model_copy_int8.onnx') 110 | 111 | # ------------------------------------------------------------------- 112 | # 记录一下输入输出的名字,onnxruntime 跑的时候需要提供这些名字 113 | # 我写的只是单输出单输入的版本,多输出多输入你得自己改改 114 | # ------------------------------------------------------------------- 115 | int8_input_names = [name for name, _ in qir.inputs.items()] 116 | int8_output_names = [name for name, _ in qir.outputs.items()] 117 | 118 | # ------------------------------------------------------------------- 119 | # 启动 tensorRT 进行推理,你先装一下 trt 120 | # ------------------------------------------------------------------- 121 | import tensorrt as trt 122 | import trt_infer 123 | 124 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 125 | logger = trt.Logger(trt.Logger.INFO) 126 | with open('model_copy_int8.engine', 'rb') as f, trt.Runtime(logger) as runtime: 127 | engine = runtime.deserialize_cuda_engine(f.read()) 128 | 129 | results = [] 130 | with engine.create_execution_context() as context: 131 | inputs, outputs, bindings, stream = trt_infer.allocate_buffers(context.engine) 132 | for sample in tqdm(samples, desc='TensorRT is running...'): 133 | inputs[0].host = convert_any_to_numpy(sample) 134 | [output] = trt_infer.do_inference( 135 | context, bindings=bindings, inputs=inputs, 136 | outputs=outputs, stream=stream, batch_size=1) 137 | results.append(convert_any_to_torch_tensor(output).reshape([-1, 1000])) -------------------------------------------------------------------------------- /quantization/C03_compare_trt_fp32_int8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | 6 | print(sys.getdefaultencoding()) 7 | s = "中文乱码问题解决" 8 | print(s) 9 | 10 | # --------------------------------------------------------------- 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 12 | 13 | # This script shows you how to export ppq internal graph to tensorRT 14 | # --------------------------------------------------------------- 15 | 16 | # For this inference test, all test data is randomly picked. 17 | # If you want to use real data, just rewrite the defination of SAMPLES 18 | print("开始import") 19 | import onnxruntime 20 | import torch 21 | from tqdm import tqdm 22 | import glob 23 | import cv2 24 | import numpy as np 25 | from torchvision import transforms 26 | from PIL import Image 27 | import os 28 | from sklearn.metrics.pairwise import cosine_similarity 29 | import onnx 30 | from copy import deepcopy 31 | 32 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray: 33 | if x is None and not accepet_none: 34 | raise ValueError("Trying to convert an empty value.") 35 | if isinstance(x, np.ndarray): 36 | return x 37 | elif isinstance(x, int) or isinstance(x, float): 38 | return np.array( 39 | [ 40 | x, 41 | ] 42 | ) 43 | elif isinstance(x, torch.Tensor): 44 | if x.numel() == 0 and accepet_none: 45 | return None 46 | if x.numel() == 0 and not accepet_none: 47 | raise ValueError("Trying to convert an empty value.") 48 | if x.numel() == 1: 49 | return convert_any_to_numpy(x.detach().cpu().item()) 50 | if x.numel() > 1: 51 | return x.detach().cpu().numpy() 52 | elif isinstance(x, list) or isinstance(x, tuple): 53 | return np.array(x) 54 | else: 55 | raise TypeError( 56 | f"input value {x}({type(x)}) can not be converted as numpy type." 57 | ) 58 | 59 | def read_image(path): 60 | mean_val = [103.53, 116.28, 123.675] 61 | std_val = [57.375, 57.12, 58.395] 62 | input_size = [768, 448] 63 | 64 | # img = np.random.randint(255, size=input_size + [3]).astype(np.uint8) 65 | img_raw = cv2.imread(path) 66 | img = cv2.resize(img_raw, (input_size[0],input_size[1])).astype(np.float32) 67 | img -= mean_val 68 | img /= std_val 69 | img = np.transpose(img, (2, 0, 1)).astype(np.float32) 70 | img = np.expand_dims(img, axis=0) 71 | 72 | img = np.ascontiguousarray(img, dtype=np.float32) 73 | # img_tensor = torch.from_numpy(img) 74 | # dummy_input = torch.autograd.Variable(img_tensor) 75 | return img 76 | 77 | calibration_files = glob.glob( 78 | os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg") 79 | )[-100:] 80 | 81 | SAMPLES = [ 82 | read_image(path) for path in calibration_files 83 | ] # rewirte this to use real data. 84 | 85 | 86 | DEVICE = "cuda" 87 | FINETUNE = True 88 | EXECUTING_DEVICE = "cuda" 89 | REQUIRE_ANALYSE = True 90 | 91 | # ------------------------------------------------------------------- 92 | # 启动 tensorRT 进行推理,你先装一下 trt 93 | # ------------------------------------------------------------------- 94 | 95 | 96 | def infer_with_trt(trt_int8_path = ""): 97 | import tensorrt as trt 98 | import trt_infer 99 | trt.init_libnvinfer_plugins(None, "") 100 | 101 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 102 | logger = trt.Logger(trt.Logger.INFO) 103 | with open(trt_int8_path, "rb") as f, trt.Runtime( 104 | logger 105 | ) as runtime: 106 | engine = runtime.deserialize_cuda_engine(f.read()) 107 | 108 | trt_outpus_all = [] 109 | with engine.create_execution_context() as context: 110 | inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers( 111 | context.engine 112 | ) 113 | for sample in tqdm(samples, desc="TensorRT is running..."): 114 | # trt infer 115 | inputs[0].host = convert_any_to_numpy(sample) 116 | trt_outputs_list = trt_infer.do_inference( 117 | context, 118 | bindings=bindings, 119 | inputs=inputs, 120 | outputs=outputs, 121 | stream=stream, 122 | batch_size=1, 123 | ) 124 | trt_outputs_dict = { 125 | trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names)) 126 | } 127 | trt_outpus_all.append(deepcopy(trt_outputs_dict)) 128 | return trt_outpus_all 129 | 130 | 131 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/road-service/road_service/engine/mod_road_multi_tasks/model/RMTNet_release20220609.trtmodel") 132 | trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/LaneModel/onnx_infer/model/epoch_390_mm2conv.opt.trt_int8_with_1578pics_calib_entropy.trtmodel") # 原始QAT转换的模型 133 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized.int8.trtmodel") # 进行虚拟量化转换后的模型 134 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.no_weight_quant.int8.trtmodel") # 不虚拟量化,仅使用min max值 135 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.trtmodel") # 进行虚拟量化 136 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.trtmodel") # 进行虚拟量化 137 | trt_outpus_all_fp32 = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/LaneModel/onnx_infer/model/epoch_390_mm2conv.opt.fp16.trtmodel") 138 | 139 | 140 | sims = {} 141 | for i in range(len(trt_outpus_all)): 142 | for output_name, _ in trt_outpus_all[i].items(): 143 | trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1)) 144 | trt_fp32_output = np.reshape(trt_outpus_all_fp32[i][output_name], (1, -1)) 145 | cos_sim = cosine_similarity(trt_output, trt_fp32_output) 146 | if output_name not in sims: 147 | sims[output_name] = [] 148 | sims[output_name].append(cos_sim.ravel()) 149 | # if cos_sim < 0.985: 150 | # print(output_name, cos_sim) 151 | # print(trt_fp32_output[0, :5]) 152 | # print(trt_output[0, :5]) 153 | 154 | print("===================") 155 | mean_sims = [] 156 | for key, value in sims.items(): 157 | print(key, np.mean(value), np.min(value)) 158 | mean_sims.append(np.mean(value)) 159 | print("average cosine sim = ", np.mean(mean_sims)) -------------------------------------------------------------------------------- /quantization/C02_compare_trt_fp32_int8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | 6 | print(sys.getdefaultencoding()) 7 | s = "中文乱码问题解决" 8 | print(s) 9 | 10 | # --------------------------------------------------------------- 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 12 | 13 | # This script shows you how to export ppq internal graph to tensorRT 14 | # --------------------------------------------------------------- 15 | 16 | # For this inference test, all test data is randomly picked. 17 | # If you want to use real data, just rewrite the defination of SAMPLES 18 | print("开始import") 19 | import onnxruntime 20 | import torch 21 | from tqdm import tqdm 22 | import glob 23 | import cv2 24 | import numpy as np 25 | from torchvision import transforms 26 | from PIL import Image 27 | import os 28 | from sklearn.metrics.pairwise import cosine_similarity 29 | import onnx 30 | from copy import deepcopy 31 | 32 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray: 33 | if x is None and not accepet_none: 34 | raise ValueError("Trying to convert an empty value.") 35 | if isinstance(x, np.ndarray): 36 | return x 37 | elif isinstance(x, int) or isinstance(x, float): 38 | return np.array( 39 | [ 40 | x, 41 | ] 42 | ) 43 | elif isinstance(x, torch.Tensor): 44 | if x.numel() == 0 and accepet_none: 45 | return None 46 | if x.numel() == 0 and not accepet_none: 47 | raise ValueError("Trying to convert an empty value.") 48 | if x.numel() == 1: 49 | return convert_any_to_numpy(x.detach().cpu().item()) 50 | if x.numel() > 1: 51 | return x.detach().cpu().numpy() 52 | elif isinstance(x, list) or isinstance(x, tuple): 53 | return np.array(x) 54 | else: 55 | raise TypeError( 56 | f"input value {x}({type(x)}) can not be converted as numpy type." 57 | ) 58 | 59 | def read_image(path): 60 | mean = [123.675, 116.28, 103.53] 61 | std = [58.395, 57.12, 57.375] 62 | input_w = 960 63 | input_h = 480 64 | 65 | # for onnx inference 66 | mean = np.array(mean) 67 | std = np.array(std) 68 | 69 | # Load by OpenCV 70 | img = cv2.imread(path) 71 | # Convert to RGB 72 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 73 | 74 | img = cv2.resize(img, (input_w, input_h)) 75 | 76 | img = img.astype(np.float32) 77 | 78 | # Norm 79 | for i in range(3): 80 | img[..., i] = (img[..., i] - mean[i]) / std[i] 81 | 82 | # hwc -> nchw 83 | h, w, c = img.shape 84 | img = img.reshape((1, c, h ,w)) 85 | 86 | return np.array(img) 87 | 88 | calibration_files = glob.glob( 89 | os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg") 90 | )[-100:] 91 | 92 | SAMPLES = [ 93 | read_image(path) for path in calibration_files 94 | ] # rewirte this to use real data. 95 | 96 | 97 | DEVICE = "cuda" 98 | FINETUNE = True 99 | EXECUTING_DEVICE = "cuda" 100 | REQUIRE_ANALYSE = True 101 | 102 | # ------------------------------------------------------------------- 103 | # 启动 tensorRT 进行推理,你先装一下 trt 104 | # ------------------------------------------------------------------- 105 | 106 | 107 | def infer_with_trt(trt_int8_path = ""): 108 | import tensorrt as trt 109 | import trt_infer 110 | trt.init_libnvinfer_plugins(None, "") 111 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 112 | logger = trt.Logger(trt.Logger.INFO) 113 | with open(trt_int8_path, "rb") as f, trt.Runtime( 114 | logger 115 | ) as runtime: 116 | engine = runtime.deserialize_cuda_engine(f.read()) 117 | 118 | trt_outpus_all = [] 119 | with engine.create_execution_context() as context: 120 | inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers( 121 | context.engine 122 | ) 123 | for sample in tqdm(samples, desc="TensorRT is running..."): 124 | # trt infer 125 | inputs[0].host = convert_any_to_numpy(sample) 126 | trt_outputs_list = trt_infer.do_inference( 127 | context, 128 | bindings=bindings, 129 | inputs=inputs, 130 | outputs=outputs, 131 | stream=stream, 132 | batch_size=1, 133 | ) 134 | trt_outputs_dict = { 135 | trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names)) 136 | } 137 | trt_outpus_all.append(deepcopy(trt_outputs_dict)) 138 | return trt_outpus_all 139 | 140 | 141 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/road-service/road_service/engine/mod_road_multi_tasks/model/RMTNet_release20220609.trtmodel") 142 | trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/roadseg-infer/res101_ep100.opt.trt_int8_with_1578pics_calib_entropy.trtmodel") # 原始QAT转换的模型 143 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized.int8.trtmodel") # 进行虚拟量化转换后的模型 144 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.no_weight_quant.int8.trtmodel") # 不虚拟量化,仅使用min max值 145 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.trtmodel") # 进行虚拟量化 146 | # trt_outpus_all = infer_with_trt("/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.trtmodel") # 进行虚拟量化 147 | trt_outpus_all_fp32 = infer_with_trt("/apdcephfs/private_howellyang/road_service_app/roadseg-infer/res101_ep100.opt.fp16.trtmodel") 148 | 149 | 150 | sims = {} 151 | for i in range(len(trt_outpus_all)): 152 | for output_name, _ in trt_outpus_all[i].items(): 153 | trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1)) 154 | trt_fp32_output = np.reshape(trt_outpus_all_fp32[i][output_name], (1, -1)) 155 | cos_sim = cosine_similarity(trt_output, trt_fp32_output) 156 | if output_name not in sims: 157 | sims[output_name] = [] 158 | sims[output_name].append(cos_sim.ravel()) 159 | # if cos_sim < 0.985: 160 | # print(output_name, cos_sim) 161 | # print(trt_fp32_output[0, :5]) 162 | # print(trt_output[0, :5]) 163 | 164 | print("===================") 165 | mean_sims = [] 166 | for key, value in sims.items(): 167 | print(key, np.mean(value), np.min(value)) 168 | mean_sims.append(np.mean(value)) 169 | print("average cosine sim = ", np.mean(mean_sims)) -------------------------------------------------------------------------------- /onnx2trt.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from __future__ import print_function 3 | 4 | import argparse 5 | import glob 6 | import os 7 | from tabnanny import verbose 8 | import tensorrt as trt 9 | import pycuda.driver as cuda 10 | import pycuda.autoinit # fix init error of cuda 11 | from google.protobuf.json_format import MessageToDict 12 | import onnx 13 | from onnxsim import simplify 14 | try: 15 | import onnxoptimizer as optimizer 16 | except: 17 | from onnx import optimizer 18 | 19 | from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference 20 | import numpy as np 21 | from trt_utils import ( 22 | create_image_stream, 23 | create_calibrator, 24 | create_tensorrt_engine, 25 | evaluate_engine, 26 | ) 27 | 28 | parser = argparse.ArgumentParser(description="Onnx Calibration Params") 29 | parser.add_argument("--onnx", type=str, default=None, required=True, help="原始的onnx路径") 30 | parser.add_argument( 31 | "--trt_engine", type=str, default=None, required=False, help="tensorRT engine的保存路径" 32 | ) 33 | 34 | parser.add_argument( 35 | "--engine_type", 36 | type=str, 37 | default="int8", 38 | choices=["int8", "fp32", "fp16", "best"], 39 | required=False, 40 | help="模型的计算精度", 41 | ) 42 | 43 | parser.add_argument( 44 | "--trt_calib_cache", 45 | type=str, 46 | default="./trt_int8.cache", 47 | required=False, 48 | help="用来存储每个节点动态范围的路径", 49 | ) 50 | parser.add_argument( 51 | "--calib_dir", type=str, default=None, required=False, help="进行精度测试以及量化校准使用的图片路径" 52 | ) 53 | parser.add_argument( 54 | "--calib_algo", 55 | type=str, 56 | default=None, 57 | required=False, 58 | choices=["Search", "TRTEntropy", "TRTMinMax", "TRTPercentile", "ONNXEntropy", "ONNXMinMax", "ONNXPercentile"], 59 | help="""量化校准使用的算法: 60 | Search 进行自动化搜索, 自动选择最终输出的cosine距离最高的校准算法 61 | TRTEntropy 使用交叉熵评估量化前后的量化误差,自动选择误差最小的动态范围值 62 | TRTMinMax 计算每个节点输出的最大最小值,作为最终的动态范围值 63 | TRTPercentile 计算每个节点输出值,然后求其分位点作为动态范围值 64 | """, 65 | ) 66 | 67 | parser.add_argument( 68 | "--channel_order", 69 | type=str, 70 | default="RGB", 71 | required=False, 72 | choices=["RGB", "BGR"], 73 | help="图片的输入顺序, 可选BGR、RGB", 74 | ) 75 | parser.add_argument( 76 | "--means", type=str, default="0.0,0.0,0.0", required=False, help="图片预处理的均值" 77 | ) 78 | parser.add_argument( 79 | "--stds", type=str, default="1.0,1.0,1.0", required=False, help="图片预处理的方差" 80 | ) 81 | parser.add_argument( 82 | "--pixel_type", 83 | type=str, 84 | default="NCHW", 85 | required=False, 86 | choices=["NCHW", "NHWC"], 87 | help="模型输入的通道顺序, 一般而言", 88 | ) 89 | 90 | args = parser.parse_args() 91 | onnx_path = args.onnx 92 | engine_type = args.engine_type 93 | trt_engine = args.trt_engine 94 | calib_algo = args.calib_algo 95 | calib_dir = args.calib_dir 96 | means = args.means 97 | stds = args.stds 98 | pixel_type = args.pixel_type 99 | trt_calib_cache = args.trt_calib_cache 100 | channel_order = args.channel_order 101 | 102 | # 获取输入输出信息 103 | print("[ONNX2TRT] Optimizing Onnx Model....") 104 | INPUT_SHAPES = [] 105 | INPUT_NAMES = [] 106 | onnx_model = onnx.load(onnx_path) 107 | onnx_model, check = simplify(onnx_model) # simplify 108 | optimized_model = optimizer.optimize(onnx_model) # optimize 109 | onnx_model = SymbolicShapeInference.infer_shapes( 110 | onnx_model, 111 | int_max=2**31 - 1, 112 | auto_merge=True, 113 | guess_output_rank=True, 114 | verbose=2 115 | ) 116 | 117 | onnx_path = onnx_path.replace(".onnx", "") + "_with_shape.onnx" 118 | onnx.save(onnx_model, onnx_path) 119 | 120 | input_all = [node.name for node in onnx_model.graph.input] 121 | input_initializer = [node.name for node in onnx_model.graph.initializer] 122 | net_feed_input_names = list(set(input_all) - set(input_initializer)) 123 | 124 | for _input in onnx_model.graph.input: 125 | m_dict = MessageToDict(_input) 126 | dim_info = m_dict.get("type").get("tensorType").get("shape").get("dim") 127 | input_shape = [int(d.get("dimValue")) for d in dim_info] # [4,3,384,640] 128 | input_name = m_dict.get("name") 129 | if input_name in net_feed_input_names: 130 | INPUT_SHAPES.append(input_shape) 131 | INPUT_NAMES.append(input_name) 132 | print(INPUT_NAMES[-1], INPUT_SHAPES[-1]) 133 | 134 | if len(INPUT_SHAPES) > 1: 135 | print("模型存在多个输入, 本工具暂不支持多输入模型") 136 | raise NameError("模型存在多个输入, 本工具暂不支持多输入模型") 137 | 138 | elif len(INPUT_SHAPES[0]) != 4: 139 | print("模型的输入不是NCHW或NHWC, 本工具暂不支持这种输入格式") 140 | raise NameError("模型的输入不是NCHW或NHWC, 本工具暂不支持这种输入格式") 141 | 142 | if engine_type == "int8": 143 | if calib_algo == "Search": 144 | search_types = ["TRTEntropy", "TRTMinMax", "TRTPercentile"] 145 | else: 146 | search_types = [calib_algo] 147 | image_stream = create_image_stream( 148 | calib_dir, INPUT_SHAPES[0], means, stds, pixel_type, channel_order 149 | ) 150 | final_cos_similarity = -1.0 151 | final_engine = None 152 | print("[ONNX2TRT] Start Calibration with {}".format(search_types)) 153 | for calibrator_type in search_types: 154 | calibrator = create_calibrator( 155 | image_stream, INPUT_NAMES, trt_calib_cache, calib_algo, onnx_path 156 | ) 157 | engine = create_tensorrt_engine(onnx_path, engine_type, calibrator) 158 | cos_similarity, infer_time = evaluate_engine(onnx_path, engine, image_stream) 159 | if cos_similarity > final_cos_similarity: 160 | final_cos_similarity = cos_similarity 161 | final_engine = engine 162 | final_infer_time = infer_time 163 | print("[ONNX2TRT] INFO: 校准算法 = ", calib_algo) 164 | print("[ONNX2TRT] INFO: 与onnx输出的cos相似度 = ", cos_similarity) 165 | print("[ONNX2TRT] INFO: 模型infer的平均耗时 = ", infer_time) 166 | 167 | else: 168 | final_engine = create_tensorrt_engine(onnx_path, engine_type) 169 | if calib_dir != "": 170 | image_stream = create_image_stream( 171 | calib_dir, INPUT_SHAPES[0], means, stds, pixel_type, channel_order 172 | ) 173 | cos_similarity, infer_time = evaluate_engine( 174 | onnx_path, final_engine, image_stream 175 | ) 176 | print("[ONNX2TRT] INFO: 校准算法 = ", None) 177 | print("[ONNX2TRT] INFO: 与onnx输出的cos相似度 = ", cos_similarity) 178 | print("[ONNX2TRT] INFO: 模型infer的平均耗时 = ", infer_time) 179 | 180 | # 将trt engine写入文件 181 | print("[ONNX2TRT] INFO: 模型构建完成, 将模型写入路径 = ", trt_engine) 182 | if not os.path.exists(os.path.dirname(trt_engine)): 183 | os.makedirs(os.path.dirname(trt_engine), exist_ok=True) 184 | with open(trt_engine, "wb") as f: 185 | f.write(final_engine.serialize()) 186 | -------------------------------------------------------------------------------- /quantization/ptq/quantization_filter.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from matplotlib.pyplot import axis 3 | from trt_utils import read_calib_cache 4 | from data_loader import DataLoader 5 | from onnx_model import OnnxModel 6 | from queue import Queue 7 | import numpy as np 8 | from onnx import numpy_helper 9 | import onnx 10 | from time import time 11 | from sklearn.metrics.pairwise import cosine_similarity 12 | 13 | class QuantizaitonFilter: 14 | def __init__(self, model_path: str, calib_path: str): 15 | self.model_path = model_path 16 | self.data_loader = DataLoader() 17 | self.model = OnnxModel(model_path) 18 | self.calib_path = calib_path 19 | OnnxModel.quantize_weights(self.model.qdq_model) 20 | self.fp32_weight_name2tensor = {} 21 | for weight in self.model.fp32_model.graph.initializer: 22 | self.fp32_weight_name2tensor[weight.name] = weight 23 | 24 | self.int8_weight_name2tensor = {} 25 | for weight in self.model.qdq_model.graph.initializer: 26 | self.int8_weight_name2tensor[weight.name] = weight 27 | 28 | def is_conv_output(self, onnx_model, tensor_name): 29 | pre_node = None 30 | for node in onnx_model.graph.node: 31 | for output in node.output: 32 | if output == tensor_name: 33 | pre_node = node 34 | break 35 | 36 | if pre_node is None: 37 | return False 38 | 39 | if pre_node.op_type == "Conv": 40 | return True 41 | elif pre_node.op_type == "Relu": 42 | return self.is_conv_output(onnx_model, pre_node.input[0]) 43 | elif pre_node.op_type == "Concat": 44 | ret = False 45 | for input in pre_node.input[0]: 46 | ret |= self.is_conv_output(onnx_model, input) 47 | return ret 48 | 49 | def get_conv_tensors(self, onnx_model, act_scale_map): 50 | conv_out_scale_map = {} 51 | conv_out_to_bias = {} 52 | for node in onnx_model.graph.node: 53 | for tensor_name in node.output: 54 | if tensor_name not in act_scale_map: 55 | continue 56 | else: 57 | if node.op_type == "Conv": 58 | if len(node.input) >= 3: # with bias 59 | conv_out_scale_map[tensor_name] = act_scale_map[tensor_name] 60 | conv_out_to_bias[tensor_name] = node.input[2] 61 | elif node.op_type == "Relu": 62 | pre_nodes = OnnxModel.get_previous_nodes(onnx_model, node.input[0]) 63 | assert len(pre_nodes) == 1, "Relu should only have one input" 64 | if pre_nodes[0].op_type == "Conv" and len(pre_nodes[0].input) >= 3: 65 | conv_out_scale_map[node.input[0]] = act_scale_map[tensor_name] 66 | conv_out_to_bias[node.input[0]] = pre_nodes[0].input[2] 67 | elif node.op_type == "Concat": 68 | for input in node.input: 69 | pre_nodes = OnnxModel.get_previous_nodes(onnx_model, input) 70 | assert len(pre_nodes) == 1, "each input shold corespond to one node" 71 | if pre_nodes[0].op_type == "Conv" and len(pre_nodes[0].input) >= 3: 72 | conv_out_scale_map[input] = act_scale_map[tensor_name] 73 | conv_out_to_bias[input] = pre_nodes[0].input[2] 74 | elif pre_nodes[0].op_type == "Relu": 75 | nodes_before_relu = OnnxModel.get_previous_nodes(onnx_model, pre_nodes[0].input[0]) 76 | assert len(nodes_before_relu) == 1, "Relu should only have one input" 77 | if nodes_before_relu[0].op_type == "Conv" and len(nodes_before_relu[0].input) >= 3: 78 | conv_out_scale_map[pre_nodes[0].input[0]] = act_scale_map[tensor_name] 79 | conv_out_to_bias[pre_nodes[0].input[0]] = nodes_before_relu[0].input[2] 80 | return conv_out_scale_map, conv_out_to_bias 81 | 82 | def eval_quantize(self, fp32_output, int8_output): 83 | sims = [] 84 | diffs = [] 85 | rel_diffs = [] 86 | for fp32, int8 in zip(fp32_output, int8_output): 87 | fp32 = np.reshape(fp32, (1, -1)) 88 | int8 = np.reshape(int8, (1, -1)) 89 | sim = cosine_similarity(fp32, int8) 90 | diff = np.abs(fp32 - int8) 91 | rel_diff = diff / (np.abs(fp32) + 1e-8) 92 | sims.append(sim) 93 | diffs.append(np.median(diff)) 94 | rel_diffs.append(np.median(rel_diff)) 95 | return np.mean(sims), np.mean(diffs), np.mean(rel_diffs) 96 | 97 | def process(self): 98 | # Step 01. read input data 99 | input_data = self.data_loader.get_numpy_data(image_num=100) 100 | 101 | # Step 02. read calibration cache 102 | act_scale_map = read_calib_cache(self.calib_path) 103 | act_scale_map = {name: value for name, value in act_scale_map.items() if name in self.model.all_tensor_names} 104 | qdq_model = deepcopy(self.model.qdq_model) 105 | for tensor_name, scale_value in act_scale_map.items(): 106 | OnnxModel.add_act_dqd_node(qdq_model, tensor_name, scale_value) 107 | onnx.save(qdq_model, self.model_path + "_qdq100.onnx") 108 | 109 | # Step 03. caculate snrs 110 | # fp32_outputs = OnnxModel.get_onnx_outputs(self.model.fp32_model, list(act_scale_map.keys()), input_data) 111 | # snrs = {} 112 | # for name, fp32_output in fp32_outputs.items(): 113 | # snrs[name] = self.caculate_snr(fp32_outputs[name], act_scale_map[name]) 114 | 115 | return self.model.fp32_model, self.model.qdq_model 116 | 117 | 118 | if __name__ == "__main__": 119 | import sys 120 | onnx_path = "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.onnx" 121 | calib_path = "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.trt_int8_with_531pics_calib_percentile595.calib_cache" 122 | BS = QuantizaitonFilter(onnx_path, calib_path) 123 | onnx_model, qdq_model = BS.process() 124 | # onnx.save(onnx_model, "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.bias_correction_v1.onnx") 125 | # onnx.save(qdq_model, "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.bias_correction_v1.qdq.onnx") -------------------------------------------------------------------------------- /quantization/P02_MT_onnx2tensorRT_int8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | 6 | print(sys.getdefaultencoding()) 7 | s = "中文乱码问题解决" 8 | print(s) 9 | 10 | # --------------------------------------------------------------- 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 12 | 13 | # This script shows you how to export ppq internal graph to tensorRT 14 | # --------------------------------------------------------------- 15 | 16 | # For this inference test, all test data is randomly picked. 17 | # If you want to use real data, just rewrite the defination of SAMPLES 18 | print("开始import") 19 | import onnxruntime 20 | import torch 21 | from ppq import * 22 | from ppq.api import * 23 | from tqdm import tqdm 24 | import glob 25 | import cv2 26 | import numpy as np 27 | from torchvision import transforms 28 | from PIL import Image 29 | import os 30 | 31 | 32 | def read_image(path): 33 | # 多任务模型 34 | _img_transforms = transforms.Compose( 35 | [ 36 | transforms.Resize((384, 768)), 37 | transforms.ToTensor(), 38 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 39 | ] 40 | ) 41 | img = Image.open(path).convert("RGB") 42 | img_w, img_h = img.size[0], img.size[1] 43 | img = _img_transforms(img) 44 | img = img.unsqueeze(0) 45 | return img 46 | 47 | 48 | QUANT_PLATFROM = TargetPlatform.TRT_INT8 49 | MODEL = "Models/RMTNet_release20220609_v2.opt.onnx" 50 | INPUT_SHAPE = [1, 3, 384, 768] 51 | 52 | calibration_files = glob.glob( 53 | os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg") 54 | )[:128] 55 | SAMPLES = [ 56 | read_image(path) for path in calibration_files 57 | ] # rewirte this to use real data. 58 | 59 | 60 | DEVICE = "cuda" 61 | FINETUNE = True 62 | QS = QuantizationSettingFactory.default_setting() 63 | EXECUTING_DEVICE = "cuda" 64 | REQUIRE_ANALYSE = True 65 | 66 | # ------------------------------------------------------------------- 67 | # 下面向你展示了常用参数调节选项: 68 | # ------------------------------------------------------------------- 69 | if PPQ_CONFIG.USING_CUDA_KERNEL: 70 | print("====== using advanced_optimization =====") 71 | QS.advanced_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 72 | QS.advanced_optimization_setting.steps = 2500 # 再训练步数,影响训练时间,2500步大概几分钟 73 | QS.advanced_optimization_setting.collecting_device = ( 74 | "executor" # 缓存数据放在那,executor 就是放在gpu,如果显存超了你就换成 'cpu' 75 | ) 76 | QS.advanced_optimization_setting.auto_check = ( 77 | False # 打开这个选项则训练过程中会防止过拟合,以及意外情况,通常不需要开。 78 | ) 79 | else: 80 | print("====== using lsq_optimization =====") 81 | QS.lsq_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 82 | QS.lsq_optimization_setting.epochs = 128 # 再训练轮数,影响训练时间,30轮大概几分钟 83 | QS.lsq_optimization_setting.collecting_device = ( 84 | "cuda" # 缓存数据放在那,cuda 就是放在gpu,如果显存超了你就换成 'cpu' 85 | ) 86 | 87 | # 把量化的不太好的算子送回 FP32 88 | # QS.dispatching_table.append(operation="Concat_2420", platform=TargetPlatform.FP32) 89 | 90 | 91 | print("正准备量化你的网络,检查下列设置:") 92 | print(f"TARGET PLATFORM : {QUANT_PLATFROM.name}") 93 | print(f"NETWORK INPUTSHAPE : {INPUT_SHAPE}") 94 | 95 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x,但是你如果没有装相应编译环境的话是编译不了的 96 | # 你可以尝试安装编译环境,或者在不启动 CUDA KERNEL 的情况下完成量化:移除 with ENABLE_CUDA_KERNEL(): 即可 97 | with ENABLE_CUDA_KERNEL(): 98 | qir = quantize_onnx_model( 99 | onnx_import_file=MODEL, 100 | calib_dataloader=SAMPLES, 101 | calib_steps=128, 102 | setting=QS, 103 | input_shape=INPUT_SHAPE, 104 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 105 | platform=QUANT_PLATFROM, 106 | do_quantize=True, 107 | ) 108 | 109 | # ------------------------------------------------------------------- 110 | # PPQ 计算量化误差时,使用信噪比的倒数作为指标,即噪声能量 / 信号能量 111 | # 量化误差 0.1 表示在整体信号中,量化噪声的能量约为 10% 112 | # 你应当注意,在 graphwise_error_analyse 分析中,我们衡量的是累计误差 113 | # 网络的最后一层往往都具有较大的累计误差,这些误差是其前面的所有层所共同造成的 114 | # 你需要使用 layerwise_error_analyse 逐层分析误差的来源 115 | # ------------------------------------------------------------------- 116 | print("正计算网络量化误差(SNR),最后一层的误差应小于 0.1 以保证量化精度:") 117 | reports = graphwise_error_analyse( 118 | graph=qir, 119 | running_device=EXECUTING_DEVICE, 120 | steps=32, 121 | dataloader=SAMPLES, 122 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 123 | ) 124 | for op, snr in reports.items(): 125 | if snr > 0.1: 126 | ppq_warning(f"层 {op} 的累计量化误差显著,请考虑进行优化") 127 | 128 | if REQUIRE_ANALYSE: 129 | print("正计算逐层量化误差(SNR),每一层的独立量化误差应小于 0.1 以保证量化精度:") 130 | layerwise_error_analyse( 131 | graph=qir, 132 | running_device=EXECUTING_DEVICE, 133 | interested_outputs=None, 134 | dataloader=SAMPLES, 135 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 136 | ) 137 | 138 | print("网络量化结束,正在生成目标文件:") 139 | export_ppq_graph( 140 | graph=qir, 141 | platform=QUANT_PLATFROM, 142 | graph_save_to=MODEL.replace(".onnx", "_v2_int8.onnx"), 143 | ) 144 | 145 | # ------------------------------------------------------------------- 146 | # 记录一下输入输出的名字,onnxruntime 跑的时候需要提供这些名字 147 | # 我写的只是单输出单输入的版本,多输出多输入你得自己改改 148 | # ------------------------------------------------------------------- 149 | int8_input_names = [name for name, _ in qir.inputs.items()] 150 | int8_output_names = [name for name, _ in qir.outputs.items()] 151 | 152 | # ------------------------------------------------------------------- 153 | # 启动 tensorRT 进行推理,你先装一下 trt 154 | # ------------------------------------------------------------------- 155 | import tensorrt as trt 156 | import trt_infer 157 | 158 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 159 | logger = trt.Logger(trt.Logger.INFO) 160 | with open(MODEL.replace(".onnx", "_v2_int8.engine"), "rb") as f, trt.Runtime( 161 | logger 162 | ) as runtime: 163 | engine = runtime.deserialize_cuda_engine(f.read()) 164 | 165 | results = [] 166 | with engine.create_execution_context() as context: 167 | inputs, outputs, bindings, stream, _ = trt_infer.allocate_buffers( 168 | context.engine 169 | ) 170 | for sample in tqdm(samples, desc="TensorRT is running..."): 171 | inputs[0].host = convert_any_to_numpy(sample) 172 | output = trt_infer.do_inference( 173 | context, 174 | bindings=bindings, 175 | inputs=inputs, 176 | outputs=outputs, 177 | stream=stream, 178 | batch_size=1, 179 | ) 180 | # results.append(convert_any_to_torch_tensor(output).reshape([-1, 1000])) 181 | -------------------------------------------------------------------------------- /quantization/onnx2tensorRT_adaround.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | 6 | print(sys.getdefaultencoding()) 7 | s = "中文乱码问题解决" 8 | print(s) 9 | 10 | # --------------------------------------------------------------- 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 12 | 13 | # This script shows you how to export ppq internal graph to tensorRT 14 | # --------------------------------------------------------------- 15 | 16 | # For this inference test, all test data is randomly picked. 17 | # If you want to use real data, just rewrite the defination of SAMPLES 18 | print("开始import") 19 | import onnxruntime 20 | import torch 21 | from ppq import * 22 | from ppq.api import * 23 | from tqdm import tqdm 24 | import glob 25 | import cv2 26 | import numpy as np 27 | from torchvision import transforms 28 | from PIL import Image 29 | import os 30 | 31 | 32 | def read_image(path): 33 | # 多任务模型 34 | _img_transforms = transforms.Compose( 35 | [ 36 | transforms.Resize((384, 768)), 37 | transforms.ToTensor(), 38 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 39 | ] 40 | ) 41 | img = Image.open(path).convert("RGB") 42 | img_w, img_h = img.size[0], img.size[1] 43 | img = _img_transforms(img) 44 | img = img.unsqueeze(0) 45 | return img 46 | 47 | 48 | QUANT_PLATFROM = TargetPlatform.TRT_INT8 49 | MODEL = "model_copy_adaround.onnx" 50 | INPUT_SHAPE = [1, 3, 384, 768] 51 | 52 | calibration_files = glob.glob( 53 | os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg") 54 | )[:128] 55 | SAMPLES = [ 56 | read_image(path) for path in calibration_files 57 | ] # rewirte this to use real data. 58 | 59 | 60 | DEVICE = "cuda" 61 | FINETUNE = True 62 | QS = QuantizationSettingFactory.default_setting() 63 | EXECUTING_DEVICE = "cuda" 64 | REQUIRE_ANALYSE = True 65 | 66 | # ------------------------------------------------------------------- 67 | # 下面向你展示了常用参数调节选项: 68 | # ------------------------------------------------------------------- 69 | if True: # PPQ_CONFIG.USING_CUDA_KERNEL: 70 | print("====== using advanced_optimization =====") 71 | QS.advanced_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 72 | QS.advanced_optimization_setting.steps = 2500 # 再训练步数,影响训练时间,2500步大概几分钟 73 | QS.advanced_optimization_setting.collecting_device = ( 74 | "executor" # 缓存数据放在那,executor 就是放在gpu,如果显存超了你就换成 'cpu' 75 | ) 76 | QS.advanced_optimization_setting.auto_check = ( 77 | False # 打开这个选项则训练过程中会防止过拟合,以及意外情况,通常不需要开。 78 | ) 79 | else: 80 | print("====== using lsq_optimization =====") 81 | QS.lsq_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 82 | QS.lsq_optimization_setting.epochs = 32 # 再训练轮数,影响训练时间,30轮大概几分钟 83 | QS.lsq_optimization_setting.collecting_device = ( 84 | "cuda" # 缓存数据放在那,cuda 就是放在gpu,如果显存超了你就换成 'cpu' 85 | ) 86 | 87 | QS.dispatching_table.append( 88 | operation="Sigmoid", platform=TargetPlatform.FP32 89 | ) # 把量化的不太好的算子送回 FP32 90 | 91 | print("正准备量化你的网络,检查下列设置:") 92 | print(f"TARGET PLATFORM : {QUANT_PLATFROM.name}") 93 | print(f"NETWORK INPUTSHAPE : {INPUT_SHAPE}") 94 | 95 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x,但是你如果没有装相应编译环境的话是编译不了的 96 | # 你可以尝试安装编译环境,或者在不启动 CUDA KERNEL 的情况下完成量化:移除 with ENABLE_CUDA_KERNEL(): 即可 97 | with ENABLE_CUDA_KERNEL(): 98 | qir = quantize_onnx_model( 99 | onnx_import_file=MODEL, 100 | calib_dataloader=SAMPLES, 101 | calib_steps=32, 102 | setting=QS, 103 | input_shape=INPUT_SHAPE, 104 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 105 | platform=QUANT_PLATFROM, 106 | do_quantize=True, 107 | ) 108 | 109 | # ------------------------------------------------------------------- 110 | # PPQ 计算量化误差时,使用信噪比的倒数作为指标,即噪声能量 / 信号能量 111 | # 量化误差 0.1 表示在整体信号中,量化噪声的能量约为 10% 112 | # 你应当注意,在 graphwise_error_analyse 分析中,我们衡量的是累计误差 113 | # 网络的最后一层往往都具有较大的累计误差,这些误差是其前面的所有层所共同造成的 114 | # 你需要使用 layerwise_error_analyse 逐层分析误差的来源 115 | # ------------------------------------------------------------------- 116 | print("正计算网络量化误差(SNR),最后一层的误差应小于 0.1 以保证量化精度:") 117 | reports = graphwise_error_analyse( 118 | graph=qir, 119 | running_device=EXECUTING_DEVICE, 120 | steps=32, 121 | dataloader=SAMPLES, 122 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 123 | ) 124 | for op, snr in reports.items(): 125 | if snr > 0.1: 126 | ppq_warning(f"层 {op} 的累计量化误差显著,请考虑进行优化") 127 | 128 | if REQUIRE_ANALYSE: 129 | print("正计算逐层量化误差(SNR),每一层的独立量化误差应小于 0.1 以保证量化精度:") 130 | layerwise_error_analyse( 131 | graph=qir, 132 | running_device=EXECUTING_DEVICE, 133 | interested_outputs=None, 134 | dataloader=SAMPLES, 135 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 136 | ) 137 | 138 | print("网络量化结束,正在生成目标文件:") 139 | export_ppq_graph( 140 | graph=qir, 141 | platform=QUANT_PLATFROM, 142 | graph_save_to="model_copy_adaround_int8.onnx", 143 | ) 144 | 145 | # ------------------------------------------------------------------- 146 | # 记录一下输入输出的名字,onnxruntime 跑的时候需要提供这些名字 147 | # 我写的只是单输出单输入的版本,多输出多输入你得自己改改 148 | # ------------------------------------------------------------------- 149 | int8_input_names = [name for name, _ in qir.inputs.items()] 150 | int8_output_names = [name for name, _ in qir.outputs.items()] 151 | 152 | # ------------------------------------------------------------------- 153 | # 启动 tensorRT 进行推理,你先装一下 trt 154 | # ------------------------------------------------------------------- 155 | import tensorrt as trt 156 | import trt_infer 157 | 158 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 159 | logger = trt.Logger(trt.Logger.INFO) 160 | with open("model_copy_adaround_int8.engine", "rb") as f, trt.Runtime( 161 | logger 162 | ) as runtime: 163 | engine = runtime.deserialize_cuda_engine(f.read()) 164 | 165 | results = [] 166 | with engine.create_execution_context() as context: 167 | inputs, outputs, bindings, stream, output_names = trt_infer.allocate_buffers( 168 | context.engine 169 | ) 170 | for sample in tqdm(samples, desc="TensorRT is running..."): 171 | inputs[0].host = convert_any_to_numpy(sample) 172 | outputs_list = trt_infer.do_inference( 173 | context, 174 | bindings=bindings, 175 | inputs=inputs, 176 | outputs=outputs, 177 | stream=stream, 178 | batch_size=1, 179 | ) 180 | for output in outputs_list: 181 | print(np.reshape(output, (1, -1))[0, :10]) 182 | -------------------------------------------------------------------------------- /quantization/P01_MT_onnx2tensorRT_int8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | 6 | print(sys.getdefaultencoding()) 7 | s = "中文乱码问题解决" 8 | print(s) 9 | 10 | # --------------------------------------------------------------- 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 12 | 13 | # This script shows you how to export ppq internal graph to tensorRT 14 | # --------------------------------------------------------------- 15 | 16 | # For this inference test, all test data is randomly picked. 17 | # If you want to use real data, just rewrite the defination of SAMPLES 18 | print("开始import") 19 | import onnxruntime 20 | import torch 21 | from ppq import * 22 | from ppq.api import * 23 | from tqdm import tqdm 24 | import glob 25 | import cv2 26 | import numpy as np 27 | from torchvision import transforms 28 | from PIL import Image 29 | import os 30 | 31 | 32 | def read_image(path): 33 | # 多任务模型 34 | _img_transforms = transforms.Compose( 35 | [ 36 | transforms.Resize((384, 768)), 37 | transforms.ToTensor(), 38 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 39 | ] 40 | ) 41 | img = Image.open(path).convert("RGB") 42 | img_w, img_h = img.size[0], img.size[1] 43 | img = _img_transforms(img) 44 | img = img.unsqueeze(0) 45 | return img 46 | 47 | 48 | QUANT_PLATFROM = TargetPlatform.TRT_INT8 49 | MODEL = "/apdcephfs/private_howellyang/onnx2trt/model_T01/model.onnx" 50 | INPUT_SHAPE = [1, 3, 384, 768] 51 | 52 | calibration_files = glob.glob( 53 | os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg") 54 | )[:128] 55 | SAMPLES = [ 56 | read_image(path) for path in calibration_files 57 | ] # rewirte this to use real data. 58 | 59 | 60 | DEVICE = "cuda" 61 | FINETUNE = True 62 | QS = QuantizationSettingFactory.default_setting() 63 | EXECUTING_DEVICE = "cuda" 64 | REQUIRE_ANALYSE = True 65 | 66 | # ------------------------------------------------------------------- 67 | # 下面向你展示了常用参数调节选项: 68 | # ------------------------------------------------------------------- 69 | if PPQ_CONFIG.USING_CUDA_KERNEL: 70 | print("====== using advanced_optimization =====") 71 | QS.advanced_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 72 | QS.advanced_optimization_setting.steps = 2500 # 再训练步数,影响训练时间,2500步大概几分钟 73 | QS.advanced_optimization_setting.collecting_device = ( 74 | "executor" # 缓存数据放在那,executor 就是放在gpu,如果显存超了你就换成 'cpu' 75 | ) 76 | QS.advanced_optimization_setting.auto_check = ( 77 | False # 打开这个选项则训练过程中会防止过拟合,以及意外情况,通常不需要开。 78 | ) 79 | else: 80 | print("====== using lsq_optimization =====") 81 | QS.lsq_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 82 | QS.lsq_optimization_setting.epochs = 128 # 再训练轮数,影响训练时间,30轮大概几分钟 83 | QS.lsq_optimization_setting.collecting_device = ( 84 | "cuda" # 缓存数据放在那,cuda 就是放在gpu,如果显存超了你就换成 'cpu' 85 | ) 86 | 87 | # 把量化的不太好的算子送回 FP32 88 | # QS.dispatching_table.append(operation="Concat_2420", platform=TargetPlatform.FP32) 89 | 90 | 91 | print("正准备量化你的网络,检查下列设置:") 92 | print(f"TARGET PLATFORM : {QUANT_PLATFROM.name}") 93 | print(f"NETWORK INPUTSHAPE : {INPUT_SHAPE}") 94 | 95 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x,但是你如果没有装相应编译环境的话是编译不了的 96 | # 你可以尝试安装编译环境,或者在不启动 CUDA KERNEL 的情况下完成量化:移除 with ENABLE_CUDA_KERNEL(): 即可 97 | with ENABLE_CUDA_KERNEL(): 98 | qir = quantize_onnx_model( 99 | onnx_import_file=MODEL, 100 | calib_dataloader=SAMPLES, 101 | calib_steps=128, 102 | setting=QS, 103 | input_shape=INPUT_SHAPE, 104 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 105 | platform=QUANT_PLATFROM, 106 | do_quantize=True, 107 | ) 108 | 109 | # ------------------------------------------------------------------- 110 | # PPQ 计算量化误差时,使用信噪比的倒数作为指标,即噪声能量 / 信号能量 111 | # 量化误差 0.1 表示在整体信号中,量化噪声的能量约为 10% 112 | # 你应当注意,在 graphwise_error_analyse 分析中,我们衡量的是累计误差 113 | # 网络的最后一层往往都具有较大的累计误差,这些误差是其前面的所有层所共同造成的 114 | # 你需要使用 layerwise_error_analyse 逐层分析误差的来源 115 | # ------------------------------------------------------------------- 116 | print("正计算网络量化误差(SNR),最后一层的误差应小于 0.1 以保证量化精度:") 117 | reports = graphwise_error_analyse( 118 | graph=qir, 119 | running_device=EXECUTING_DEVICE, 120 | steps=32, 121 | dataloader=SAMPLES, 122 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 123 | ) 124 | for op, snr in reports.items(): 125 | if snr > 0.1: 126 | ppq_warning(f"层 {op} 的累计量化误差显著,请考虑进行优化") 127 | 128 | if REQUIRE_ANALYSE: 129 | print("正计算逐层量化误差(SNR),每一层的独立量化误差应小于 0.1 以保证量化精度:") 130 | layerwise_error_analyse( 131 | graph=qir, 132 | running_device=EXECUTING_DEVICE, 133 | interested_outputs=None, 134 | dataloader=SAMPLES, 135 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 136 | ) 137 | 138 | print("网络量化结束,正在生成目标文件:") 139 | export_ppq_graph( 140 | graph=qir, 141 | platform=QUANT_PLATFROM, 142 | graph_save_to=MODEL.replace(".onnx", "_v2_int8.onnx"), 143 | ) 144 | 145 | # ------------------------------------------------------------------- 146 | # 记录一下输入输出的名字,onnxruntime 跑的时候需要提供这些名字 147 | # 我写的只是单输出单输入的版本,多输出多输入你得自己改改 148 | # ------------------------------------------------------------------- 149 | int8_input_names = [name for name, _ in qir.inputs.items()] 150 | int8_output_names = [name for name, _ in qir.outputs.items()] 151 | 152 | # ------------------------------------------------------------------- 153 | # 启动 tensorRT 进行推理,你先装一下 trt 154 | # ------------------------------------------------------------------- 155 | import tensorrt as trt 156 | import trt_infer 157 | 158 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 159 | logger = trt.Logger(trt.Logger.INFO) 160 | with open(MODEL.replace(".onnx", "_v2_int8.engine"), "rb") as f, trt.Runtime( 161 | logger 162 | ) as runtime: 163 | engine = runtime.deserialize_cuda_engine(f.read()) 164 | 165 | results = [] 166 | with engine.create_execution_context() as context: 167 | inputs, outputs, bindings, stream, _ = trt_infer.allocate_buffers( 168 | context.engine 169 | ) 170 | for sample in tqdm(samples, desc="TensorRT is running..."): 171 | inputs[0].host = convert_any_to_numpy(sample) 172 | output = trt_infer.do_inference( 173 | context, 174 | bindings=bindings, 175 | inputs=inputs, 176 | outputs=outputs, 177 | stream=stream, 178 | batch_size=1, 179 | ) 180 | # results.append(convert_any_to_torch_tensor(output).reshape([-1, 1000])) 181 | -------------------------------------------------------------------------------- /quantization/ptq/P01_MT_onnx2tensorRT_int8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | 6 | print(sys.getdefaultencoding()) 7 | s = "中文乱码问题解决" 8 | print(s) 9 | 10 | # --------------------------------------------------------------- 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 12 | 13 | # This script shows you how to export ppq internal graph to tensorRT 14 | # --------------------------------------------------------------- 15 | 16 | # For this inference test, all test data is randomly picked. 17 | # If you want to use real data, just rewrite the defination of SAMPLES 18 | print("开始import") 19 | import onnxruntime 20 | import torch 21 | from ppq import * 22 | from ppq.api import * 23 | from tqdm import tqdm 24 | import glob 25 | import cv2 26 | import numpy as np 27 | from torchvision import transforms 28 | from PIL import Image 29 | import os 30 | 31 | 32 | def read_image(path): 33 | # 多任务模型 34 | _img_transforms = transforms.Compose( 35 | [ 36 | transforms.Resize((384, 768)), 37 | transforms.ToTensor(), 38 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 39 | ] 40 | ) 41 | img = Image.open(path).convert("RGB") 42 | img_w, img_h = img.size[0], img.size[1] 43 | img = _img_transforms(img) 44 | img = img.unsqueeze(0) 45 | return img 46 | 47 | 48 | QUANT_PLATFROM = TargetPlatform.TRT_INT8 49 | MODEL = "/apdcephfs/private_howellyang/onnx2trt/model_T01/model.onnx" 50 | INPUT_SHAPE = [1, 3, 384, 768] 51 | 52 | calibration_files = glob.glob( 53 | os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg") 54 | )[:128] 55 | SAMPLES = [ 56 | read_image(path) for path in calibration_files 57 | ] # rewirte this to use real data. 58 | 59 | 60 | DEVICE = "cuda" 61 | FINETUNE = True 62 | QS = QuantizationSettingFactory.default_setting() 63 | EXECUTING_DEVICE = "cuda" 64 | REQUIRE_ANALYSE = True 65 | 66 | # ------------------------------------------------------------------- 67 | # 下面向你展示了常用参数调节选项: 68 | # ------------------------------------------------------------------- 69 | if PPQ_CONFIG.USING_CUDA_KERNEL: 70 | print("====== using advanced_optimization =====") 71 | QS.advanced_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 72 | QS.advanced_optimization_setting.steps = 2500 # 再训练步数,影响训练时间,2500步大概几分钟 73 | QS.advanced_optimization_setting.collecting_device = ( 74 | "executor" # 缓存数据放在那,executor 就是放在gpu,如果显存超了你就换成 'cpu' 75 | ) 76 | QS.advanced_optimization_setting.auto_check = ( 77 | False # 打开这个选项则训练过程中会防止过拟合,以及意外情况,通常不需要开。 78 | ) 79 | else: 80 | print("====== using lsq_optimization =====") 81 | QS.lsq_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 82 | QS.lsq_optimization_setting.epochs = 128 # 再训练轮数,影响训练时间,30轮大概几分钟 83 | QS.lsq_optimization_setting.collecting_device = ( 84 | "cuda" # 缓存数据放在那,cuda 就是放在gpu,如果显存超了你就换成 'cpu' 85 | ) 86 | 87 | # 把量化的不太好的算子送回 FP32 88 | # QS.dispatching_table.append(operation="Concat_2420", platform=TargetPlatform.FP32) 89 | 90 | 91 | print("正准备量化你的网络,检查下列设置:") 92 | print(f"TARGET PLATFORM : {QUANT_PLATFROM.name}") 93 | print(f"NETWORK INPUTSHAPE : {INPUT_SHAPE}") 94 | 95 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x,但是你如果没有装相应编译环境的话是编译不了的 96 | # 你可以尝试安装编译环境,或者在不启动 CUDA KERNEL 的情况下完成量化:移除 with ENABLE_CUDA_KERNEL(): 即可 97 | with ENABLE_CUDA_KERNEL(): 98 | qir = quantize_onnx_model( 99 | onnx_import_file=MODEL, 100 | calib_dataloader=SAMPLES, 101 | calib_steps=128, 102 | setting=QS, 103 | input_shape=INPUT_SHAPE, 104 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 105 | platform=QUANT_PLATFROM, 106 | do_quantize=True, 107 | ) 108 | 109 | # ------------------------------------------------------------------- 110 | # PPQ 计算量化误差时,使用信噪比的倒数作为指标,即噪声能量 / 信号能量 111 | # 量化误差 0.1 表示在整体信号中,量化噪声的能量约为 10% 112 | # 你应当注意,在 graphwise_error_analyse 分析中,我们衡量的是累计误差 113 | # 网络的最后一层往往都具有较大的累计误差,这些误差是其前面的所有层所共同造成的 114 | # 你需要使用 layerwise_error_analyse 逐层分析误差的来源 115 | # ------------------------------------------------------------------- 116 | print("正计算网络量化误差(SNR),最后一层的误差应小于 0.1 以保证量化精度:") 117 | reports = graphwise_error_analyse( 118 | graph=qir, 119 | running_device=EXECUTING_DEVICE, 120 | steps=32, 121 | dataloader=SAMPLES, 122 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 123 | ) 124 | for op, snr in reports.items(): 125 | if snr > 0.1: 126 | ppq_warning(f"层 {op} 的累计量化误差显著,请考虑进行优化") 127 | 128 | if REQUIRE_ANALYSE: 129 | print("正计算逐层量化误差(SNR),每一层的独立量化误差应小于 0.1 以保证量化精度:") 130 | layerwise_error_analyse( 131 | graph=qir, 132 | running_device=EXECUTING_DEVICE, 133 | interested_outputs=None, 134 | dataloader=SAMPLES, 135 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 136 | ) 137 | 138 | print("网络量化结束,正在生成目标文件:") 139 | export_ppq_graph( 140 | graph=qir, 141 | platform=QUANT_PLATFROM, 142 | graph_save_to=MODEL.replace(".onnx", "_v2_int8.onnx"), 143 | ) 144 | 145 | # ------------------------------------------------------------------- 146 | # 记录一下输入输出的名字,onnxruntime 跑的时候需要提供这些名字 147 | # 我写的只是单输出单输入的版本,多输出多输入你得自己改改 148 | # ------------------------------------------------------------------- 149 | int8_input_names = [name for name, _ in qir.inputs.items()] 150 | int8_output_names = [name for name, _ in qir.outputs.items()] 151 | 152 | # ------------------------------------------------------------------- 153 | # 启动 tensorRT 进行推理,你先装一下 trt 154 | # ------------------------------------------------------------------- 155 | import tensorrt as trt 156 | import trt_infer 157 | 158 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 159 | logger = trt.Logger(trt.Logger.INFO) 160 | with open(MODEL.replace(".onnx", "_v2_int8.engine"), "rb") as f, trt.Runtime( 161 | logger 162 | ) as runtime: 163 | engine = runtime.deserialize_cuda_engine(f.read()) 164 | 165 | results = [] 166 | with engine.create_execution_context() as context: 167 | inputs, outputs, bindings, stream, _ = trt_infer.allocate_buffers( 168 | context.engine 169 | ) 170 | for sample in tqdm(samples, desc="TensorRT is running..."): 171 | inputs[0].host = convert_any_to_numpy(sample) 172 | output = trt_infer.do_inference( 173 | context, 174 | bindings=bindings, 175 | inputs=inputs, 176 | outputs=outputs, 177 | stream=stream, 178 | batch_size=1, 179 | ) 180 | # results.append(convert_any_to_torch_tensor(output).reshape([-1, 1000])) 181 | -------------------------------------------------------------------------------- /quantization/P01_MT_onnx2tensorRT_int8_sample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | 6 | print(sys.getdefaultencoding()) 7 | s = "中文乱码问题解决" 8 | print(s) 9 | 10 | # --------------------------------------------------------------- 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 12 | 13 | # This script shows you how to export ppq internal graph to tensorRT 14 | # --------------------------------------------------------------- 15 | 16 | # For this inference test, all test data is randomly picked. 17 | # If you want to use real data, just rewrite the defination of SAMPLES 18 | print("开始import") 19 | import onnxruntime 20 | import torch 21 | from ppq import * 22 | from ppq.api import * 23 | from tqdm import tqdm 24 | import glob 25 | import cv2 26 | import numpy as np 27 | from torchvision import transforms 28 | from PIL import Image 29 | import os 30 | 31 | 32 | def read_image(path): 33 | # 多任务模型 34 | _img_transforms = transforms.Compose( 35 | [ 36 | transforms.Resize((384, 768)), 37 | transforms.ToTensor(), 38 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 39 | ] 40 | ) 41 | img = Image.open(path).convert("RGB") 42 | img_w, img_h = img.size[0], img.size[1] 43 | img = _img_transforms(img) 44 | img = img.unsqueeze(0) 45 | return img 46 | 47 | 48 | QUANT_PLATFROM = TargetPlatform.TRT_INT8 49 | MODEL = "Models/RMTNet_release20220609_v2.opt.onnx" 50 | INPUT_SHAPE = [1, 3, 384, 768] 51 | 52 | calibration_files = glob.glob( 53 | os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg") 54 | )[:2] 55 | SAMPLES = [ 56 | read_image(path) for path in calibration_files 57 | ] # rewirte this to use real data. 58 | 59 | 60 | DEVICE = "cuda" 61 | FINETUNE = True 62 | QS = QuantizationSettingFactory.default_setting() 63 | EXECUTING_DEVICE = "cuda" 64 | REQUIRE_ANALYSE = True 65 | 66 | # ------------------------------------------------------------------- 67 | # 下面向你展示了常用参数调节选项: 68 | # ------------------------------------------------------------------- 69 | if PPQ_CONFIG.USING_CUDA_KERNEL: 70 | print("====== using advanced_optimization =====") 71 | QS.advanced_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 72 | QS.advanced_optimization_setting.steps = 2500 # 再训练步数,影响训练时间,2500步大概几分钟 73 | QS.advanced_optimization_setting.collecting_device = ( 74 | "executor" # 缓存数据放在那,executor 就是放在gpu,如果显存超了你就换成 'cpu' 75 | ) 76 | QS.advanced_optimization_setting.auto_check = ( 77 | False # 打开这个选项则训练过程中会防止过拟合,以及意外情况,通常不需要开。 78 | ) 79 | else: 80 | print("====== using lsq_optimization =====") 81 | QS.lsq_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 82 | QS.lsq_optimization_setting.epochs = 4 # 再训练轮数,影响训练时间,30轮大概几分钟 83 | QS.lsq_optimization_setting.collecting_device = ( 84 | "cuda" # 缓存数据放在那,cuda 就是放在gpu,如果显存超了你就换成 'cpu' 85 | ) 86 | 87 | # 把量化的不太好的算子送回 FP32 88 | QS.dispatching_table.append(operation="Conv_3342", platform=TargetPlatform.FP32) 89 | QS.dispatching_table.append(operation="Relu_3343", platform=TargetPlatform.FP32) 90 | QS.dispatching_table.append(operation="Conv_2523", platform=TargetPlatform.FP32) 91 | 92 | print("正准备量化你的网络,检查下列设置:") 93 | print(f"TARGET PLATFORM : {QUANT_PLATFROM.name}") 94 | print(f"NETWORK INPUTSHAPE : {INPUT_SHAPE}") 95 | 96 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x,但是你如果没有装相应编译环境的话是编译不了的 97 | # 你可以尝试安装编译环境,或者在不启动 CUDA KERNEL 的情况下完成量化:移除 with ENABLE_CUDA_KERNEL(): 即可 98 | with ENABLE_CUDA_KERNEL(): 99 | qir = quantize_onnx_model( 100 | onnx_import_file=MODEL, 101 | calib_dataloader=SAMPLES, 102 | calib_steps=16, 103 | setting=QS, 104 | input_shape=INPUT_SHAPE, 105 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 106 | platform=QUANT_PLATFROM, 107 | do_quantize=True, 108 | ) 109 | 110 | # ------------------------------------------------------------------- 111 | # PPQ 计算量化误差时,使用信噪比的倒数作为指标,即噪声能量 / 信号能量 112 | # 量化误差 0.1 表示在整体信号中,量化噪声的能量约为 10% 113 | # 你应当注意,在 graphwise_error_analyse 分析中,我们衡量的是累计误差 114 | # 网络的最后一层往往都具有较大的累计误差,这些误差是其前面的所有层所共同造成的 115 | # 你需要使用 layerwise_error_analyse 逐层分析误差的来源 116 | # ------------------------------------------------------------------- 117 | print("正计算网络量化误差(SNR),最后一层的误差应小于 0.1 以保证量化精度:") 118 | reports = graphwise_error_analyse( 119 | graph=qir, 120 | running_device=EXECUTING_DEVICE, 121 | steps=32, 122 | dataloader=SAMPLES, 123 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 124 | ) 125 | for op, snr in reports.items(): 126 | if snr > 0.1: 127 | ppq_warning(f"层 {op} 的累计量化误差显著,请考虑进行优化") 128 | 129 | if REQUIRE_ANALYSE: 130 | print("正计算逐层量化误差(SNR),每一层的独立量化误差应小于 0.1 以保证量化精度:") 131 | layerwise_error_analyse( 132 | graph=qir, 133 | running_device=EXECUTING_DEVICE, 134 | interested_outputs=None, 135 | dataloader=SAMPLES, 136 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 137 | ) 138 | 139 | print("网络量化结束,正在生成目标文件:") 140 | export_ppq_graph( 141 | graph=qir, 142 | platform=QUANT_PLATFROM, 143 | graph_save_to=MODEL.replace(".onnx", "_int8_sample2.onnx"), 144 | ) 145 | 146 | # ------------------------------------------------------------------- 147 | # 记录一下输入输出的名字,onnxruntime 跑的时候需要提供这些名字 148 | # 我写的只是单输出单输入的版本,多输出多输入你得自己改改 149 | # ------------------------------------------------------------------- 150 | int8_input_names = [name for name, _ in qir.inputs.items()] 151 | int8_output_names = [name for name, _ in qir.outputs.items()] 152 | 153 | # ------------------------------------------------------------------- 154 | # 启动 tensorRT 进行推理,你先装一下 trt 155 | # ------------------------------------------------------------------- 156 | import tensorrt as trt 157 | import trt_infer 158 | 159 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 160 | logger = trt.Logger(trt.Logger.INFO) 161 | with open(MODEL.replace(".onnx", "_int8_sample2.engine"), "rb") as f, trt.Runtime( 162 | logger 163 | ) as runtime: 164 | engine = runtime.deserialize_cuda_engine(f.read()) 165 | 166 | results = [] 167 | with engine.create_execution_context() as context: 168 | inputs, outputs, bindings, stream = trt_infer.allocate_buffers(context.engine) 169 | for sample in tqdm(samples, desc="TensorRT is running..."): 170 | inputs[0].host = convert_any_to_numpy(sample) 171 | output = trt_infer.do_inference( 172 | context, 173 | bindings=bindings, 174 | inputs=inputs, 175 | outputs=outputs, 176 | stream=stream, 177 | batch_size=1, 178 | ) 179 | # results.append(convert_any_to_torch_tensor(output).reshape([-1, 1000])) 180 | -------------------------------------------------------------------------------- /onnx_calibrator.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import abc 3 | import json 4 | import numpy as np 5 | import tensorrt as trt 6 | import tensorrt as trt 7 | import pycuda.driver as cuda 8 | import pycuda.autoinit # fix init error of cuda 9 | import os 10 | import onnx 11 | import struct 12 | from onnxruntime.quantization.calibrate import ( 13 | CalibrationDataReader, 14 | MinMaxCalibrater, 15 | EntropyCalibrater, 16 | PercentileCalibrater, 17 | ) 18 | 19 | # 使用onnx的quantize tools生成每个节点的scales和zero point 20 | # 并转换为tensorRT可用的calibration cache file 21 | # 后续需要用tensorrt模型转换工具生成trt engine 22 | class ONNXDataReader(CalibrationDataReader): 23 | def __init__(self, input_name, image_stream, max_iter_num=None): 24 | super(ONNXDataReader).__init__() 25 | self.input_name = input_name 26 | self.image_stream = image_stream 27 | self.max_iter_num = max_iter_num 28 | self.iter_num = 0 29 | 30 | def get_next(self) -> dict: 31 | self.iter_num += 1 32 | if self.iter_num > self.max_iter_num: 33 | return None 34 | batch = self.image_stream.next_batch() 35 | if not batch.size: 36 | return None 37 | """generate the input data dict for ONNXinferenceSession run""" 38 | return { 39 | self.input_name: batch, 40 | # "image_shape": np.asarray([[self.image_stream.WIDTH, self.image_stream.HEIGHT]], dtype=np.float32), 41 | } 42 | 43 | 44 | class ONNXCalibrator(trt.IInt8EntropyCalibrator2): 45 | def __init__(self, input_layers, stream, cache_file, calib_algo, onnx_model_path): 46 | super(ONNXCalibrator, self).__init__() 47 | self.input_layers = input_layers 48 | 49 | # 数据读取的类, 等同于图片处理的回调 50 | self.stream = stream 51 | 52 | # 分配GPU 53 | self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes) 54 | 55 | # cache路径 56 | self.cache_file = cache_file 57 | 58 | # 重置校准集 59 | self.stream.reset() 60 | 61 | # 使用onnx的calibrator来统计每个节点的dynamic range 62 | calibrator = self.create_calibrator(calib_algo, onnx_model_path) 63 | calibrator.set_execution_providers( 64 | ["CUDAExecutionProvider", "CPUExecutionProvider"] 65 | ) 66 | each_iter_num = 1 67 | for i in range(self.stream.max_batches // each_iter_num): 68 | data_reader = ONNXDataReader( 69 | self.input_layers[0], self.stream, each_iter_num 70 | ) 71 | calibrator.collect_data(data_reader) 72 | self.write_calibration_table(calibrator.compute_range(), self.cache_file) 73 | 74 | @staticmethod 75 | def write_calibration_table(calibration_cache, save_path): 76 | """ 77 | Helper function to write calibration table to files. 78 | """ 79 | with open(save_path + "_calib_cache.json", "w") as file: 80 | file.write( 81 | json.dumps(calibration_cache) 82 | ) # use `json.loads` to do the reverse 83 | 84 | # write plain text: tensorRT需要对结果做转换 85 | # TRT-8400-EntropyCalibration2 86 | # input.1: 3ca94044 87 | # 9131: 3cf4f8d5 88 | # 加密 hex(struct.unpack(' np.ndarray: 34 | if x is None and not accepet_none: 35 | raise ValueError("Trying to convert an empty value.") 36 | if isinstance(x, np.ndarray): 37 | return x 38 | elif isinstance(x, int) or isinstance(x, float): 39 | return np.array( 40 | [ 41 | x, 42 | ] 43 | ) 44 | elif isinstance(x, torch.Tensor): 45 | if x.numel() == 0 and accepet_none: 46 | return None 47 | if x.numel() == 0 and not accepet_none: 48 | raise ValueError("Trying to convert an empty value.") 49 | if x.numel() == 1: 50 | return convert_any_to_numpy(x.detach().cpu().item()) 51 | if x.numel() > 1: 52 | return x.detach().cpu().numpy() 53 | elif isinstance(x, list) or isinstance(x, tuple): 54 | return np.array(x) 55 | else: 56 | raise TypeError( 57 | f"input value {x}({type(x)}) can not be converted as numpy type." 58 | ) 59 | 60 | def read_image(path): 61 | mean_val = [103.53, 116.28, 123.675] 62 | std_val = [57.375, 57.12, 58.395] 63 | input_size = [768, 448] 64 | 65 | # img = np.random.randint(255, size=input_size + [3]).astype(np.uint8) 66 | img_raw = cv2.imread(path) 67 | img = cv2.resize(img_raw, (input_size[0],input_size[1])).astype(np.float32) 68 | img -= mean_val 69 | img /= std_val 70 | img = np.transpose(img, (2, 0, 1)).astype(np.float32) 71 | img = np.expand_dims(img, axis=0) 72 | 73 | img = np.ascontiguousarray(img, dtype=np.float32) 74 | return img 75 | 76 | calibration_files = glob.glob( 77 | os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg") 78 | )[-100:] 79 | SAMPLES = [ 80 | read_image(path) for path in calibration_files 81 | ] # rewirte this to use real data. 82 | 83 | 84 | DEVICE = "cuda" 85 | FINETUNE = True 86 | EXECUTING_DEVICE = "cuda" 87 | REQUIRE_ANALYSE = True 88 | 89 | # ------------------------------------------------------------------- 90 | # 启动 tensorRT 进行推理,你先装一下 trt 91 | # ------------------------------------------------------------------- 92 | 93 | 94 | def infer_with_trt(trt_int8_path = ""): 95 | import tensorrt as trt 96 | import trt_infer 97 | trt.init_libnvinfer_plugins(None, "") 98 | 99 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 100 | logger = trt.Logger(trt.Logger.INFO) 101 | with open(trt_int8_path, "rb") as f, trt.Runtime( 102 | logger 103 | ) as runtime: 104 | engine = runtime.deserialize_cuda_engine(f.read()) 105 | 106 | trt_outpus_all = [] 107 | with engine.create_execution_context() as context: 108 | inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers( 109 | context.engine 110 | ) 111 | for sample in tqdm(samples, desc="TensorRT is running..."): 112 | # trt infer 113 | inputs[0].host = convert_any_to_numpy(sample) 114 | trt_outputs_list = trt_infer.do_inference( 115 | context, 116 | bindings=bindings, 117 | inputs=inputs, 118 | outputs=outputs, 119 | stream=stream, 120 | batch_size=1, 121 | ) 122 | trt_outputs_dict = { 123 | trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names)) 124 | } 125 | trt_outpus_all.append(deepcopy(trt_outputs_dict)) 126 | return trt_outpus_all 127 | 128 | 129 | def infer_with_onnx(onnx_path = ""): 130 | 131 | sess = onnxruntime.InferenceSession( 132 | onnx_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"] 133 | ) 134 | input_name = sess.get_inputs()[0].name 135 | onnx_output_names = [output.name for output in sess.get_outputs()] 136 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 137 | 138 | onnx_outpus_all = [] 139 | for sample in tqdm(samples, desc="Onnx is running..."): 140 | onnx_outputs = sess.run(onnx_output_names, {input_name: sample}) 141 | onnx_outputs_dict = { 142 | onnx_output_names[i]: onnx_outputs[i] for i in range(len(onnx_output_names)) 143 | } 144 | onnx_outpus_all.append(deepcopy(onnx_outputs_dict)) 145 | return onnx_outpus_all 146 | 147 | 148 | import sys 149 | 150 | if len(sys.argv) > 2: 151 | onnx_path = sys.argv[1] 152 | trt_path = sys.argv[2] 153 | else: 154 | onnx_path = "/apdcephfs/private_howellyang/onnx2trt/Models_Fp16/RMTNet_release20220609_v2.opt.onnx" 155 | trt_path = "/apdcephfs/private_howellyang/onnx2trt/Models_Fp16/RMTNet_release20220609.fp16.trtmodel" 156 | 157 | trt_outpus_all = infer_with_trt(trt_path) 158 | onnx_outputs_all = infer_with_onnx(onnx_path) 159 | 160 | sims = {} 161 | diffs = {} 162 | for i in range(len(trt_outpus_all)): 163 | for output_name, _ in trt_outpus_all[i].items(): 164 | trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1)) 165 | trt_fp32_output = np.reshape(onnx_outputs_all[i][output_name], (1, -1)) 166 | cos_sim = cosine_similarity(trt_output, trt_fp32_output) 167 | abs_diff_mean = np.mean(np.abs(trt_output - trt_fp32_output)) 168 | if output_name not in sims: 169 | sims[output_name] = [] 170 | diffs[output_name] = [] 171 | sims[output_name].append(cos_sim.ravel()) 172 | diffs[output_name].append(abs_diff_mean.ravel()) 173 | # if cos_sim < 0.985: 174 | # print(output_name, cos_sim) 175 | # print(trt_fp32_output[0, :5]) 176 | # print(trt_output[0, :5]) 177 | 178 | print("===================") 179 | mean_sims = [] 180 | mean_diffs = [] 181 | for key, value in sims.items(): 182 | print(key, np.mean(value), np.min(value), np.mean(diffs[key]), np.max(diffs[key])) 183 | mean_sims.append(np.mean(value)) 184 | mean_diffs.append(np.mean(diffs[key])) 185 | print("average cosine sim = ", np.mean(mean_sims)) 186 | print("average dff abs = ", np.mean(mean_diffs)) -------------------------------------------------------------------------------- /quantization/onnx_calibrator.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import abc 3 | import json 4 | import numpy as np 5 | import tensorrt as trt 6 | import tensorrt as trt 7 | import pycuda.driver as cuda 8 | import pycuda.autoinit # fix init error of cuda 9 | import os 10 | import onnx 11 | import struct 12 | from onnxruntime.quantization.calibrate import ( 13 | CalibrationDataReader, 14 | MinMaxCalibrater, 15 | EntropyCalibrater, 16 | PercentileCalibrater, 17 | ) 18 | 19 | # 使用onnx的quantize tools生成每个节点的scales和zero point 20 | # 并转换为tensorRT可用的calibration cache file 21 | # 后续需要用tensorrt模型转换工具生成trt engine 22 | class ONNXDataReader(CalibrationDataReader): 23 | def __init__(self, input_name, image_stream, max_iter_num=None): 24 | super(ONNXDataReader).__init__() 25 | self.input_name = input_name 26 | self.image_stream = image_stream 27 | self.max_iter_num = max_iter_num 28 | self.iter_num = 0 29 | 30 | def get_next(self) -> dict: 31 | self.iter_num += 1 32 | if self.iter_num > self.max_iter_num: 33 | return None 34 | batch = self.image_stream.next_batch() 35 | if not batch.size: 36 | return None 37 | """generate the input data dict for ONNXinferenceSession run""" 38 | return { 39 | self.input_name: batch, 40 | # "image_shape": np.asarray([[self.image_stream.WIDTH, self.image_stream.HEIGHT]], dtype=np.float32), 41 | } 42 | 43 | 44 | class ONNXCalibrator(trt.IInt8EntropyCalibrator2): 45 | def __init__(self, input_layers, stream, cache_file, calib_algo, onnx_model_path): 46 | super(ONNXCalibrator, self).__init__() 47 | self.input_layers = input_layers 48 | 49 | # 数据读取的类, 等同于图片处理的回调 50 | self.stream = stream 51 | 52 | # 分配GPU 53 | self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes) 54 | 55 | # cache路径 56 | self.cache_file = cache_file 57 | 58 | # 重置校准集 59 | self.stream.reset() 60 | 61 | # 使用onnx的calibrator来统计每个节点的dynamic range 62 | calibrator = self.create_calibrator(calib_algo, onnx_model_path) 63 | # calibrator.set_execution_providers( 64 | # ["CUDAExecutionProvider", "CPUExecutionProvider"] 65 | # ) 66 | calibrator.set_execution_providers( 67 | ["CPUExecutionProvider"] 68 | ) 69 | each_iter_num = 1 70 | for i in range(self.stream.max_batches // each_iter_num): 71 | data_reader = ONNXDataReader( 72 | self.input_layers[0], self.stream, each_iter_num 73 | ) 74 | calibrator.collect_data(data_reader) 75 | self.write_calibration_table(calibrator.compute_range(), self.cache_file) 76 | 77 | @staticmethod 78 | def write_calibration_table(calibration_cache, save_path): 79 | """ 80 | Helper function to write calibration table to files. 81 | """ 82 | with open(save_path + "_calib_cache.json", "w") as file: 83 | file.write( 84 | json.dumps(calibration_cache) 85 | ) # use `json.loads` to do the reverse 86 | 87 | # write plain text: tensorRT需要对结果做转换 88 | # TRT-8400-EntropyCalibration2 89 | # input.1: 3ca94044 90 | # 9131: 3cf4f8d5 91 | # 加密 hex(struct.unpack(' np.ndarray: 33 | if x is None and not accepet_none: 34 | raise ValueError("Trying to convert an empty value.") 35 | if isinstance(x, np.ndarray): 36 | return x 37 | elif isinstance(x, int) or isinstance(x, float): 38 | return np.array( 39 | [ 40 | x, 41 | ] 42 | ) 43 | elif isinstance(x, torch.Tensor): 44 | if x.numel() == 0 and accepet_none: 45 | return None 46 | if x.numel() == 0 and not accepet_none: 47 | raise ValueError("Trying to convert an empty value.") 48 | if x.numel() == 1: 49 | return convert_any_to_numpy(x.detach().cpu().item()) 50 | if x.numel() > 1: 51 | return x.detach().cpu().numpy() 52 | elif isinstance(x, list) or isinstance(x, tuple): 53 | return np.array(x) 54 | else: 55 | raise TypeError( 56 | f"input value {x}({type(x)}) can not be converted as numpy type." 57 | ) 58 | def read_image(path): 59 | mean = [123.675, 116.28, 103.53] 60 | std = [58.395, 57.12, 57.375] 61 | input_w = 960 62 | input_h = 480 63 | 64 | # for onnx inference 65 | mean = np.array(mean) 66 | std = np.array(std) 67 | 68 | # Load by OpenCV 69 | img = cv2.imread(path) 70 | # Convert to RGB 71 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 72 | 73 | img = cv2.resize(img, (input_w, input_h)) 74 | 75 | img = img.astype(np.float32) 76 | 77 | # Norm 78 | for i in range(3): 79 | img[..., i] = (img[..., i] - mean[i]) / std[i] 80 | 81 | # hwc -> nchw 82 | h, w, c = img.shape 83 | img = img.reshape((1, c, h ,w)) 84 | 85 | return np.array(img) 86 | 87 | calibration_files = glob.glob( 88 | os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg") 89 | )[-100:] 90 | SAMPLES = [ 91 | read_image(path) for path in calibration_files 92 | ] # rewirte this to use real data. 93 | 94 | 95 | DEVICE = "cuda" 96 | FINETUNE = True 97 | EXECUTING_DEVICE = "cuda" 98 | REQUIRE_ANALYSE = True 99 | 100 | # ------------------------------------------------------------------- 101 | # 启动 tensorRT 进行推理,你先装一下 trt 102 | # ------------------------------------------------------------------- 103 | 104 | 105 | def infer_with_trt(trt_int8_path = ""): 106 | import tensorrt as trt 107 | import trt_infer 108 | 109 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 110 | logger = trt.Logger(trt.Logger.INFO) 111 | with open(trt_int8_path, "rb") as f, trt.Runtime( 112 | logger 113 | ) as runtime: 114 | engine = runtime.deserialize_cuda_engine(f.read()) 115 | 116 | trt_outpus_all = [] 117 | with engine.create_execution_context() as context: 118 | inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers( 119 | context.engine 120 | ) 121 | for sample in tqdm(samples, desc="TensorRT is running..."): 122 | # trt infer 123 | inputs[0].host = convert_any_to_numpy(sample) 124 | trt_outputs_list = trt_infer.do_inference( 125 | context, 126 | bindings=bindings, 127 | inputs=inputs, 128 | outputs=outputs, 129 | stream=stream, 130 | batch_size=1, 131 | ) 132 | trt_outputs_dict = { 133 | trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names)) 134 | } 135 | trt_outpus_all.append(deepcopy(trt_outputs_dict)) 136 | return trt_outpus_all 137 | 138 | 139 | def infer_with_onnx(onnx_path = ""): 140 | 141 | sess = onnxruntime.InferenceSession( 142 | onnx_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"] 143 | ) 144 | input_name = sess.get_inputs()[0].name 145 | onnx_output_names = [output.name for output in sess.get_outputs()] 146 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 147 | 148 | onnx_outpus_all = [] 149 | for sample in tqdm(samples, desc="Onnx is running..."): 150 | onnx_outputs = sess.run(onnx_output_names, {input_name: sample}) 151 | onnx_outputs_dict = { 152 | onnx_output_names[i]: onnx_outputs[i] for i in range(len(onnx_output_names)) 153 | } 154 | onnx_outpus_all.append(deepcopy(onnx_outputs_dict)) 155 | return onnx_outpus_all 156 | 157 | 158 | import sys 159 | 160 | if len(sys.argv) > 2: 161 | onnx_path = sys.argv[1] 162 | trt_path = sys.argv[2] 163 | else: 164 | onnx_path = "/apdcephfs/private_howellyang/onnx2trt/Models_Fp16/RMTNet_release20220609_v2.opt.onnx" 165 | trt_path = "/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.onnx" 166 | 167 | trt_outpus_all = infer_with_onnx(trt_path) 168 | onnx_outputs_all = infer_with_onnx(onnx_path) 169 | 170 | sims = {} 171 | diffs = {} 172 | for i in range(len(trt_outpus_all)): 173 | for output_name, _ in trt_outpus_all[i].items(): 174 | trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1)) 175 | trt_fp32_output = np.reshape(onnx_outputs_all[i][output_name], (1, -1)) 176 | cos_sim = cosine_similarity(trt_output, trt_fp32_output) 177 | abs_diff_mean = np.mean(np.abs(trt_output - trt_fp32_output)) 178 | if output_name not in sims: 179 | sims[output_name] = [] 180 | diffs[output_name] = [] 181 | sims[output_name].append(cos_sim.ravel()) 182 | diffs[output_name].append(abs_diff_mean.ravel()) 183 | # if cos_sim < 0.985: 184 | # print(output_name, cos_sim) 185 | # print(trt_fp32_output[0, :5]) 186 | # print(trt_output[0, :5]) 187 | 188 | print("===================") 189 | mean_sims = [] 190 | mean_diffs = [] 191 | for key, value in sims.items(): 192 | print(key, np.mean(value), np.min(value), np.mean(diffs[key]), np.max(diffs[key])) 193 | mean_sims.append(np.mean(value)) 194 | mean_diffs.append(np.mean(diffs[key])) 195 | print("average cosine sim = ", np.mean(mean_sims)) 196 | print("average dff abs = ", np.mean(mean_diffs)) -------------------------------------------------------------------------------- /quantization/onnx2trt.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from __future__ import print_function 3 | 4 | import argparse 5 | import glob 6 | import os 7 | from tabnanny import verbose 8 | import tensorrt as trt 9 | import pycuda.driver as cuda 10 | import pycuda.autoinit # fix init error of cuda 11 | from google.protobuf.json_format import MessageToDict 12 | import onnx 13 | from onnxsim import simplify 14 | try: 15 | import onnxoptimizer as optimizer 16 | except: 17 | from onnx import optimizer 18 | 19 | from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference 20 | import numpy as np 21 | from trt_utils import ( 22 | create_image_stream, 23 | create_calibrator, 24 | create_tensorrt_engine, 25 | evaluate_engine, 26 | ) 27 | 28 | parser = argparse.ArgumentParser(description="Onnx Calibration Params") 29 | parser.add_argument("--onnx", type=str, default=None, required=True, help="原始的onnx路径") 30 | parser.add_argument( 31 | "--trt_engine", type=str, default=None, required=True, help="tensorRT engine的保存路径" 32 | ) 33 | 34 | parser.add_argument( 35 | "--engine_type", 36 | type=str, 37 | default="int8", 38 | choices=["int8", "fp32", "fp16", "best"], 39 | required=False, 40 | help="模型的计算精度", 41 | ) 42 | 43 | parser.add_argument( 44 | "--trt_calib_cache", 45 | type=str, 46 | default="./trt_int8.cache", 47 | required=False, 48 | help="用来存储每个节点动态范围的路径", 49 | ) 50 | parser.add_argument( 51 | "--calib_dir", type=str, default=None, required=False, help="进行精度测试以及量化校准使用的图片路径" 52 | ) 53 | parser.add_argument( 54 | "--calib_algo", 55 | type=str, 56 | default="TRTEntropy", 57 | required=False, 58 | choices=["Search", "TRTEntropy", "TRTMinMax", "TRTPercentile", "ONNXEntropy", "ONNXMinMax", "ONNXPercentile"], 59 | help="""量化校准使用的算法: 60 | Search 进行自动化搜索, 自动选择最终输出的cosine距离最高的校准算法 61 | TRTEntropy 使用KL散度评估量化前后的量化误差,自动选择误差最小的动态范围值 62 | TRTMinMax 计算每个节点输出的最大最小值,作为最终的动态范围值 63 | TRTPercentile 计算每个节点输出值,然后求其分位点作为动态范围值 64 | ONNXEntropy 计算原理同TRTEntropy,采用onnx quantization的工程实现 65 | ONNXMinMax 计算原理同TRTMinMax,采用onnx quantization的工程实现 66 | ONNXPercentile 计算原理同TRTPercentile,采用onnx quantization的工程实现 67 | """, 68 | ) 69 | 70 | parser.add_argument( 71 | "--channel_order", 72 | type=str, 73 | default="RGB", 74 | required=False, 75 | choices=["RGB", "BGR"], 76 | help="图片的输入顺序, 可选BGR、RGB", 77 | ) 78 | parser.add_argument( 79 | "--means", type=str, default="0.0,0.0,0.0", required=False, help="图片预处理的均值" 80 | ) 81 | parser.add_argument( 82 | "--stds", type=str, default="1.0,1.0,1.0", required=False, help="图片预处理的方差" 83 | ) 84 | parser.add_argument( 85 | "--pixel_type", 86 | type=str, 87 | default="NCHW", 88 | required=False, 89 | choices=["NCHW", "NHWC"], 90 | help="模型输入的通道顺序, 一般而言", 91 | ) 92 | 93 | args = parser.parse_args() 94 | onnx_path = args.onnx 95 | engine_type = args.engine_type 96 | trt_engine = args.trt_engine 97 | calib_algo = args.calib_algo 98 | calib_dir = args.calib_dir 99 | means = args.means 100 | stds = args.stds 101 | pixel_type = args.pixel_type 102 | trt_calib_cache = args.trt_calib_cache 103 | channel_order = args.channel_order 104 | 105 | # 获取输入输出信息 106 | print("[ONNX2TRT] INFO: Optimizing Onnx Model....") 107 | INPUT_SHAPES = [] 108 | INPUT_NAMES = [] 109 | onnx_model = onnx.load(onnx_path) 110 | onnx_model, check = simplify(onnx_model) # simplify 111 | optimized_model = optimizer.optimize(onnx_model) # optimize 112 | onnx_model = SymbolicShapeInference.infer_shapes( 113 | onnx_model, 114 | int_max=2**31 - 1, 115 | auto_merge=True, 116 | guess_output_rank=True, 117 | verbose=2 118 | ) 119 | 120 | onnx_path = onnx_path.replace(".onnx", "") + "_with_shape.onnx" 121 | onnx.save(onnx_model, onnx_path) 122 | 123 | input_all = [node.name for node in onnx_model.graph.input] 124 | input_initializer = [node.name for node in onnx_model.graph.initializer] 125 | net_feed_input_names = list(set(input_all) - set(input_initializer)) 126 | 127 | for _input in onnx_model.graph.input: 128 | m_dict = MessageToDict(_input) 129 | dim_info = m_dict.get("type").get("tensorType").get("shape").get("dim") 130 | input_shape = [int(d.get("dimValue")) for d in dim_info] # [4,3,384,640] 131 | input_name = m_dict.get("name") 132 | if input_name in net_feed_input_names: 133 | INPUT_SHAPES.append(input_shape) 134 | INPUT_NAMES.append(input_name) 135 | print("[ONNX2TRT] INFO: 模型输入 ", INPUT_NAMES[-1], INPUT_SHAPES[-1]) 136 | 137 | if len(INPUT_SHAPES) > 1: 138 | print("模型存在多个输入, 本工具暂不支持多输入模型") 139 | raise NameError("模型存在多个输入, 本工具暂不支持多输入模型") 140 | 141 | elif len(INPUT_SHAPES[0]) != 4: 142 | print("模型的输入不是NCHW或NHWC, 本工具暂不支持这种输入格式") 143 | raise NameError("模型的输入不是NCHW或NHWC, 本工具暂不支持这种输入格式") 144 | 145 | if engine_type == "int8": 146 | if calib_algo == "Search": 147 | search_types = ["TRTEntropy", "TRTMinMax", "TRTPercentile"] 148 | else: 149 | search_types = [calib_algo] 150 | image_stream = create_image_stream( 151 | calib_dir, INPUT_SHAPES[0], means, stds, pixel_type, channel_order 152 | ) 153 | final_cos_similarity = -1.0 154 | final_engine = None 155 | print("[ONNX2TRT] INFO: Search Best Calibration in {}".format(search_types)) 156 | for calib_algo in search_types: 157 | print("[ONNX2TRT] INFO: Start Calibration with {}".format(calib_algo)) 158 | calibrator = create_calibrator( 159 | image_stream, INPUT_NAMES, trt_calib_cache, calib_algo, onnx_path 160 | ) 161 | print("[ONNX2TRT] INFO: Start Create TensorRT Engine with {}".format(calib_algo)) 162 | engine = create_tensorrt_engine(onnx_path, engine_type, calibrator) 163 | print("[ONNX2TRT] INFO: Start Evaluation of {}".format(calib_algo)) 164 | cos_similarity, infer_time = evaluate_engine(onnx_path, engine, image_stream) 165 | if cos_similarity > final_cos_similarity: 166 | final_cos_similarity = cos_similarity 167 | final_engine = engine 168 | final_infer_time = infer_time 169 | print("[ONNX2TRT] INFO: 校准算法 = ", calib_algo) 170 | print("[ONNX2TRT] INFO: 与onnx输出的cos相似度 = ", cos_similarity) 171 | print("[ONNX2TRT] INFO: 模型infer的平均耗时 = ", infer_time) 172 | 173 | else: 174 | final_engine = create_tensorrt_engine(onnx_path, engine_type) 175 | if calib_dir != "": 176 | image_stream = create_image_stream( 177 | calib_dir, INPUT_SHAPES[0], means, stds, pixel_type, channel_order 178 | ) 179 | cos_similarity, infer_time = evaluate_engine( 180 | onnx_path, final_engine, image_stream 181 | ) 182 | print("[ONNX2TRT] INFO: 校准算法 = ", None) 183 | print("[ONNX2TRT] INFO: 与onnx输出的cos相似度 = ", cos_similarity) 184 | print("[ONNX2TRT] INFO: 模型infer的平均耗时 = ", infer_time) 185 | 186 | # 将trt engine写入文件 187 | print("[ONNX2TRT] INFO: 模型构建完成, 将模型写入路径 = ", trt_engine) 188 | if not os.path.exists(os.path.dirname(trt_engine)): 189 | os.makedirs(os.path.dirname(trt_engine), exist_ok=True) 190 | with open(trt_engine, "wb") as f: 191 | f.write(final_engine.serialize()) 192 | -------------------------------------------------------------------------------- /quantization/P03_MT_onnx2tensorRT_int8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | 6 | print(sys.getdefaultencoding()) 7 | s = "中文乱码问题解决" 8 | print(s) 9 | 10 | # --------------------------------------------------------------- 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 12 | 13 | # This script shows you how to export ppq internal graph to tensorRT 14 | # --------------------------------------------------------------- 15 | 16 | # For this inference test, all test data is randomly picked. 17 | # If you want to use real data, just rewrite the defination of SAMPLES 18 | print("开始import") 19 | import onnxruntime 20 | import torch 21 | from ppq import * 22 | from ppq.api import * 23 | from tqdm import tqdm 24 | import glob 25 | import cv2 26 | import numpy as np 27 | from torchvision import transforms 28 | from PIL import Image 29 | import os 30 | 31 | 32 | 33 | def read_image(path): 34 | mean_val = [103.53, 116.28, 123.675] 35 | std_val = [57.375, 57.12, 58.395] 36 | input_size = [768, 448] 37 | 38 | # img = np.random.randint(255, size=input_size + [3]).astype(np.uint8) 39 | img_raw = cv2.imread(path) 40 | img = cv2.resize(img_raw, (input_size[0],input_size[1])).astype(np.float32) 41 | img -= mean_val 42 | img /= std_val 43 | img = np.transpose(img, (2, 0, 1)).astype(np.float32) 44 | img = np.expand_dims(img, axis=0) 45 | 46 | img = np.ascontiguousarray(img, dtype=np.float32) 47 | img_tensor = torch.from_numpy(img) 48 | print("======", np.shape(img_tensor)) 49 | # dummy_input = torch.autograd.Variable(img_tensor) 50 | return img_tensor 51 | 52 | 53 | QUANT_PLATFROM = TargetPlatform.TRT_INT8 54 | MODEL = "/apdcephfs/private_howellyang/road_service_app/LaneModel/onnx_infer/model/epoch_390_mm2conv.opt.onnx" 55 | INPUT_SHAPE = [1, 3, 448, 768] 56 | 57 | calibration_files = glob.glob( 58 | os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg") 59 | )[:128] 60 | SAMPLES = [ 61 | read_image(path) for path in calibration_files 62 | ] # rewirte this to use real data. 63 | 64 | 65 | DEVICE = "cuda" 66 | FINETUNE = True 67 | QS = QuantizationSettingFactory.default_setting() 68 | EXECUTING_DEVICE = "cuda" 69 | REQUIRE_ANALYSE = True 70 | 71 | # ------------------------------------------------------------------- 72 | # 下面向你展示了常用参数调节选项: 73 | # ------------------------------------------------------------------- 74 | if PPQ_CONFIG.USING_CUDA_KERNEL: 75 | print("====== using advanced_optimization =====") 76 | QS.advanced_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 77 | QS.advanced_optimization_setting.steps = 2500 # 再训练步数,影响训练时间,2500步大概几分钟 78 | QS.advanced_optimization_setting.collecting_device = ( 79 | # "executor" # 缓存数据放在那,executor 就是放在gpu,如果显存超了你就换成 'cpu' 80 | "cpu" 81 | ) 82 | QS.advanced_optimization_setting.auto_check = ( 83 | False # 打开这个选项则训练过程中会防止过拟合,以及意外情况,通常不需要开。 84 | ) 85 | else: 86 | print("====== using lsq_optimization =====") 87 | QS.lsq_optimization = FINETUNE # 启动网络再训练过程,降低量化误差 88 | QS.lsq_optimization_setting.epochs = 128 # 再训练轮数,影响训练时间,30轮大概几分钟 89 | QS.lsq_optimization_setting.collecting_device = ( 90 | "cuda" # 缓存数据放在那,cuda 就是放在gpu,如果显存超了你就换成 'cpu' 91 | ) 92 | 93 | # 把量化的不太好的算子送回 FP32 94 | # QS.dispatching_table.append(operation="Conv_3342", platform=TargetPlatform.FP32) 95 | # QS.dispatching_table.append(operation="Relu_3343", platform=TargetPlatform.FP32) 96 | # QS.dispatching_table.append(operation="Conv_2523", platform=TargetPlatform.FP32) 97 | 98 | print("正准备量化你的网络,检查下列设置:") 99 | print(f"TARGET PLATFORM : {QUANT_PLATFROM.name}") 100 | print(f"NETWORK INPUTSHAPE : {INPUT_SHAPE}") 101 | 102 | # ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x,但是你如果没有装相应编译环境的话是编译不了的 103 | # 你可以尝试安装编译环境,或者在不启动 CUDA KERNEL 的情况下完成量化:移除 with ENABLE_CUDA_KERNEL(): 即可 104 | with ENABLE_CUDA_KERNEL(): 105 | qir = quantize_onnx_model( 106 | onnx_import_file=MODEL, 107 | calib_dataloader=SAMPLES, 108 | calib_steps=128, 109 | setting=QS, 110 | input_shape=INPUT_SHAPE, 111 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 112 | platform=QUANT_PLATFROM, 113 | do_quantize=True, 114 | ) 115 | 116 | # ------------------------------------------------------------------- 117 | # PPQ 计算量化误差时,使用信噪比的倒数作为指标,即噪声能量 / 信号能量 118 | # 量化误差 0.1 表示在整体信号中,量化噪声的能量约为 10% 119 | # 你应当注意,在 graphwise_error_analyse 分析中,我们衡量的是累计误差 120 | # 网络的最后一层往往都具有较大的累计误差,这些误差是其前面的所有层所共同造成的 121 | # 你需要使用 layerwise_error_analyse 逐层分析误差的来源 122 | # ------------------------------------------------------------------- 123 | print("正计算网络量化误差(SNR),最后一层的误差应小于 0.1 以保证量化精度:") 124 | reports = graphwise_error_analyse( 125 | graph=qir, 126 | running_device=EXECUTING_DEVICE, 127 | steps=32, 128 | dataloader=SAMPLES, 129 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 130 | ) 131 | for op, snr in reports.items(): 132 | if snr > 0.1: 133 | ppq_warning(f"层 {op} 的累计量化误差显著,请考虑进行优化") 134 | 135 | if REQUIRE_ANALYSE: 136 | print("正计算逐层量化误差(SNR),每一层的独立量化误差应小于 0.1 以保证量化精度:") 137 | layerwise_error_analyse( 138 | graph=qir, 139 | running_device=EXECUTING_DEVICE, 140 | interested_outputs=None, 141 | dataloader=SAMPLES, 142 | collate_fn=lambda x: x.to(EXECUTING_DEVICE), 143 | ) 144 | 145 | print("网络量化结束,正在生成目标文件:") 146 | export_ppq_graph( 147 | graph=qir, 148 | platform=QUANT_PLATFROM, 149 | graph_save_to=MODEL.replace(".onnx", "_v2_int8.onnx"), 150 | ) 151 | 152 | # ------------------------------------------------------------------- 153 | # 记录一下输入输出的名字,onnxruntime 跑的时候需要提供这些名字 154 | # 我写的只是单输出单输入的版本,多输出多输入你得自己改改 155 | # ------------------------------------------------------------------- 156 | int8_input_names = [name for name, _ in qir.inputs.items()] 157 | int8_output_names = [name for name, _ in qir.outputs.items()] 158 | 159 | # ------------------------------------------------------------------- 160 | # 启动 tensorRT 进行推理,你先装一下 trt 161 | # ------------------------------------------------------------------- 162 | import tensorrt as trt 163 | import trt_infer 164 | 165 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 166 | logger = trt.Logger(trt.Logger.INFO) 167 | with open(MODEL.replace(".onnx", "_v2_int8.engine"), "rb") as f, trt.Runtime( 168 | logger 169 | ) as runtime: 170 | engine = runtime.deserialize_cuda_engine(f.read()) 171 | 172 | results = [] 173 | with engine.create_execution_context() as context: 174 | inputs, outputs, bindings, stream, _ = trt_infer.allocate_buffers(context.engine) 175 | for sample in tqdm(samples, desc="TensorRT is running..."): 176 | inputs[0].host = convert_any_to_numpy(sample) 177 | output = trt_infer.do_inference( 178 | context, 179 | bindings=bindings, 180 | inputs=inputs, 181 | outputs=outputs, 182 | stream=stream, 183 | batch_size=1, 184 | ) 185 | # results.append(convert_any_to_torch_tensor(output).reshape([-1, 1000])) 186 | -------------------------------------------------------------------------------- /quantization/compare_onnx_onnx_v3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | 6 | print(sys.getdefaultencoding()) 7 | s = "中文乱码问题解决" 8 | print(s) 9 | 10 | # --------------------------------------------------------------- 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 12 | 13 | # This script shows you how to export ppq internal graph to tensorRT 14 | # --------------------------------------------------------------- 15 | 16 | # For this inference test, all test data is randomly picked. 17 | # If you want to use real data, just rewrite the defination of SAMPLES 18 | print("开始import") 19 | import onnxruntime 20 | import torch 21 | from tqdm import tqdm 22 | import glob 23 | import cv2 24 | import numpy as np 25 | from torchvision import transforms 26 | from PIL import Image 27 | import os 28 | from sklearn.metrics.pairwise import cosine_similarity 29 | import onnx 30 | from copy import deepcopy 31 | 32 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray: 33 | if x is None and not accepet_none: 34 | raise ValueError("Trying to convert an empty value.") 35 | if isinstance(x, np.ndarray): 36 | return x 37 | elif isinstance(x, int) or isinstance(x, float): 38 | return np.array( 39 | [ 40 | x, 41 | ] 42 | ) 43 | elif isinstance(x, torch.Tensor): 44 | if x.numel() == 0 and accepet_none: 45 | return None 46 | if x.numel() == 0 and not accepet_none: 47 | raise ValueError("Trying to convert an empty value.") 48 | if x.numel() == 1: 49 | return convert_any_to_numpy(x.detach().cpu().item()) 50 | if x.numel() > 1: 51 | return x.detach().cpu().numpy() 52 | elif isinstance(x, list) or isinstance(x, tuple): 53 | return np.array(x) 54 | else: 55 | raise TypeError( 56 | f"input value {x}({type(x)}) can not be converted as numpy type." 57 | ) 58 | 59 | def read_image(path): 60 | # 多任务模型 61 | # _img_transforms = transforms.Compose( 62 | # [ 63 | # transforms.Resize((384, 768)), 64 | # transforms.ToTensor(), 65 | # transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 66 | # ] 67 | # ) 68 | # img = Image.open(path).convert("RGB") 69 | # img_w, img_h = img.size[0], img.size[1] 70 | # img = _img_transforms(img) 71 | # img = img.unsqueeze(0) 72 | # return img 73 | mean_val = [103.53, 116.28, 123.675] 74 | std_val = [57.375, 57.12, 58.395] 75 | input_size = [768, 448] 76 | 77 | # img = np.random.randint(255, size=input_size + [3]).astype(np.uint8) 78 | img_raw = cv2.imread(path) 79 | img = cv2.resize(img_raw, (input_size[0],input_size[1])).astype(np.float32) 80 | img -= mean_val 81 | img /= std_val 82 | img = np.transpose(img, (2, 0, 1)).astype(np.float32) 83 | img = np.expand_dims(img, axis=0) 84 | 85 | img = np.ascontiguousarray(img, dtype=np.float32) 86 | # img_tensor = torch.from_numpy(img) 87 | return img 88 | 89 | calibration_files = glob.glob( 90 | os.path.join("/apdcephfs/private_howellyang/data/Calib1k5/", "*.jpg") 91 | )[-100:] 92 | SAMPLES = [ 93 | read_image(path) for path in calibration_files 94 | ] # rewirte this to use real data. 95 | 96 | 97 | DEVICE = "cuda" 98 | FINETUNE = True 99 | EXECUTING_DEVICE = "cuda" 100 | REQUIRE_ANALYSE = True 101 | 102 | # ------------------------------------------------------------------- 103 | # 启动 tensorRT 进行推理,你先装一下 trt 104 | # ------------------------------------------------------------------- 105 | 106 | 107 | def infer_with_trt(trt_int8_path = ""): 108 | import tensorrt as trt 109 | import trt_infer 110 | 111 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 112 | logger = trt.Logger(trt.Logger.INFO) 113 | with open(trt_int8_path, "rb") as f, trt.Runtime( 114 | logger 115 | ) as runtime: 116 | engine = runtime.deserialize_cuda_engine(f.read()) 117 | 118 | trt_outpus_all = [] 119 | with engine.create_execution_context() as context: 120 | inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers( 121 | context.engine 122 | ) 123 | for sample in tqdm(samples, desc="TensorRT is running..."): 124 | # trt infer 125 | inputs[0].host = convert_any_to_numpy(sample) 126 | trt_outputs_list = trt_infer.do_inference( 127 | context, 128 | bindings=bindings, 129 | inputs=inputs, 130 | outputs=outputs, 131 | stream=stream, 132 | batch_size=1, 133 | ) 134 | trt_outputs_dict = { 135 | trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names)) 136 | } 137 | trt_outpus_all.append(deepcopy(trt_outputs_dict)) 138 | return trt_outpus_all 139 | 140 | 141 | def infer_with_onnx(onnx_path = ""): 142 | 143 | sess = onnxruntime.InferenceSession( 144 | onnx_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"] 145 | ) 146 | input_name = sess.get_inputs()[0].name 147 | onnx_output_names = [output.name for output in sess.get_outputs()] 148 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 149 | 150 | onnx_outpus_all = [] 151 | for sample in tqdm(samples, desc="Onnx is running..."): 152 | onnx_outputs = sess.run(onnx_output_names, {input_name: sample}) 153 | onnx_outputs_dict = { 154 | onnx_output_names[i]: onnx_outputs[i] for i in range(len(onnx_output_names)) 155 | } 156 | onnx_outpus_all.append(deepcopy(onnx_outputs_dict)) 157 | return onnx_outpus_all 158 | 159 | 160 | import sys 161 | 162 | if len(sys.argv) > 2: 163 | onnx_path = sys.argv[1] 164 | trt_path = sys.argv[2] 165 | else: 166 | onnx_path = "/apdcephfs/private_howellyang/onnx2trt/Models_Fp16/RMTNet_release20220609_v2.opt.onnx" 167 | trt_path = "/apdcephfs/private_howellyang/onnx2trt/model.weight_quantized_v2.onnx" 168 | 169 | trt_outpus_all = infer_with_onnx(trt_path) 170 | onnx_outputs_all = infer_with_onnx(onnx_path) 171 | 172 | sims = {} 173 | diffs = {} 174 | for i in range(len(trt_outpus_all)): 175 | for output_name, _ in trt_outpus_all[i].items(): 176 | trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1)) 177 | trt_fp32_output = np.reshape(onnx_outputs_all[i][output_name], (1, -1)) 178 | cos_sim = cosine_similarity(trt_output, trt_fp32_output) 179 | abs_diff_mean = np.mean(np.abs(trt_output - trt_fp32_output)) 180 | if output_name not in sims: 181 | sims[output_name] = [] 182 | diffs[output_name] = [] 183 | sims[output_name].append(cos_sim.ravel()) 184 | diffs[output_name].append(abs_diff_mean.ravel()) 185 | # if cos_sim < 0.985: 186 | # print(output_name, cos_sim) 187 | # print(trt_fp32_output[0, :5]) 188 | # print(trt_output[0, :5]) 189 | 190 | print("===================") 191 | mean_sims = [] 192 | mean_diffs = [] 193 | for key, value in sims.items(): 194 | print(key, np.mean(value), np.min(value), np.mean(diffs[key]), np.max(diffs[key])) 195 | mean_sims.append(np.mean(value)) 196 | mean_diffs.append(np.mean(diffs[key])) 197 | print("average cosine sim = ", np.mean(mean_sims)) 198 | print("average dff abs = ", np.mean(mean_diffs)) -------------------------------------------------------------------------------- /quantization/compare_onnx_trt_v1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 #指定解释器 2 | # encoding:utf-8 3 | 4 | import sys 5 | 6 | print(sys.getdefaultencoding()) 7 | s = "中文乱码问题解决" 8 | print(s) 9 | 10 | # --------------------------------------------------------------- 11 | # 这个脚本向你展示了如何使用 tensorRT 对 PPQ 导出的模型进行推理 12 | 13 | # This script shows you how to export ppq internal graph to tensorRT 14 | # --------------------------------------------------------------- 15 | 16 | # For this inference test, all test data is randomly picked. 17 | # If you want to use real data, just rewrite the defination of SAMPLES 18 | print("开始import") 19 | import onnxruntime 20 | import torch 21 | from tqdm import tqdm 22 | import glob 23 | import cv2 24 | import numpy as np 25 | from torchvision import transforms 26 | from PIL import Image 27 | import os 28 | from sklearn.metrics.pairwise import cosine_similarity 29 | import onnx 30 | from copy import deepcopy 31 | 32 | def convert_any_to_numpy(x, accepet_none: bool = True) -> np.ndarray: 33 | if x is None and not accepet_none: 34 | raise ValueError("Trying to convert an empty value.") 35 | if isinstance(x, np.ndarray): 36 | return x 37 | elif isinstance(x, int) or isinstance(x, float): 38 | return np.array( 39 | [ 40 | x, 41 | ] 42 | ) 43 | elif isinstance(x, torch.Tensor): 44 | if x.numel() == 0 and accepet_none: 45 | return None 46 | if x.numel() == 0 and not accepet_none: 47 | raise ValueError("Trying to convert an empty value.") 48 | if x.numel() == 1: 49 | return convert_any_to_numpy(x.detach().cpu().item()) 50 | if x.numel() > 1: 51 | return x.detach().cpu().numpy() 52 | elif isinstance(x, list) or isinstance(x, tuple): 53 | return np.array(x) 54 | else: 55 | raise TypeError( 56 | f"input value {x}({type(x)}) can not be converted as numpy type." 57 | ) 58 | 59 | def read_image(path): 60 | # 多任务模型 61 | _img_transforms = transforms.Compose( 62 | [ 63 | transforms.Resize((384, 768)), 64 | transforms.ToTensor(), 65 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 66 | ] 67 | ) 68 | img = Image.open(path).convert("RGB") 69 | img_w, img_h = img.size[0], img.size[1] 70 | img = _img_transforms(img) 71 | img = img.unsqueeze(0) 72 | return img 73 | 74 | calibration_files = glob.glob( 75 | os.path.join("/mapai/howellyang/code/road-service/road_service/calib_images/", "*.jpg") 76 | )[:100] 77 | 78 | 79 | SAMPLES = [ 80 | read_image(path) for path in calibration_files 81 | ] # rewirte this to use real data. 82 | 83 | 84 | DEVICE = "cuda" 85 | FINETUNE = True 86 | EXECUTING_DEVICE = "cuda" 87 | REQUIRE_ANALYSE = True 88 | 89 | # ------------------------------------------------------------------- 90 | # 启动 tensorRT 进行推理,你先装一下 trt 91 | # ------------------------------------------------------------------- 92 | 93 | 94 | def infer_with_trt(trt_int8_path = ""): 95 | import tensorrt as trt 96 | import trt_infer 97 | trt.init_libnvinfer_plugins(None, "") 98 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 99 | logger = trt.Logger(trt.Logger.INFO) 100 | with open(trt_int8_path, "rb") as f, trt.Runtime(logger) as runtime: 101 | engine = runtime.deserialize_cuda_engine(f.read()) 102 | 103 | trt_outpus_all = [] 104 | with engine.create_execution_context() as context: 105 | inputs, outputs, bindings, stream, trt_output_names = trt_infer.allocate_buffers( 106 | context.engine 107 | ) 108 | for k, sample in enumerate(tqdm(samples, desc="TensorRT is running...")): 109 | # trt infer 110 | inputs[0].host = convert_any_to_numpy(sample) 111 | trt_outputs_list = trt_infer.do_inference( 112 | context, 113 | bindings=bindings, 114 | inputs=inputs, 115 | outputs=outputs, 116 | stream=stream, 117 | batch_size=1, 118 | ) 119 | 120 | sample_base = os.path.basename(calibration_files[k]) 121 | # for i in range(len(trt_output_names)): 122 | # save_path = os.path.join("/mapai/howellyang/code/road-service/road_service/engine/mod_road_multi_tasks/outputs_trt", sample_base + "_{}.npy".format(i)) 123 | # np.save(save_path, trt_outputs_list[i]) 124 | 125 | trt_outputs_dict = { 126 | trt_output_names[i]: trt_outputs_list[i] for i in range(len(trt_output_names)) 127 | } 128 | trt_outpus_all.append(deepcopy(trt_outputs_dict)) 129 | return trt_outpus_all 130 | 131 | 132 | def infer_with_onnx(onnx_path = ""): 133 | options = onnxruntime.SessionOptions() 134 | options.intra_op_num_threads = 1 135 | options.inter_op_num_threads = 1 136 | sess = onnxruntime.InferenceSession( 137 | onnx_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"], sess_options=options 138 | ) 139 | input_name = sess.get_inputs()[0].name 140 | onnx_output_names = [output.name for output in sess.get_outputs()] 141 | samples = [convert_any_to_numpy(sample) for sample in SAMPLES] 142 | 143 | onnx_outpus_all = [] 144 | for k, sample in enumerate(tqdm(samples, desc="Onnx is running...")): 145 | onnx_outputs = sess.run(onnx_output_names, {input_name: sample}) 146 | 147 | sample_base = os.path.basename(calibration_files[k]) 148 | # for i in range(len(onnx_output_names)): 149 | # save_path = os.path.join("/mapai/howellyang/code/road-service/road_service/engine/mod_road_multi_tasks/outputs_onnx", sample_base + "_{}.npy".format(i)) 150 | # np.save(save_path, onnx_outputs[i]) 151 | 152 | onnx_outputs_dict = { 153 | onnx_output_names[i]: onnx_outputs[i] for i in range(len(onnx_output_names)) 154 | } 155 | onnx_outpus_all.append(deepcopy(onnx_outputs_dict)) 156 | return onnx_outpus_all 157 | 158 | 159 | 160 | import sys 161 | 162 | if len(sys.argv) > 2: 163 | onnx_path = sys.argv[1] 164 | trt_path = sys.argv[2] 165 | else: 166 | onnx_path = "/apdcephfs/private_howellyang/onnx2trt/Models_Fp16/RMTNet_release20220609_v2.opt.onnx" 167 | trt_path = "/apdcephfs/private_howellyang/onnx2trt/Models_Fp16/RMTNet_release20220609.fp16.trtmodel" 168 | 169 | trt_outpus_all = infer_with_trt(trt_path) 170 | onnx_outputs_all = infer_with_onnx(onnx_path) 171 | 172 | sims = {} 173 | diffs = {} 174 | for i in range(len(trt_outpus_all)): 175 | for output_name, _ in trt_outpus_all[i].items(): 176 | trt_output = np.reshape(trt_outpus_all[i][output_name], (1, -1)) 177 | trt_fp32_output = np.reshape(onnx_outputs_all[i][output_name], (1, -1)) 178 | cos_sim = cosine_similarity(trt_output, trt_fp32_output) 179 | abs_diff_mean = np.mean(np.abs(trt_output - trt_fp32_output)) 180 | if output_name not in sims: 181 | sims[output_name] = [] 182 | diffs[output_name] = [] 183 | sims[output_name].append(cos_sim.ravel()) 184 | diffs[output_name].append(abs_diff_mean.ravel()) 185 | # if cos_sim < 0.985: 186 | # print(output_name, cos_sim) 187 | # print(trt_fp32_output[0, :5]) 188 | # print(trt_output[0, :5]) 189 | 190 | print("===================") 191 | mean_sims = [] 192 | mean_diffs = [] 193 | for key, value in sims.items(): 194 | print(key, np.mean(value), np.min(value), np.mean(diffs[key]), np.max(diffs[key])) 195 | mean_sims.append(np.mean(value)) 196 | mean_diffs.append(np.mean(diffs[key])) 197 | print("average cosine sim = ", np.mean(mean_sims)) 198 | print("average dff abs = ", np.mean(mean_diffs)) 199 | -------------------------------------------------------------------------------- /quantization/ptq/ppq_optimize.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | from onnx import numpy_helper 3 | import numpy as np 4 | import json 5 | import sys 6 | 7 | 8 | def get_post_nodes(onnx_model, tensor_name): 9 | post_nodes = [] 10 | for node in onnx_model.graph.node: 11 | for input_tensor in node.input: 12 | if input_tensor == tensor_name: 13 | post_nodes.append(node) 14 | break 15 | return post_nodes 16 | 17 | 18 | def remove_qdq(onnx_model, node): 19 | nodes_to_remove = [] 20 | assert node.op_type == "QuantizeLinear" 21 | nodes_to_remove.append(node) 22 | for dq_node in onnx_model.graph.node: 23 | if dq_node.input[0] == node.output[0]: 24 | assert dq_node.op_type == "DequantizeLinear" 25 | nodes_to_remove.append(dq_node) 26 | for post_node in onnx_model.graph.node: 27 | for i, input in enumerate(post_node.input): 28 | if input == dq_node.output[0]: 29 | post_node.input[i] = node.input[0] 30 | return nodes_to_remove 31 | 32 | 33 | def create_act_initializer_tensor( 34 | name, 35 | tensor_array, 36 | data_type=onnx.TensorProto.FLOAT, 37 | ): 38 | 39 | # (TensorProto) 40 | initializer_tensor = onnx.helper.make_tensor( 41 | name=name, 42 | data_type=data_type, 43 | dims=(), # [1], 44 | vals=[tensor_array], 45 | ) 46 | 47 | return initializer_tensor 48 | 49 | 50 | def add_act_dqd_node(qdq_model, tensor_name, scale): 51 | flag_found = False 52 | for node in qdq_model.graph.node: 53 | for j in range(len(node.input)): 54 | if node.input[j] == tensor_name: 55 | flag_found = True 56 | if not flag_found: 57 | return None 58 | 59 | quant_node_name = tensor_name + "_QuantizeLinear" 60 | dequant_node_name = tensor_name + "_DequantizeLinear" 61 | q_input = tensor_name 62 | q_output = tensor_name + "_QuantizeLinear" 63 | dq_input = q_output 64 | dq_output = tensor_name + "_DequantizeLinear" 65 | 66 | scale_name = tensor_name + "_QuantizeScale" 67 | zp_name = tensor_name + "_QuantizeZp" 68 | qlinear_node = onnx.helper.make_node( 69 | "QuantizeLinear", 70 | [q_input, scale_name, zp_name], 71 | [q_output], 72 | quant_node_name, 73 | ) 74 | dequant_node = onnx.helper.make_node( 75 | "DequantizeLinear", 76 | [dq_input, scale_name, zp_name], 77 | [dq_output], 78 | dequant_node_name, 79 | ) 80 | 81 | for node in qdq_model.graph.node: 82 | for j in range(len(node.input)): 83 | if node.input[j] == tensor_name: 84 | node.input[j] = dq_output 85 | 86 | qdq_model.graph.node.extend([qlinear_node, dequant_node]) 87 | 88 | scale_initializer_tensor = create_act_initializer_tensor( 89 | name=scale_name, tensor_array=scale, data_type=onnx.TensorProto.FLOAT 90 | ) 91 | 92 | zp_initializer_tensor = create_act_initializer_tensor( 93 | name=zp_name, tensor_array=0, data_type=onnx.TensorProto.INT8 94 | ) 95 | 96 | qdq_model.graph.initializer.append(scale_initializer_tensor) 97 | qdq_model.graph.initializer.append(zp_initializer_tensor) 98 | return qdq_model 99 | 100 | # Step 01. Move QDQ forward 101 | int8_model_path = sys.argv[1] # onnx.load("/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.onnx.model_int8.onnx") 102 | int8_model = onnx.load(int8_model_path) 103 | weight_name2tensor = {} 104 | for weight in int8_model.graph.initializer: 105 | weight_name2tensor[weight.name] = weight 106 | 107 | nodes_to_remove = [] 108 | scale_map = {} 109 | scale_map_final = {} 110 | for node in int8_model.graph.node: 111 | output_tensor = node.output[0] 112 | post_nodes = get_post_nodes(int8_model, output_tensor) 113 | 114 | QDQ_count = 0 115 | for post_node in post_nodes: 116 | if post_node.op_type in ["QuantizeLinear"]: 117 | QDQ_count += 1 118 | 119 | # 第一种情况: 存在QDQ, 但是与后续节点个数不同 120 | # 第二种情况: 存在多于1个QDQ 121 | if node.op_type not in ["Concat"] and (QDQ_count > 0 and QDQ_count != len(post_nodes)) or QDQ_count > 1: 122 | scale_values = [] 123 | for post_node in post_nodes: 124 | if post_node.op_type in ["QuantizeLinear"]: 125 | scale_name = post_node.input[1] 126 | scale_tensor = weight_name2tensor[scale_name] 127 | scale_value = numpy_helper.to_array(scale_tensor) # out_ch, in_ch, ker, ker 128 | scale_values.append(float(scale_value.ravel())) 129 | nodes_to_remove.extend(remove_qdq(int8_model, post_node)) 130 | print(node.name, QDQ_count, len(post_nodes), scale_values) 131 | scale_map[node.output[0]] = np.mean(scale_values) 132 | elif QDQ_count == 1 and len(post_nodes) == 1: 133 | scale_values = [] 134 | for post_node in post_nodes: 135 | if post_node.op_type in ["QuantizeLinear"]: 136 | scale_name = post_node.input[1] 137 | scale_tensor = weight_name2tensor[scale_name] 138 | scale_value = numpy_helper.to_array(scale_tensor) # out_ch, in_ch, ker, ker 139 | scale_values.append(float(scale_value.ravel())) 140 | assert len(scale_values) == 1 141 | scale_map_final[node.output[0]] = np.mean(scale_values) 142 | 143 | for node in nodes_to_remove: 144 | int8_model.graph.node.remove(node) 145 | 146 | for tensor_name, scale in scale_map.items(): 147 | add_act_dqd_node(int8_model, tensor_name, scale) 148 | 149 | onnx.save(int8_model, int8_model_path + ".opt_step1.onnx") 150 | 151 | scale_map_final.update(scale_map) 152 | with open(int8_model_path + ".opt_step1.scale_map.json", "w") as fw: 153 | json.dump(scale_map_final, fw, indent=4) 154 | 155 | 156 | # Step 02 add QDQ node to model 157 | def read_calib_cache(calib_cache): 158 | import struct 159 | scale_map = {} 160 | with open(calib_cache) as fr: 161 | for line in fr.readlines()[1:]: 162 | print(line.strip()) 163 | name, value = line.strip().split(": ") 164 | name = name.strip(":") 165 | value = value.strip(":") 166 | if value.strip() == "0": 167 | val = 0.0 168 | else: 169 | val = struct.unpack("!f", bytes.fromhex(value.strip()))[0] 170 | scale_map[name] = val 171 | scale_map = {k: scale_map[k] for k in sorted(scale_map)} 172 | return scale_map 173 | 174 | 175 | # int8_model = onnx.load( 176 | # "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.onnx.model_int8.opt_step1.onnx") 177 | 178 | weight_name2tensor = {} 179 | for weight in int8_model.graph.initializer: 180 | weight_name2tensor[weight.name] = weight 181 | 182 | QDQ_scales = {} 183 | for node in int8_model.graph.node: 184 | if node.op_type in ["QuantizeLinear"]: 185 | scale_name = node.input[1] 186 | scale_tensor = weight_name2tensor[scale_name] 187 | scale_value = numpy_helper.to_array( 188 | scale_tensor) # out_ch, in_ch, ker, ker 189 | if np.size(scale_value) > 1: 190 | continue 191 | scale_value = float(scale_value.ravel()) 192 | QDQ_scales[node.input[0]] = scale_value 193 | 194 | calib_cache = "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.trt_int8_with_1687pics_calib_percentile595.calib_cache" 195 | full_scale_map = read_calib_cache(calib_cache) 196 | 197 | for tensor_name, scale in full_scale_map.items(): 198 | if tensor_name in QDQ_scales: 199 | print(tensor_name, scale, QDQ_scales[tensor_name]) 200 | else: 201 | print(tensor_name, "not exist") 202 | scale = max(scale, 1e-8) 203 | add_act_dqd_node(int8_model, tensor_name, scale) 204 | 205 | 206 | onnx.save(int8_model, int8_model_path + ".opt_step2.onnx") 207 | 208 | # print(QDQ_scales) 209 | # print(full_scale_map) 210 | -------------------------------------------------------------------------------- /quantization/qat/ppq_optimize.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | from onnx import numpy_helper 3 | import numpy as np 4 | import json 5 | import sys 6 | 7 | 8 | def get_post_nodes(onnx_model, tensor_name): 9 | post_nodes = [] 10 | for node in onnx_model.graph.node: 11 | for input_tensor in node.input: 12 | if input_tensor == tensor_name: 13 | post_nodes.append(node) 14 | break 15 | return post_nodes 16 | 17 | 18 | def remove_qdq(onnx_model, node): 19 | nodes_to_remove = [] 20 | assert node.op_type == "QuantizeLinear" 21 | nodes_to_remove.append(node) 22 | for dq_node in onnx_model.graph.node: 23 | if dq_node.input[0] == node.output[0]: 24 | assert dq_node.op_type == "DequantizeLinear" 25 | nodes_to_remove.append(dq_node) 26 | for post_node in onnx_model.graph.node: 27 | for i, input in enumerate(post_node.input): 28 | if input == dq_node.output[0]: 29 | post_node.input[i] = node.input[0] 30 | return nodes_to_remove 31 | 32 | 33 | def create_act_initializer_tensor( 34 | name, 35 | tensor_array, 36 | data_type=onnx.TensorProto.FLOAT, 37 | ): 38 | 39 | # (TensorProto) 40 | initializer_tensor = onnx.helper.make_tensor( 41 | name=name, 42 | data_type=data_type, 43 | dims=(), # [1], 44 | vals=[tensor_array], 45 | ) 46 | 47 | return initializer_tensor 48 | 49 | 50 | def add_act_dqd_node(qdq_model, tensor_name, scale): 51 | flag_found = False 52 | for node in qdq_model.graph.node: 53 | for j in range(len(node.input)): 54 | if node.input[j] == tensor_name: 55 | flag_found = True 56 | if not flag_found: 57 | return None 58 | 59 | quant_node_name = tensor_name + "_QuantizeLinear" 60 | dequant_node_name = tensor_name + "_DequantizeLinear" 61 | q_input = tensor_name 62 | q_output = tensor_name + "_QuantizeLinear" 63 | dq_input = q_output 64 | dq_output = tensor_name + "_DequantizeLinear" 65 | 66 | scale_name = tensor_name + "_QuantizeScale" 67 | zp_name = tensor_name + "_QuantizeZp" 68 | qlinear_node = onnx.helper.make_node( 69 | "QuantizeLinear", 70 | [q_input, scale_name, zp_name], 71 | [q_output], 72 | quant_node_name, 73 | ) 74 | dequant_node = onnx.helper.make_node( 75 | "DequantizeLinear", 76 | [dq_input, scale_name, zp_name], 77 | [dq_output], 78 | dequant_node_name, 79 | ) 80 | 81 | for node in qdq_model.graph.node: 82 | for j in range(len(node.input)): 83 | if node.input[j] == tensor_name: 84 | node.input[j] = dq_output 85 | 86 | qdq_model.graph.node.extend([qlinear_node, dequant_node]) 87 | 88 | scale_initializer_tensor = create_act_initializer_tensor( 89 | name=scale_name, tensor_array=scale, data_type=onnx.TensorProto.FLOAT 90 | ) 91 | 92 | zp_initializer_tensor = create_act_initializer_tensor( 93 | name=zp_name, tensor_array=0, data_type=onnx.TensorProto.INT8 94 | ) 95 | 96 | qdq_model.graph.initializer.append(scale_initializer_tensor) 97 | qdq_model.graph.initializer.append(zp_initializer_tensor) 98 | return qdq_model 99 | 100 | # Step 01. Move QDQ forward 101 | int8_model_path = sys.argv[1] # onnx.load("/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.onnx.model_int8.onnx") 102 | int8_model = onnx.load(int8_model_path) 103 | weight_name2tensor = {} 104 | for weight in int8_model.graph.initializer: 105 | weight_name2tensor[weight.name] = weight 106 | 107 | nodes_to_remove = [] 108 | scale_map = {} 109 | scale_map_final = {} 110 | for node in int8_model.graph.node: 111 | output_tensor = node.output[0] 112 | post_nodes = get_post_nodes(int8_model, output_tensor) 113 | 114 | QDQ_count = 0 115 | for post_node in post_nodes: 116 | if post_node.op_type in ["QuantizeLinear"]: 117 | QDQ_count += 1 118 | 119 | # 第一种情况: 存在QDQ, 但是与后续节点个数不同 120 | # 第二种情况: 存在多于1个QDQ 121 | if node.op_type not in ["Concat"] and (QDQ_count > 0 and QDQ_count != len(post_nodes)) or QDQ_count > 1: 122 | scale_values = [] 123 | for post_node in post_nodes: 124 | if post_node.op_type in ["QuantizeLinear"]: 125 | scale_name = post_node.input[1] 126 | scale_tensor = weight_name2tensor[scale_name] 127 | scale_value = numpy_helper.to_array(scale_tensor) # out_ch, in_ch, ker, ker 128 | scale_values.append(float(scale_value.ravel())) 129 | nodes_to_remove.extend(remove_qdq(int8_model, post_node)) 130 | print(node.name, QDQ_count, len(post_nodes), scale_values) 131 | scale_map[node.output[0]] = np.mean(scale_values) 132 | elif QDQ_count == 1 and len(post_nodes) == 1: 133 | scale_values = [] 134 | for post_node in post_nodes: 135 | if post_node.op_type in ["QuantizeLinear"]: 136 | scale_name = post_node.input[1] 137 | scale_tensor = weight_name2tensor[scale_name] 138 | scale_value = numpy_helper.to_array(scale_tensor) # out_ch, in_ch, ker, ker 139 | scale_values.append(float(scale_value.ravel())) 140 | assert len(scale_values) == 1 141 | scale_map_final[node.output[0]] = np.mean(scale_values) 142 | 143 | for node in nodes_to_remove: 144 | int8_model.graph.node.remove(node) 145 | 146 | for tensor_name, scale in scale_map.items(): 147 | add_act_dqd_node(int8_model, tensor_name, scale) 148 | 149 | onnx.save(int8_model, int8_model_path + ".opt_step1.onnx") 150 | 151 | scale_map_final.update(scale_map) 152 | with open(int8_model_path + ".opt_step1.scale_map.json", "w") as fw: 153 | json.dump(scale_map_final, fw, indent=4) 154 | 155 | 156 | # Step 02 add QDQ node to model 157 | def read_calib_cache(calib_cache): 158 | import struct 159 | scale_map = {} 160 | with open(calib_cache) as fr: 161 | for line in fr.readlines()[1:]: 162 | print(line.strip()) 163 | name, value = line.strip().split(": ") 164 | name = name.strip(":") 165 | value = value.strip(":") 166 | if value.strip() == "0": 167 | val = 0.0 168 | else: 169 | val = struct.unpack("!f", bytes.fromhex(value.strip()))[0] 170 | scale_map[name] = val 171 | scale_map = {k: scale_map[k] for k in sorted(scale_map)} 172 | return scale_map 173 | 174 | 175 | # int8_model = onnx.load( 176 | # "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.onnx.model_int8.opt_step1.onnx") 177 | 178 | weight_name2tensor = {} 179 | for weight in int8_model.graph.initializer: 180 | weight_name2tensor[weight.name] = weight 181 | 182 | QDQ_scales = {} 183 | for node in int8_model.graph.node: 184 | if node.op_type in ["QuantizeLinear"]: 185 | scale_name = node.input[1] 186 | scale_tensor = weight_name2tensor[scale_name] 187 | scale_value = numpy_helper.to_array( 188 | scale_tensor) # out_ch, in_ch, ker, ker 189 | if np.size(scale_value) > 1: 190 | continue 191 | scale_value = float(scale_value.ravel()) 192 | QDQ_scales[node.input[0]] = scale_value 193 | 194 | calib_cache = "/mapai/howellyang/code/onnx2trt/RMTNet_release20220609_mm2conv.optimized.rm_inits.trt_int8_with_1687pics_calib_percentile595.calib_cache" 195 | full_scale_map = read_calib_cache(calib_cache) 196 | 197 | for tensor_name, scale in full_scale_map.items(): 198 | if tensor_name in QDQ_scales: 199 | print(tensor_name, scale, QDQ_scales[tensor_name]) 200 | else: 201 | print(tensor_name, "not exist") 202 | scale = max(scale, 1e-8) 203 | add_act_dqd_node(int8_model, tensor_name, scale) 204 | 205 | 206 | onnx.save(int8_model, int8_model_path + ".opt_step2.onnx") 207 | 208 | # print(QDQ_scales) 209 | # print(full_scale_map) 210 | --------------------------------------------------------------------------------