├── .flake8 ├── .gitignore ├── LICENSE ├── README.md ├── dipoorlet ├── __init__.py ├── __main__.py ├── deploy │ ├── __init__.py │ ├── deploy_atlas.py │ ├── deploy_base.py │ ├── deploy_default.py │ ├── deploy_imx.py │ ├── deploy_magicmind.py │ ├── deploy_rv.py │ ├── deploy_snpe.py │ ├── deploy_stpu.py │ ├── deploy_ti.py │ └── deploy_trt.py ├── dist_helper.py ├── forward_net.py ├── platform_settings.py ├── profiling.py ├── quantize.py ├── tensor_cali │ ├── __init__.py │ ├── basic_algorithm.py │ └── tensor_cali_base.py ├── utils.py └── weight_transform │ ├── __init__.py │ ├── ada_quant_layer.py │ ├── adaround.py │ ├── bias_correction.py │ ├── brecq.py │ ├── sparse_quant.py │ ├── sparse_quant_layer.py │ ├── update_bn.py │ ├── utils.py │ ├── weight_equalization.py │ └── weight_trans_base.py ├── example ├── .gitkeep ├── magicmind.md ├── rv.md ├── snpe.md ├── tensorrt.md └── ti.md ├── requirements.txt └── setup.py /.flake8: -------------------------------------------------------------------------------- 1 | # This is an example .flake8 config, used when developing *Black* itself. 2 | # Keep in sync with setup.cfg which is used for source packages. 3 | 4 | [flake8] 5 | # W606: reserved keywords 6 | ignore = W292,F403,F405,C901,E741 7 | max-line-length = 130 8 | max-complexity = 18 9 | select = B,C,E,F,W,T4,B9 10 | exclude = ./dipoorlet/proto 11 | per-file-ignores = */__init__.py:F401, ./dipoorlet/deploy/deploy_base.py:F401, ./dipoorlet/tensor_cali/tensor_cali_base.py:F401 ./dipoorlet/main.py:E402 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | dipoorlet.egg-info/ 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | Dipoorlet is an offline quantization tool that can perform offline quantization on ONNX model on a given calibration dataset: 4 | 5 | * Support several **Activation Calibration** algorithms: ***Mse, Minmax, Hist, etc***. 6 | * Support **Weight Transformation** to achieve better quantization results: ***BiasCorrection, WeightEqualization, etc.*** 7 | * Supports **SOTA** offline finetune algorithms to improve quantization performance: ***Adaround, Brecq, Qdrop.*** 8 | * Generate **Quantitative Parameters** required for several platforms: ***SNP, TensorRT, STPU, ATLAS, etc.*** 9 | * Provide detailed **Quantitative Analysis** to facilitate the identification of accuracy bottlenecks in model quantization. 10 | 11 | # Installation 12 | 13 | ``` 14 | git clone https://github.com/ModelTC/Dipoorlet.git 15 | cd Dipoorlet 16 | python setup.py install 17 | ``` 18 | 19 | # Environment 20 | ### CUDA 21 | Project using ONNXRuntime as inference runtime, using Pytorch as training tool, so users have to carefully make CUDA and CUDNN version right in order to make this two runtime work. 22 | 23 | For example: 24 | `ONNXRuntime==1.10.0` and `Pytorch==1.10.0-1.13.0` can runs under `CUDA==11.4 CUDNN==8.2.4` 25 | 26 | Please visit [ONNXRuntime](https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements) and [Pytorch](https://pytorch.org/get-started/previous-versions/). 27 | 28 | ### Docker 29 | ONNXRuntime has bug when running in docker when `cpu-sets` is set. 30 | Please check [issue](https://github.com/microsoft/onnxruntime/issues/8313) 31 | 32 | 33 | # Usage 34 | 35 | ## Prepare Calibration Dataset 36 | 37 | The pre processed calibration data needs to be prepared and provided in a specific path form. For example, the model has two input tensors called "input_0" and "input_1", and the file structure is as follows: 38 | 39 | ``` 40 | cali_data_dir 41 | | 42 | ├──input_0 43 | │ ├──0.bin 44 | │ ├──1.bin 45 | │ ├──... 46 | │ └──N-1.bin 47 | └──input_1 48 | ├──0.bin 49 | ├──1.bin 50 | ├──... 51 | └──N-1.bin 52 | ``` 53 | 54 | 55 | ## Running Dipoorlet in Pytorch Distributed Environment 56 | ``` 57 | python -m torch.distributed.launch --use_env -m dipoorlet -M MODEL_PATH -I INPUT_PATH -N PIC_NUM -A [mse, hist, minmax] -D [trt, snpe, rv, atlas, ti, stpu] [--bc] [--adaround] [--brecq] [--drop] 58 | ``` 59 | 60 | ## Running Dipoorlet in Cluster Environment 61 | ``` 62 | python -m dipoorlet -M MODEL_PATH -I INPUT_PATH -N PIC_NUM -A [mse, hist, minmax] -D [trt, snpe, rv, atlas, ti, stpu] [--bc] [--adaround] [--brecq] [--drop] [--slurm | --mpirun] 63 | ``` 64 | ## Optional 65 | 66 | - Using -M to specify ONNX model path. 67 | - Using -A to select activation statistic algorithm, minmax, hist, mse. 68 | - Using -D to select deploy platform, trt, snpe, rv, ti... 69 | - Using -N to specify number of calibration pics. 70 | - Using -I to specify path of calibration pics. 71 | - Using -O to specify output path. 72 | - For hist and kl: 73 | --bins specify histogram bins. 74 | --threshold specify histogram threshold for hist algorithm. 75 | - Using --bc to do Bias Correction algorithm. 76 | - Using --we to do weight equalization. 77 | - Using --adaround to do offline finetune by [Adaround](https://arxiv.org/abs/2004.10568). 78 | - Using --brecq to do offline finetune by [Brecq](https://arxiv.org/abs/2102.05426). 79 | - Using --brecq --drop to do offline finetune by [Qdrop](https://arxiv.org/abs/2203.05740). 80 | - Using --skip_layers to skip quantization of some layers. 81 | - Using --slurm to launch task from slurm. 82 | - Other usage can get by "python -m dipoorlet --h/-help" 83 | 84 | ## Example 85 | 86 | Quantify an onnx model model.onnx, save 100 calibration data in workdir/data/, where "data" represents the name of the onnx model. Use “minmax“ activation value calibration algorithm, use “Qdrop“ to perform unlabeled fine tuning on weights, and finally generate TensorRT quantization configuration information: 87 | 88 | ##### Calibration Data Path 89 | 90 | ``` 91 | workdir 92 | | 93 | ├──data 94 | ├──0.bin 95 | ├──1.bin 96 | ├──... 97 | └──99.bin 98 | 99 | ``` 100 | 101 | ##### Command 102 | 103 | ``` 104 | python -m torch.distributed.launch --use_env -m dipoorlet -M model.onnx -I workdir/ -N 100 -A minmax -D trt 105 | ``` 106 | -------------------------------------------------------------------------------- /dipoorlet/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | -------------------------------------------------------------------------------- /dipoorlet/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import time 5 | import copy 6 | 7 | import onnx 8 | import torch 9 | import torch.distributed as dist 10 | 11 | from onnxsim import simplify 12 | 13 | from .deploy import to_deploy 14 | from .dist_helper import init_from_mpi, init_from_slurm 15 | from .profiling import (quantize_profiling_multipass, quantize_profiling_transformer, 16 | show_model_profiling_res, show_model_ranges, weight_need_perchannel) 17 | from .tensor_cali import tensor_calibration 18 | from .utils import (ONNXGraph, load_clip_val, logger, reduce_clip_val, 19 | reduce_profiling_res, save_clip_val, save_profiling_res, 20 | setup_logger, deploy_QOperator) 21 | from .weight_transform import weight_calibration 22 | 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("-M", "--model", help="onnx model") 25 | parser.add_argument("-I", "--input_dir", help="calibration data", required=True) 26 | parser.add_argument("-O", "--output_dir", help="output data path") 27 | parser.add_argument("-N", "--data_num", help="num of calibration pics", type=int, required=True) 28 | parser.add_argument("--we", help="weight euqalization", action="store_true") 29 | parser.add_argument("--bc", help="bias correction", action="store_true") 30 | parser.add_argument("--update_bn", help="update BN", action="store_true") 31 | parser.add_argument("--adaround", help="Adaround", action="store_true") 32 | parser.add_argument("--brecq", help="BrecQ", action="store_true") 33 | parser.add_argument("--drop", help="QDrop", action="store_true") 34 | parser.add_argument("-A", "--act_quant", help="algorithm of activation quantization", 35 | choices=['minmax', 'hist', 'mse'], default='mse') 36 | parser.add_argument("-D", "--deploy", help="deploy platform", 37 | choices=['trt', 'stpu', 'magicmind', 'rv', 'atlas', 38 | 'snpe', 'ti', 'imx'], required=True) 39 | parser.add_argument("--bins", help="bins for histogram and kl", default=2048) 40 | parser.add_argument("--threshold", help="threshold for histogram", default=0.99999, type=float) 41 | parser.add_argument("--savefp", help="Save FP output of model.", action="store_true") 42 | parser.add_argument("--ada_bs", help="Batch size for adaround.", type=int, default=64) 43 | parser.add_argument("--ada_epoch", help="Epoch for adaround.", type=int, default=5000) 44 | parser.add_argument("--skip_layers", help="Skip layer name", default=[], type=str, nargs='+') 45 | parser.add_argument("--stpu_wg", help="Enable winograd for stpu.", action="store_true") 46 | parser.add_argument("--skip_prof_layer", help="Skip profiling by layer.", default=False, action='store_true') 47 | parser.add_argument("--slurm", help="Launch task from slurm", default=False, action='store_true') 48 | parser.add_argument("--mpirun", help="Launch task from mpirun", default=False, action='store_true') 49 | parser.add_argument("--sparse", help="Sparse on/off", default=False, action="store_true") 50 | parser.add_argument("--sparse_rate", help="Sparse rate", type=float, default=0.5) 51 | parser.add_argument("--pattern", help="Sparse pattern", choices=["unstruction", "nv24"], default="unstruction") 52 | parser.add_argument("--optim_transformer", help="Transformer model optimization", default=False, action='store_true') 53 | parser.add_argument("--model_type", help="Transformer model type", choices=["unet"], default=None) 54 | parser.add_argument("--quant_format", default="QDQ", type=str, choices=["QOP", "QDQ"]) 55 | args = parser.parse_args() 56 | 57 | if args.slurm: 58 | init_from_slurm() 59 | elif args.mpirun: 60 | init_from_mpi() 61 | else: 62 | dist.init_process_group(backend='nccl') 63 | device = dist.get_rank() % torch.cuda.device_count() 64 | torch.cuda.set_device(device) 65 | 66 | if args.output_dir is None: 67 | model_path = ('/').join(args.model.split('/')[:-1]) 68 | output_dir = os.path.join(os.path.abspath(model_path), 'results') 69 | args.output_dir = output_dir 70 | 71 | if args.model_type is not None: 72 | args.optim_transformer = True 73 | args.skip_prof_layer = True 74 | 75 | if dist.get_rank() == 0: 76 | if not os.path.exists(args.output_dir): 77 | os.makedirs(args.output_dir) 78 | setup_logger(args) 79 | 80 | if args.optim_transformer: 81 | model_path = ('/').join(args.model.split('/')[:-1]) 82 | args.infer_shape_dir = os.path.join(os.path.abspath(model_path), "infer_shape.onnx") 83 | onnx.shape_inference.infer_shapes_path(args.model, args.infer_shape_dir) 84 | args.optimzed_model_dir = os.path.join(args.output_dir, 'optim_model.onnx') 85 | os.system("python -m onnxruntime.transformers.optimizer \ 86 | --input {} --output {} --model_type={} \ 87 | --use_external_data_format --disable_packed_qkv \ 88 | --disable_packed_kv --use_gpu --disable_nhwc_conv" 89 | .format(args.infer_shape_dir, args.optimzed_model_dir, args.model_type)) 90 | dist.barrier() 91 | args.optimzed_model_dir = os.path.join(args.output_dir, 'optim_model.onnx') 92 | logger.parent = None 93 | 94 | start = time.time() 95 | if args.optim_transformer: 96 | model = onnx.load(args.optimzed_model_dir) 97 | else: 98 | model = onnx.load(args.model) 99 | if model.opset_import[0].version < 13: 100 | model = onnx.version_converter.convert_version(model, 13) 101 | model, check = simplify(model) 102 | assert check, "Simplified ONNX model could not be validated" 103 | onnx_graph = ONNXGraph(model, args.output_dir, args.deploy, args.model_type) 104 | 105 | if dist.get_rank() == 0 and not args.optim_transformer: 106 | try: 107 | onnx.checker.check_model(onnx_graph.model) 108 | except onnx.checker.ValidationError as e: 109 | logger.info("The onnx model is invalid:{}, please rectifie your model and restart Dipoorlet.".format(e)) 110 | sys.exit() 111 | 112 | # Assgin rank index to calibration GPU wise. 113 | # Split the dataset averagly. 114 | setattr(args, 'rank', dist.get_rank()) 115 | setattr(args, 'local_rank', dist.get_rank() % torch.cuda.device_count()) 116 | setattr(args, 'world_size', dist.get_world_size()) 117 | if dist.get_rank() == 0: 118 | logger.info("Do tensor calibration...") 119 | act_clip_val, weight_clip_val = tensor_calibration(onnx_graph, args) 120 | tensor_range = copy.deepcopy(act_clip_val) 121 | save_clip_val(act_clip_val, weight_clip_val, args, 122 | act_fname='act_clip_val.json.rank{}'.format(args.rank), 123 | weight_fname='weight_clip_val.json.rank{}'.format(args.rank)) 124 | dist.barrier() 125 | if dist.get_rank() == 0: 126 | reduce_clip_val(dist.get_world_size(), args) 127 | dist.barrier() 128 | act_clip_val, weight_clip_val = load_clip_val(args) 129 | 130 | # Weight Transform. 131 | if dist.get_rank() == 0: 132 | logger.info("Weight transform...") 133 | graph, graph_ori, act_clip_val, weight_clip_val = \ 134 | weight_calibration(onnx_graph, act_clip_val, weight_clip_val, args) 135 | dist.barrier() 136 | 137 | # Profiling Distributed. 138 | if dist.get_rank() == 0: 139 | logger.info("Profiling...") 140 | if args.model_type is not None: 141 | layer_cosine_dict, model_cosine_dict, quant_node_list = quantize_profiling_transformer( 142 | graph, graph_ori, act_clip_val, weight_clip_val, args) 143 | else: 144 | layer_cosine_dict, model_cosine_dict, quant_node_list = quantize_profiling_multipass( 145 | graph, graph_ori, act_clip_val, weight_clip_val, args) 146 | save_profiling_res(layer_cosine_dict, model_cosine_dict, args) 147 | dist.barrier() 148 | if dist.get_rank() == 0: 149 | layer_cosine_dict, model_cosine_dict = reduce_profiling_res(dist.get_world_size(), args) 150 | show_model_profiling_res(graph, layer_cosine_dict, model_cosine_dict, quant_node_list, args) 151 | show_model_ranges(graph, act_clip_val, weight_clip_val, args) 152 | weight_need_perchannel(graph, args) 153 | 154 | # Deploy 155 | if dist.get_rank() == 0: 156 | logger.info("Deploy to " + args.deploy + '...') 157 | to_deploy(graph, act_clip_val, weight_clip_val, args) 158 | if args.quant_format == 'QOP' and args.model_type is None: 159 | deploy_QOperator(graph.model, tensor_range, args) 160 | end = time.time() 161 | logger.info("Total time cost: {} seconds.".format(int(end - start))) -------------------------------------------------------------------------------- /dipoorlet/deploy/__init__.py: -------------------------------------------------------------------------------- 1 | from .deploy_base import to_deploy 2 | -------------------------------------------------------------------------------- /dipoorlet/deploy/deploy_atlas.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os.path as osp 3 | 4 | from ..platform_settings import platform_setting_table 5 | from .deploy_default import deploy_dispatcher 6 | 7 | ATLAS_QUANT_LAYER = platform_setting_table['atlas']['quant_nodes'] 8 | 9 | 10 | def get_step_zeropoint(clip_val): 11 | ret = dict() 12 | range_min = min(0, clip_val[0]) 13 | range_max = max(0, clip_val[1]) 14 | step = (range_max - range_min) / 255. 15 | # Zero point range [-128, 127] to support sym/asym in same time. 16 | if step == 0.0: 17 | step = 1.0 18 | zero_point = round(-range_min / step) - 128 19 | ret.update({'scale': step, 'offset': int(zero_point)}) 20 | return ret 21 | 22 | 23 | @deploy_dispatcher.register("atlas") 24 | def gen_atlas_quant_param(graph, clip_val, args, **kwargs): 25 | res = {} 26 | for node in graph.graph.node: 27 | if node.op_type in ATLAS_QUANT_LAYER: 28 | tensor_name = node.input[0] 29 | res[tensor_name] = get_step_zeropoint(clip_val[tensor_name]) 30 | 31 | with open(osp.join(args.output_dir, 'atlas_quant_param.json'), 'w') as f: 32 | json.dump(res, f, indent=4) 33 | -------------------------------------------------------------------------------- /dipoorlet/deploy/deploy_base.py: -------------------------------------------------------------------------------- 1 | from ..platform_settings import platform_setting_table 2 | from .deploy_atlas import gen_atlas_quant_param 3 | from .deploy_default import deploy_dispatcher 4 | from .deploy_magicmind import gen_magicmind_proto 5 | from .deploy_rv import gen_rv_yaml 6 | from .deploy_snpe import gen_snpe_encodings 7 | from .deploy_stpu import gen_stpu_minmax 8 | from .deploy_ti import gen_ti_json 9 | from .deploy_trt import gen_trt_range 10 | from .deploy_imx import gen_imx_range 11 | 12 | 13 | def to_deploy(graph, act_clip_val, weight_clip_val, args, **kwargs): 14 | if platform_setting_table[args.deploy]['deploy_weight']: 15 | clip_val = act_clip_val.copy() 16 | clip_val.update(weight_clip_val) 17 | else: 18 | clip_val = act_clip_val 19 | deploy_dispatcher(args.deploy, graph, clip_val, args, **kwargs) 20 | -------------------------------------------------------------------------------- /dipoorlet/deploy/deploy_default.py: -------------------------------------------------------------------------------- 1 | from ..utils import dispatch_functool, logger 2 | 3 | 4 | @dispatch_functool 5 | def deploy_dispatcher(*args, **kwargs): 6 | logger.warning("Deploy Platform Not Found!") 7 | -------------------------------------------------------------------------------- /dipoorlet/deploy/deploy_imx.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import numpy as np 4 | 5 | from .deploy_default import deploy_dispatcher 6 | 7 | 8 | @deploy_dispatcher.register("imx") 9 | def gen_imx_range(graph, clip_val, args, **kwargs): 10 | bit_width = 8 11 | removed_keys = [k for k in clip_val if k.endswith(".bias")] 12 | for k in removed_keys: 13 | del clip_val[k] 14 | for k, v in clip_val.items(): 15 | clip_max = np.max(np.abs(clip_val[k]), axis=0) 16 | q_max = [2 ** (bit_width - 1) - 1] 17 | scale = np.array(clip_max) / q_max 18 | if np.any(scale == 0): 19 | scale = np.where(scale == 0, 1., scale) 20 | 21 | scale = 2 ** np.round(np.log2(scale)) 22 | clip_val[k] = scale.tolist() 23 | imx_blob_json = dict() 24 | imx_blob_json['blob_range'] = clip_val 25 | with open(os.path.join(args.output_dir, 'imx_scale.json'), 'w') as f: 26 | json.dump(imx_blob_json, f, indent=4) 27 | -------------------------------------------------------------------------------- /dipoorlet/deploy/deploy_magicmind.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import numpy as np 5 | 6 | from .deploy_default import deploy_dispatcher 7 | 8 | 9 | @deploy_dispatcher.register("magicmind") 10 | def gen_magicmind_proto(graph, clip_val, args, **kwargs): 11 | cambricom_quant_param = {} 12 | for k, v in clip_val.items(): 13 | cambricom_quant_param[k] = { 14 | "min": float(np.min(clip_val[k][0])), 15 | "max": float(np.max(clip_val[k][1])) 16 | } 17 | blob_range_json = dict() 18 | blob_range_json['blob_range'] = cambricom_quant_param 19 | with open(os.path.join(args.output_dir, 'magicmind_quant_param.json'), 'wt') as f: 20 | json.dump(blob_range_json, f, indent=4) 21 | -------------------------------------------------------------------------------- /dipoorlet/deploy/deploy_rv.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os.path as osp 3 | 4 | import numpy as np 5 | import yaml 6 | 7 | from ..platform_settings import LAYER_HAS_WEIGHT 8 | from .deploy_default import deploy_dispatcher 9 | 10 | 11 | def step_zeropoint(clip_val): 12 | ret = dict() 13 | range_min = min(0, np.min(clip_val[0])) 14 | range_max = max(0, np.max(clip_val[1])) 15 | step = (range_max - range_min) / 255. 16 | if step == 0.0: 17 | step = 1.0 / 255. 18 | zero_point = round(-range_min / step) 19 | ret.update({'scale': [float(step)], 'zero_point': [int(zero_point)]}) 20 | return ret 21 | 22 | 23 | @deploy_dispatcher.register("rv") 24 | def gen_rv_yaml(graph, clip_val, args, **kwargs): 25 | def gen1126(graph, clip_val, args, **kwargs): 26 | res = {'customized_quantize_layers': {}, 27 | 'quantize_parameters': {}} 28 | # Pass concat qparam to input 29 | for node in graph.graph.node: 30 | if node.op_type == 'Concat': 31 | for input_tensor in node.input: 32 | clip_val[input_tensor][0] = clip_val[node.output[0]][0] 33 | clip_val[input_tensor][1] = clip_val[node.output[0]][1] 34 | next_node = graph.get_tensor_consumer(node.output[0]) 35 | for i in graph.network_inputs: 36 | tensor_dict = { 37 | 'dtype': 'asymmetric_affine', 38 | 'method': 'layer', 39 | 'max_value': [max(0., float(clip_val[i][1]))], 40 | 'min_value': [min(0., float(clip_val[i][0]))], 41 | 'qtype': 'u8' 42 | } 43 | key = f'@{i}:out0' 44 | res['quantize_parameters'][key] = tensor_dict 45 | res['quantize_parameters'][key].update(step_zeropoint(clip_val[i])) 46 | for node in graph.graph.node: 47 | # Sigmoid input has specific range -6.3-6.3 48 | next_node = graph.get_tensor_consumer(node.output[0]) 49 | if len(next_node) == 1 and not isinstance(next_node[0], str) and next_node[0].op_type == 'Sigmoid': 50 | continue 51 | if node.op_type in LAYER_HAS_WEIGHT: 52 | for idx, input_tensor in enumerate(node.input[1:]): 53 | qtype = 'u8' 54 | if idx == 0: 55 | # weight 56 | key = f'@{node.name}:weight' 57 | tensor_dict = { 58 | 'dtype': 'asymmetric_affine', 59 | 'method': 'layer', 60 | 'max_value': [max(0.0, float(np.max(clip_val[input_tensor][1])))], 61 | 'min_value': [min(0.0, float(np.min(clip_val[input_tensor][0])))], 62 | 'qtype': qtype 63 | } 64 | tensor_dict.update(step_zeropoint(clip_val[input_tensor])) 65 | elif idx == 1: 66 | key = f'@{node.name}:bias' 67 | qtype = 'i32' 68 | acts = step_zeropoint(clip_val[node.input[0]])['scale'] 69 | ws = step_zeropoint(clip_val[node.input[1]])['scale'] 70 | tensor_dict = { 71 | 'dtype': 'asymmetric_affine', 72 | 'method': 'layer', 73 | 'max_value': [], 74 | 'min_value': [], 75 | 'zero_point': [0], 76 | 'scale': [ws[0] * acts[0]], 77 | 'qtype': qtype 78 | } 79 | else: 80 | print("We meet unsupported node{}, skip.".format(node.name)) 81 | res['quantize_parameters'][key] = tensor_dict 82 | for idx, output_tensor in enumerate(node.output): 83 | tensor_dict = { 84 | 'dtype': 'asymmetric_affine', 85 | 'method': 'layer', 86 | 'max_value': [max(0., float(np.max(clip_val[output_tensor][1])))], 87 | 'min_value': [min(0., float(np.min(clip_val[output_tensor][0])))], 88 | 'qtype': 'u8' 89 | } 90 | key = f'@{node.name}:out{idx}' 91 | res['quantize_parameters'][key] = tensor_dict 92 | res['quantize_parameters'][key].update(step_zeropoint(clip_val[output_tensor])) 93 | # We need to merge relu. 94 | if node.op_type == 'Relu': 95 | prev_node = graph.get_tensor_producer(node.input[0]) 96 | for prev_key in res['quantize_parameters']: 97 | if prev_node.name in prev_key and 'out' in prev_key: 98 | res['quantize_parameters'][prev_key] = res['quantize_parameters'][key] 99 | # We need to merge BatchNorm and Scale. 100 | if node.op_type == 'CaffeScale': 101 | prev_node = graph.get_tensor_producer(node.input[0]) 102 | if prev_node.op_type == 'CaffeBatchNorm': 103 | for prev_key in res['quantize_parameters']: 104 | if prev_node.name in prev_key and 'out' in prev_key: 105 | res['quantize_parameters'][prev_key] = res['quantize_parameters'][key] 106 | del res['quantize_parameters'][key] 107 | with open(osp.join(args.output_dir, 'rv_quantized_param.yaml'), 'w') as f: 108 | f.write(yaml.dump(res)) 109 | with open(osp.join(args.output_dir, 'rv_quantized_param.json'), 'w') as f: 110 | json.dump(res, f, indent=4) 111 | 112 | def gen3568(graph, clip_val, args, **kwargs): 113 | res = {'custom_quantize_layers': {}, 114 | 'quantize_parameters': {}} 115 | # Pass concat qparam to input 116 | for node in graph.graph.node: 117 | if node.op_type == 'Concat': 118 | for input_tensor in node.input: 119 | clip_val[input_tensor][0] = clip_val[node.output[0]][0] 120 | clip_val[input_tensor][1] = clip_val[node.output[0]][1] 121 | next_node = graph.get_tensor_consumer(node.output[0]) 122 | for i in graph.network_inputs: 123 | tensor_dict = { 124 | 'max': [max(0., float(clip_val[i][1]))], 125 | 'min': [min(0., float(clip_val[i][0]))], 126 | } 127 | key = f'{i}' 128 | res['quantize_parameters'][key] = tensor_dict 129 | for node in graph.graph.node: 130 | # Sigmoid input has specific range -6.3-6.3 131 | next_node = graph.get_tensor_consumer(node.output[0]) 132 | if len(next_node) == 1 and not isinstance(next_node[0], str) and next_node[0].op_type == 'Sigmoid': 133 | continue 134 | if node.op_type in LAYER_HAS_WEIGHT: 135 | for idx, input_tensor in enumerate(node.input[1:]): 136 | if idx == 0: 137 | # weight 138 | key = f'{node.name}_W' 139 | tensor_dict = { 140 | 'max': [max(0.0, float(np.max(clip_val[input_tensor][1])))], 141 | 'min': [min(0.0, float(np.min(clip_val[input_tensor][0])))], 142 | } 143 | elif idx == 1: 144 | key = f'{node.name}_b' 145 | max_val = np.max(clip_val[node.input[2]]) 146 | min_val = np.min(clip_val[node.input[2]]) 147 | tensor_dict = { 148 | 'max': [float(max(abs(max_val), abs(min_val)))], 149 | 'min': [float(-max(abs(max_val), abs(min_val)))], 150 | } 151 | else: 152 | print("We meet unsupported node{}, skip.".format(node.name)) 153 | res['quantize_parameters'][key] = tensor_dict 154 | for idx, output_tensor in enumerate(node.output): 155 | tensor_dict = { 156 | 'max': [max(0., float(np.max(clip_val[output_tensor][1])))], 157 | 'min': [min(0., float(np.min(clip_val[output_tensor][0])))], 158 | } 159 | key = f'{output_tensor}' 160 | res['quantize_parameters'][key] = tensor_dict 161 | # We need to merge relu. 162 | if node.op_type == 'Relu': 163 | prev_key = node.input[0] 164 | res['quantize_parameters'][prev_key] = res['quantize_parameters'][key] 165 | # We need to merge BatchNorm and Scale. 166 | if node.op_type == 'CaffeScale': 167 | prev_node = graph.get_tensor_producer(node.input[0])[0] 168 | if prev_node.op_type == 'CaffeBatchNorm': 169 | prev_key = node.input[0] 170 | res['quantize_parameters'][prev_key] = res['quantize_parameters'][key] 171 | del res['quantize_parameters'][key] 172 | with open(osp.join(args.output_dir, 'rk_quantized_param.yaml'), 'w') as f: 173 | f.write(yaml.dump(res)) 174 | with open(osp.join(args.output_dir, 'rk_quantized_param.json'), 'w') as f: 175 | json.dump(res, f, indent=4) 176 | 177 | gen1126(graph, clip_val, args, **kwargs) 178 | gen3568(graph, clip_val, args, **kwargs) 179 | -------------------------------------------------------------------------------- /dipoorlet/deploy/deploy_snpe.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from .deploy_default import deploy_dispatcher 5 | 6 | 7 | @deploy_dispatcher.register("snpe") 8 | def gen_snpe_encodings(graph, clip_val, args, **kwargs): 9 | activation_encodings = {} 10 | # We can set param_encodings actually, but we only set activation_encodings. 11 | # https://developer.qualcomm.com/docs/snpe/quantized_models.html 12 | for node in graph.graph.node: 13 | for idx, in_tensor in enumerate(node.input): 14 | if in_tensor == '': 15 | continue 16 | if in_tensor in graph.initializer: 17 | continue 18 | activation_encodings[in_tensor] = [{ 19 | 'bitwidth': 8, 20 | 'min': float(clip_val[in_tensor][0]), 21 | 'max': max(max(0.0, float(clip_val[in_tensor][1])), float(clip_val[in_tensor][0]) + 0.01) 22 | }] 23 | for output_tensor in graph.network_outputs: 24 | activation_encodings[output_tensor] = [{ 25 | 'bitwidth': 8, 26 | 'min': float(clip_val[output_tensor][0]), 27 | 'max': max(max(0.0, float(clip_val[output_tensor][1])), float(clip_val[output_tensor][0]) + 0.01) 28 | }] 29 | encodings = { 30 | 'activation_encodings': activation_encodings, 31 | 'param_encodings': {} 32 | } 33 | with open(os.path.join(args.output_dir, 'snpe_encodings.json'), 'wt') as f: 34 | json.dump(encodings, f, indent=4) 35 | -------------------------------------------------------------------------------- /dipoorlet/deploy/deploy_stpu.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import numpy as np 5 | from onnx import numpy_helper 6 | 7 | from ..platform_settings import LAYER_HAS_WEIGHT 8 | from .deploy_default import deploy_dispatcher 9 | 10 | dtINT8 = 1 11 | dtINT16 = 2 12 | dtINT32 = 3 13 | dtINT64 = 4 14 | dtUINT8 = 5 15 | dtUINT16 = 6 16 | dtUINT32 = 7 17 | dtUINT64 = 8 18 | dtFLOAT16 = 9 19 | dtFLOAT32 = 10 20 | dtFLOAT64 = 11 21 | 22 | 23 | @deploy_dispatcher.register("stpu") 24 | def gen_stpu_minmax(graph, clip_val, args, **kwargs): 25 | param = {} 26 | 27 | quant_weight(clip_val, graph, param) 28 | quant_activation(clip_val, graph, param) 29 | merge_relu_layer(graph, param) 30 | if args.stpu_wg: 31 | conv_wg_layer(graph, param) 32 | layer_emin_state(graph, param) 33 | quant_bias(graph, param, args) 34 | 35 | with open(os.path.join(args.output_dir, 'stpu_minmax.json'), 'wt') as f: 36 | json.dump(param, f, indent=4) 37 | 38 | 39 | def quant_weight(clip_val, graph, param): 40 | for node in graph.graph.node: 41 | if node.op_type in LAYER_HAS_WEIGHT: 42 | abs_max = max(np.abs(np.min(clip_val[node.input[1]][0])), 43 | np.max(clip_val[node.input[1]][1])) 44 | param[node.name + '_weights'] = { 45 | 'min': float(-abs_max), 46 | 'max': float(abs_max) 47 | } 48 | 49 | 50 | def quant_activation(clip_val, graph, param): 51 | for in_tensor in graph.network_inputs: 52 | abs_max = max(np.abs(clip_val[in_tensor][0]), clip_val[in_tensor][1]) 53 | param[in_tensor] = { 54 | 'min': float(-abs_max), 55 | 'max': float(abs_max) 56 | } 57 | for node in graph.graph.node: 58 | for out_tensor in node.output: 59 | abs_max = max(np.abs(clip_val[out_tensor][0]), clip_val[out_tensor][1]) 60 | param[out_tensor] = { 61 | 'min': float(-abs_max), 62 | 'max': float(abs_max) 63 | } 64 | 65 | 66 | def merge_relu_layer(graph, param): 67 | for node in graph.graph.node: 68 | if node.op_type in ['Relu', 'Clip']: 69 | param[node.input[0]] = param[node.output[0]].copy() 70 | 71 | 72 | def conv_wg_filter(node): 73 | if node.op_type != 'Conv': 74 | return False 75 | if node.get_attribute_value('group', 1) != 1: 76 | return False 77 | kernel_h, kernel_w = node.get_attribute_value('kernel_shape') 78 | stride_h, stride_w = node.get_attribute_value('strides') 79 | if (kernel_h == 3 and kernel_w == 3) and (stride_h == 1 and stride_w == 1): 80 | return True 81 | return False 82 | 83 | 84 | def wg_weight_convt(ker): 85 | wu_ = np.zeros((*ker.shape[:2], 4, 4)) 86 | g = np.array([[2, 0, 0], [1, 1, 1], [1, -1, 1], [0, 0, 2]], dtype='float32') 87 | for i in range(ker.shape[0]): 88 | for j in range(ker.shape[1]): 89 | wu_[i, j, :, :] = g.dot(ker[i, j, :, :]).dot(g.T) 90 | return max(wu_.max(), 0), min(wu_.min(), 0) 91 | 92 | 93 | def conv_wg_layer(graph, param): 94 | for node in graph.graph.node: 95 | if conv_wg_filter(node): 96 | if not 'layer_' + node.name in param.keys(): 97 | param['layer_' + node.name] = {'wg': True} 98 | weight = numpy_helper.to_array(graph.initializer[node.input[1]][0]) 99 | vmax, vmin = wg_weight_convt(weight) 100 | abs_vmax = max(vmax, -vmin) 101 | param[node.name + '_weights']['max'] = float(abs_vmax) 102 | param[node.name + '_weights']['min'] = float(-abs_vmax) 103 | 104 | 105 | def find_e(v): 106 | v_ = abs(v) 107 | if v_ == 0: 108 | return 0 109 | 110 | for e in range(1, 254): 111 | r_e = e - 127 112 | if (v >= 2 ** r_e) and (v < 2 ** (r_e + 1)): 113 | return e 114 | 115 | if v < 2 ** (-126): 116 | return 1 117 | return 254 118 | 119 | 120 | def find_interp_emin(vmax, r): 121 | emax = find_e(vmax) 122 | return emax - (22 - r) 123 | 124 | 125 | def find_pool_ave_emin(i_vmax, o_vmax, n, r): 126 | n = n ** .5 127 | vmax = max(i_vmax, o_vmax) * n 128 | emax = find_e(vmax) 129 | n_e = find_e(n * 4) - 127 130 | return emax - (22 - r - n_e) 131 | 132 | 133 | def find_conv_emin(i_vmax, w_vmax, o_vmax, n, r): 134 | n = n ** .5 135 | vmax = max(n * i_vmax * w_vmax, o_vmax) 136 | emax = find_e(vmax) 137 | return emax - (12 - r) 138 | 139 | 140 | def find_corr_emin(o_vmax, n, r): 141 | n = n ** .5 142 | vmax = o_vmax * n 143 | emax = find_e(vmax) 144 | return emax - (12 - r) 145 | 146 | 147 | def find_softmax_emin(i_vmax, n): 148 | vmax = np.exp(i_vmax) * n 149 | emax = find_e(vmax) 150 | return emax - 22 151 | 152 | 153 | def find_psroipooling_emin(i_vmax, o_vmax, r): 154 | # vmax = max(i_vmax, o_vmax) 155 | emax = find_e(i_vmax) + 6 156 | return emax - (22 - r) 157 | 158 | 159 | def layer_emin_state(graph, param): 160 | for l in graph.graph.node: 161 | if l.op_type in ['Upsample', 'DynamicUpsample']: 162 | emin = find_interp_emin(param[l.output[0]]['max'], 2) 163 | param[l.output[0]]['emin'] = emin 164 | ''' 165 | if l.op_type == 'Softmax': 166 | axis = l.get_attribute_value('axis') 167 | i_vmax = param[l.input[0]]['max'] 168 | c = graph.get_tensor_shape(l.input[0])[axis] 169 | emin = find_softmax_emin(i_vmax, c) 170 | param[l.output[0]]['emin'] = emin 171 | if l.op_type in ['GlobalAveragePool', 'GlobalMaxPool'] + ['MaxPool', 'AveragePool']: 172 | if l.op_type in ['GlobalAveragePool', 'GlobalMaxPool']: 173 | kernel_h, kernel_w = l.get_attribute_value('kernel_shape') 174 | n = kernel_h * kernel_w 175 | else: 176 | n, c, kernel_h, kernel_w = graph.get_tensor_shape(l.input[0]) 177 | i_vmax = param[l.input[0]]['max'] 178 | o_vmax = param[l.output[0]]['max'] 179 | emin = find_pool_ave_emin(i_vmax, o_vmax, n, 2) 180 | param[l.output[0]]['emin'] = emin 181 | ''' 182 | if l.op_type in ['Conv', 'ConvTranspose']: 183 | weight_shape = graph.get_tensor_shape(l.input[1]) 184 | n = weight_shape[1] * weight_shape[2] * weight_shape[3] 185 | i_vmax = param[l.input[0]]['max'] 186 | o_vmax = param[l.output[0]]['max'] 187 | w_vmax = param[l.name + '_weights']['max'] 188 | emin = find_conv_emin(i_vmax, w_vmax, o_vmax, n, 2) 189 | param[l.output[0]]['emin'] = emin 190 | if l.op_type == 'Gemm': 191 | i_vmax = param[l.input[0]]['max'] 192 | o_vmax = param[l.output[0]]['max'] 193 | w_vmax = param[l.name + '_weights']['max'] 194 | n = np.prod(graph.get_tensor_shape(l.input[0])) 195 | emin = find_conv_emin(i_vmax, w_vmax, o_vmax, n, 2) 196 | param[l.output[0]]['emin'] = emin 197 | ''' 198 | if l.op_type == 'PSROIPool': 199 | i_vmax = param[l.input[0]]['max'] 200 | o_vmax = param[l.output[0]]['max'] 201 | emin = find_psroipooling_emin(i_vmax, o_vmax, 1) 202 | param[l.output[0]]['emin'] = emin 203 | ''' 204 | if l.op_type == 'Corr': 205 | co = l.get_attribute_value('groups') 206 | n = np.prod(graph.get_tensor_shape(l.input[0])) / co 207 | o_vmax = param[l.output[0]]['max'] 208 | emin = find_corr_emin(o_vmax, n, 4) 209 | param[l.output[0]]['emin'] = emin 210 | 211 | 212 | def quant_bias(graph, param, args): 213 | for l in graph.graph.node: 214 | if l.op_type in ['Conv', 'ConvTranspose', 'Gemm'] and len(l.input) == 3: 215 | wmax = param[l.name + '_weights']['max'] 216 | wmin = param[l.name + '_weights']['min'] 217 | imax = param[l.input[0]]['max'] 218 | imin = param[l.input[0]]['min'] 219 | 220 | walpha = (wmax - wmin) / (2 ** 8 - 2) 221 | ialpha = (imax - imin) / (2 ** 8 - 2) 222 | param[l.name + '_bias'] = {'alpha': walpha * ialpha, 'zero_point': 0} 223 | -------------------------------------------------------------------------------- /dipoorlet/deploy/deploy_ti.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from .deploy_default import deploy_dispatcher 5 | 6 | 7 | @deploy_dispatcher.register("ti") 8 | def gen_ti_json(graph, clip_val, args, **kwargs): 9 | # Deploy for raw range. 10 | with open(os.path.join(args.output_dir, 'ti_blob_range.txt'), 'w') as f: 11 | for k, v in clip_val.items(): 12 | f.write('{} {} {}\n'.format(k, v[0], v[1])) 13 | # Deploy for nart. 14 | ti_blob_range = dict() 15 | for k, v in clip_val.items(): 16 | clip_val[k] = [float(_v) for _v in v] 17 | ti_blob_range['blob_range'] = clip_val 18 | with open(os.path.join(args.output_dir, 'ti_blob_range.json'), 'w') as f: 19 | json.dump(ti_blob_range, f, indent=4) 20 | -------------------------------------------------------------------------------- /dipoorlet/deploy/deploy_trt.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from .deploy_default import deploy_dispatcher 5 | 6 | 7 | @deploy_dispatcher.register("trt") 8 | def gen_trt_range(graph, clip_val, args, **kwargs): 9 | for k, v in clip_val.items(): 10 | # max(-clip_min, clip_max) 11 | clip_val[k] = max(-clip_val[k][0].astype(float), clip_val[k][1].astype(float)) 12 | 13 | tensorrt_blob_json = dict() 14 | tensorrt_blob_json['blob_range'] = clip_val 15 | with open(os.path.join(args.output_dir, 'trt_clip_val.json'), 'w') as f: 16 | json.dump(tensorrt_blob_json, f, indent=4) 17 | -------------------------------------------------------------------------------- /dipoorlet/dist_helper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import torch 5 | import torch.distributed as dist 6 | 7 | 8 | def init_from_mpi(): 9 | if 'MASTER_ADDR' not in os.environ: 10 | omp_uri = os.environ["OMPI_MCA_orte_hnp_uri"] 11 | target_ip = re.search(r'.*tcp://((\d{1,3}\.){3}\d{1,3})[:,].*', omp_uri) 12 | auto_addr = target_ip.group(1) 13 | os.environ['MASTER_ADDR'] = auto_addr 14 | if 'MASTER_PORT' not in os.environ: 15 | os.environ['MASTER_PORT'] = '29500' 16 | local_id = os.environ.get('OMPI_COMM_WORLD_RANK') 17 | ntasks = os.environ.get('OMPI_COMM_WORLD_SIZE') 18 | os.environ['WORLD_SIZE'] = ntasks 19 | os.environ['RANK'] = local_id 20 | dist.init_process_group(backend='nccl') 21 | rank = dist.get_rank() 22 | device = rank % torch.cuda.device_count() 23 | torch.cuda.set_device(device) 24 | 25 | 26 | def init_from_slurm(): 27 | job_id = int(os.environ['SLURM_JOB_ID']) 28 | port = 24553 + job_id % 10000 29 | proc_id = int(os.environ['SLURM_PROCID']) 30 | ntasks = int(os.environ['SLURM_NTASKS']) 31 | node_list = os.environ['SLURM_NODELIST'] 32 | if '[' in node_list: 33 | beg = node_list.find('[') 34 | pos1 = node_list.find('-', beg) 35 | if pos1 < 0: 36 | pos1 = 1000 37 | pos2 = node_list.find(',', beg) 38 | if pos2 < 0: 39 | pos2 = 1000 40 | node_list = node_list[:min(pos1, pos2)].replace('[', '') 41 | addr = node_list[8:].replace('-', '.') 42 | os.environ['MASTER_PORT'] = str(port) 43 | os.environ['MASTER_ADDR'] = addr 44 | os.environ['WORLD_SIZE'] = str(ntasks) 45 | os.environ['RANK'] = str(proc_id) 46 | dist.init_process_group(backend='nccl') 47 | rank = dist.get_rank() 48 | device = rank % torch.cuda.device_count() 49 | torch.cuda.set_device(device) 50 | -------------------------------------------------------------------------------- /dipoorlet/forward_net.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import time 3 | import sys 4 | from collections import OrderedDict 5 | 6 | import numpy as np 7 | import onnx 8 | import onnxruntime as ort 9 | import torch 10 | import torch.distributed as dist 11 | from onnx.helper import make_graph, make_model 12 | from onnx.helper import make_tensor_value_info as mtvi 13 | from tqdm import tqdm 14 | 15 | from .platform_settings import platform_setting_table 16 | from .quantize import QUANT_NODE_NAME_LIST 17 | from .utils import ONNXGraph, logger 18 | 19 | ort.set_default_logger_severity(3) 20 | sys.setrecursionlimit(2000) 21 | 22 | 23 | class ActivationCache(object): 24 | # We assume get tensor by sequence. 25 | def __init__(self, graph, args, st=None, ed=None): 26 | self.graph = copy.deepcopy(graph) 27 | self.graph_list = [] 28 | self.ref_cnt = {} 29 | self.name_to_net = {} 30 | self.name_to_graph_id = {} 31 | self.activation_cache = {} 32 | self.args = args 33 | self.st = st 34 | self.ed = ed 35 | self.providers = [("CUDAExecutionProvider", {'device_id': args.local_rank})] 36 | self.fetch_input() 37 | self._split_network() 38 | self.fill_ref_cnt() 39 | 40 | def reset(self): 41 | self.activation_cache.clear() 42 | self.fetch_input() 43 | self.fill_ref_cnt() 44 | 45 | def fetch_input(self, in_tensor=None): 46 | if in_tensor is None: 47 | # Means We are initializing. 48 | for name in self.graph.network_inputs: 49 | self.activation_cache[name] = [] 50 | if self.st is None: 51 | self.st = 0 52 | self.ed = self.args.data_num 53 | for data in input_data_generator(self.args.input_dir, self.graph.network_inputs, self.st, self.ed): 54 | for name in self.graph.network_inputs: 55 | self.activation_cache[name].append( 56 | data[name][:].reshape(*self.graph.get_tensor_shape(name)).copy()) 57 | else: 58 | # Means We need specific tensor. 59 | self.activation_cache[in_tensor] = [] 60 | for data in input_data_generator(self.args.input_dir, self.graph.network_inputs, self.st, self.ed): 61 | self.activation_cache[in_tensor].append( 62 | data[in_tensor][:].reshape(*self.graph.get_tensor_shape(in_tensor)).copy()) 63 | 64 | def input_generator(self, tensor_name_list): 65 | # TODO batch generator. 66 | data = {} 67 | for i in range(self.ed - self.st): 68 | for tensor in tensor_name_list: 69 | data[tensor] = self.activation_cache[tensor][i] 70 | yield data 71 | 72 | def __getitem__(self, tensor_name): 73 | if tensor_name in self.graph.initializer: 74 | return self.graph.initializer[tensor_name][0] 75 | if tensor_name not in self.activation_cache: 76 | node = self.graph.get_tensor_producer(tensor_name) 77 | # quantize_output(self.name, 'get item: ', tensor_name, self.activation_cache.keys()) 78 | self.forward_subnet(node.name, node.input) 79 | return self.activation_cache[tensor_name] 80 | 81 | def forward_subnet(self, subnet_name, input_list): 82 | sub_graph = self.graph_list[self.name_to_graph_id[subnet_name]] 83 | for input_tensor in input_list: 84 | if input_tensor == '': 85 | continue 86 | if input_tensor not in sub_graph.initializer and input_tensor not in self.activation_cache: 87 | node = self.graph.get_tensor_producer(input_tensor) 88 | if isinstance(node, str): 89 | # Means We need network input. 90 | self.fetch_input(input_tensor) 91 | else: 92 | self.forward_subnet(node.name, node.input) 93 | 94 | input_generator = self.input_generator(sub_graph.network_inputs) 95 | sub_graph = self.graph_list[self.name_to_graph_id[subnet_name]] 96 | sub_net = sub_graph.model 97 | ort_inputs = {} 98 | ort_session = ort.InferenceSession(sub_net.SerializeToString(), providers=self.providers) 99 | if 'CUDAExecutionProvider' not in ort_session.get_provider_options(): 100 | logger.warning("CUDA may not used. Please check your ort/cuda/cudnn version.") 101 | 102 | for data in input_generator: 103 | for name in sub_graph.network_inputs: 104 | if len(data[name].shape) == 0 or sub_graph.get_tensor_shape(name)[0] == 0: 105 | ort_inputs[name] = data[name] 106 | else: 107 | ort_inputs[name] = data[name][:].reshape(*sub_graph.get_tensor_shape(name)) 108 | outputs = [output.name for output in ort_session.get_outputs()] 109 | ort_outputs = ort_session.run(outputs, ort_inputs) 110 | ort_outs = OrderedDict(zip(outputs, ort_outputs)) 111 | 112 | for i in ort_outs: 113 | # There may be dummy outputs, which 114 | # do not needed by any other layers neither is network output. 115 | if i in self.ref_cnt or i in self.graph.network_outputs: 116 | if i in self.activation_cache: 117 | self.activation_cache[i].append(ort_outs[i].copy()) 118 | else: 119 | self.activation_cache[i] = [ort_outs[i].copy()] 120 | # Tensor Wont Be used in this forward. 121 | for input_tensor in input_list: 122 | if input_tensor in sub_graph.initializer: 123 | continue 124 | if input_tensor == '': 125 | continue 126 | self.ref_cnt[input_tensor] -= 1 127 | if self.ref_cnt[input_tensor] == 0: 128 | del (self.activation_cache[input_tensor]) 129 | 130 | def fill_ref_cnt(self): 131 | for node in self.graph.graph.node: 132 | for in_tensor in node.input: 133 | if in_tensor in self.ref_cnt: 134 | self.ref_cnt[in_tensor] += 1 135 | else: 136 | self.ref_cnt[in_tensor] = 1 137 | 138 | def _split_network(self): 139 | for i, node in enumerate(self.graph.graph.node): 140 | inputs = [] 141 | outputs = [] 142 | inits = [] 143 | network_inputs = [] 144 | network_outputs = [] 145 | for input in node.input: 146 | if input == '': 147 | continue 148 | if input not in self.graph.initializer: 149 | in_type = self.graph.get_value_type(input) 150 | shape = self.graph.get_tensor_shape(input) 151 | if shape[0] == 0: 152 | shape = [] 153 | input_value = mtvi(input, in_type, shape) 154 | inputs.append(input_value) 155 | network_inputs.append(input) 156 | else: 157 | inits.append(self.graph.initializer[input][0]) 158 | 159 | for output in node.output: 160 | if output == '': 161 | continue 162 | out_type = self.graph.get_value_type(output) 163 | shape = self.graph.get_tensor_shape(output) 164 | if shape[0] == 0: 165 | shape = [] 166 | output_value = mtvi(output, out_type, shape) 167 | outputs.append(output_value) 168 | network_outputs.append(output) 169 | 170 | graph = make_graph(nodes=[node], name=node.name, inputs=inputs, 171 | outputs=outputs, initializer=inits) 172 | opset_import = self.graph.model.opset_import 173 | sub_net = make_model(graph, producer_name=node.name, opset_imports=opset_import) 174 | sub_graph = ONNXGraph(sub_net, self.args.output_dir) 175 | sub_graph.tensor_name_shape_map = self.graph.tensor_name_shape_map 176 | sub_graph.network_inputs = network_inputs 177 | sub_graph.network_outputs = network_outputs 178 | self.graph_list.append(sub_graph) 179 | for idx, sub_graph in enumerate(self.graph_list): 180 | self.name_to_graph_id[sub_graph.graph.name] = idx 181 | 182 | def update_graph(self, graph): 183 | for i, sub_graph in enumerate(self.graph_list): 184 | for init_name in self.graph_list[i].initializer: 185 | tensor = graph.get_initializer(init_name) 186 | self.graph_list[i].set_initializer(init_name, tensor) 187 | self.graph_list[i].update_model() 188 | self.ref_cnt = {} 189 | self.fill_ref_cnt() 190 | 191 | 192 | def forward_get_minmax(onnx_graph, args): 193 | net = copy.deepcopy(onnx_graph.model) 194 | graph = net.graph 195 | for node in reversed(graph.node): 196 | for output_name in reversed(node.output): 197 | if output_name not in [_o.name for _o in graph.output]: 198 | graph.output.insert(0, onnx.ValueInfoProto(name=output_name)) 199 | providers = [("CUDAExecutionProvider", {'device_id': args.local_rank})] 200 | ort_session = ort.InferenceSession(net.SerializeToString(), providers=providers) 201 | if 'CUDAExecutionProvider' not in ort_session.get_provider_options(): 202 | logger.warning("CUDA may not used. Please check your ort/cuda/cudnn version.") 203 | # Start activation quantization. 204 | statistics = {} 205 | t1 = 0 206 | ort_inputs = {} 207 | rank_num = args.data_num // args.world_size 208 | data_st_idx = args.rank * rank_num 209 | data_ed_idx = min((args.rank + 1) * rank_num, args.data_num) 210 | for data in tqdm(input_data_generator(args.input_dir, onnx_graph.network_inputs, data_st_idx, data_ed_idx), 211 | desc='Minmax update'): 212 | for name in onnx_graph.network_inputs: 213 | ort_inputs[name] = data[name][:].reshape(onnx_graph.get_tensor_shape(name)) 214 | st = time.time() 215 | outputs = [output.name for output in ort_session.get_outputs()] 216 | ort_outputs = ort_session.run(outputs, ort_inputs) 217 | ed = time.time() 218 | t1 += ed - st 219 | ort_outs = OrderedDict(zip(outputs, ort_outputs)) 220 | for i in ort_inputs: 221 | if i in statistics: 222 | statistics[i]['max'].append(ort_inputs[i].max()) 223 | statistics[i]['min'].append(ort_inputs[i].min()) 224 | else: 225 | statistics[i] = {} 226 | statistics[i]['max'] = [ort_inputs[i].max()] 227 | statistics[i]['min'] = [ort_inputs[i].min()] 228 | for i in ort_outs: 229 | if i in statistics: 230 | statistics[i]['max'].append(ort_outs[i].max()) 231 | statistics[i]['min'].append(ort_outs[i].min()) 232 | else: 233 | statistics[i] = {} 234 | statistics[i]['max'] = [ort_outs[i].max()] 235 | statistics[i]['min'] = [ort_outs[i].min()] 236 | logger.info("Forward time: {:.2f} seconds".format(t1)) 237 | return statistics 238 | 239 | 240 | def forward_get_hist(onnx_graph, stats_min_max, args): 241 | net = copy.deepcopy(onnx_graph.model) 242 | graph = net.graph 243 | for node in reversed(graph.node): 244 | for output_name in reversed(node.output): 245 | if output_name not in [_o.name for _o in graph.output]: 246 | graph.output.insert(0, onnx.ValueInfoProto(name=output_name)) 247 | providers = [("CUDAExecutionProvider", {'device_id': args.local_rank})] 248 | ort_session = ort.InferenceSession(net.SerializeToString(), providers=providers) 249 | if 'CUDAExecutionProvider' not in ort_session.get_provider_options(): 250 | logger.warning("CUDA may not used. Please check your ort/cuda/cudnn version.") 251 | # Start activation quantization. 252 | statistics = {} 253 | ort_inputs = {} 254 | rank_num = args.data_num // args.world_size 255 | data_st_idx = args.rank * rank_num 256 | data_ed_idx = min((args.rank + 1) * rank_num, args.data_num) 257 | for data in tqdm(input_data_generator(args.input_dir, onnx_graph.network_inputs, data_st_idx, data_ed_idx), 258 | desc='Hist update: {}'.format(args.rank)): 259 | for name in onnx_graph.network_inputs: 260 | ort_inputs[name] = data[name][:].reshape(onnx_graph.get_tensor_shape(name)) 261 | outputs = [output.name for output in ort_session.get_outputs()] 262 | ort_outputs = ort_session.run(outputs, ort_inputs) 263 | ort_outs = OrderedDict(zip(outputs, ort_outputs)) 264 | 265 | for i in ort_inputs: 266 | data_max = max(np.max(stats_min_max[i]['max']), 267 | -np.min(stats_min_max[i]['min'])) 268 | hist, _ = np.histogram(np.abs(ort_inputs[i]), int(args.bins), (0, data_max)) 269 | if i in statistics: 270 | statistics[i].append(hist) 271 | else: 272 | statistics[i] = [hist] 273 | for i in ort_outs: 274 | data_max = max(np.max(stats_min_max[i]['max']), 275 | -np.min(stats_min_max[i]['min'])) 276 | hist, _ = np.histogram(np.abs(ort_outs[i]), int(args.bins), (0, data_max)) 277 | if i in statistics: 278 | statistics[i].append(hist) 279 | else: 280 | statistics[i] = [hist] 281 | return statistics 282 | 283 | 284 | def forward_net_octav(onnx_graph, args): 285 | # Generate Graph and Net 286 | net = copy.deepcopy(onnx_graph.model) 287 | graph = net.graph 288 | for node in reversed(graph.node): 289 | for output_name in reversed(node.output): 290 | if output_name not in [_o.name for _o in graph.output]: 291 | graph.output.insert(0, onnx.ValueInfoProto(name=output_name)) 292 | providers = [("CUDAExecutionProvider", {'device_id': args.local_rank})] 293 | ort_session = ort.InferenceSession(net.SerializeToString(), providers=providers) 294 | if 'CUDAExecutionProvider' not in ort_session.get_provider_options(): 295 | logger.warning("CUDA may not used. Please check your ort/cuda/cudnn version.") 296 | # Start activation quantization. 297 | statistics = {} 298 | t1 = 0 299 | ort_inputs = {} 300 | rank_num = args.data_num // args.world_size 301 | data_st_idx = args.rank * rank_num 302 | data_ed_idx = min((args.rank + 1) * rank_num, args.data_num) 303 | for data in tqdm(input_data_generator(args.input_dir, onnx_graph.network_inputs, data_st_idx, data_ed_idx), 304 | desc='OCTAV update rank: {}'.format(args.rank)): 305 | ort_inputs = {} 306 | for name in onnx_graph.network_inputs: 307 | ort_inputs[name] = data[name][:].reshape(onnx_graph.get_tensor_shape(name)) 308 | st = time.time() 309 | outputs = [output.name for output in ort_session.get_outputs()] 310 | ort_outputs = ort_session.run(outputs, ort_inputs) 311 | ed = time.time() 312 | t1 += ed - st 313 | ort_outs = OrderedDict(zip(outputs, ort_outputs)) 314 | ort_inputs.update(ort_outs) 315 | for i in ort_inputs: 316 | data_max = ort_inputs[i].max() 317 | data_min = ort_inputs[i].min() 318 | # If dynamic_sym = True, Means one more bit. 319 | if np.abs(data_min - 0) < 1e-6 and 'dynamic_sym' in platform_setting_table[args.deploy]['qi_params']: 320 | unsigned = 4 321 | else: 322 | unsigned = 1 323 | abs_x = np.abs(ort_inputs[i]) 324 | s_n = abs_x.sum() / abs_x[abs_x > 0].size 325 | for _ in range(20): 326 | s_n_plus_1 = abs_x[abs_x > s_n].sum() / \ 327 | (1 / (4 ** 8) / 3 / unsigned * abs_x[abs_x <= s_n].size + abs_x[abs_x > s_n].size) 328 | if np.abs(s_n_plus_1 - s_n) < 1e-6: 329 | break 330 | s_n = s_n_plus_1 331 | if i in statistics: 332 | statistics[i]['optimal_s'].append(s_n) 333 | statistics[i]['min'].append(data_min) 334 | statistics[i]['max'].append(data_max) 335 | else: 336 | statistics[i] = { 337 | 'optimal_s': [s_n], 338 | 'min': [data_min], 339 | 'max': [data_max] 340 | } 341 | logger.info("Forward time: {:.2f} seconds".format(t1)) 342 | return statistics 343 | 344 | 345 | def forward_get_minmax_transformer(onnx_graph, args): 346 | # Start minmax activation quantization. 347 | statistics = {} 348 | rank_num = args.data_num // args.world_size 349 | data_st_idx = args.rank * rank_num 350 | data_ed_idx = min((args.rank + 1) * rank_num, args.data_num) 351 | fp_act_cache = ActivationCache(onnx_graph, args, data_st_idx, data_ed_idx) 352 | 353 | input_names = [inp.name for inp in onnx_graph.graph.input] 354 | output_names = [] 355 | for node in onnx_graph.graph.node: 356 | for out in node.output: 357 | output_names.append(out) 358 | tensor_names = input_names + output_names 359 | 360 | st = time.time() 361 | for name in tensor_names: 362 | if name == '': 363 | continue 364 | for i in range(data_ed_idx - data_st_idx): 365 | if name in statistics: 366 | statistics[name]['max'].append(fp_act_cache[name][i].max()) 367 | statistics[name]['min'].append(fp_act_cache[name][i].min()) 368 | else: 369 | statistics[name] = {} 370 | statistics[name]['max'] = [fp_act_cache[name][i].max()] 371 | statistics[name]['min'] = [fp_act_cache[name][i].min()] 372 | ed = time.time() 373 | 374 | logger.info("Forward time: {:.2f} seconds".format(ed - st)) 375 | return statistics 376 | 377 | 378 | def forward_get_hist_transformer(onnx_graph, stats_min_max, args): 379 | # Start hist activation quantization. 380 | statistics = {} 381 | rank_num = args.data_num // args.world_size 382 | data_st_idx = args.rank * rank_num 383 | data_ed_idx = min((args.rank + 1) * rank_num, args.data_num) 384 | fp_act_cache = ActivationCache(onnx_graph, args, data_st_idx, data_ed_idx) 385 | 386 | input_names = [inp.name for inp in onnx_graph.graph.input] 387 | output_names = [] 388 | for node in onnx_graph.graph.node: 389 | for out in node.output: 390 | output_names.append(out) 391 | tensor_names = input_names + output_names 392 | 393 | for name in tensor_names: 394 | if name == '': 395 | continue 396 | for i in range(data_ed_idx - data_st_idx): 397 | data_max = max(np.max(stats_min_max[name]['max']), 398 | -np.min(stats_min_max[name]['min'])) 399 | hist, _ = np.histogram(np.abs(fp_act_cache[name][i]), int(args.bins), (0, data_max)) 400 | if name in statistics: 401 | statistics[name].append(hist) 402 | else: 403 | statistics[name] = [hist] 404 | 405 | return statistics 406 | 407 | 408 | def forward_net_octav_transformer(onnx_graph, args): 409 | # Start mse activation quantization. 410 | statistics = {} 411 | rank_num = args.data_num // args.world_size 412 | data_st_idx = args.rank * rank_num 413 | data_ed_idx = min((args.rank + 1) * rank_num, args.data_num) 414 | fp_act_cache = ActivationCache(onnx_graph, args, data_st_idx, data_ed_idx) 415 | 416 | input_names = [inp.name for inp in onnx_graph.graph.input] 417 | output_names = [] 418 | for node in onnx_graph.graph.node: 419 | for out in node.output: 420 | output_names.append(out) 421 | tensor_names = input_names + output_names 422 | 423 | st = time.time() 424 | for name in tensor_names: 425 | if name == '': 426 | continue 427 | for i in range(data_ed_idx - data_st_idx): 428 | data_max = fp_act_cache[name][i].max() 429 | data_min = fp_act_cache[name][i].min() 430 | # If dynamic_sym = True, Means one more bit. 431 | if np.abs(data_min - 0) < 1e-6 and 'dynamic_sym' in platform_setting_table[args.deploy]['qi_params']: 432 | unsigned = 4 433 | else: 434 | unsigned = 1 435 | abs_x = np.abs(fp_act_cache[name][i]) 436 | s_n = abs_x.sum() / abs_x[abs_x > 0].size 437 | for _ in range(20): 438 | s_n_plus_1 = abs_x[abs_x > s_n].sum() / \ 439 | (1 / (4 ** 8) / 3 / unsigned * abs_x[abs_x <= s_n].size + abs_x[abs_x > s_n].size) 440 | if np.abs(s_n_plus_1 - s_n) < 1e-6: 441 | break 442 | s_n = s_n_plus_1 443 | if name in statistics: 444 | statistics[name]['optimal_s'].append(s_n) 445 | statistics[name]['min'].append(data_min) 446 | statistics[name]['max'].append(data_max) 447 | else: 448 | statistics[name] = { 449 | 'optimal_s': [s_n], 450 | 'min': [data_min], 451 | 'max': [data_max] 452 | } 453 | ed = time.time() 454 | 455 | logger.info("Forward time: {:.2f} seconds".format(ed - st)) 456 | return statistics 457 | 458 | 459 | def input_data_generator(input_dir, input_name_list, data_st_idx, data_ed_idx): 460 | for idx in range(data_st_idx, data_ed_idx): 461 | data = {} 462 | for i in input_name_list: 463 | data[i] = np.fromfile(f'{input_dir}/{i}/{idx}.bin', 'float32') 464 | yield data 465 | 466 | 467 | def forward_get_tensor(graph, net, index, args): 468 | for node in graph.graph.node: 469 | if node.op_type in QUANT_NODE_NAME_LIST: 470 | continue 471 | for output_name in node.output: 472 | if output_name not in [_o.name for _o in net.graph.output]: 473 | net.graph.output.insert(0, onnx.ValueInfoProto(name=output_name)) 474 | rank = dist.get_rank() 475 | device = rank % torch.cuda.device_count() 476 | providers = [("CUDAExecutionProvider", {'device_id': device})] 477 | ort_session = ort.InferenceSession(net.SerializeToString(), providers=providers) 478 | ort_inputs = {} 479 | for data in input_data_generator(args.input_dir, graph.network_inputs, index, index + 1): 480 | for name in graph.network_inputs: 481 | ort_inputs[name] = data[name][:].reshape(graph.get_tensor_shape(name)) 482 | outputs = [output.name for output in ort_session.get_outputs()] 483 | ort_outputs = ort_session.run(outputs, ort_inputs) 484 | ort_outs = OrderedDict(zip(outputs, ort_outputs)) 485 | return copy.deepcopy(ort_outs) 486 | -------------------------------------------------------------------------------- /dipoorlet/platform_settings.py: -------------------------------------------------------------------------------- 1 | LAYER_HAS_WEIGHT = ['Conv', 'Gemm', 'ConvTranspose', 'PRelu', 'BatchNormalization'] 2 | basic_quant_node = ['Relu', 'Eltwise', 'MaxPool', 'Conv', 'Gemm', 'ConvTranspose', 'PRelu', 3 | 'AveragePool', 'Concat', 'Split', 'Add', 'Mul', 'Abs', 'Reciprocal', 'Sigmoid'] 4 | 5 | 6 | trt_platform_settings = { 7 | 'deploy_exclude_layers': [], 8 | 'quant_nodes': ['Relu', 'MaxPool', 'Conv', 'Gemm', 'ConvTranspose', 'PRelu', 'AveragePool', 'Add', 'Sigmoid'], 9 | 'qw_params': { 10 | 'bit_width': 8, 11 | 'type': 'Linear', 12 | 'symmetric': True, 13 | 'per_channel': True 14 | }, 15 | 'qi_params': { 16 | 'bit_width': 8, 17 | 'type': 'Linear', 18 | 'symmetric': True 19 | }, 20 | 'quantize_network_output': False, 21 | 'deploy_weight': False 22 | } 23 | 24 | 25 | stpu_platform_settings = { 26 | 'deploy_exclude_layers': [], 27 | 'quant_nodes': basic_quant_node + ['Clip', 'HardSigmoid'], 28 | 'qi_params': { 29 | 'bit_width': 8, 30 | 'type': 'Linear', 31 | 'symmetric': True 32 | }, 33 | 'qw_params': { 34 | 'bit_width': 8, 35 | 'type': 'Linear', 36 | 'symmetric': True, 37 | 'per_channel': False 38 | }, 39 | 'quantize_network_output': False, 40 | 'deploy_weight': True 41 | } 42 | 43 | 44 | magicmind_platform_settings = { 45 | 'deploy_exclude_layers': [], 46 | 'quant_nodes': ['Gemm', 'Conv', 'ConvTranspose', 'MatMul'], 47 | 'qw_params': { 48 | 'bit_width': 8, 49 | 'type': 'Linear', 50 | 'symmetric': False, 51 | 'log_scale': False, 52 | 'per_channel': True 53 | }, 54 | 'qi_params': { 55 | 'bit_width': 8, 56 | 'type': 'Linear', 57 | 'symmetric': False, 58 | 'log_scale': False 59 | }, 60 | 'quantize_network_output': False, 61 | 'deploy_weight': False 62 | } 63 | 64 | 65 | rv_platform_settings = { 66 | 'deploy_exclude_layers': [], 67 | 'quant_nodes': basic_quant_node, 68 | 'qi_params': { 69 | 'bit_width': 8, 70 | 'type': 'Linear', 71 | 'symmetric': False 72 | }, 73 | 'qw_params': { 74 | 'bit_width': 8, 75 | 'type': 'Linear', 76 | 'per_channel': False, 77 | 'symmetric': False 78 | }, 79 | 'quantize_network_output': True, 80 | 'deploy_weight': True 81 | } 82 | 83 | 84 | # Set rely on Atlas manual. 85 | # https://www.hiascend.com/document/detail/zh/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000029.html 86 | # Todo: Pool cannot be global. 87 | atlas_platform_settings = { 88 | 'quant_nodes': ['Conv', 'Gemm', 'AveragePool'], 89 | 'qw_params': { 90 | 'bit_width': 8, 91 | 'type': 'Linear', 92 | 'symmetric': True, 93 | 'per_channel': True 94 | }, 95 | 'qi_params': { 96 | 'bit_width': 8, 97 | 'type': 'Linear', 98 | 'symmetric': False 99 | }, 100 | 'quantize_network_output': False, 101 | 'deploy_weight': False 102 | } 103 | 104 | 105 | # SNPE docs 106 | # https://developer.qualcomm.com/sites/default/files/docs/snpe/quantized_models.html 107 | snpe_platform_settings = { 108 | 'deploy_exclude_layers': [], 109 | 'quant_nodes': basic_quant_node + ['Sigmoid'], 110 | 'qw_params': { 111 | 'bit_width': 8, 112 | 'type': 'Linear', 113 | 'symmetric': False, 114 | 'per_channel': False 115 | }, 116 | 'qi_params': { 117 | 'bit_width': 8, 118 | 'type': 'Linear', 119 | 'symmetric': False 120 | }, 121 | 'quantize_network_output': True, 122 | 'deploy_weight': False 123 | } 124 | 125 | 126 | # TI docs https://software-dl.ti.com/jacinto7/esd/processor-sdk-rtos-jacinto7 \ 127 | # /07_03_00_07/exports/docs/tidl_j7_02_00_00_07/ti_dl/docs/ \ 128 | # user_guide_html/md_tidl_fsg_quantization.html 129 | # If calibrationOption = 13 130 | # dw conv perchannel + log2=True 131 | # odinary conv perlayer + log2=False 132 | # If calibrationOption = 16 133 | # dw conv weight bit_width = 16 134 | ti_platform_settings = { 135 | 'deploy_exclude_layers': [], 136 | 'quant_nodes': basic_quant_node, 137 | 'qw_params': { 138 | 'bit_width': 8, 139 | 'type': 'Linear', 140 | 'symmetric': True, 141 | 'per_channel': False, 142 | 'log_scale': False 143 | }, 144 | 'qi_params': { 145 | 'bit_width': 8, 146 | 'type': 'Linear', 147 | 'symmetric': True, 148 | 'dynamic_sym': True, 149 | 'log_scale': True 150 | }, 151 | 'quantize_network_output': False, 152 | 'deploy_weight': False 153 | } 154 | 155 | imx_platform_settings = { 156 | 'deploy_exclude_layers': [], 157 | 'quant_nodes': basic_quant_node, 158 | 'qw_params': { 159 | 'bit_width': 8, 160 | 'type': 'Linear', 161 | 'symmetric': True, 162 | 'per_channel': True, 163 | 'log_scale': True 164 | }, 165 | 'qi_params': { 166 | 'bit_width': 8, 167 | 'type': 'Linear', 168 | 'symmetric': True, 169 | 'log_scale': True 170 | }, 171 | 'quantize_network_output': True, 172 | 'deploy_weight': True 173 | } 174 | 175 | platform_setting_table = { 176 | 'trt': trt_platform_settings, 177 | 'stpu': stpu_platform_settings, 178 | 'magicmind': magicmind_platform_settings, 179 | 'rv': rv_platform_settings, 180 | 'atlas': atlas_platform_settings, 181 | 'snpe': snpe_platform_settings, 182 | 'ti': ti_platform_settings, 183 | 'imx': imx_platform_settings 184 | } 185 | -------------------------------------------------------------------------------- /dipoorlet/profiling.py: -------------------------------------------------------------------------------- 1 | import heapq 2 | import math 3 | import os 4 | 5 | import numpy as np 6 | import torch.distributed as dist 7 | from onnx import numpy_helper 8 | from tqdm import tqdm 9 | 10 | from .forward_net import forward_get_tensor, ActivationCache 11 | from .platform_settings import platform_setting_table 12 | from .quantize import DQTENSORSUFFIX, QUANT_NODE_NAME_LIST, quant_graph 13 | from .utils import cos_similarity, logger 14 | 15 | 16 | def update_node_quant_profiling(graph_q, node, fp_cache, q_cache, layer_cosine_dict, args): 17 | # Layer till now. 18 | for out_tensor in node.output: 19 | out_quant_node = graph_q.get_tensor_consumer(out_tensor)[0] 20 | q_out_tensor = out_tensor 21 | if not isinstance(out_quant_node, str) and out_quant_node.op_type == QUANT_NODE_NAME_LIST[-1]: 22 | q_out_tensor = out_tensor + DQTENSORSUFFIX 23 | cos_tol = 0. 24 | cur_batch_size = len(fp_cache[out_tensor]) 25 | for i in range(cur_batch_size): 26 | cos_tol += cos_similarity(fp_cache[out_tensor][i], q_cache[q_out_tensor][i]) 27 | if out_tensor not in layer_cosine_dict: 28 | layer_cosine_dict[out_tensor] = (0, 0.) 29 | num_till_now, cos_till_now = layer_cosine_dict[out_tensor] 30 | layer_cosine_dict[out_tensor] = (num_till_now + cur_batch_size, 31 | (num_till_now * cos_till_now + cos_tol) / (num_till_now + cur_batch_size)) 32 | 33 | 34 | def quantize_profiling_multipass(graph_after_wt, graph_ori, act_clip_val, weight_clip_val, args): 35 | clip_val = act_clip_val.copy() 36 | clip_val.update(weight_clip_val) 37 | graph_q, quant_node_list = quant_graph(graph_after_wt, clip_val, args) 38 | 39 | rank = dist.get_rank() 40 | if rank == 0: 41 | graph_q.save_onnx_model(name='quant_model') 42 | 43 | layer_cosine_dict = {} 44 | model_cosine_dict = {} 45 | single = get_output_single_map(graph_after_wt) 46 | fp_net = graph_ori.model 47 | q_net = graph_q.model 48 | rank_data_size = math.ceil(args.data_num / args.world_size) 49 | rank_st = rank * rank_data_size 50 | rank_ed = min(rank * rank_data_size + rank_data_size, args.data_num) 51 | rank_data_size = rank_ed - rank_st 52 | if rank == 0: 53 | data_gen = tqdm(range(rank_st, rank_ed)) 54 | else: 55 | data_gen = range(rank_st, rank_ed) 56 | for i in data_gen: 57 | fp_tensors = forward_get_tensor(graph_ori, fp_net, i, args) 58 | q_tensors = forward_get_tensor(graph_q, q_net, i, args) 59 | for node in quant_node_list: 60 | for tensor_name in node.output: 61 | if tensor_name not in layer_cosine_dict: 62 | layer_cosine_dict[tensor_name] = cos_similarity(fp_tensors[tensor_name], q_tensors[tensor_name]) 63 | else: 64 | layer_cosine_dict[tensor_name] += cos_similarity(fp_tensors[tensor_name], q_tensors[tensor_name]) 65 | for tensor_name in graph_after_wt.network_outputs: 66 | q_tensor_name = tensor_name 67 | if tensor_name + DQTENSORSUFFIX in q_tensors: 68 | q_tensor_name = tensor_name + DQTENSORSUFFIX 69 | if single[tensor_name]: 70 | if tensor_name not in model_cosine_dict: 71 | model_cosine_dict[tensor_name] = {'res_tol': [q_tensors[q_tensor_name]], 'fp_tol': [fp_tensors[tensor_name]]} 72 | else: 73 | model_cosine_dict[tensor_name]['res_tol'].append(q_tensors[q_tensor_name]) 74 | model_cosine_dict[tensor_name]['fp_tol'].append(fp_tensors[tensor_name]) 75 | else: 76 | _cos = cos_similarity(fp_tensors[tensor_name], q_tensors[q_tensor_name]) 77 | if tensor_name not in model_cosine_dict: 78 | model_cosine_dict[tensor_name] = [_cos, _cos] 79 | else: 80 | model_cosine_dict[tensor_name][0] += _cos 81 | model_cosine_dict[tensor_name][1] = min(_cos, model_cosine_dict[tensor_name][1]) 82 | if args.savefp and rank == 0: 83 | save_path = os.path.join(args.output_dir, 'output', tensor_name) 84 | if not os.path.exists(save_path): 85 | os.makedirs(save_path) 86 | fp_tensors[tensor_name].astype(np.float32).tofile(os.path.join(save_path, 'onnx-output-{}.bin'.format(i))) 87 | 88 | for k, v in layer_cosine_dict.items(): 89 | layer_cosine_dict[k] = v / rank_data_size 90 | 91 | for k, v in model_cosine_dict.items(): 92 | if single[k]: 93 | _cos = cos_similarity(np.stack(model_cosine_dict[k]['res_tol']), 94 | np.stack(model_cosine_dict[k]['fp_tol'])) 95 | model_cosine_dict[k] = [_cos, _cos] 96 | else: 97 | model_cosine_dict[k] = [v[0] / rank_data_size, v[1]] 98 | 99 | return layer_cosine_dict, model_cosine_dict, quant_node_list 100 | 101 | 102 | def quantize_profiling_transformer(graph_after_wt, graph_ori, act_clip_val, weight_clip_val, args): 103 | clip_val = act_clip_val.copy() 104 | clip_val.update(weight_clip_val) 105 | graph_q, quant_node_list = quant_graph(graph_after_wt, clip_val, args) 106 | 107 | rank = dist.get_rank() 108 | if rank == 0: 109 | graph_q.save_onnx_model(name='quant_model') 110 | 111 | layer_cosine_dict = {} 112 | model_cosine_dict = {} 113 | single = get_output_single_map(graph_after_wt) 114 | rank_data_size = math.ceil(args.data_num / args.world_size) 115 | rank_st = rank * rank_data_size 116 | rank_ed = min(rank * rank_data_size + rank_data_size, args.data_num) 117 | rank_data_size = rank_ed - rank_st 118 | if rank == 0: 119 | data_gen = tqdm(range(0, rank_ed - rank_st)) 120 | else: 121 | data_gen = range(0, rank_ed - rank_st) 122 | 123 | fp_act_cache = ActivationCache(graph_ori, args, rank_st, rank_ed) 124 | q_act_cache = ActivationCache(graph_q, args, rank_st, rank_ed) 125 | 126 | for i in data_gen: 127 | for tensor_name in graph_after_wt.network_outputs: 128 | q_tensor_name = tensor_name 129 | fp_tensors = fp_act_cache[tensor_name][i] 130 | q_tensors = q_act_cache[q_tensor_name][i] 131 | if tensor_name + DQTENSORSUFFIX in q_tensors: 132 | q_tensor_name = tensor_name + DQTENSORSUFFIX 133 | 134 | if single[tensor_name]: 135 | if tensor_name not in model_cosine_dict: 136 | model_cosine_dict[tensor_name] = {'res_tol': [q_tensors], 'fp_tol': [fp_tensors]} 137 | else: 138 | model_cosine_dict[tensor_name]['res_tol'].append(q_tensors) 139 | model_cosine_dict[tensor_name]['fp_tol'].append(fp_tensors) 140 | else: 141 | _cos = cos_similarity(fp_tensors, q_tensors) 142 | if tensor_name not in model_cosine_dict: 143 | model_cosine_dict[tensor_name] = [_cos, _cos] 144 | else: 145 | model_cosine_dict[tensor_name][0] += _cos 146 | model_cosine_dict[tensor_name][1] = min(_cos, model_cosine_dict[tensor_name][1]) 147 | 148 | for k, v in model_cosine_dict.items(): 149 | if single[k]: 150 | _cos = cos_similarity(np.stack(model_cosine_dict[k]['res_tol']), 151 | np.stack(model_cosine_dict[k]['fp_tol'])) 152 | model_cosine_dict[k] = [_cos, _cos] 153 | else: 154 | model_cosine_dict[k] = [v[0] / rank_data_size, v[1]] 155 | 156 | return layer_cosine_dict, model_cosine_dict, quant_node_list 157 | 158 | 159 | def update_quant_model_cosine(graph, fp_cache, q_cache, model_cosine_dict, single, args): 160 | for name in graph.network_outputs: 161 | cos_tol = 0. 162 | min_cos = 1.0 163 | res_tol = [] 164 | fp_tol = [] 165 | # Output name may not change. 166 | if name + DQTENSORSUFFIX in q_cache.graph.network_outputs: 167 | q_network_output_act = q_cache[name + DQTENSORSUFFIX] 168 | else: 169 | q_network_output_act = q_cache[name] 170 | fp_network_output_act = fp_cache[name] 171 | if args.savefp: 172 | save_path = os.path.join(args.output_dir, 'output', name) 173 | if not os.path.exists(save_path): 174 | os.makedirs(save_path) 175 | cur_batch_size = len(fp_network_output_act) 176 | for i in range(cur_batch_size): 177 | if not single[name]: 178 | _cos = cos_similarity(q_network_output_act[i], fp_network_output_act[i]) 179 | min_cos = min(min_cos, _cos) 180 | cos_tol += _cos 181 | else: 182 | res_tol.append(q_network_output_act[i]) 183 | fp_tol.append(fp_network_output_act[i]) 184 | if args.savefp: 185 | fp_network_output_act[i].astype(np.float32).tofile( 186 | os.path.join(save_path, 'onnx-output-{}.bin'.format(i + fp_cache.st))) 187 | if name not in model_cosine_dict: 188 | if single[name]: 189 | model_cosine_dict[name] = {'res_tol': [], 'fp_tol': []} 190 | else: 191 | model_cosine_dict[name] = (0, 0., 1.0) 192 | if single[name]: 193 | model_cosine_dict[name]['res_tol'].append(res_tol) 194 | model_cosine_dict[name]['fp_tol'].append(fp_tol) 195 | else: 196 | num_till_now, cos_till_now, min_till_now = model_cosine_dict[name] 197 | model_cosine_dict[name] = (num_till_now + cur_batch_size, 198 | (num_till_now * cos_till_now + cos_tol) / (num_till_now + cur_batch_size), 199 | min(min_till_now, min_cos)) 200 | 201 | 202 | def get_output_single_map(graph): 203 | single = {} 204 | for out_tensor in graph.network_outputs: 205 | shape = graph.get_tensor_shape(out_tensor) 206 | single[out_tensor] = np.prod(shape[1:]) <= 10 207 | return single 208 | 209 | 210 | def show_model_ranges(graph, act_clip_val, weight_clip_val, args): 211 | logger.info("Model ranges:") 212 | ranges_all = act_clip_val.copy() 213 | ranges_all.update(weight_clip_val) 214 | for name, range in ranges_all.items(): 215 | tensor_shape = graph.get_tensor_shape(name) 216 | if isinstance(range[0], np.ndarray): 217 | per_channel = "" 218 | if 'per_channel' in platform_setting_table[args.deploy]['qw_params'] and \ 219 | platform_setting_table[args.deploy]['qw_params']['per_channel']: 220 | per_channel = "per channel " 221 | logger.info("{:<30} Shape: {:<20} Range: {}[{:<10f} {:<10f}]".format(name, str(tensor_shape), 222 | per_channel, range[0].min(), range[1].max())) 223 | else: 224 | logger.info("{:<30} Shape: {:<20} Range: [{:<10f} {:<10f}]".format(name, str(tensor_shape), range[0], range[1])) 225 | 226 | 227 | def weight_need_perchannel(graph, args): 228 | qw_params = platform_setting_table[args.deploy]['qw_params'] 229 | if 'per_channel' in qw_params and qw_params['per_channel']: 230 | return 231 | logger.info("Layer degradate by per layer: ") 232 | heap = [] 233 | for node in graph.graph.node: 234 | if node.op_type == 'Conv': 235 | weight = numpy_helper.to_array(graph.initializer[node.input[1]][0]) 236 | c_num = weight.shape[0] 237 | per_channel_min = np.min(weight.reshape((c_num, -1)), -1) 238 | per_channel_max = np.max(weight.reshape((c_num, -1)), -1) 239 | per_channel_range = per_channel_max - per_channel_min 240 | per_layer_range = np.max(weight) - np.min(weight) 241 | heapq.heappush(heap, (per_channel_range.mean() / per_layer_range, node.name)) 242 | for tuple_cos_name in heapq.nsmallest(len(heap), heap): 243 | logger.info("{:40} ratio : {:<.5f}".format(tuple_cos_name[1], tuple_cos_name[0])) 244 | 245 | 246 | def show_model_profiling_res(graph_after_wt, layer_cosine_dict, model_cosine_dict, quant_node_list, args): 247 | quant_heapq = [] 248 | single = get_output_single_map(graph_after_wt) 249 | if not args.skip_prof_layer: 250 | for node in quant_node_list: 251 | logger.info(node.name) 252 | for out_tensor in node.output: 253 | logger.info("Layer cos: {:.5f}".format(layer_cosine_dict[out_tensor])) 254 | heapq.heappush(quant_heapq, (layer_cosine_dict[out_tensor], node.name + '-' + out_tensor)) 255 | logger.info("The smallest cos value of 10 layers: ") 256 | for tuple_cos_name in heapq.nsmallest(10, quant_heapq): 257 | logger.info("{:40} cos : {:<.5f}".format(tuple_cos_name[1], tuple_cos_name[0])) 258 | logger.info("Quant model output cos: ") 259 | for name in graph_after_wt.network_outputs: 260 | if not single[name]: 261 | logger.info("{:40} avgcos : {:<.5f} mincos : {:<.5f}".format(name, model_cosine_dict[name][0], 262 | model_cosine_dict[name][1])) 263 | else: 264 | logger.info("{:40} tolcos : {:<.5f}".format(name, model_cosine_dict[name][0])) 265 | -------------------------------------------------------------------------------- /dipoorlet/quantize.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | from onnx import TensorProto, helper 5 | 6 | from .platform_settings import LAYER_HAS_WEIGHT, platform_setting_table 7 | from .utils import ONNXGraph, logger 8 | 9 | QTENSORSUFFIX = '_q' 10 | DQTENSORSUFFIX = '_dq' 11 | QNODESUFFIX = '_fake_quant' 12 | DQNODESUFFIX = '_fake_dequant' 13 | QUANT_NODE_NAME_LIST = ['QuantizeLinear', 'DequantizeLinear'] 14 | MERGE_RELU = ['Conv', 'Gemm', 'Eltwise', 'Add'] 15 | RELU_TYPE = ['Relu', 'PRelu', 'Mul'] 16 | WEIGHT_TRANSPOSE_SUFFIX = '_transpose' 17 | CLIP_SUFFIX = '_clip' 18 | 19 | 20 | def quant_graph(onnx_graph, clip_val, args): 21 | graph_q = ONNXGraph() 22 | graph_q.copy_from(onnx_graph) 23 | quant_node_list = [] 24 | 25 | for node in graph_q.graph.node: 26 | if node.name in args.skip_layers: 27 | continue 28 | if node.op_type in platform_setting_table[args.deploy]['quant_nodes']: 29 | quant_node_list.append(node) 30 | 31 | act_quantized = [] 32 | for node in quant_node_list: 33 | insert_fake_quant_node(graph_q, node, act_quantized, clip_val, args) 34 | if platform_setting_table[args.deploy]['quantize_network_output']: 35 | insert_fake_quant_node_output(graph_q, clip_val, args) 36 | graph_q.update_model() 37 | return graph_q, quant_node_list 38 | 39 | 40 | def insert_fake_quant_node(graph, node, act_quantized, data_range_list, args): 41 | param = platform_setting_table[args.deploy] 42 | # We now quant input and weight tp INT8 but left output fp32. 43 | find_weight = False 44 | trt_merge_add = False 45 | for idx, in_tensor in enumerate(node.input): 46 | # ConvTranspose need transpose for weight. 47 | need_transpose = False 48 | shape = graph.get_tensor_shape(in_tensor) 49 | # Merge ReLU 50 | if node.op_type in RELU_TYPE: 51 | if isinstance(graph.get_tensor_producer(node.input[0]), str): 52 | continue 53 | _prev = graph.get_tensor_producer(node.input[0]) 54 | if len(node.input) == 1 and not isinstance(_prev, str) and _prev.op_type in MERGE_RELU: 55 | continue 56 | 57 | q_nodes = None 58 | # Quantize weight. 59 | if in_tensor in graph.initializer and node.op_type in LAYER_HAS_WEIGHT: 60 | if not find_weight: 61 | # We find Weight here. 62 | find_weight = True 63 | if node.op_type == 'ConvTranspose': 64 | need_transpose = True 65 | q_nodes, _, _ = get_qnode_by_param(param['qw_params'], in_tensor, shape, data_range_list[in_tensor], 66 | need_transpose) 67 | 68 | elif 'qb_params' in param: 69 | # We find bias here. 70 | q_nodes, _, _ = get_qnode_by_param(param['qb_params'], in_tensor, shape, data_range_list[in_tensor], 71 | need_transpose) 72 | 73 | # Quantize input. 74 | if in_tensor in graph.network_inputs or in_tensor not in graph.input: 75 | # Conv Conv Conv 76 | # | | | 77 | # skip Q Q 78 | # \ | / 79 | # Add means the first add branch with conv should merge in TRT. 80 | if args.deploy == 'trt' and node.op_type == 'Add' and not trt_merge_add: 81 | _prev = graph.get_tensor_producer(in_tensor) 82 | if _prev.op_type == 'Conv': 83 | trt_merge_add = True 84 | continue 85 | q_nodes, _, _ = get_qnode_by_param(param['qi_params'], in_tensor, shape, data_range_list[in_tensor]) 86 | 87 | if q_nodes is not None: 88 | node.input[idx] = q_nodes.output[0].name 89 | # Output already quantized. 90 | if in_tensor in act_quantized: 91 | continue 92 | graph.insert_qnodes_purely(q_nodes=q_nodes, node=node) 93 | act_quantized.append(in_tensor) 94 | 95 | graph.topologize_graph() 96 | 97 | 98 | def insert_fake_quant_node_output(graph, clip_val, args): 99 | param = platform_setting_table[args.deploy] 100 | out_tensor_list = copy.deepcopy(graph.network_outputs) 101 | for out_tensor in out_tensor_list: 102 | q_nodes, _, _ = get_qnode_by_param(param['qi_params'], out_tensor, graph.get_tensor_shape(out_tensor), 103 | clip_val[out_tensor]) 104 | graph.insert_qnodes_purely(q_nodes=q_nodes, idx=graph.index(graph.get_tensor_producer(out_tensor)) + 1) 105 | graph.del_network_output(out_tensor) 106 | graph.add_network_output(q_nodes.output[0]) 107 | graph.topologize_graph() 108 | return 109 | 110 | 111 | def get_qnode_by_param(param, in_tensor_name, tensor_shape, range, need_transpose=False): 112 | bit_width = param['bit_width'] 113 | zero_point = [0] 114 | per_channel = True 115 | if 'per_channel' not in param or not param['per_channel']: 116 | per_channel = False 117 | 118 | if param['type'] == "Linear": 119 | symmetric = param['symmetric'] 120 | if 'per_channel' not in param or not param['per_channel']: 121 | range[0] = np.min(range[0]) 122 | range[1] = np.max(range[1]) 123 | # dynamic_sym only works on activations which could not 124 | # be perchannel. 125 | if 'dynamic_sym' in param and param['dynamic_sym']: 126 | if np.abs(range[0] - 0.0) < 1e-6: 127 | symmetric = False 128 | if symmetric: 129 | if not isinstance(range[0], np.ndarray): 130 | channel_num = 1 131 | else: 132 | channel_num = len(range[0]) 133 | # 8bit -128-127 actually identical to -127-127 134 | q_min = [-2 ** (bit_width - 1) + 1] * channel_num 135 | q_max = [2 ** (bit_width - 1) - 1] * channel_num 136 | data_max = np.max(np.abs(range), axis=0) 137 | scale = np.array(data_max) / q_max 138 | if np.any(scale == 0): 139 | # force set scale to 1 for zero weight channel 140 | # print("Find {} channels all zero in {}, set scale to 1.". 141 | # format(len(np.where(scale == 0)[0]), in_tensor_name)) 142 | scale = np.where(scale == 0, 1., scale) 143 | scale = scale.tolist() 144 | 145 | else: 146 | if not isinstance(range[0], np.ndarray): 147 | # Per layer. 148 | data_min = min(0, range[0]) 149 | data_max = max(0, range[1]) 150 | scale = (data_max - data_min) / (2 ** bit_width - 1) 151 | # Align zero point. 152 | if scale == 0.0: 153 | scale += 1. 154 | zero_point = np.round(-data_min / scale) 155 | # data_min = -zero_point * scale 156 | # data_max = scale * (2 ** bit_width - 1) + data_min 157 | # scale = (data_max - data_min) / (2 ** bit_width - 1) 158 | # q_min = [int(np.round(data_min / scale))] 159 | # q_max = [int(np.round(data_max / scale))] 160 | q_min = [int(-zero_point)] 161 | q_max = [int(2 ** (bit_width) - 1 - zero_point)] 162 | scale = [float(scale)] 163 | else: 164 | # Per channel. 165 | data_min = range[0] 166 | data_min[data_min > 0.] = 0. 167 | data_max = range[1] 168 | data_max[data_max < 0.] = 0. 169 | scale = (data_max - data_min) / (2 ** bit_width - 1) 170 | # Fix all zero channel. 171 | if np.any(scale == 0): 172 | # force set scale to 1 for zero weight channel 173 | logger.warning("Find {} channels all zero in {}, set scale to 1.".format( 174 | len(np.where(scale == 0)[0]), in_tensor_name)) 175 | scale = np.where(scale == 0, 1., scale) 176 | # Align zero point. 177 | zero_point = (-data_min / scale).round() 178 | q_min = -zero_point 179 | q_max = (2 ** (bit_width) - 1 - zero_point).astype(np.int32).tolist() 180 | q_min = q_min.astype(np.int32).tolist() 181 | scale = scale.tolist() 182 | if 'log_scale' in param and param['log_scale']: 183 | scale = 2 ** np.round(np.log2(scale)) 184 | scale = np.array(scale, dtype=np.float32) 185 | zero_point = np.full(scale.shape, zero_point, dtype=np.int8) 186 | q_nodes = make_quant_dequant(in_tensor_name, 187 | tensor_shape, 188 | scale, 189 | zero_point, 190 | need_transpose, 191 | per_channel, 192 | symmetric) 193 | 194 | return q_nodes, q_min, q_max 195 | 196 | 197 | def make_quant_dequant(tensor_name, tensor_shape, scale_val, zero_point_val, need_transpose=False, 198 | per_channel=False, symmetric=True): 199 | in_tensor = helper.make_tensor_value_info(tensor_name, TensorProto.FLOAT, tensor_shape) 200 | if len(scale_val) == 1: 201 | shape = [] 202 | else: 203 | shape = list(scale_val.shape) 204 | scale = helper.make_tensor(tensor_name + '_scale', TensorProto.FLOAT, shape, scale_val) 205 | zero_point = helper.make_tensor(tensor_name + '_zero_point', 206 | TensorProto.INT8 if symmetric else TensorProto.UINT8, shape, zero_point_val) 207 | tensor_dequant = helper.make_tensor_value_info(tensor_name + DQTENSORSUFFIX, TensorProto.FLOAT, tensor_shape) 208 | if per_channel: 209 | q_node = helper.make_node( 210 | name=tensor_name + "_QuantizeLinear", 211 | op_type="QuantizeLinear", 212 | inputs=[tensor_name, tensor_name + '_scale', tensor_name + '_zero_point'], 213 | outputs=[tensor_name + QTENSORSUFFIX], 214 | axis=1 if need_transpose else 0) 215 | dq_node = helper.make_node( 216 | name=tensor_name + "_DequantizeLinear", 217 | op_type="DequantizeLinear", 218 | inputs=[tensor_name + QTENSORSUFFIX, tensor_name + '_scale', tensor_name + '_zero_point'], 219 | outputs=[tensor_name + DQTENSORSUFFIX], 220 | axis=1 if need_transpose else 0) 221 | else: 222 | q_node = helper.make_node( 223 | name=tensor_name + "_QuantizeLinear", 224 | op_type="QuantizeLinear", 225 | inputs=[tensor_name, tensor_name + '_scale', tensor_name + '_zero_point'], 226 | outputs=[tensor_name + QTENSORSUFFIX]) 227 | dq_node = helper.make_node( 228 | name=tensor_name + "_DequantizeLinear", 229 | op_type="DequantizeLinear", 230 | inputs=[tensor_name + QTENSORSUFFIX, tensor_name + '_scale', tensor_name + '_zero_point'], 231 | outputs=[tensor_name + DQTENSORSUFFIX]) 232 | graph_quant = helper.make_graph( 233 | [q_node, dq_node], 234 | 'graph_quant', 235 | [in_tensor], 236 | [tensor_dequant], 237 | initializer=[scale, zero_point], 238 | ) 239 | return graph_quant 240 | -------------------------------------------------------------------------------- /dipoorlet/tensor_cali/__init__.py: -------------------------------------------------------------------------------- 1 | from .tensor_cali_base import tensor_calibration 2 | from .basic_algorithm import find_clip_val_minmax_weight -------------------------------------------------------------------------------- /dipoorlet/tensor_cali/basic_algorithm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ..forward_net import * 4 | from ..platform_settings import LAYER_HAS_WEIGHT 5 | from ..utils import dispatch_functool, logger 6 | 7 | 8 | @dispatch_functool 9 | def tensor_cali_dispatcher(*args, **kwargs): 10 | logger.info("Calibration Algorithm Not Found!") 11 | 12 | 13 | @tensor_cali_dispatcher.register('minmax') 14 | def find_clip_val_minmax(onnx_graph, args, **kwargs): 15 | if args.optim_transformer: 16 | stats_min_max = forward_get_minmax_transformer(onnx_graph, args) 17 | else: 18 | stats_min_max = forward_get_minmax(onnx_graph, args) 19 | clip_val = {} 20 | for name, tensor in stats_min_max.items(): 21 | clip_val[name] = [np.min(tensor['min']), np.max(tensor['max'])] 22 | return clip_val 23 | 24 | 25 | @tensor_cali_dispatcher.register('hist') 26 | def find_clip_val_hist(onnx_graph, args, store_stats=None, **kwargs): 27 | if store_stats: 28 | stats_min_max = store_stats['minmax'] 29 | act_stats_hist = store_stats['hist'] 30 | else: 31 | if args.optim_transformer: 32 | stats_min_max = forward_get_minmax_transformer(onnx_graph, args) 33 | act_stats_hist = forward_get_hist_transformer(onnx_graph, stats_min_max, args) 34 | else: 35 | stats_min_max = forward_get_minmax(onnx_graph, args) 36 | act_stats_hist = forward_get_hist(onnx_graph, stats_min_max, args) 37 | for name, hist in act_stats_hist.items(): 38 | act_stats_hist[name] = np.stack(hist).sum(0) 39 | clip_val = {} 40 | for name, hist in act_stats_hist.items(): 41 | hist = hist.astype(np.float32) / hist.sum() 42 | data_max = max(-np.min(stats_min_max[name]['min']), np.max(stats_min_max[name]['max'])) 43 | accum = 0 44 | for i in range(len(hist)): 45 | accum += hist[i] 46 | if accum >= args.threshold: 47 | clip_value = (i + 0.5) * (data_max / args.bins) 48 | clip_val[name] = [max(-clip_value, np.min(stats_min_max[name]['min'])), 49 | min(clip_value, np.max(stats_min_max[name]['max']))] 50 | break 51 | if name not in clip_val: 52 | clip_val[name] = [np.min(stats_min_max[name]['min']), 53 | np.max(stats_min_max[name]['max'])] 54 | return clip_val 55 | 56 | 57 | @tensor_cali_dispatcher.register('mse') 58 | def find_clip_val_octav(onnx_graph, args, **kwargs): 59 | if args.optim_transformer: 60 | optimal_s = forward_net_octav_transformer(onnx_graph, args) 61 | else: 62 | optimal_s = forward_net_octav(onnx_graph, args) 63 | clip_val = {} 64 | for k, v in optimal_s.items(): 65 | data_max = np.array(v['max']).max() 66 | data_min = np.array(v['min']).min() 67 | clip_val[k] = [max(data_min, -np.array(v['optimal_s']).mean()), 68 | min(data_max, np.array(v['optimal_s']).mean())] 69 | return clip_val 70 | 71 | 72 | def find_clip_val_minmax_weight(onnx_graph, args): 73 | weight_tensor = {} 74 | need_transpose = [] 75 | for node in onnx_graph.graph.node: 76 | if node.op_type in LAYER_HAS_WEIGHT: 77 | for in_tensor in node.input[1:]: 78 | weight_tensor[in_tensor] = onnx_graph.get_initializer(in_tensor) 79 | if node.op_type == 'ConvTranspose': 80 | need_transpose.append(node.input[1]) 81 | weight_clip_val = {} 82 | for name, tensor in weight_tensor.items(): 83 | # BN tracked param do not have shape. 84 | if len(tensor.shape) < 1: 85 | continue 86 | if name in need_transpose: 87 | tensor = tensor.transpose([1, 0, 2, 3]) 88 | c_num = tensor.shape[0] 89 | weight_clip_val[name] = [np.min(tensor.reshape((c_num, -1)), -1), 90 | np.max(tensor.reshape((c_num, -1)), -1)] 91 | return weight_clip_val 92 | -------------------------------------------------------------------------------- /dipoorlet/tensor_cali/tensor_cali_base.py: -------------------------------------------------------------------------------- 1 | from .basic_algorithm import * 2 | 3 | 4 | def tensor_calibration(onnx_graph, args): 5 | weight_clip_val = find_clip_val_minmax_weight(onnx_graph, args) 6 | act_clip_val = tensor_cali_dispatcher(args.act_quant, onnx_graph, args) 7 | return act_clip_val, weight_clip_val 8 | -------------------------------------------------------------------------------- /dipoorlet/utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import logging 4 | import os 5 | import time 6 | import sys 7 | 8 | import numpy as np 9 | import onnx 10 | import torch.distributed as dist 11 | from onnx import TensorProto, numpy_helper 12 | from onnx.external_data_helper import convert_model_to_external_data 13 | from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer 14 | from onnxruntime.quantization.quant_utils import QuantizationMode, QuantType 15 | from termcolor import colored 16 | 17 | from .platform_settings import platform_setting_table 18 | 19 | logger = logging.getLogger("dipoorlet") 20 | 21 | 22 | class ONNXGraph(object): 23 | def __init__(self, model=None, output_dir="", deploy=None, model_type=None): 24 | self.model = model 25 | if self.model: 26 | self.graph = self.model.graph 27 | self.output_dir = output_dir 28 | self.deploy = deploy 29 | self.model_type = model_type 30 | self.initializer = {} 31 | self.input_map = {} 32 | self.output_map = {} 33 | self.network_inputs = [] 34 | self.network_outputs = [] 35 | self.tensor_name_shape_map = {} 36 | self.value_name_type_map = {} 37 | self.name_idx_map = {} 38 | self.input = [] 39 | self.output = [] 40 | if self.model: 41 | self.set_names() 42 | self.convert_constant_to_init() 43 | self.topologize_graph() 44 | self.prepare_initializer() 45 | self.set_index() 46 | self.get_inp_oup() 47 | self.get_shape_type() 48 | 49 | def set_names(self): 50 | for idx, node in enumerate(self.model.graph.node): 51 | if node.name == '': 52 | node.name = node.op_type + "_" + str(idx) 53 | 54 | def convert_constant_to_init(self): 55 | for node in self.model.graph.node: 56 | if node.op_type == 'Constant': 57 | tensor = onnx.numpy_helper.to_array(node.attribute[0].t) 58 | self.set_initializer(node.output[0], tensor, raw=True) 59 | 60 | def prepare_initializer(self): 61 | self.initializer.clear() 62 | for idx, init in enumerate(self.graph.initializer): 63 | self.initializer[init.name] = (init, idx) 64 | 65 | def get_inp_oup(self): 66 | self.network_inputs.clear() 67 | self.network_outputs.clear() 68 | self.tensor_name_shape_map.clear() 69 | self.input.clear() 70 | self.output.clear() 71 | for input in self.graph.input: 72 | if isinstance(self.get_tensor_producer(input.name), str) and \ 73 | input.name not in self.initializer: 74 | self.network_inputs.append(input.name) 75 | for output in self.graph.output: 76 | self.network_outputs.append(output.name) 77 | self.input = self.network_inputs.copy() 78 | self.output = self.network_outputs.copy() 79 | 80 | for node in self.model.graph.node: 81 | for inp in node.input: 82 | if inp in self.initializer and inp not in self.input: 83 | self.input.append(inp) 84 | for oup in node.output: 85 | if oup not in self.output: 86 | self.output.append(oup) 87 | 88 | def get_shape_type(self): 89 | for input in self.graph.input: 90 | if input.name in self.network_inputs: 91 | self.tensor_name_shape_map[input.name] = [x.dim_value for x in input.type.tensor_type.shape.dim] 92 | self.value_name_type_map[input.name] = input.type.tensor_type.elem_type 93 | 94 | for output in self.graph.output: 95 | self.tensor_name_shape_map[output.name] = [x.dim_value for x in output.type.tensor_type.shape.dim] 96 | self.value_name_type_map[output.name] = output.type.tensor_type.elem_type 97 | 98 | for init in self.initializer: 99 | self.tensor_name_shape_map[init] = list(self.get_initializer(init).shape) 100 | inferred_value_info = self.model.graph.value_info 101 | for info in inferred_value_info: 102 | shape = [x.dim_value for x in info.type.tensor_type.shape.dim] 103 | value_type = info.type.tensor_type.elem_type 104 | self.tensor_name_shape_map[info.name] = shape 105 | self.value_name_type_map[info.name] = value_type 106 | 107 | value_names = list(self.tensor_name_shape_map.keys()) 108 | for name in value_names: 109 | self.tensor_name_shape_map[name + "_q"] = self.tensor_name_shape_map[name] 110 | if self.deploy is not None: 111 | if name in self.initializer: 112 | symmetric = platform_setting_table[self.deploy]['qw_params']['symmetric'] 113 | else: 114 | symmetric = platform_setting_table[self.deploy]['qi_params']['symmetric'] 115 | self.value_name_type_map[name + "_q"] = TensorProto.INT8 if symmetric else TensorProto.UINT8 116 | self.tensor_name_shape_map[name + "_dq"] = self.tensor_name_shape_map[name] 117 | self.value_name_type_map[name + "_dq"] = TensorProto.FLOAT 118 | 119 | def get_tensor_shape(self, tensor_name): 120 | return self.tensor_name_shape_map[tensor_name] 121 | 122 | def get_value_type(self, value_name): 123 | return self.value_name_type_map[value_name] 124 | 125 | def get_constant(self, name): 126 | for node in self.model.graph.node: 127 | if node.op_type == 'Constant': 128 | if node.output[0] == name: 129 | return numpy_helper.to_array(node.attribute[0].t).tolist() 130 | 131 | def get_initializer(self, initializer_name): 132 | return numpy_helper.to_array(self.initializer[initializer_name][0]) 133 | 134 | def set_initializer(self, initializer_name, value_tensor, raw=True): 135 | idx = None 136 | data_type = None 137 | if initializer_name in self.initializer: 138 | idx = self.initializer[initializer_name][1] 139 | if raw: 140 | initializer = numpy_helper.from_array(value_tensor) 141 | else: 142 | if value_tensor.dtype == np.float32: 143 | data_type = TensorProto.FLOAT 144 | if value_tensor.dtype == np.uint8: 145 | data_type = TensorProto.UINT8 146 | if value_tensor.dtype == np.int8: 147 | data_type = TensorProto.INT8 148 | initializer = onnx.helper.make_tensor(name=initializer_name, 149 | data_type=data_type, 150 | dims=[] if value_tensor.size == 1 else list(value_tensor.shape), 151 | vals=value_tensor, 152 | raw=False) 153 | initializer.name = initializer_name 154 | if idx is not None: 155 | self.graph.initializer.remove(self.graph.initializer[idx]) 156 | self.graph.initializer.insert(idx, initializer) 157 | else: 158 | self.graph.initializer.append(initializer) 159 | self.prepare_initializer() 160 | 161 | def topologize_graph(self): 162 | self.input_map.clear() 163 | self.output_map.clear() 164 | for idx, node in enumerate(self.graph.node): 165 | for output_name in node.output: 166 | self.output_map[output_name] = node 167 | for input_name in node.input: 168 | if input_name not in self.input_map: 169 | self.input_map[input_name] = [] 170 | self.input_map[input_name].append(node) 171 | 172 | def get_tensor_producer(self, output_name): 173 | if output_name not in self.output_map: 174 | return 'INPUT_TOKEN' 175 | return self.output_map[output_name] 176 | 177 | def get_tensor_consumer(self, input_name): 178 | if input_name not in self.input_map: 179 | return ['OUTPUT_TOKEN'] 180 | return self.input_map[input_name] 181 | 182 | def save_onnx_model(self, name='tmp', size_threshold=2048): 183 | if self.model_type is not None: 184 | convert_model_to_external_data(self.model, all_tensors_to_one_file=True, 185 | location="{}.data".format(name), 186 | size_threshold=size_threshold, 187 | convert_attribute=False) 188 | 189 | model_path = os.path.join(self.output_dir, '{}.onnx'.format(name)) 190 | onnx.save(self.model, model_path) 191 | 192 | def remove_node_purely(self, node): 193 | self.graph.node.remove(node) 194 | 195 | def insert_node_purely(self, node, idx=0): 196 | self.graph.node.insert(idx, node) 197 | 198 | def insert_qnodes_purely(self, q_nodes, idx=0, node=None): 199 | node_list = reversed(q_nodes.node) 200 | if node: 201 | idx = self.index(node) 202 | for node in node_list: 203 | self.graph.node.insert(idx, node) 204 | for init in q_nodes.initializer: 205 | self.graph.initializer.append(init) 206 | self.set_index() 207 | 208 | def del_network_output(self, out_name): 209 | idx = self.network_outputs.index(out_name) 210 | self.graph.output.pop(idx) 211 | self.network_outputs.remove(out_name) 212 | 213 | def add_network_output(self, out_put): 214 | self.graph.output.append(out_put) 215 | self.network_outputs.append(out_put.name) 216 | 217 | def del_initializer(self, initializer_name): 218 | if initializer_name in self.initializer: 219 | del self.initializer[initializer_name] 220 | 221 | def set_index(self): 222 | for node_idx, node in enumerate(self.graph.node): 223 | self.name_idx_map[node.name] = node_idx 224 | 225 | def index(self, node): 226 | return self.name_idx_map[node.name] 227 | 228 | def update_model(self): 229 | self.set_index() 230 | self.model = onnx.helper.make_model(self.graph, 231 | producer_name='updated_model', 232 | opset_imports=self.model.opset_import) 233 | self.prepare_initializer() 234 | 235 | def copy_from(self, source_graph): 236 | self.model = copy.deepcopy(source_graph.model) 237 | self.graph = copy.deepcopy(source_graph.graph) 238 | self.initializer = copy.deepcopy(source_graph.initializer) 239 | self.input_map = copy.deepcopy(source_graph.input_map) 240 | self.output_map = copy.deepcopy(source_graph.output_map) 241 | self.network_inputs = copy.deepcopy(source_graph.network_inputs) 242 | self.network_outputs = copy.deepcopy(source_graph.network_outputs) 243 | self.tensor_name_shape_map = copy.deepcopy(source_graph.tensor_name_shape_map) 244 | self.value_name_type_map = copy.deepcopy(source_graph.value_name_type_map) 245 | self.input = copy.deepcopy(source_graph.input) 246 | self.output = copy.deepcopy(source_graph.output) 247 | self.name_idx_map = source_graph.name_idx_map.copy() 248 | self.output_dir = source_graph.output_dir 249 | self.deploy = source_graph.deploy 250 | self.model_type = source_graph.model_type 251 | 252 | 253 | def setup_logger(args): 254 | global logger 255 | fmt = '[%(asctime)s %(name)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s' 256 | color_fmt = colored('[%(asctime)s %(name)s]', 'green') + \ 257 | colored('(%(filename)s %(lineno)d)', 'yellow') + ': %(levelname)s %(message)s' 258 | logger.setLevel(logging.INFO) 259 | logger_file = os.path.join(args.output_dir, 260 | 'log-{}.txt'.format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()))) 261 | with open(logger_file, 'w') as f: 262 | f.write(str(args) + '\n') 263 | file_handler = logging.FileHandler(logger_file) 264 | file_handler.setLevel(logging.INFO) 265 | file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S')) 266 | logger.addHandler(file_handler) 267 | console_handler = logging.StreamHandler(sys.stdout) 268 | console_handler.setLevel(logging.INFO) 269 | console_handler.setFormatter(logging.Formatter(fmt=color_fmt, datefmt='%Y-%m-%d %H:%M:%S')) 270 | logger.addHandler(console_handler) 271 | 272 | 273 | def cos_similarity(ta, tb): 274 | assert ta.shape == tb.shape 275 | if np.sum(ta * tb) == 0: 276 | return 0. 277 | return np.sum(ta * tb) / np.sqrt(np.square(ta).sum()) \ 278 | / np.sqrt(np.square(tb).sum()) 279 | 280 | 281 | def dispatch_functool(func): 282 | registry = {} 283 | 284 | def dispatch(value): 285 | try: 286 | return registry[value] 287 | except KeyError: 288 | return func 289 | 290 | def register(value, func=None): 291 | if func is None: 292 | return lambda f: register(value, f) 293 | registry[value] = func 294 | return func 295 | 296 | def wrapper(*args, **kw): 297 | return dispatch(args[0])(*(args[1:]), **kw) 298 | 299 | wrapper.register = register 300 | wrapper.dispatch = dispatch 301 | wrapper.registry = registry 302 | 303 | return wrapper 304 | 305 | 306 | def update_model_path(name, args): 307 | '''Update model path saved in args. Often sync among GPUs. 308 | Always followed load_graph. 309 | ''' 310 | args.model = os.path.join(args.output_dir, '{}.onnx'.format(name)) 311 | 312 | 313 | def save_clip_val(act_clip_val, weight_clip_val, args, act_fname='act_clip_val.json', weight_fname='weight_clip_val.json'): 314 | for k, v in act_clip_val.items(): 315 | act_clip_val[k][0] = act_clip_val[k][0].tolist() 316 | act_clip_val[k][1] = act_clip_val[k][1].tolist() 317 | for k, v in weight_clip_val.items(): 318 | weight_clip_val[k][0] = weight_clip_val[k][0].tolist() 319 | weight_clip_val[k][1] = weight_clip_val[k][1].tolist() 320 | with open(os.path.join(args.output_dir, act_fname), 'w') as f: 321 | json.dump(act_clip_val, f, indent=4) 322 | with open(os.path.join(args.output_dir, weight_fname), 'w') as f: 323 | json.dump(weight_clip_val, f, indent=4) 324 | 325 | 326 | def reduce_clip_val(rank_size, args, act_fname='act_clip_val.json', weight_fname='weight_clip_val.json'): 327 | '''Collect activation clip val from each GPU and reduce. Weight range use rank0. 328 | ''' 329 | act_clip_val, weight_clip_val = load_clip_val(args, act_fname + '.rank0', weight_fname + '.rank0') 330 | for k, v in act_clip_val.items(): 331 | if args.act_quant != 'minmax': 332 | v[0] /= float(rank_size) 333 | v[1] /= float(rank_size) 334 | for i in range(1, rank_size): 335 | with open(os.path.join(args.output_dir, act_fname + '.rank{}'.format(i)), 'r') as f: 336 | _act_clip_val = json.load(f) 337 | for k, v in _act_clip_val.items(): 338 | if args.act_quant != 'minmax': 339 | act_clip_val[k][0] += v[0] / float(rank_size) 340 | act_clip_val[k][1] += v[1] / float(rank_size) 341 | else: 342 | act_clip_val[k] = [ 343 | np.array(min(v[0], act_clip_val[k][0])), 344 | np.array(max(v[1], act_clip_val[k][1]))] 345 | save_clip_val(act_clip_val, weight_clip_val, args) 346 | 347 | 348 | def load_clip_val(args, act_fname='act_clip_val.json', weight_fname='weight_clip_val.json'): 349 | act_clip_val = {} 350 | weight_clip_val = {} 351 | with open(os.path.join(args.output_dir, act_fname), 'r') as f: 352 | act_clip_val = json.load(f) 353 | for k, v in act_clip_val.items(): 354 | # We need scalar here. 355 | act_clip_val[k][0] = np.float64(act_clip_val[k][0]) 356 | act_clip_val[k][1] = np.float64(act_clip_val[k][1]) 357 | with open(os.path.join(args.output_dir, weight_fname), 'r') as f: 358 | per_channel = False 359 | if 'per_channel' in platform_setting_table[args.deploy]['qw_params']: 360 | per_channel = platform_setting_table[args.deploy]['qw_params']['per_channel'] 361 | weight_clip_val = json.load(f) 362 | for k, v in weight_clip_val.items(): 363 | weight_clip_val[k][0] = np.array(weight_clip_val[k][0]) 364 | weight_clip_val[k][1] = np.array(weight_clip_val[k][1]) 365 | if not per_channel: 366 | weight_clip_val[k][0] = np.float64(weight_clip_val[k][0]) 367 | weight_clip_val[k][1] = np.float64(weight_clip_val[k][1]) 368 | return act_clip_val, weight_clip_val 369 | 370 | 371 | def save_profiling_res(layer_cosine_dict, model_cosine_dict, args, 372 | layer_res_fname='layer_res.json', model_res_fname='model_res.json'): 373 | rank = dist.get_rank() 374 | for k, v in layer_cosine_dict.items(): 375 | layer_cosine_dict[k] = float(v) 376 | for k, v in model_cosine_dict.items(): 377 | model_cosine_dict[k][0] = float(v[0]) 378 | model_cosine_dict[k][1] = float(v[1]) 379 | if len(layer_cosine_dict) != 0: 380 | with open(os.path.join(args.output_dir, layer_res_fname + '.rank{}'.format(rank)), 'w') as f: 381 | json.dump(layer_cosine_dict, f, indent=4) 382 | with open(os.path.join(args.output_dir, model_res_fname + '.rank{}'.format(rank)), 'w') as f: 383 | json.dump(model_cosine_dict, f, indent=4) 384 | 385 | 386 | def reduce_profiling_res(rank_size, args, layer_res_fname='layer_res.json', model_res_fname='model_res.json'): 387 | '''Collect profiling res from each GPU and reduce. 388 | ''' 389 | if args.model_type is None: 390 | with open(os.path.join(args.output_dir, layer_res_fname + '.rank0'), 'r') as f: 391 | layer_cosine_dict = json.load(f) 392 | else: 393 | layer_cosine_dict = {} 394 | with open(os.path.join(args.output_dir, model_res_fname + '.rank0'), 'r') as f: 395 | model_cosine_dict = json.load(f) 396 | if args.model_type is None: 397 | for k, v in layer_cosine_dict.items(): 398 | layer_cosine_dict[k] = v / float(rank_size) 399 | for i in range(1, rank_size): 400 | with open(os.path.join(args.output_dir, layer_res_fname + '.rank{}'.format(i)), 'r') as f: 401 | _layer_cosine_dict = json.load(f) 402 | for k, v in _layer_cosine_dict.items(): 403 | layer_cosine_dict[k] += v / float(rank_size) 404 | for k, v in model_cosine_dict.items(): 405 | model_cosine_dict[k][0] = v[0] / float(rank_size) 406 | for i in range(1, rank_size): 407 | with open(os.path.join(args.output_dir, model_res_fname + '.rank{}'.format(i)), 'r') as f: 408 | _model_cosine_dict = json.load(f) 409 | for k, v in _model_cosine_dict.items(): 410 | model_cosine_dict[k][0] += v[0] / float(rank_size) 411 | model_cosine_dict[k][1] = min(model_cosine_dict[k][1], v[1]) 412 | return layer_cosine_dict, model_cosine_dict 413 | 414 | 415 | def deploy_QOperator(model, tensor_range, args): 416 | mode = QuantizationMode.QLinearOps 417 | per_channel = platform_setting_table[args.deploy]['qw_params']['per_channel'] 418 | op_types_to_quantize = platform_setting_table[args.deploy]['quant_nodes'] 419 | 420 | if platform_setting_table[args.deploy]['qw_params']['symmetric']: 421 | weight_type = QuantType.QInt8 422 | else: 423 | weight_type = QuantType.QUInt8 424 | 425 | if platform_setting_table[args.deploy]['qi_params']['symmetric']: 426 | activation_type = QuantType.QInt8 427 | else: 428 | activation_type = QuantType.QUInt8 429 | 430 | quantizer = ONNXQuantizer(model, per_channel, False, mode, True, 431 | weight_type, activation_type, tensor_range, 432 | None, args.skip_layers, op_types_to_quantize) 433 | quantizer.quantize_model() 434 | model_output = os.path.join(args.output_dir, 'qop_model.onnx') 435 | quantizer.model.save_model_to_file(model_output) -------------------------------------------------------------------------------- /dipoorlet/weight_transform/__init__.py: -------------------------------------------------------------------------------- 1 | from .weight_trans_base import weight_calibration 2 | -------------------------------------------------------------------------------- /dipoorlet/weight_transform/ada_quant_layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from onnx import helper 6 | 7 | __all__ = ["nnie_rest_init", "quant_acti", "quant_weight", "quant_weight_nnie", "quant_acti_nnie", 8 | "adaround_reg", "AdaQLayer", "L2_norm"] 9 | 10 | 11 | def nnie_rest_init(weight): 12 | def zero_point(x, z): 13 | out = 16 * torch.log2(x) - z 14 | out = torch.round(out) 15 | return out 16 | max_value = weight.abs().max() 17 | z = zero_point(max_value, 127).cuda() 18 | pos_idx = weight > 2 ** ((z - 16) / 16) 19 | neg_idx = weight < - 2 ** ((z + 1 - 16) / 16) 20 | zero_idx = (weight >= - 2 ** ((z + 1 - 16) / 16)) & (weight < 2 ** ((z - 16) / 16)) 21 | res = weight.clone() 22 | res[zero_idx] = 0 23 | res[pos_idx] = (16 * torch.log2(weight[pos_idx]) - z) - torch.floor(16 * torch.log2(weight[pos_idx]) - z) 24 | res[neg_idx] = (16 * torch.log2(-weight[neg_idx]) - z) - torch.floor(16 * torch.log2(-weight[neg_idx]) - z) 25 | return res 26 | 27 | 28 | def quant_acti(x, scale, q_min, q_max, prob): 29 | x_ori = x 30 | x = (x / scale).round() 31 | x = torch.max(x, q_min) 32 | x = torch.min(x, q_max) 33 | x *= scale 34 | if prob < 1.0: 35 | x = torch.where(torch.rand_like(x) < prob, x, x_ori) 36 | return x 37 | 38 | 39 | def quant_weight(weight, round_mask, scale, q_min, q_max, per_channel, soft=True): 40 | if soft: 41 | weight = (weight / scale).floor() + adaround_reg().rectified_sigmoid(round_mask) 42 | else: 43 | weight = (weight / scale).floor() + (round_mask >= 0).float() 44 | if not per_channel: 45 | weight.clamp(q_min.item(), q_max.item()) 46 | else: 47 | weight = torch.max(weight, q_min) 48 | weight = torch.min(weight, q_max) 49 | weight = weight * scale 50 | return weight 51 | 52 | 53 | def quant_weight_nnie(weight, round_mask, soft=True): 54 | def zero_point(x, z): 55 | out = 16 * torch.log2(x) - z 56 | out = torch.round(out) 57 | return out 58 | max_value = weight.abs().max() 59 | z = zero_point(max_value, 127).cuda() 60 | pos_idx = weight > 2 ** ((z - 16) / 16) 61 | neg_idx = weight < - 2 ** ((z + 1 - 16) / 16) 62 | zero_idx = (weight >= - 2 ** ((z + 1 - 16) / 16)) & (weight <= 2 ** ((z - 16) / 16)) 63 | res = weight.clone() 64 | res[zero_idx] = 0 65 | if soft: 66 | wp = torch.floor(16 * torch.log2(weight[pos_idx]) - z) + adaround_reg().rectified_sigmoid(round_mask[pos_idx]) 67 | wn = torch.floor(16 * torch.log2(-weight[neg_idx]) - z) + adaround_reg().rectified_sigmoid(round_mask[neg_idx]) 68 | else: 69 | wp = torch.floor(16 * torch.log2(weight[pos_idx]) - z) + (round_mask[pos_idx] >= 0).float() 70 | wn = torch.floor(16 * torch.log2(-weight[neg_idx]) - z) + (round_mask[neg_idx] >= 0).float() 71 | res[pos_idx] = 2 ** ((torch.clamp(wp, 0, 127) + z) / 16) 72 | res[neg_idx] = - 2 ** ((torch.clamp(wn, 1, 127) + z) / 16) 73 | return res 74 | 75 | 76 | def quant_acti_nnie(acti, max_value, prob): 77 | def zero_point(x, z): 78 | out = 16 * torch.log2(x) - z 79 | out = torch.round(out) 80 | return out 81 | z = zero_point(max_value, 127).cuda() 82 | pos_idx = acti > 2 ** ((z - 16) / 16) 83 | neg_idx = acti < - 2 ** ((z + 1 - 16) / 16) 84 | zero_idx = (acti >= -2 ** ((z + 1 - 16) / 16)) & (acti <= 2 ** ((z - 16) / 16)) 85 | acti_q = acti.clone() 86 | ap = torch.round(16 * torch.log2(acti[pos_idx]) - z) 87 | an = torch.round(16 * torch.log2(-acti[neg_idx]) - z) 88 | acti_q[zero_idx] = 0. 89 | acti_q[pos_idx] = 2 ** ((torch.clamp(ap, 0, 127) + z) / 16) 90 | acti_q[neg_idx] = - 2 ** ((torch.clamp(an, 1, 127) + z) / 16) 91 | if prob < 1.0: 92 | acti_q = torch.where(torch.rand_like(acti) < prob, acti_q, acti) 93 | return acti_q 94 | 95 | 96 | class adaround_reg(nn.Module): 97 | def __init__(self, max_iter=10000, zeta=1.1, gamma=-0.1, alpha=0.01, beta=20): 98 | self.zeta = zeta 99 | self.gamma = gamma 100 | self.alpha = alpha 101 | self.beta = beta 102 | self.temp_anneal = TempDecay(max_iter) 103 | super().__init__() 104 | 105 | def rectified_sigmoid(self, round_mask): 106 | return ((self.zeta - self.gamma) * torch.sigmoid(round_mask) + self.gamma).clamp(0, 1) 107 | 108 | def forward(self, round_mask, iter): 109 | self.beta = self.temp_anneal(iter) 110 | return self.alpha * (1 - torch.pow((self.rectified_sigmoid(round_mask) - 0.5).abs() * 2, self.beta)).sum() 111 | 112 | 113 | def L2_norm(pred, tgt): 114 | return (pred - tgt).pow(2.0).sum(1).mean() 115 | 116 | 117 | class TempDecay: 118 | def __init__(self, t_max, rel_start_decay=0.2, start_b=20, end_b=2): 119 | self.t_max = t_max 120 | self.start_decay = rel_start_decay * t_max 121 | self.start_b = start_b 122 | self.end_b = end_b 123 | self.type = type 124 | 125 | def __call__(self, t): 126 | if t < self.start_decay: 127 | return 0.0 128 | else: 129 | rel_t = (t - self.start_decay) / (self.t_max - self.start_decay) 130 | return self.end_b + 0.5 * (self.start_b - self.end_b) * (1 + np.cos(rel_t * np.pi)) 131 | 132 | 133 | class AdaQLayer(torch.nn.Module): 134 | def __init__(self, node, weight, bias, rest, reg, qw_tensor, qi_tensor, relu_flag, type, acti_quant): 135 | super(AdaQLayer, self).__init__() 136 | self.qw_tensor = qw_tensor 137 | self.qi_tensor = qi_tensor 138 | self.type = type 139 | self.transposed = False 140 | self.attr_name_map = {} 141 | self.get_attr_name_map(node) 142 | if self.type == 'Conv': 143 | self.layer = self.build_torch_conv(node, weight, bias) 144 | elif self.type == 'Gemm': 145 | self.layer = self.build_torch_linear(node, weight, bias) 146 | else: 147 | self.layer = self.build_torch_deconv(node, weight, bias) 148 | self.transposed = True 149 | self.relu_flag = relu_flag 150 | if relu_flag: 151 | self.relu = nn.ReLU() 152 | # Init alpha. 153 | rest = -torch.log((reg.zeta - reg.gamma) / (rest - reg.gamma) - 1) 154 | self.round_mask = torch.nn.Parameter(rest.cuda(), True) 155 | # Init drop ratio. 156 | self.drop_ratio = 0.5 157 | # Init activation quantization mode 158 | self.acti_quant = acti_quant 159 | 160 | def get_attr_name_map(self, node): 161 | for attr in node.attribute: 162 | self.attr_name_map[attr.name] = attr 163 | 164 | def build_torch_conv(self, node, weight, bias): 165 | dialiations = helper.get_attribute_value(self.attr_name_map['dilations']) 166 | groups = helper.get_attribute_value(self.attr_name_map['group']) 167 | kernel_size = helper.get_attribute_value(self.attr_name_map['kernel_shape']) 168 | padding = helper.get_attribute_value(self.attr_name_map['pads'])[:2] 169 | stride = helper.get_attribute_value(self.attr_name_map['strides']) 170 | o_c = weight.shape[0] 171 | i_c = weight.shape[1] * groups 172 | bias_flag = bias is not None 173 | conv = torch.nn.Conv2d(i_c, o_c, kernel_size, stride, padding, dialiations, groups, bias_flag) 174 | conv.weight.data = weight.data 175 | conv.weight.requires_grad = False 176 | if bias is not None: 177 | conv.bias.data = torch.from_numpy(bias).cuda().data 178 | conv.bias.requires_grad = False 179 | return conv 180 | 181 | def build_torch_linear(self, node, weight, bias): 182 | o_c = weight.shape[0] 183 | i_c = weight.shape[1] 184 | bias_flag = bias is not None 185 | linear = torch.nn.Linear(i_c, o_c, bias_flag) 186 | linear.weight.data = weight.data 187 | linear.weight.requires_grad = False 188 | if bias is not None: 189 | linear.bias.data = torch.from_numpy(bias).cuda().data 190 | linear.bias.requires_grad = False 191 | return linear 192 | 193 | def build_torch_deconv(self, node, weight, bias): 194 | dialiations = helper.get_attribute_value(self.attr_name_map['dilations']) 195 | groups = helper.get_attribute_value(self.attr_name_map['group']) 196 | kernel_size = helper.get_attribute_value(self.attr_name_map['kernel_shape']) 197 | padding = helper.get_attribute_value(self.attr_name_map['pads'])[:2] 198 | stride = helper.get_attribute_value(self.attr_name_map['strides']) 199 | if "output_padding" in self.attr_name_map: 200 | output_padding = helper.get_attribute_value(self.attr_name_map['output_padding']) 201 | else: 202 | output_padding = 0 203 | o_c = weight.shape[0] 204 | i_c = weight.shape[1] * groups 205 | bias_flag = bias is not None 206 | deconv = torch.nn.ConvTranspose2d( 207 | i_c, o_c, kernel_size, stride, padding, output_padding, groups, bias_flag, dialiations) 208 | deconv.weight.data = weight.data 209 | deconv.weight.requires_grad = False 210 | if bias is not None: 211 | deconv.bias.data = torch.from_numpy(bias).cuda().data 212 | deconv.bias.requires_grad = False 213 | return deconv 214 | 215 | def forward(self, x): 216 | if self.qw_tensor['type'] == 'Linear': 217 | q_weight = quant_weight(self.layer.weight, self.round_mask, 218 | self.qw_tensor['scale'], self.qw_tensor['q_min'], self.qw_tensor['q_max'], 219 | self.qw_tensor['per_channel']) 220 | if self.transposed: 221 | q_weight = q_weight.transpose(0, 1) 222 | else: 223 | q_weight = quant_weight_nnie(self.layer.weight, self.round_mask) 224 | if self.type == 'Conv': 225 | x = F.conv2d( 226 | x, 227 | q_weight, self.layer.bias, 228 | self.layer.stride, 229 | self.layer.padding, 230 | self.layer.dilation, 231 | self.layer.groups) 232 | elif self.type == 'Gemm': 233 | x = F.linear( 234 | x, 235 | q_weight, self.layer.bias) 236 | else: 237 | x = F.conv_transpose2d( 238 | x, 239 | q_weight, self.layer.bias, 240 | self.layer.stride, 241 | self.layer.padding, 242 | self.layer.output_padding, 243 | self.layer.groups, 244 | self.layer.dilation) 245 | if self.relu_flag: 246 | x = F.relu(x) 247 | if self.acti_quant and self.qi_tensor['type'] == 'Linear': 248 | x = quant_acti(x, self.qi_tensor['scale'], self.qi_tensor['q_min'], 249 | self.qi_tensor['q_max'], self.drop_ratio) 250 | elif self.acti_quant and self.qi_tensor['type'] == 'NNIE': 251 | x = quant_acti_nnie(x, self.qi_tensor['max_value'], self.drop_ratio) 252 | return x 253 | -------------------------------------------------------------------------------- /dipoorlet/weight_transform/adaround.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | import torch 5 | import torch.distributed as dist 6 | import torch.nn.functional as F 7 | from onnx import numpy_helper 8 | from torch.nn.parallel import DistributedDataParallel as DDP 9 | 10 | from ..forward_net import ActivationCache 11 | from ..platform_settings import platform_setting_table 12 | from ..quantize import QUANT_NODE_NAME_LIST, quant_graph 13 | from ..utils import logger 14 | from .ada_quant_layer import * 15 | from .utils import * 16 | from .weight_equalization import node_has_equalized 17 | 18 | 19 | def adaround(graph_ori, graph, act_clip_val, weight_clip_val, args): 20 | dist.barrier() 21 | clip_val = act_clip_val.copy() 22 | clip_val.update(weight_clip_val) 23 | graph_ada = copy.deepcopy(graph) 24 | rank = dist.get_rank() 25 | num_per_rank = args.data_num // dist.get_world_size() 26 | rank_st = rank * num_per_rank 27 | rank_ed = rank_st + num_per_rank 28 | fp_act_cache = ActivationCache(graph_ori, args, rank_st, rank_ed) 29 | prev_act_cache = None 30 | for node in graph_ori.graph.node: 31 | if node.name in args.skip_layers: 32 | continue 33 | if node.op_type in LEARNABLE_LAYER_TYPES: 34 | # We can not mimic when node has weight equalized. 35 | if args.we and node_has_equalized(graph, node): 36 | continue 37 | if dist.get_rank() == 0: 38 | logger.info("Adaround for: {}".format(node.name)) 39 | # Using graph_ada and restore act cache for incremental update. 40 | if not prev_act_cache: 41 | graph_q, quant_node_list = quant_graph(graph_ada, clip_val, args) 42 | q_act_cache = ActivationCache(graph_q, args, rank_st, rank_ed) 43 | else: 44 | q_act_cache.update_graph(graph_q) 45 | q_act_cache.activation_cache = prev_act_cache 46 | prev_node = graph_q.get_tensor_consumer(node.input[0])[0] 47 | prev_node = graph_q.get_tensor_consumer(prev_node.output[0])[0] 48 | in_tensor_name = node.input[0] 49 | if prev_node.op_type == QUANT_NODE_NAME_LIST[-1]: 50 | in_tensor_name = prev_node.output[0] 51 | 52 | q_in_tensor = np.stack(q_act_cache[in_tensor_name]) 53 | fp_out_tensor = np.stack(fp_act_cache[node.output[0]]) 54 | prev_act_cache = q_act_cache.activation_cache.copy() 55 | 56 | # Get weight and build torch conv. 57 | weight = numpy_helper.to_array(graph_ada.initializer[node.input[1]][0]) 58 | bias = None 59 | if len(node.input) == 3: 60 | bias = numpy_helper.to_array(graph_ada.initializer[node.input[2]][0]) 61 | 62 | weight = torch.from_numpy(weight).cuda() 63 | # Get quantization param. 64 | if args.deploy != 'nnie': 65 | weight_range = clip_val[node.input[1]] 66 | qw_param = platform_setting_table[args.deploy]['qw_params'] 67 | if node.op_type == 'ConvTranspose': 68 | weight = weight.transpose(0, 1) 69 | scale, q_min, q_max = get_quant_tensor(weight.shape, qw_param, weight_range) 70 | rest = (weight / scale) - (weight / scale).floor() 71 | qw_tensor = {'scale': scale, 72 | 'q_min': q_min, 73 | 'q_max': q_max, 74 | 'per_channel': qw_param['per_channel'], 75 | 'type': 'Linear'} 76 | else: 77 | qw_tensor = {'scale': None, 78 | 'q_min': None, 79 | 'q_max': None, 80 | 'per_channel': None, 81 | 'type': 'NNIE'} 82 | rest = nnie_rest_init(weight) 83 | # Learning. 84 | relu_flag = follow_relu(graph, node) 85 | if relu_flag: 86 | fp_tensor = torch.nn.Parameter(F.relu(torch.from_numpy(fp_out_tensor)), False) 87 | else: 88 | fp_tensor = torch.nn.Parameter(torch.from_numpy(fp_out_tensor), False) 89 | # Learning round mask. 90 | total_iter = args.ada_epoch * np.ceil(num_per_rank / args.ada_bs) 91 | reg = adaround_reg(total_iter) 92 | ada_layer = AdaQLayer(node, weight, bias, rest, reg, qw_tensor, None, 93 | relu_flag, node.op_type, args.acti_quant) 94 | round_mask = learning_round_mask( 95 | torch.nn.Parameter(torch.from_numpy(q_in_tensor).cuda(), False), 96 | fp_tensor.cuda(), 97 | ada_layer, reg, args.ada_bs, args.ada_epoch) 98 | # Deploy new weight. 99 | if args.deploy != 'nnie': 100 | new_rounded_weight = quant_weight( 101 | weight, 102 | round_mask, scale, q_min, q_max, 103 | qw_param['per_channel'], soft=False) 104 | if node.op_type == 'ConvTranspose': 105 | new_rounded_weight = new_rounded_weight.transpose(0, 1) 106 | else: 107 | new_rounded_weight = quant_weight_nnie(weight, round_mask, soft=False) 108 | new_rounded_weight = new_rounded_weight.cpu().detach().numpy() 109 | update_weight(graph_ada, new_rounded_weight, node.input[1]) 110 | update_weight(graph_q, new_rounded_weight, node.input[1]) 111 | graph_ada.update_model() 112 | graph_q.update_model() 113 | if dist.get_rank() == 0: 114 | graph_ada.save_onnx_model('adaround') 115 | # We must use original ranges. 116 | return graph_ada 117 | 118 | 119 | def learning_round_mask(in_tensor, fp_out_tensor, ada_layer, reg, batch_size, max_epoch): 120 | optimizer = torch.optim.Adam([ada_layer.round_mask]) 121 | ada_layer = DDP(ada_layer, [torch.cuda.current_device()]) 122 | # New train precedure 123 | cur_iter = 0 124 | for epoch in range(max_epoch): 125 | for idx in range(np.ceil(len(in_tensor) / batch_size).astype(int)): 126 | st = idx * batch_size 127 | ed = st + batch_size 128 | input = in_tensor[st:ed].squeeze(1) 129 | fp_output = fp_out_tensor[st:ed].squeeze(1) 130 | output = ada_layer(input) 131 | loss = L2_norm(output, fp_output) + reg(ada_layer.module.round_mask, cur_iter) 132 | cur_iter += 1 133 | optimizer.zero_grad() 134 | loss.backward() 135 | optimizer.step() 136 | if epoch % 50 == 0 and dist.get_rank() == 0: 137 | logger.info("Epoch: {:<4} L2 Loss: {:>10.3f} Beta: {:>3.3f}".format(epoch, loss, reg.beta)) 138 | res = adaround_reg().rectified_sigmoid(ada_layer.module.round_mask) 139 | if dist.get_rank() == 0: 140 | logger.info("Loss: {:>5.3f} Ceil: {:>5} Floor: {:>5} Total: {:>5} Ratio: {:>.3f}".format( 141 | loss, 142 | res[res + 1e-4 >= 1.0].numel(), res[res <= 1e-4].numel(), torch.numel(res), 143 | (res[res + 1e-4 >= 1.0].numel() + res[res <= 1e-4].numel()) / torch.numel(res))) 144 | return ada_layer.module.round_mask 145 | -------------------------------------------------------------------------------- /dipoorlet/weight_transform/bias_correction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from onnx import numpy_helper 3 | 4 | from ..forward_net import ActivationCache 5 | from ..quantize import quant_graph 6 | from ..utils import ONNXGraph, logger 7 | 8 | 9 | def update_conv_node_bias(graph_bc, node, fp_activations, q_activations): 10 | bias_diff = np.stack(fp_activations, axis=0) \ 11 | - np.stack(q_activations, axis=0) 12 | axis = (0, 2, 3) if node.op_type == 'Conv' else (0) 13 | bias_diff = np.squeeze(bias_diff, axis=1).mean(axis=axis) 14 | if len(node.input) > 2: 15 | ori_bias = numpy_helper.to_array(graph_bc.initializer[node.input[2]][0]) 16 | corrected_bias = ori_bias + bias_diff 17 | corrected_bias_name = graph_bc.initializer[node.input[2]][0].name 18 | graph_bc.set_initializer(corrected_bias_name, corrected_bias) 19 | graph_bc.tensor_name_shape_map[corrected_bias_name] = \ 20 | graph_bc.tensor_name_shape_map.pop(graph_bc.initializer[node.input[2]][0].name) 21 | graph_bc.input.append(corrected_bias_name) 22 | else: 23 | bias = bias_diff 24 | bias_name = node.name + '_bias' 25 | graph_bc.set_initializer(bias_name, bias) 26 | graph_bc.tensor_name_shape_map[bias_name] = list(bias.shape) 27 | graph_bc.input.append(bias_name) 28 | for bc_node in graph_bc.graph.node: 29 | if bc_node.name == node.name: 30 | bc_node.input.append(bias_name) 31 | return 32 | 33 | 34 | def bias_correction(graph, act_clip_val, weight_clip_val, args): 35 | bias_correction_node_type = ['Conv', 'Gemm'] 36 | clip_val = act_clip_val.copy() 37 | clip_val.update(weight_clip_val) 38 | graph_bc = ONNXGraph() 39 | graph_bc.copy_from(graph) 40 | fp_cache = ActivationCache(graph, args) 41 | prev_act = None 42 | for node in graph.graph.node: 43 | if node.op_type in bias_correction_node_type: 44 | logger.info("Update bias for node: {}".format(node.name)) 45 | # We should do incremental update here. 46 | graph_q, _ = quant_graph(graph_bc, clip_val, args) 47 | q_cache = ActivationCache(graph_q, args) 48 | if prev_act is not None: 49 | q_cache.activation_cache = prev_act 50 | _ = q_cache[node.input[0]] 51 | prev_act = q_cache.activation_cache.copy() 52 | update_conv_node_bias(graph_bc, node, fp_cache[node.output[0]], q_cache[node.output[0]]) 53 | graph_bc.update_model() 54 | 55 | graph_bc.save_onnx_model('update_bias_model') 56 | -------------------------------------------------------------------------------- /dipoorlet/weight_transform/brecq.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | import torch 5 | import torch.distributed as dist 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from onnx import numpy_helper 9 | from torch.nn.parallel import DistributedDataParallel as DDP 10 | 11 | from ..forward_net import ActivationCache 12 | from ..platform_settings import platform_setting_table 13 | from ..quantize import QUANT_NODE_NAME_LIST, quant_graph 14 | from ..utils import logger 15 | from .ada_quant_layer import * 16 | from .utils import * 17 | from .weight_equalization import node_has_equalized 18 | 19 | 20 | def brecq(graph_ori, graph, act_clip_val, weight_clip_val, args): 21 | dist.barrier() 22 | rank = dist.get_rank() 23 | num_per_rank = args.data_num // dist.get_world_size() 24 | rank_st = rank * num_per_rank 25 | rank_ed = rank_st + num_per_rank 26 | clip_val = act_clip_val.copy() 27 | clip_val.update(weight_clip_val) 28 | graph_brecq = copy.deepcopy(graph) 29 | fp_act_cache = ActivationCache(graph_ori, args, rank_st, rank_ed) 30 | prev_act_cache = None 31 | already = [] 32 | _log_head = 'Qdrop' if args.drop is True else 'Brecq' 33 | for node in graph_ori.graph.node: 34 | if node.name in args.skip_layers: 35 | continue 36 | if node.op_type in LEARNABLE_LAYER_TYPES and node.name not in already: 37 | block_layer_list = get_block_from_first(graph, node, args) 38 | # If the last node has weight equalized, it cannot be the last. 39 | if args.we: 40 | if node_has_equalized(graph, block_layer_list[-1]): 41 | block_layer_list.pop(-1) 42 | if dist.get_rank() == 0: 43 | logger.info("{} for: {}".format(_log_head, ' '.join([_node.name for _node in block_layer_list]))) 44 | already.extend([_node.name for _node in block_layer_list]) 45 | # Using graph_brecq and restore act cache for incremental update. 46 | if not prev_act_cache: 47 | graph_q, quant_node_list = quant_graph(graph_brecq, clip_val, args) 48 | q_act_cache = ActivationCache(graph_q, args, rank_st, rank_ed) 49 | else: 50 | q_act_cache.update_graph(graph_q) 51 | q_act_cache.activation_cache = prev_act_cache 52 | prev_node = graph_q.get_tensor_consumer(block_layer_list[0].input[0])[0] 53 | prev_node = graph_q.get_tensor_consumer(prev_node.output[0])[0] 54 | in_tensor_name = block_layer_list[0].input[0] 55 | if prev_node.op_type == QUANT_NODE_NAME_LIST[-1]: 56 | in_tensor_name = prev_node.output[0] 57 | q_in_tensor = np.stack(q_act_cache[in_tensor_name]) 58 | fp_in_tensor = np.stack(fp_act_cache[block_layer_list[0].input[0]]) 59 | fp_out_tensor = np.stack(fp_act_cache[block_layer_list[-1].output[0]]) 60 | prev_act_cache = q_act_cache.activation_cache.copy() 61 | # Use one reg for seq. 62 | total_iter = args.ada_epoch * len(block_layer_list) * np.ceil(num_per_rank / args.ada_bs) 63 | reg = adaround_reg(total_iter) 64 | ada_layer_list = [] 65 | # Get weight and build torch conv. 66 | for _node in block_layer_list: 67 | weight = numpy_helper.to_array(graph_brecq.initializer[_node.input[1]][0]) 68 | weight = torch.from_numpy(weight).cuda() 69 | bias = None 70 | if len(_node.input) == 3: 71 | bias = numpy_helper.to_array(graph_brecq.initializer[_node.input[2]][0]) 72 | # Get quantization param. 73 | if args.deploy != 'nnie': 74 | weight_range = clip_val[_node.input[1]] 75 | qw_param = platform_setting_table[args.deploy]['qw_params'] 76 | if _node.op_type == 'ConvTranspose': 77 | weight = weight.transpose(0, 1) 78 | scale, q_min, q_max = get_quant_tensor(weight.shape, qw_param, weight_range) 79 | rest = (weight / scale) - (weight / scale).floor() 80 | qw_tensor = {'scale': scale, 81 | 'q_min': q_min, 82 | 'q_max': q_max, 83 | 'per_channel': qw_param['per_channel'], 84 | 'type': 'Linear'} 85 | else: 86 | qw_tensor = {'scale': None, 87 | 'q_min': None, 88 | 'q_max': None, 89 | 'per_channel': None, 90 | 'type': 'NNIE'} 91 | rest = nnie_rest_init(weight) 92 | 93 | # Generate torch qlayer. 94 | relu_flag = follow_relu(graph, _node) 95 | # get acti quantization param 96 | following_node = following_relu(graph, _node) if relu_flag else _node 97 | acti_range = clip_val[following_node.output[0]] 98 | if args.deploy != 'nnie': 99 | acti_shape = graph.get_tensor_shape(following_node.output[0]) 100 | qi_param = platform_setting_table[args.deploy]['qi_params'] 101 | scale, q_min, q_max = get_quant_tensor(acti_shape, qi_param, acti_range) 102 | qi_tensor = {'scale': scale, 103 | 'q_min': q_min, 104 | 'q_max': q_max, 105 | 'type': 'Linear'} 106 | else: 107 | max_value = max(abs(acti_range[0]), acti_range[1]) 108 | max_value = torch.from_numpy(np.array(max_value).astype(np.float32)).cuda() 109 | qi_tensor = {'max_value': max_value, 110 | 'type': 'NNIE'} 111 | ada_layer_list.append( 112 | AdaQLayer(_node, weight, bias, rest, reg, qw_tensor, qi_tensor, 113 | relu_flag, _node.op_type, args.acti_quant) 114 | ) 115 | # Block output follow relu. 116 | relu_flag = follow_relu(graph, block_layer_list[-1]) 117 | if relu_flag: 118 | fp_out_tensor = torch.nn.Parameter(F.relu(torch.from_numpy(fp_out_tensor)), False) 119 | else: 120 | fp_out_tensor = torch.nn.Parameter(torch.from_numpy(fp_out_tensor), False) 121 | # Learning. 122 | ada_block = nn.Sequential(*ada_layer_list) 123 | round_mask_list = learning_round_mask( 124 | torch.nn.Parameter(torch.from_numpy(q_in_tensor).cuda(), False), 125 | torch.nn.Parameter(torch.from_numpy(fp_in_tensor).cuda(), False), 126 | fp_out_tensor.cuda(), 127 | ada_block, reg, args.ada_bs, args.ada_epoch * len(block_layer_list), args.drop) 128 | # Deploy new weight. 129 | for idx, _node in enumerate(block_layer_list): 130 | weight = numpy_helper.to_array(graph_brecq.initializer[_node.input[1]][0]) 131 | weight = torch.from_numpy(weight).cuda() 132 | round_mask = round_mask_list[idx] 133 | if args.deploy != 'nnie': 134 | weight_range = clip_val[_node.input[1]] 135 | qw_param = platform_setting_table[args.deploy]['qw_params'] 136 | if _node.op_type == 'ConvTranspose': 137 | weight = weight.transpose(0, 1) 138 | scale, q_min, q_max = get_quant_tensor(weight.shape, qw_param, weight_range) 139 | new_rounded_weight = quant_weight( 140 | weight, 141 | round_mask, scale, q_min, q_max, 142 | qw_param['per_channel'], soft=False) 143 | if _node.op_type == 'ConvTranspose': 144 | new_rounded_weight = new_rounded_weight.transpose(0, 1) 145 | else: 146 | new_rounded_weight = quant_weight_nnie(weight, round_mask, soft=False) 147 | new_rounded_weight = new_rounded_weight.cpu().detach().numpy() 148 | update_weight(graph_brecq, new_rounded_weight, _node.input[1]) 149 | update_weight(graph_q, new_rounded_weight, _node.input[1]) 150 | graph_brecq.update_model() 151 | graph_q.update_model() 152 | if dist.get_rank() == 0: 153 | graph_brecq.save_onnx_model('brecq') 154 | # We must use original ranges. 155 | return graph_brecq 156 | 157 | 158 | def learning_round_mask(q_in_tensor, fp_in_tensor, fp_out_tensor, ada_block, reg, batch_size, max_epoch, drop): 159 | opt_list = [] 160 | for layer in ada_block: 161 | if isinstance(layer, AdaQLayer): 162 | opt_list.append(layer.round_mask) 163 | optimizer = torch.optim.Adam(opt_list) 164 | ada_block = DDP(ada_block, [torch.cuda.current_device()]) 165 | # New train precedure 166 | cur_iter = 0 167 | ratio = 0.5 if drop else 1.0 168 | for epoch in range(max_epoch): 169 | if ratio < 1.0: 170 | in_tensor = torch.where(torch.rand_like(q_in_tensor) < ratio, q_in_tensor, fp_in_tensor) 171 | else: 172 | in_tensor = q_in_tensor 173 | for idx in range(np.ceil(len(in_tensor) / batch_size).astype(int)): 174 | st = idx * batch_size 175 | ed = st + batch_size 176 | input = in_tensor[st:ed].squeeze(1) 177 | fp_output = fp_out_tensor[st:ed].squeeze(1) 178 | output = ada_block(input) 179 | loss = L2_norm(output, fp_output) 180 | for layer in ada_block.module: 181 | if isinstance(layer, AdaQLayer): 182 | loss += reg(layer.round_mask, cur_iter) 183 | cur_iter += 1 184 | optimizer.zero_grad() 185 | loss.backward() 186 | optimizer.step() 187 | if epoch % 100 == 0 and dist.get_rank() == 0: 188 | logger.info("Epoch: {:<5} L2 Loss: {:>10.3f} Beta: {:>3.3f}".format(epoch, loss, reg.beta)) 189 | for layer in ada_block.module: 190 | if isinstance(layer, AdaQLayer): 191 | res = adaround_reg().rectified_sigmoid(layer.round_mask) 192 | if dist.get_rank() == 0: 193 | logger.info("Ceil: {:>5} Floor: {:>5} Total: {:>5} Ratio: {:>.3f}".format( 194 | res[res + 1e-4 >= 1.0].numel(), res[res <= 1e-4].numel(), torch.numel(res), 195 | (res[res + 1e-4 >= 1.0].numel() + res[res <= 1e-4].numel()) / torch.numel(res))) 196 | round_mask_list = [] 197 | for layer in ada_block.module: 198 | if isinstance(layer, AdaQLayer): 199 | round_mask_list.append(layer.round_mask) 200 | return round_mask_list 201 | -------------------------------------------------------------------------------- /dipoorlet/weight_transform/sparse_quant.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | import torch 5 | import torch.distributed as dist 6 | import torch.nn.functional as F 7 | from onnx import numpy_helper 8 | from torch.nn.parallel import DistributedDataParallel as DDP 9 | 10 | from ..forward_net import ActivationCache 11 | from ..platform_settings import platform_setting_table 12 | from ..quantize import QUANT_NODE_NAME_LIST, quant_graph 13 | from ..utils import logger 14 | from .sparse_quant_layer import * 15 | from .utils import * 16 | from .weight_equalization import node_has_equalized 17 | 18 | 19 | def sparse_quant(graph_ori, graph, act_clip_val, weight_clip_val, args): 20 | dist.barrier() 21 | clip_val = act_clip_val.copy() 22 | clip_val.update(weight_clip_val) 23 | graph_ada = copy.deepcopy(graph) 24 | rank = dist.get_rank() 25 | num_per_rank = args.data_num // dist.get_world_size() 26 | rank_st = rank * num_per_rank 27 | rank_ed = rank_st + num_per_rank 28 | sparse_info = {"sparse": True, "rate": args.sparse_rate, "pattern": args.pattern} 29 | fp_act_cache = ActivationCache(graph_ori, args, rank_st, rank_ed) 30 | prev_act_cache = None 31 | for node in graph_ori.graph.node: 32 | if node.name in args.skip_layers: 33 | continue 34 | if node.op_type in LEARNABLE_LAYER_TYPES: 35 | # We can not mimic when node has weight equalized. 36 | if args.we and node_has_equalized(graph, node): 37 | continue 38 | if dist.get_rank() == 0: 39 | logger.info("sparse_quant for: {}".format(node.name)) 40 | # Using graph_ada and restore act cache for incremental update. 41 | if not prev_act_cache: 42 | graph_q, quant_node_list = quant_graph(graph_ada, clip_val, args) 43 | q_act_cache = ActivationCache(graph_q, args, rank_st, rank_ed) 44 | else: 45 | q_act_cache.update_graph(graph_q) 46 | q_act_cache.activation_cache = prev_act_cache 47 | prev_node = graph_q.get_tensor_consumer(node.input[0])[0] 48 | prev_node = graph_q.get_tensor_consumer(prev_node.output[0])[0] 49 | in_tensor_name = node.input[0] 50 | if prev_node.op_type == QUANT_NODE_NAME_LIST[-1]: 51 | in_tensor_name = prev_node.output[0] 52 | 53 | q_in_tensor = np.stack(q_act_cache[in_tensor_name]) 54 | fp_out_tensor = np.stack(fp_act_cache[node.output[0]]) 55 | prev_act_cache = q_act_cache.activation_cache.copy() 56 | 57 | # Get weight and build torch conv. 58 | weight = numpy_helper.to_array(graph_ada.initializer[node.input[1]][0]) 59 | bias = None 60 | if len(node.input) == 3: 61 | bias = numpy_helper.to_array(graph_ada.initializer[node.input[2]][0]) 62 | 63 | weight = torch.from_numpy(weight).cuda() 64 | # Get quantization param. 65 | weight_range = clip_val[node.input[1]] 66 | qw_param = platform_setting_table[args.deploy]['qw_params'] 67 | if node.op_type == 'ConvTranspose': 68 | weight = weight.transpose(0, 1) 69 | scale, q_min, q_max = get_quant_tensor(weight.shape, qw_param, weight_range) 70 | qw_tensor = {'scale': scale, 71 | 'q_min': q_min, 72 | 'q_max': q_max, 73 | 'per_channel': qw_param['per_channel'], 74 | 'type': 'Linear'} 75 | # Learning. 76 | relu_flag = follow_relu(graph, node) 77 | if relu_flag: 78 | fp_tensor = torch.nn.Parameter(F.relu(torch.from_numpy(fp_out_tensor)), False) 79 | else: 80 | fp_tensor = torch.nn.Parameter(torch.from_numpy(fp_out_tensor), False) 81 | # Learning sparse quant. 82 | sq_layer = SparseQLayer(node, weight, bias, qw_tensor, None, 83 | relu_flag, node.op_type, sparse_info) 84 | weight = learning_sparse_quant( 85 | torch.nn.Parameter(torch.from_numpy(q_in_tensor).cuda(), False), 86 | fp_tensor.cuda(), 87 | sq_layer, args.ada_bs, args.ada_epoch) 88 | # Deploy new weight. 89 | new_weight = prune_weight(weight, sparse_info) 90 | new_weight = quant_weight_wo_roundmask( 91 | new_weight, 92 | scale, q_min, q_max, 93 | qw_param['per_channel']) 94 | if node.op_type == 'ConvTranspose': 95 | new_weight = new_weight.transpose(0, 1) 96 | new_weight = new_weight.cpu().detach().numpy() 97 | update_weight(graph_ada, new_weight, node.input[1]) 98 | update_weight(graph_q, new_weight, node.input[1]) 99 | graph_ada.update_model() 100 | graph_q.update_model() 101 | if dist.get_rank() == 0: 102 | graph_ada.save_onnx_model('sparse_quant') 103 | # We must use original ranges. 104 | return graph_ada 105 | 106 | 107 | def learning_sparse_quant(in_tensor, fp_out_tensor, ada_layer, batch_size, max_epoch): 108 | optimizer = torch.optim.SGD([ada_layer.layer.weight], lr=0.001, momentum=0.9, weight_decay=1e-4) 109 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=max_epoch) 110 | ada_layer = DDP(ada_layer, [torch.cuda.current_device()]) 111 | # New train precedure 112 | cur_iter = 0 113 | for epoch in range(max_epoch): 114 | for idx in range(np.ceil(len(in_tensor) / batch_size).astype(int)): 115 | st = idx * batch_size 116 | ed = st + batch_size 117 | input = in_tensor[st:ed].squeeze(1) 118 | fp_output = fp_out_tensor[st:ed].squeeze(1) 119 | output = ada_layer(input) 120 | loss = L2_norm(output, fp_output) 121 | cur_iter += 1 122 | optimizer.zero_grad() 123 | loss.backward() 124 | optimizer.step() 125 | scheduler.step() 126 | if epoch % 50 == 0 and dist.get_rank() == 0: 127 | logger.info("Epoch: {:<4} L2 Loss: {:>10.6f}, LR: {:>10.6f}".format(epoch, loss, scheduler.get_lr()[0])) 128 | if dist.get_rank() == 0: 129 | logger.info("Loss: {:>10.6f}".format(loss)) 130 | return ada_layer.module.layer.weight 131 | -------------------------------------------------------------------------------- /dipoorlet/weight_transform/sparse_quant_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from onnx import helper 5 | 6 | __all__ = ["SparseQLayer", "L2_norm", "quant_weight_wo_roundmask", "prune_weight"] 7 | 8 | 9 | class STE(torch.autograd.Function): 10 | @staticmethod 11 | def forward(self, input): 12 | input = input.round() 13 | return input 14 | 15 | @staticmethod 16 | def backward(self, grad_output): 17 | return grad_output 18 | 19 | 20 | def quant_weight_wo_roundmask(weight, scale, q_min, q_max, per_channel): 21 | weight = STE.apply(weight / scale) 22 | if not per_channel: 23 | weight.clamp(q_min.item(), q_max.item()) 24 | else: 25 | weight = torch.max(weight, q_min) 26 | weight = torch.min(weight, q_max) 27 | weight = weight * scale 28 | return weight 29 | 30 | 31 | def create_unstruction_mask(weight, sparsity): 32 | revised_weight = weight.abs() 33 | prune_num = int(sparsity * revised_weight.numel()) 34 | if prune_num == 0: 35 | threshold = revised_weight.min() - 1 36 | else: 37 | threshold = torch.topk(revised_weight.view(-1), prune_num, largest=False)[0].max() 38 | mask = torch.gt(revised_weight, threshold).type_as(revised_weight) 39 | return mask 40 | 41 | 42 | def create_nv24_mask(weight, N, M): 43 | if len(weight.shape) == 4: 44 | weight_temp = weight.detach().abs().permute(0, 2, 3, 1).reshape(-1, M) 45 | index = torch.argsort(weight_temp, dim=1)[:, :int(M - N)] 46 | mask = torch.ones(weight_temp.shape, device=weight_temp.device) 47 | mask = mask.scatter_(dim=1, index=index, value=0).reshape((weight.shape[0], weight.shape[2], weight.shape[3], weight.shape[1])) 48 | mask = mask.permute(0, 3, 1, 2) 49 | elif len(weight.shape) == 2: 50 | weight_temp = weight.detach().abs().reshape(-1, M) 51 | index = torch.argsort(weight_temp, dim=1)[:, :int(M - N)] 52 | mask = torch.ones(weight_temp.shape, device=weight_temp.device) 53 | mask = mask.scatter_(dim=1, index=index, value=0).reshape(weight.shape) 54 | return mask 55 | 56 | 57 | def prune_weight(weight, sparse_info): 58 | if sparse_info["pattern"] == "unstruction": 59 | mask = create_unstruction_mask(weight, sparse_info["rate"]) 60 | elif sparse_info["pattern"] == "nv24": 61 | mask = create_nv24_mask(weight, 2, 4) 62 | return weight * mask 63 | 64 | 65 | def L2_norm(pred, tgt): 66 | return (pred - tgt).pow(2.0).sum(1).mean() 67 | 68 | 69 | class SparseQLayer(torch.nn.Module): 70 | def __init__(self, node, weight, bias, qw_tensor, qi_tensor, relu_flag, type, sparse_info=None): 71 | super(SparseQLayer, self).__init__() 72 | self.qw_tensor = qw_tensor 73 | self.qi_tensor = qi_tensor 74 | self.type = type 75 | self.transposed = False 76 | self.attr_name_map = {} 77 | self.get_attr_name_map(node) 78 | if self.type == 'Conv': 79 | self.layer = self.build_torch_conv(node, weight, bias) 80 | elif self.type == 'Gemm': 81 | self.layer = self.build_torch_linear(node, weight, bias) 82 | else: 83 | self.layer = self.build_torch_deconv(node, weight, bias) 84 | self.transposed = True 85 | self.relu_flag = relu_flag 86 | if relu_flag: 87 | self.relu = nn.ReLU() 88 | # Sparse 89 | self.sparse_info = sparse_info 90 | 91 | def get_attr_name_map(self, node): 92 | for attr in node.attribute: 93 | self.attr_name_map[attr.name] = attr 94 | 95 | def build_torch_conv(self, node, weight, bias): 96 | dialiations = helper.get_attribute_value(self.attr_name_map['dilations']) 97 | groups = helper.get_attribute_value(self.attr_name_map['group']) 98 | kernel_size = helper.get_attribute_value(self.attr_name_map['kernel_shape']) 99 | padding = helper.get_attribute_value(self.attr_name_map['pads'])[:2] 100 | stride = helper.get_attribute_value(self.attr_name_map['strides']) 101 | o_c = weight.shape[0] 102 | i_c = weight.shape[1] * groups 103 | bias_flag = bias is not None 104 | conv = torch.nn.Conv2d(i_c, o_c, kernel_size, stride, padding, dialiations, groups, bias_flag) 105 | conv.weight.data = weight.data 106 | conv.weight.requires_grad = True 107 | if bias is not None: 108 | conv.bias.data = torch.from_numpy(bias).cuda().data 109 | conv.bias.requires_grad = True 110 | return conv 111 | 112 | def build_torch_linear(self, node, weight, bias): 113 | o_c = weight.shape[0] 114 | i_c = weight.shape[1] 115 | bias_flag = bias is not None 116 | linear = torch.nn.Linear(i_c, o_c, bias_flag) 117 | linear.weight.data = weight.data 118 | linear.weight.requires_grad = True 119 | if bias is not None: 120 | linear.bias.data = torch.from_numpy(bias).cuda().data 121 | linear.bias.requires_grad = True 122 | return linear 123 | 124 | def build_torch_deconv(self, node, weight, bias): 125 | dialiations = helper.get_attribute_value(self.attr_name_map['dilations']) 126 | groups = helper.get_attribute_value(self.attr_name_map['group']) 127 | kernel_size = helper.get_attribute_value(self.attr_name_map['kernel_shape']) 128 | padding = helper.get_attribute_value(self.attr_name_map['pads'])[:2] 129 | stride = helper.get_attribute_value(self.attr_name_map['strides']) 130 | if "output_padding" in self.attr_name_map: 131 | output_padding = helper.get_attribute_value(self.attr_name_map['output_padding']) 132 | else: 133 | output_padding = 0 134 | o_c = weight.shape[0] 135 | i_c = weight.shape[1] * groups 136 | bias_flag = bias is not None 137 | deconv = torch.nn.ConvTranspose2d( 138 | i_c, o_c, kernel_size, stride, padding, output_padding, groups, bias_flag, dialiations) 139 | deconv.weight.data = weight.data 140 | deconv.weight.requires_grad = True 141 | if bias is not None: 142 | deconv.bias.data = torch.from_numpy(bias).cuda().data 143 | deconv.bias.requires_grad = True 144 | return deconv 145 | 146 | def forward(self, x): 147 | s_weight = prune_weight(self.layer.weight, self.sparse_info) 148 | q_weight = quant_weight_wo_roundmask(s_weight, 149 | self.qw_tensor['scale'], self.qw_tensor['q_min'], self.qw_tensor['q_max'], 150 | self.qw_tensor['per_channel']) 151 | if self.transposed: 152 | q_weight = q_weight.transpose(0, 1) 153 | if self.type == 'Conv': 154 | x = F.conv2d( 155 | x, 156 | q_weight, self.layer.bias, 157 | self.layer.stride, 158 | self.layer.padding, 159 | self.layer.dilation, 160 | self.layer.groups) 161 | elif self.type == 'Gemm': 162 | x = F.linear( 163 | x, 164 | q_weight, self.layer.bias) 165 | else: 166 | x = F.conv_transpose2d( 167 | x, 168 | q_weight, self.layer.bias, 169 | self.layer.stride, 170 | self.layer.padding, 171 | self.layer.output_padding, 172 | self.layer.groups, 173 | self.layer.dilation) 174 | if self.relu_flag: 175 | x = F.relu(x) 176 | return x -------------------------------------------------------------------------------- /dipoorlet/weight_transform/update_bn.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | from onnx import numpy_helper 5 | 6 | from ..forward_net import ActivationCache 7 | from ..quantize import quant_graph 8 | from ..tensor_cali import tensor_calibration 9 | from ..utils import ONNXGraph, logger 10 | 11 | 12 | def update_bn_node(graph, node, in_tensor, momentum=0.9): 13 | running_mean = numpy_helper.to_array(graph.initializer[node.input[3]][0]) 14 | running_var = numpy_helper.to_array(graph.initializer[node.input[4]][0]) 15 | for i in range(len(in_tensor)): 16 | running_mean = momentum * running_mean + (1.0 - momentum) * np.mean(in_tensor[i], axis=(0, 2, 3)) 17 | running_var = momentum * running_var + (1.0 - momentum) * np.std(in_tensor[i], axis=(0, 2, 3)) 18 | update_mean = running_mean 19 | update_var = running_var 20 | update_mean_name = graph.initializer[node.input[3]][0].name 21 | update_var_name = graph.initializer[node.input[4]][0].name 22 | graph.set_initializer(update_mean_name, update_mean) 23 | graph.set_initializer(update_var_name, update_var) 24 | 25 | 26 | def update_bn_multipass(graph, act_clip_val, weight_clip_val, args): 27 | clip_val = act_clip_val.copy() 28 | clip_val.update(weight_clip_val) 29 | graph_tuning_bn = ONNXGraph() 30 | graph_tuning_bn.copy_from(graph) 31 | bn_list = [] 32 | for node in graph_tuning_bn.graph.node: 33 | if node.op_type == "BatchNormalization": 34 | bn_list.append(node) 35 | pre_act_cache = None 36 | for node in bn_list: 37 | logger.info("Update BN for node: {}".format(node.name)) 38 | graph_q, quant_node_list = quant_graph(graph_tuning_bn, clip_val, args) 39 | q_cache = ActivationCache(graph_q, args) 40 | if pre_act_cache: 41 | q_cache.activation_cache = pre_act_cache 42 | update_bn_node(graph_tuning_bn, node, q_cache[node.input[0]]) 43 | pre_act_cache = copy.deepcopy(q_cache.activation_cache) 44 | graph_tuning_bn.update_model() 45 | graph_tuning_bn.save_onnx_model('update_bn_model') 46 | 47 | act_clip_val, weight_clip_val = tensor_calibration(graph_tuning_bn, args) 48 | return graph_tuning_bn, act_clip_val, weight_clip_val 49 | 50 | 51 | def update_bn_onepass(graph, act_clip_val, weight_clip_val, args): 52 | clip_val = act_clip_val.copy() 53 | clip_val.update(weight_clip_val) 54 | graph_tuning_bn = copy.deepcopy(graph) 55 | graph_q, quant_node_list = quant_graph(graph, clip_val, args) 56 | q_cache = ActivationCache(graph_q, args) 57 | for node in graph_tuning_bn.nodes: 58 | if node.op_type == "BatchNormalization": 59 | logger.info("Update BN for node: {}".format(node.name)) 60 | update_bn_node(graph_tuning_bn, node, q_cache[node.input[0]]) 61 | 62 | graph_tuning_bn.update_model() 63 | graph_tuning_bn.save_onnx_model('update_bn_model') 64 | 65 | 66 | update_bn = update_bn_multipass 67 | -------------------------------------------------------------------------------- /dipoorlet/weight_transform/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from onnx import numpy_helper 4 | 5 | from ..quantize import get_qnode_by_param 6 | 7 | LEARNABLE_LAYER_TYPES = ['Conv', 'Gemm', 'ConvTranspose'] 8 | __all__ = ['LEARNABLE_LAYER_TYPES', 'follow_relu', 'following_relu', 'update_weight', 'get_quant_tensor', 'get_block_from_first'] 9 | 10 | 11 | def follow_relu(graph, node): 12 | conv_out = node.output[0] 13 | nxt_node = graph.get_tensor_consumer(conv_out) 14 | return len(nxt_node) == 1 and not isinstance(nxt_node[0], str) and nxt_node[0].op_type == 'Relu' 15 | 16 | 17 | def following_relu(graph, node): 18 | conv_out = node.output[0] 19 | nxt_node = graph.get_tensor_consumer(conv_out) 20 | assert nxt_node[0].op_type == 'Relu' 21 | return nxt_node[0] 22 | 23 | 24 | def update_weight(graph, weight_tensor, weight_name): 25 | name = graph.initializer[weight_name][0].name 26 | graph.set_initializer(name, weight_tensor) 27 | 28 | 29 | def get_quant_tensor(weight_shape, param, weight_range): 30 | q_nodes, q_min, q_max = get_qnode_by_param(param, 'tmp', weight_shape, weight_range) 31 | scale = None 32 | for init in q_nodes.initializer: 33 | if init.name == 'tmp_scale': 34 | scale = numpy_helper.to_array(init) 35 | 36 | if 'per_channel' in param and param['per_channel']: 37 | c_num = weight_shape[0] 38 | scale = torch.from_numpy(np.array(scale).astype(np.float32)).view( 39 | [c_num, *[1] * (len(weight_shape) - 1)]).cuda() 40 | q_min = torch.from_numpy(np.array(q_min).astype(np.float32)).view( 41 | [c_num, *[1] * (len(weight_shape) - 1)]).cuda() 42 | q_max = torch.from_numpy(np.array(q_max).astype(np.float32)).view( 43 | [c_num, *[1] * (len(weight_shape) - 1)]).cuda() 44 | else: 45 | scale = torch.from_numpy(np.array(scale).astype(np.float32)).cuda() 46 | q_min = torch.from_numpy(np.array(q_min).astype(np.float32)).cuda() 47 | q_max = torch.from_numpy(np.array(q_max).astype(np.float32)).cuda() 48 | scale.requires_grad = False 49 | q_min.requires_grad = False 50 | q_max.requires_grad = False 51 | return scale, q_min, q_max 52 | 53 | 54 | def get_block_from_first(graph, node, args): 55 | res = [node] 56 | while True: 57 | next_node = graph.get_tensor_consumer(node.output[0]) 58 | if len(next_node) != 1 or isinstance(next_node[0], str) or next_node[0].op_type not in LEARNABLE_LAYER_TYPES + ['Relu']: 59 | return res 60 | if next_node[0].op_type != 'Relu': 61 | res.append(next_node[0]) 62 | # We set max len=3. 63 | if len(res) == 3: 64 | return res 65 | node = next_node[0] 66 | -------------------------------------------------------------------------------- /dipoorlet/weight_transform/weight_equalization.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | from onnx import numpy_helper 5 | 6 | from ..utils import ONNXGraph, logger 7 | from .utils import update_weight 8 | 9 | 10 | def find_successor(cur_node, graph): 11 | # Conv -> Relu -> Conv or Conv -> Conv pattern supported. 12 | result = [] 13 | out_tensor = cur_node.output[0] 14 | nxt_node = graph.get_tensor_consumer(out_tensor) 15 | for node in nxt_node: 16 | if isinstance(node, str): 17 | return [] 18 | if node.op_type in ['Relu', 'PRelu']: 19 | relu_out = node.output[0] 20 | nxt_nxt_node = graph.get_tensor_consumer(relu_out) 21 | for _node in nxt_nxt_node: 22 | if not isinstance(_node, str) and _node.op_type == 'Conv': 23 | result.append(_node) 24 | else: 25 | return [] 26 | elif node.op_type == 'Conv': 27 | result.append(node) 28 | else: 29 | return [] 30 | return result 31 | 32 | 33 | def node_has_equalized(graph, node): 34 | # Helper function for other algos. 35 | return len(find_successor(node, graph)) == 1 36 | 37 | 38 | def weight_equalization(graph, args): 39 | graph_we = ONNXGraph() 40 | graph_we.copy_from(graph) 41 | 42 | for node in graph_we.graph.node: 43 | if node.op_type == 'Conv': 44 | succ = find_successor(node, graph_we) 45 | if len(succ) != 1: 46 | continue 47 | iter = 1 48 | while True: 49 | weight_first = numpy_helper.to_array(graph_we.initializer[node.input[1]][0]) 50 | new_weight_first = copy.deepcopy(weight_first) 51 | if len(node.input) == 3: 52 | bias_first = numpy_helper.to_array(graph_we.initializer[node.input[2]][0]) 53 | new_bias_first = copy.deepcopy(bias_first) 54 | next_node = succ[0] 55 | weight_second = numpy_helper.to_array(graph_we.initializer[next_node.input[1]][0]) 56 | new_weight_second = copy.deepcopy(weight_second) 57 | num_group = weight_first.shape[0] // weight_second.shape[1] 58 | logger.info('Cross Layer WE: {} --- {} Groups: {} Iter: {}'.format(node.name, next_node.name, num_group, iter)) 59 | group_channels_i = weight_first.shape[0] // num_group 60 | group_channels_o = weight_second.shape[0] // num_group 61 | for g in range(num_group): 62 | c_start_i = g * group_channels_i 63 | c_end_i = (g + 1) * group_channels_i 64 | weight_first_group = weight_first[c_start_i:c_end_i] 65 | c_start_o = g * group_channels_o 66 | c_end_o = (g + 1) * group_channels_o 67 | weight_second_group = weight_second[c_start_o:c_end_o] 68 | for ii in range(weight_second_group.shape[1]): 69 | range_1 = np.abs(weight_first_group)[ii].max() 70 | range_2 = np.abs(weight_second_group)[:, ii].max() 71 | if range_1 < 1e-6: 72 | range_1 = 0. 73 | if range_2 < 1e-6: 74 | range_2 = 0. 75 | s = range_1 / np.sqrt(range_1 * range_2) 76 | if np.isinf(s) or np.isnan(s): 77 | s = 1.0 78 | new_weight_first[c_start_i + ii] /= s 79 | new_weight_second[c_start_o:c_end_o, ii] *= s 80 | if len(node.input) == 3: 81 | new_bias_first[c_start_i + ii] /= s 82 | 83 | if converged([weight_first, weight_second], [new_weight_first, new_weight_second]): 84 | break 85 | iter += 1 86 | # Update layer. 87 | update_weight(graph_we, new_weight_first, node.input[1]) 88 | graph_we.update_model() 89 | update_weight(graph_we, new_weight_second, next_node.input[1]) 90 | graph_we.update_model() 91 | if len(node.input) == 3: 92 | update_weight(graph_we, new_bias_first, node.input[2]) 93 | graph_we.update_model() 94 | graph_we.save_onnx_model('weight_equal_model') 95 | 96 | 97 | def converged(cur_weight, prev_weight, threshold=1e-4): 98 | norm_sum = 0 99 | norm_sum += np.linalg.norm(cur_weight[0] - prev_weight[0]) 100 | norm_sum += np.linalg.norm(cur_weight[1] - prev_weight[1]) 101 | return norm_sum < threshold 102 | -------------------------------------------------------------------------------- /dipoorlet/weight_transform/weight_trans_base.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | import torch.distributed as dist 3 | 4 | from ..tensor_cali import tensor_calibration, find_clip_val_minmax_weight 5 | from ..utils import (ONNXGraph, load_clip_val, logger, save_clip_val, 6 | update_model_path) 7 | from .adaround import adaround 8 | from .bias_correction import bias_correction 9 | from .brecq import brecq 10 | from .update_bn import update_bn 11 | from .weight_equalization import weight_equalization 12 | from .sparse_quant import sparse_quant 13 | 14 | 15 | def weight_calibration(onnx_graph, act_clip_val, weight_clip_val, args): 16 | '''It must be sure that after weight calibration, model/args/clip_val must be 17 | exactly the same on every GPUs. 18 | ''' 19 | graph_after_wt = ONNXGraph() 20 | graph_after_wt.copy_from(onnx_graph) 21 | if args.bc: 22 | if dist.get_rank() == 0: 23 | bias_correction(graph_after_wt, act_clip_val, weight_clip_val, args) 24 | dist.barrier() 25 | update_model_path('update_bias_model', args) 26 | model = onnx.load(args.model) 27 | graph_after_wt = ONNXGraph(model, args.output_dir) 28 | # Update bias range. 29 | weight_clip_val = find_clip_val_minmax_weight(graph_after_wt, args) 30 | 31 | if args.we: 32 | if dist.get_rank() == 0: 33 | weight_equalization(graph_after_wt, args) 34 | dist.barrier() 35 | update_model_path('weight_equal_model', args) 36 | model = onnx.load(args.model) 37 | graph_after_wt = ONNXGraph(model, args.output_dir) 38 | act_clip_val, weight_clip_val = tensor_calibration(graph_after_wt, args) 39 | 40 | if args.update_bn: 41 | if dist.get_rank() == 0: 42 | update_bn(graph_after_wt, act_clip_val, weight_clip_val, args) 43 | dist.barrier() 44 | update_model_path('update_bn_model', args) 45 | model = onnx.load(args.model) 46 | graph_after_wt = ONNXGraph(model, args.output_dir) 47 | if dist.get_rank() == 0: 48 | logger.info("Re calibration...") 49 | act_clip_val, weight_clip_val = tensor_calibration(graph_after_wt, args) 50 | save_clip_val(act_clip_val, weight_clip_val, args) 51 | dist.barrier() 52 | act_clip_val, weight_clip_val = load_clip_val(args) 53 | 54 | if not args.sparse: 55 | if args.adaround: 56 | args.acti_quant = False 57 | graph_after_wt = adaround(onnx_graph, graph_after_wt, act_clip_val, weight_clip_val, args) 58 | 59 | if args.brecq: 60 | if args.drop is True: 61 | args.acti_quant = True 62 | else: 63 | args.acti_quant = False 64 | graph_after_wt = brecq(onnx_graph, graph_after_wt, act_clip_val, weight_clip_val, args) 65 | else: 66 | graph_after_wt = sparse_quant(onnx_graph, graph_after_wt, act_clip_val, weight_clip_val, args) 67 | 68 | return graph_after_wt, onnx_graph, act_clip_val, weight_clip_val 69 | -------------------------------------------------------------------------------- /example/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Dipoorlet/c89130744d45ae2e7cb77081b8799c2ff31ee08d/example/.gitkeep -------------------------------------------------------------------------------- /example/magicmind.md: -------------------------------------------------------------------------------- 1 | # Deploy on Magicmind 2 | 3 | We provide an example of writing the quantization parameters generated by the "adaround" algorithm into the magicmind platform model. 4 | 5 | For the ONNX model "model.onnx", quantification is first performed through Dipoorlet. The activation calibration method here uses "mse" and fine-tuning its weights using the "adaround" algorithm. 6 | 7 | ``` 8 | python -m torch.distributed.launch --use_env -m dipoorlet -M model.onnx -I workdir/ -N 100 -A mse -adaround -D magicmind 9 | ``` 10 | 11 | Dipoorlet will generate calibrated model "adaround.onnx" and quantitative configuration information "magicmind_quant_param.json": 12 | 13 | 14 | ``` 15 | magicmind_quant_param.json: 16 | { 17 | "blob_range": { 18 | "0": { 19 | "min": -2.1179039478302, 20 | "max": 2.4663430328313014 21 | }, 22 | "43": { 23 | "min": -2.0301631384284935, 24 | "max": 2.0301631384284935 25 | }, 26 | "44": { 27 | "min": 0.0, 28 | "max": 2.004604876945847 29 | }, 30 | "45": { 31 | "min": 0.0, 32 | "max": 2.3547358431970458 33 | }, 34 | ... 35 | } 36 | } 37 | ``` 38 | 39 | Subsequently, convert the calibrated model "adaround.onnx" to a magicmind type network and write "magicmind_quant_param.json" to the network 40 | 41 | ``` 42 | import magicmind.python.runtime as mm 43 | import json 44 | 45 | with open('magicmind_quant_param.json', 'r') as f: 46 | dipoorlet_range = json.load(f) 47 | 48 | has_set_nodes = set() 49 | assert isinstance(network, mm.Network), "invalid network" 50 | for idx in range(network.get_input_count()): 51 | ipt = network.get_input(idx) 52 | if ipt.get_tensor_name() not in has_set_nodes: 53 | tblob_range = dipoorlet_range[ipt.get_tensor_name()] 54 | ipt_range = mm.Range(tblob_range["min"], tblob_range["max"]) 55 | print(f"set input node dynamic range name {ipt.get_tensor_name()} to {tblob_range['min']}, {tblob_range['max']}") 56 | ipt.set_dynamic_range(ipt_range, False) 57 | has_set_nodes.add(ipt.get_tensor_name()) 58 | 59 | for layer in network.get_all_nodes_in_network(): 60 | for i in range(layer.get_output_count()): 61 | output = layer.get_output(i) 62 | if output is None or output.get_tensor_name() in has_set_nodes: 63 | continue 64 | tblob_range = dipoorlet_range[output.get_tensor_name()] 65 | out_range = mm.Range(tblob_range['min'], tblob_range['max']) 66 | print("set output dynamic range of tensor `{0}` to [{1}, {2}]".format( 67 | output.get_tensor_name(), out_range.min, out_range.max)) 68 | output.set_dynamic_range(out_range, False) 69 | has_set_nodes.add(output.get_tensor_name()) 70 | ``` -------------------------------------------------------------------------------- /example/rv.md: -------------------------------------------------------------------------------- 1 | # Deploy on Rv 2 | 3 | We provide an example of writing the quantization parameters generated by the "adaround" algorithm into the rv platform model. 4 | 5 | For the ONNX model "model.onnx", quantification is first performed through Dipoorlet. The activation calibration method here uses "mse" and fine-tuning its weights using the "adaround" algorithm. 6 | 7 | ``` 8 | python -m torch.distributed.launch --use_env -m dipoorlet -M model.onnx -I workdir/ -N 100 -A mse -adaround -D rv 9 | ``` 10 | 11 | Dipoorlet will generate calibrated model "adaround.onnx" and quantitative configuration information "rk_quantized_param.json", "rv_quantized_param.json": 12 | 13 | 14 | ``` 15 | rk_quantized_param.json: 16 | { 17 | "custom_quantize_layers": {}, 18 | "quantize_parameters": { 19 | "0": { 20 | "max": [ 21 | 2.4663430328313014 22 | ], 23 | "min": [ 24 | -2.1179039478302 25 | ] 26 | }, 27 | "Conv0_W": { 28 | "max": [ 29 | 0.23317177593708038 30 | ], 31 | "min": [ 32 | -0.17200440168380737 33 | ] 34 | }, 35 | "Conv0_b": { 36 | "max": [ 37 | 0.5909613966941833 38 | ], 39 | "min": [ 40 | -0.5909613966941833 41 | ] 42 | }, 43 | ... 44 | } 45 | } 46 | 47 | rv_quantized_param.json: 48 | { 49 | "customized_quantize_layers": {}, 50 | "quantize_parameters": { 51 | "@0:out0": { 52 | "dtype": "asymmetric_affine", 53 | "method": "layer", 54 | "max_value": [ 55 | 2.4663430328313014 56 | ], 57 | "min_value": [ 58 | -2.1179039478302 59 | ], 60 | "qtype": "u8", 61 | "scale": [ 62 | 0.017977439139849026 63 | ], 64 | "zero_point": [ 65 | 118 66 | ] 67 | }, 68 | "@Conv0:weight": { 69 | "dtype": "asymmetric_affine", 70 | "method": "layer", 71 | "max_value": [ 72 | 0.23317177593708038 73 | ], 74 | "min_value": [ 75 | -0.17200440168380737 76 | ], 77 | "qtype": "u8", 78 | "scale": [ 79 | 0.0015889261867485795 80 | ], 81 | "zero_point": [ 82 | 108 83 | ] 84 | }, 85 | "@Conv0:bias": { 86 | "dtype": "asymmetric_affine", 87 | "method": "layer", 88 | "max_value": [], 89 | "min_value": [], 90 | "zero_point": [ 91 | 0 92 | ], 93 | "scale": [ 94 | 2.8564823819984977e-05 95 | ], 96 | "qtype": "i32" 97 | }, 98 | ... 99 | } 100 | } 101 | ``` 102 | 103 | 104 | Subsequently, rewrite the quantize parameters: 105 | 106 | ``` 107 | prefix = ''.join(re.split(r'[^A-Za-z0-9_]', 'netdef')) 108 | model_quant_cfg = f'./{prefix}.quantization.cfg' 109 | if quant_type_str == 'u8': 110 | dipootlet_quant_cfg = "rv_quantized_param.json" 111 | elif quant_type_str == 'rknn2-8': 112 | dipootlet_quant_cfg = "rk_quantized_param.json" 113 | with open(dipootlet_quant_cfg, 'r') as file: 114 | # Rename layer idx in json file. 115 | qaunt_yaml = yaml.safe_load(file) 116 | quantize_param = qaunt_yaml['quantize_parameters'] 117 | with open(f'./{prefix}.quantization.cfg', 'r') as f: 118 | origin_yaml = yaml.safe_load(f) 119 | origin_quantize_param = origin_yaml['quantize_parameters'] 120 | if quant_type_str == 'u8': 121 | for layer in origin_quantize_param: 122 | layer_name = '_'.join(layer.split('_')[:-1]) 123 | layer_type = layer.split(':')[-1] 124 | layer_name_without_idx = '{}:{}'.format(layer_name, layer_type) 125 | if layer_name_without_idx in quantize_param: 126 | origin_quantize_param[layer] = quantize_param[layer_name_without_idx] 127 | print("use dipoorlet to rewrite quantize param {}".format(layer_name_without_idx)) 128 | else: 129 | for layer in origin_quantize_param: 130 | if layer in quantize_param: 131 | origin_quantize_param[layer].update(quantize_param[layer]) 132 | print("use dipoorlet to rewrite quantize param {}".format(layer)) 133 | ``` -------------------------------------------------------------------------------- /example/snpe.md: -------------------------------------------------------------------------------- 1 | # Deploy on Snpe 2 | 3 | We provide an example of writing the quantization parameters generated by the "adaround" algorithm into the snpe platform model. 4 | 5 | For the ONNX model "model.onnx", quantification is first performed through Dipoorlet. The activation calibration method here uses "mse" and fine-tuning its weights using the "adaround" algorithm. 6 | 7 | ``` 8 | python -m torch.distributed.launch --use_env -m dipoorlet -M model.onnx -I workdir/ -N 100 -A mse -adaround -D snpe 9 | ``` 10 | 11 | Dipoorlet will generate calibrated model "adaround.onnx" and quantitative configuration information "snpe_encodings.json": 12 | 13 | 14 | ``` 15 | snpe_encodings.json: 16 | { 17 | "activation_encodings": { 18 | "0": [ 19 | { 20 | "bitwidth": 8, 21 | "min": -2.1179039478302, 22 | "max": 2.4663430328313014 23 | } 24 | ], 25 | "43": [ 26 | { 27 | "bitwidth": 8, 28 | "min": -2.0301631384284935, 29 | "max": 2.0301631384284935 30 | } 31 | ], 32 | "44": [ 33 | { 34 | "bitwidth": 8, 35 | "min": 0.0, 36 | "max": 2.004604876945847 37 | } 38 | ], 39 | ... 40 | }, 41 | "param_encodings": {} 42 | } 43 | ``` 44 | 45 | Subsequently, convert the calibrated model "adaround.onnx" to a spne type network and write "snpe_encodings.json" to the network 46 | 47 | ``` 48 | from subprocess import PIPE, Popen 49 | import json 50 | import onnx 51 | 52 | snpemodel_path = 'path-to-dlc_model/model.dlc' 53 | q_overrides_path = 'path-to-snpe_encodings/snpe_encodings.json' 54 | model_fp = onnx.load('adaround.onnx') 55 | cmd_args = ['snpe-onnx-to-dlc', '-i', model_fp, '-o', snpemodel_path] 56 | cmd_args.extend(['--quantization_overrides', q_overrides_path]) 57 | p = Popen(cmd_args, stdout=PIPE, stderr=PIPE) 58 | log = p.communicate() 59 | ret = p.returncode 60 | if (ret != 0): 61 | print("call snpe-dlc-quantize failed.) 62 | exit(1) 63 | quant_snpemodel_path = 'path-to-quant_model/quant_model.dlc' 64 | cmd_args = ['snpe-dlc-quantize', '--input_dlc', snpemodel_path, '--input_list', raw_list.txt, '--output_dlc', quant_snpemodel_path] 65 | cmd_args.append('--override_params') 66 | p = Popen(cmd_args, stdout=PIPE, stderr=PIPE) 67 | log = p.communicate() 68 | ret = p.returncode 69 | if (ret != 0): 70 | print("call snpe-dlc-quantize failed) 71 | exit(1) 72 | ``` -------------------------------------------------------------------------------- /example/tensorrt.md: -------------------------------------------------------------------------------- 1 | # Deploy on TensorRT 2 | 3 | We provide an example of writing the quantization parameters generated by the "adaround" algorithm into the TensorRT platform model. 4 | 5 | For the ONNX model "model.onnx", quantification is first performed through Dipoorlet. The activation calibration method here uses "mse" and fine-tuning its weights using the "adaround" algorithm. 6 | 7 | ``` 8 | python -m torch.distributed.launch --use_env -m dipoorlet -M model.onnx -I workdir/ -N 100 -A mse -adaround -D trt 9 | ``` 10 | 11 | Dipoorlet will generate calibrated model "adaround.onnx" and quantitative configuration information "trt_clip_val.json": 12 | 13 | 14 | ``` 15 | trt_clip_val.json: 16 | { 17 | "blob_range": { 18 | "0": 2.4663430328313014, 19 | "43": 2.0301631384284935, 20 | "44": 2.004604876945847, 21 | "45": 2.3547358431970458, 22 | "46": 3.7930725929523827, 23 | "47": 1.4081256442853067, 24 | "48": 3.1422231943456094, 25 | "49": 3.3541505757916714, 26 | "50": 2.9029659753242845, 27 | "51": 2.089258746225367, 28 | "52": 1.5408500571797488, 29 | "53": 2.8442123536180173, 30 | "54": 3.464566072806498, 31 | ... 32 | } 33 | } 34 | ``` 35 | 36 | Subsequently, convert the calibrated model "adaround.onnx" to a TensorRT type network and write "trt_clip_val.json" to the network 37 | 38 | ``` 39 | import tensorrt.tensorrt as trt 40 | import json 41 | 42 | with open('trt_clip_val.json', 'r') as f: 43 | dipoorlet_range = json.load(f) 44 | 45 | for layer in network: 46 | if layer.type != trt.LayerType.SHAPE and \ 47 | layer.type != trt.LayerType.CONSTANT and \ 48 | layer.type != trt.LayerType.CONCATENATION and \ 49 | layer.type != trt.LayerType.GATHER: 50 | layer.precision = trt.DataType.INT8 51 | for i in range(layer.num_inputs): 52 | inp = layer.get_input(i) 53 | if inp is not None and inp.name in dipoorlet_range: 54 | dmax = dipoorlet_range[inp.name] 55 | if inp.dynamic_range is None: 56 | inp.set_dynamic_range(-dmax, dmax) 57 | print(f'set dynamic range of tensor "{inp.name}" to {dmax}.') 58 | for i in range(layer.num_outputs): 59 | output = layer.get_output(i) 60 | if output.name in dipoorlet_range: 61 | dmax = dipoorlet_range[output.name] 62 | if output.dynamic_range is None: 63 | output.set_dynamic_range(-dmax, dmax) 64 | print(f'set dynamic range of tensor "{output.name}" to {dmax}.') 65 | ``` -------------------------------------------------------------------------------- /example/ti.md: -------------------------------------------------------------------------------- 1 | # Deploy on Ti 2 | 3 | We provide an example of writing the quantization parameters generated by the "adaround" algorithm into the ti platform model. 4 | 5 | For the ONNX model "model.onnx", quantification is first performed through Dipoorlet. The activation calibration method here uses "mse" and fine-tuning its weights using the "adaround" algorithm. 6 | 7 | ``` 8 | python -m torch.distributed.launch --use_env -m dipoorlet -M model.onnx -I workdir/ -N 100 -A mse -adaround -D ti 9 | ``` 10 | 11 | Dipoorlet will generate calibrated model "adaround.onnx" and quantitative configuration information "ti_blob_range.json": 12 | 13 | 14 | ``` 15 | ti_blob_range.json: 16 | { 17 | "blob_range": { 18 | "0": [ 19 | -2.1179039478302, 20 | 2.4663430328313014 21 | ], 22 | "43": [ 23 | -2.0301631384284935, 24 | 2.0301631384284935 25 | ], 26 | "44": [ 27 | 0.0, 28 | 2.256936187833351 29 | ], 30 | "45": [ 31 | 0.0, 32 | 2.552265446675717 33 | ], 34 | "46": [ 35 | -3.7930725929523827, 36 | 3.2160835834009767 37 | ], 38 | "47": [ 39 | 0.0, 40 | 1.5498370531397931 41 | ], 42 | ... 43 | } 44 | } 45 | ``` 46 | 47 | Subsequently, rewrite quantize parameters: 48 | 49 | ``` 50 | import json 51 | with open('ti_blob_range.json', 'r') as f: 52 | dipoorlet_range = json.load(f) 53 | 54 | with open(os.path.join(os.environ.get('TIDL_PATH'), 'utils/tidlModelImport', 'blob_range.txt'), 'w') as f: 55 | for k, v in dipoorlet_range.items(): 56 | f.write("{} {} {}\n".format(k, v[0], v[1])) 57 | ``` -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | onnx>=1.10.0 3 | onnxsim 4 | onnxruntime-gpu 5 | numpy 6 | tqdm 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import setuptools 4 | from dipoorlet import __version__ 5 | 6 | 7 | def read_requirements(): 8 | reqs = [] 9 | with open('requirements.txt', 'r') as fin: 10 | for line in fin.readlines(): 11 | reqs.append(line.strip()) 12 | return reqs 13 | 14 | 15 | def get_package_suffix(): 16 | package_suffix = __version__ 17 | if os.environ.get('CI_COMMIT_REF_SLUG', None): 18 | ci_commit_ref_slug = os.environ['CI_COMMIT_REF_SLUG'] 19 | if not ci_commit_ref_slug.startswith( 20 | 'release') and not ci_commit_ref_slug.startswith('master'): 21 | ci_commit_ref_slug = re.sub(r'[^A-Za-z0-9._-]', '_', 22 | ci_commit_ref_slug) 23 | package_suffix += f".{ci_commit_ref_slug}" 24 | if ci_commit_ref_slug == "dev" and os.environ.get('CI_COMMIT_SHORT_SHA', None): 25 | ci_commit_short_sha = os.environ['CI_COMMIT_SHORT_SHA'] 26 | package_suffix += f".{ci_commit_short_sha}" 27 | return package_suffix 28 | 29 | 30 | setuptools.setup( 31 | name="dipoorlet", 32 | version=get_package_suffix(), 33 | author="RD-MTC", 34 | description=("Offline quantization and profiling."), 35 | python_requires='>=3.6', 36 | packages=setuptools.find_packages(), 37 | classifiers=( 38 | 'Development Status :: 3 - Alpha', 39 | "Programming Language :: Python :: 3", 40 | "Operating System :: POSIX :: Linux"), 41 | install_requires=read_requirements() 42 | ) 43 | --------------------------------------------------------------------------------