├── README.md ├── inference.py ├── mobilenet.onnx ├── mobilenet_q.onnx ├── pkl_reader.py ├── quantize.py └── test.py /README.md: -------------------------------------------------------------------------------- 1 | ## onnx_quantization 2 | onnx model quantization int8 3 | ### 将fp32 onnx模型转化为 int8模型 4 | `python3.5 test.py` 5 | 6 | ### 推理测试fp32和int8的结果准确性以及运行时间 7 | `Python3.5 inference.py` 8 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | __author__ = 'taobiaoli' 3 | 4 | import onnxruntime as rt 5 | import numpy as np 6 | import cv2 7 | import onnxruntime.backend as backend 8 | from onnx import load 9 | import onnx 10 | from pkl_reader import DataGenerator 11 | import timeit 12 | 13 | 14 | #backend onnxruntime with sess, we need choose python3.5.2 15 | 16 | 17 | def top5_acc(pred,k=5): 18 | Inf = 0. 19 | results =[] 20 | for i in range(k): 21 | results.append(pred.index(max(pred))) 22 | pred[pred.index(max(pred))] = Inf 23 | return results 24 | 25 | def inference(model_path,data_path): 26 | sess = rt.InferenceSession(model_path) 27 | input_name = sess.get_inputs()[0].name 28 | output_name = sess.get_outputs()[0].name 29 | acc_top1 = 0 30 | acc_top5 = 0 31 | img = cv2.imread('ILSVRC2012_val_00049517.JPEG') 32 | img = cv2.resize(img,(224,224)) 33 | img = np.transpose(img,(2,0,1)) 34 | print(img.shape) 35 | img = img.astype('float32')/255 36 | img = img.reshape(1,224,224,3) 37 | print(img.shape) 38 | print(img.dtype) 39 | starttime = timeit.default_timer() 40 | res = sess.run([output_name],{input_name:img}) 41 | endtime = timeit.default_timer() 42 | print('cost time: ',endtime-starttime) 43 | print('result:',np.argmax(res)) 44 | ''' 45 | dg = DataGenerator(data_path,model = 'mobilenet', dtype='float32') 46 | 47 | for im, label in dg.generator(): 48 | res = sess.run([output_name],{input_name:im}) 49 | if(np.argmax(res) == label): 50 | acc_top1 = acc_top1 + 1 51 | if label in top5_acc(res): 52 | acc_top5 = acc_top5 + 1 53 | print('top1 accuracy: {}'.format(acc_top1/50000)) 54 | print('top5 accuracy: {}'.format(acc_top5/50000)) 55 | 56 | ''' 57 | ''' 58 | input_name = sess.get_inputs()[0].name 59 | print('input name',input_name) 60 | #input_shape = sess.get_inputs()[0].shape 61 | #print('input shape',input_shape) 62 | 63 | input_type = sess.get_inputs()[0].type 64 | print('input type',input_type) 65 | 66 | output_name = sess.get_outputs()[0].name 67 | print('output name',output_name) 68 | 69 | 70 | #backend foronnxruntime for backend ,we need choose python3.5.2 71 | #model = onnx.load('alex_cat_dog.onnx') 72 | #rep = backend.prepare(model,'CPU') 73 | 74 | #prepare for model input image 75 | img = cv2.imread('0050.jpg') 76 | img = cv2.resize(img,(224,224)) 77 | img = np.transpose(img,(2,0,1)) 78 | print(img.shape) 79 | img = img.astype('float32')/255 80 | img = img.reshape(1,224,224,3) 81 | print(img.shape) 82 | print(img.dtype) 83 | 84 | #backend onnxruntime with sess 85 | res = sess.run([output_name],{input_name:img}) 86 | print(res) 87 | print(np.argmax(res)) 88 | ''' 89 | if __name__ == '__main__': 90 | inference('./mobilenet.onnx','./data/val224_compressed.pkl') 91 | 92 | -------------------------------------------------------------------------------- /mobilenet.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taobiaoli/onnx_quantization/9981df0775d9321d62ff9057538beaffc0ca9f31/mobilenet.onnx -------------------------------------------------------------------------------- /mobilenet_q.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taobiaoli/onnx_quantization/9981df0775d9321d62ff9057538beaffc0ca9f31/mobilenet_q.onnx -------------------------------------------------------------------------------- /pkl_reader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle as pkl 3 | import cv2 4 | import tqdm 5 | 6 | 7 | class DataGenerator(object): 8 | def __init__(self, pkl_file, model='vgg', dtype='float32'): 9 | self.pkl_file = pkl_file 10 | self.model = model 11 | self.dtype = dtype 12 | 13 | def generator(self): 14 | data = self.load_pickle(self.pkl_file) 15 | assert len(data['data']) == 50000, len(data['data']) 16 | assert len(data['target']) == 50000, len(data['target']) 17 | for im, target in tqdm.tqdm(zip(data['data'], data['target']), total=50000): 18 | # for im, target in zip(data['data'], data['target']): 19 | im = self.str2img(im) 20 | if self.model not in ['inception', 'xception', 'mobilenet', 'inception_resnet']: 21 | im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) 22 | if self.model == 'squeezenet': 23 | im = cv2.resize(im, (227, 227)) 24 | if self.model in ['inception', 'xception', 'inception_resnet']: 25 | im = cv2.resize(im, (299, 299)) 26 | im = self.preprocessing(im, model=self.model) 27 | label = int(target) 28 | yield im, label 29 | 30 | 31 | @staticmethod 32 | def load_pickle(path): 33 | with open(path, 'rb') as f: 34 | v = pkl.load(f) 35 | f.close() 36 | return v 37 | 38 | @staticmethod 39 | def str2img(str_im): 40 | return cv2.imdecode(np.fromstring(str_im, np.uint8), cv2.IMREAD_COLOR) 41 | 42 | @staticmethod 43 | def preprocessing(im, model='vgg', dtype='float32'): 44 | dtype = np.float16 if dtype == 'float16' else np.float32 45 | im = im.astype(dtype) 46 | im = np.expand_dims(im, axis=0) 47 | if model in ['vgg', 'resnet', 'squeezenet']: 48 | im[..., 0] -= 103.939 49 | im[..., 1] -= 116.779 50 | im[..., 2] -= 123.68 51 | elif model in ['inception', 'mobilenet', 'xception', 'inception_resnet']: 52 | im /= 255. 53 | im -= 0.5 54 | im *= 2. 55 | elif model == 'densenet': 56 | im[..., 0] -= 103.939 57 | im[..., 1] -= 116.779 58 | im[..., 2] -= 123.68 59 | im[..., 0] *= 0.017 60 | im[..., 1] *= 0.017 61 | im[..., 2] *= 0.017 62 | else: 63 | pass 64 | return im 65 | -------------------------------------------------------------------------------- /quantize.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | import os 7 | import onnx 8 | import onnx.numpy_helper 9 | import struct 10 | 11 | import numpy as np 12 | from onnx import onnx_pb as onnx_proto 13 | 14 | __producer__ = "onnx.quantize" 15 | __version__ = "0.1.0" 16 | onnx_domain = "ai.onnx" 17 | onnx_op_set_version = 11 18 | 19 | type_to_name = { 20 | 1: "FLOAT", 21 | 2: "UINT8", 22 | 3: "INT8", 23 | 4: "UINT16", 24 | 5: "INT16", 25 | 6: "INT32", 26 | 7: "INT64", 27 | 8: "STRING", 28 | 9: "BOOL", 29 | 10: "FLOAT16", 30 | 11: "DOUBLE", 31 | 12: "UINT32", 32 | 13: "UINT64", 33 | 14: "COMPLEX64", 34 | 15: "COMPLEX128", 35 | } 36 | 37 | # Quantization mode 38 | # IntegerOps: Use IntegerOps in quantized model. Only ConvInteger and MatMulInteger ops are supported now. 39 | # QLinearOps: Use QLinearOps in quantized model. Only QLinearConv and QLinearMatMul ops are supported now. 40 | class QuantizationMode(): 41 | IntegerOps = 0 42 | QLinearOps = 1 43 | 44 | quantization_modes = [getattr(QuantizationMode, attr) for attr in dir(QuantizationMode)\ 45 | if not callable(getattr(QuantizationMode, attr)) and not attr.startswith("__")] 46 | 47 | class QuantizedInitializer: 48 | ''' 49 | Represents a linearly quantized weight input from ONNX operators 50 | ''' 51 | def __init__(self, name, initializer, rmins, rmaxs, zero_points, scales, data=[], quantized_data=[], axis=None, 52 | qType=onnx_proto.TensorProto.UINT8): 53 | self.name = name 54 | self.initializer = initializer # TensorProto initializer in ONNX graph 55 | self.rmins = rmins # List of minimum range for each axis 56 | self.rmaxs = rmaxs # List of maximum range for each axis 57 | self.zero_points = zero_points # 1D tensor of zero points computed for each axis. scalar if axis is empty 58 | self.scales = scales # 1D tensor of scales computed for each axis. scalar if axis is empty 59 | self.data = data # original data from initializer TensorProto 60 | self.quantized_data = quantized_data # weight-packed data from data 61 | self.axis = axis # Scalar to specify which dimension in the initializer to weight pack. 62 | # If empty, single zero point and scales computed from a single rmin and rmax 63 | self.qType = qType # type of quantized data. 64 | 65 | class QuantizedValueType(): 66 | Input = 0 67 | Initializer = 1 68 | 69 | class QuantizedValue: 70 | ''' 71 | Represents a linearly quantized value (input\output\intializer) 72 | ''' 73 | def __init__(self, name, new_quantized_name, scale_name, zero_point_name, quantized_value_type, axis=None, 74 | qType=onnx_proto.TensorProto.UINT8): 75 | self.original_name = name 76 | self.q_name = new_quantized_name 77 | self.scale_name = scale_name 78 | self.zp_name = zero_point_name 79 | self.value_type = quantized_value_type 80 | self.axis = axis 81 | self.qType = qType 82 | 83 | def quantize_data(data, quantize_range, qType): 84 | ''' 85 | :parameter quantize_range: list of data to weight pack. 86 | :parameter qType: data type to quantize to. Supported types UINT8 and INT8 87 | :return: minimum, maximum, zero point, scale, and quantized weights 88 | To pack weights, we compute a linear transformation 89 | - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and 90 | - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where 91 | m = max(abs(rmin), abs(rmax)) 92 | and add necessary intermediate nodes to trasnform quantized weight to full weight using the equation 93 | r = S(q-z), where 94 | r: real original value 95 | q: quantized value 96 | S: scale 97 | z: zero point 98 | ''' 99 | rmin = min(min(data), 0) 100 | rmax = max(max(data), 0) 101 | 102 | if qType == onnx_proto.TensorProto.INT8: 103 | max_range = max(abs(rmin), abs(rmax)) 104 | scale = (float(max_range)*2) / quantize_range 105 | zero_point = 0 106 | quantized_data = (np.asarray(data) / scale).round().astype('b') #signed byte type 107 | elif qType == onnx_proto.TensorProto.UINT8: 108 | scale = (float(rmax) - rmin) / quantize_range if rmin != rmax else 1 109 | zero_point = round((0 - rmin) / scale) # round to nearest integer 110 | quantized_data = ((np.asarray(data) / scale).round() + zero_point).astype('B') # unsigned byte type 111 | else: 112 | raise ValueError("Unexpected data type {} requested. Only INT8 and UINT8 are supported.") 113 | 114 | return rmin, rmax, zero_point, scale, quantized_data 115 | 116 | 117 | def _attribute_to_kwarg(attribute): 118 | ''' 119 | Convert attribute to kwarg format for use with onnx.helper.make_node. 120 | :parameter attribute: attribute in AttributeProto format. 121 | :return: attribute in {key: value} format. 122 | ''' 123 | if (attribute.type == 0): 124 | raise ValueError('attribute {} does not have type specified.'.format(attribute.name)) 125 | 126 | # Based on attribute type definitions from AttributeProto 127 | # definition in https://github.com/onnx/onnx/blob/master/onnx/onnx.proto 128 | if (attribute.type == 1): 129 | value = attribute.f 130 | elif (attribute.type == 2): 131 | value = attribute.i 132 | elif (attribute.type == 3): 133 | value = attribute.s 134 | elif (attribute.type == 4): 135 | value = attribute.t 136 | elif (attribute.type == 5): 137 | value = attribute.g 138 | elif (attribute.type == 6): 139 | value = attribute.floats 140 | elif (attribute.type == 7): 141 | value = attribute.ints 142 | elif (attribute.type == 8): 143 | value = attribute.strings 144 | elif (attribute.type == 9): 145 | value = attribute.tensors 146 | elif (attribute.type == 10): 147 | value = attribute.graphs 148 | else: 149 | raise ValueError('attribute {} has unsupported type {}.'.format(attribute.name, attribute.type)) 150 | 151 | return {attribute.name: value} 152 | 153 | def _find_by_name(item_name, item_list): 154 | ''' 155 | Helper function to find item by name in a list. 156 | parameter item_name: name of the item. 157 | parameter item_list: list of items. 158 | return: item if found. None otherwise. 159 | ''' 160 | items = [item for item in item_list if item.name == item_name] 161 | return items[0] if len(items) > 0 else None 162 | 163 | def _get_mul_node(inputs, output, name): 164 | ''' 165 | Helper function to create a Mul node. 166 | parameter inputs: list of input names. 167 | parameter output: output name. 168 | parameter name: name of the node. 169 | return: Mul node in NodeProto format. 170 | ''' 171 | return onnx.helper.make_node("Mul", inputs, [output], name) 172 | 173 | def _find_node_by_name(node_name, graph, new_nodes_list): 174 | ''' 175 | Helper function to check if a node exists in a graph or 176 | new set of nodes created during quantization. 177 | parameter node_name: name of the node. 178 | parameter graph: GraphProto. 179 | parameter new_nodes_list: list of nodes added during quantization. 180 | return: NodeProto if found. None otherwise. 181 | ''' 182 | graph_nodes_list = list(graph.node) # deep copy 183 | graph_nodes_list.extend(new_nodes_list) 184 | node = _find_by_name(node_name, graph_nodes_list) 185 | return node 186 | 187 | def _add_initializer_if_not_present(graph, name, value, shape, type): 188 | ''' 189 | Helper function to add an initializer if it is not present in the graph. 190 | parameter graph: GraphProto. 191 | parameter name: Initializer's name. 192 | parameter value: Initializer's value. 193 | parameter shape: Initializer's shape. 194 | parameter type: Initializer's type. 195 | ''' 196 | if _find_by_name(name, graph.initializer) is None: 197 | initializer = onnx.helper.make_tensor(name, type, shape, value) 198 | value_info = onnx.helper.make_tensor_value_info(name, type, shape) 199 | graph.initializer.extend([initializer]) 200 | graph.input.extend([value_info]) 201 | 202 | def _get_qrange_for_qType(qType): 203 | ''' 204 | Helper function to get the quantization range for a type. 205 | parameter qType: quantization type. 206 | return: quantization range. 207 | ''' 208 | if qType == onnx_proto.TensorProto.UINT8: 209 | return 255 # 2^b - 1 210 | elif qType == onnx_proto.TensorProto.INT8: 211 | return 254 # [-(2^{b-1}-1), 2^{b-1}-1]: [-127, 127] for 8 bits. 212 | else: 213 | raise ValueError('unsupported quantization data type') 214 | 215 | def _find_nodes_using_initializer(graph, initializer): 216 | ''' 217 | Helper function to find all nodes with an initializer as a input. 218 | parameter graph: GraphProto. 219 | parameter initializer: Initializer in TensorProto format. 220 | return: List of nodes. 221 | ''' 222 | result = [] 223 | for node in graph.node: 224 | for node_input in node.input: 225 | if node_input == initializer.name: 226 | result.append(node) 227 | return result 228 | 229 | class ONNXQuantizer: 230 | def __init__(self, model, per_channel, mode, static, fuse_dynamic_quant, weight_qType, input_qType, 231 | quantization_params, nodes_to_quantize): 232 | self.model = model 233 | self.per_channel = per_channel # weight-pack per channel 234 | self.mode = mode # QuantizationMode.Value 235 | self.static = static # use static quantization for inputs. 236 | self.fuse_dynamic_quant = fuse_dynamic_quant 237 | self.input_qType = input_qType # quantize input type 238 | self.weight_qType = weight_qType # quantize data type 239 | self.quantization_params = quantization_params 240 | self.nodes_to_quantize = nodes_to_quantize # specific nodes to quantize 241 | 242 | if not self.mode in quantization_modes: 243 | raise ValueError('unsupported quantization mode {}'.format(self.mode)) 244 | 245 | # QuantizeRange tensor name and zero tensor name for scale and zero point calculation. 246 | # Used when static is False 247 | self.fixed_qrange_uint8_name = "fixed_quantization_range_uint8" 248 | self.fixed_qrange_int8_name = "fixed_quantization_range_int8" 249 | # For uint8 data-type, to compute zero point, we subtract rmin from 0 (represented by fixed_zero_name tensor) 250 | self.fixed_zero_name = "fixed_zero" 251 | # For int8 data-type, zero point is always zero (respresented by fixed_zero_point_name tensor) 252 | self.fixed_zero_zp_name = "fixed_zero_zp" 253 | 254 | # List of quantized weights 255 | self._quantized_weights = [] 256 | # Map of all original value names to quantized value names 257 | self.quantized_value_map = {} 258 | 259 | def quantize_model(self): 260 | # Create a new topologically sorted list for quantizing a model 261 | new_list = [] 262 | for node in self.model.graph.node: 263 | if self.nodes_to_quantize is not None and node.name not in self.nodes_to_quantize: 264 | new_list +=self._handle_other_ops(node, new_list) 265 | else: 266 | if node.op_type == 'Conv': 267 | new_list += self._quantize_convolution(node, new_list) 268 | elif node.op_type == 'MatMul': 269 | new_list += self._quantize_matmul(node, new_list) 270 | elif node.op_type == 'Gather': 271 | new_list += self._quantize_gather_ops(node, new_list) 272 | elif node.op_type == 'Relu' or node.op_type == 'Clip': 273 | new_list +=self._handle_activation_ops(node, new_list) 274 | else: 275 | new_list +=self._handle_other_ops(node, new_list) 276 | 277 | # extend is used to append to the list for a protobuf fields 278 | # https://developers.google.com/protocol-buffers/docs/reference/python-generated?csw=1#fields 279 | self.model.graph.ClearField('node') 280 | self.model.graph.node.extend(new_list) 281 | 282 | # Remove weights which are already quantized from graph. 283 | self._remove_quantized_weights() 284 | 285 | # update opset. 286 | opset_info = next((opset for opset in self.model.opset_import if opset.domain == '' or opset.domain == onnx_domain), None) 287 | if opset_info is not None: 288 | self.model.opset_import.remove(opset_info) 289 | self.model.opset_import.extend([onnx.helper.make_opsetid(onnx_domain, onnx_op_set_version)]) 290 | 291 | return self.model 292 | 293 | def find_weight_data(self, initializer): 294 | ''' 295 | :param initializer: TensorProto initializer object from a graph 296 | :return: a list of initialized data in a given initializer object 297 | ''' 298 | if initializer.data_type == onnx_proto.TensorProto.FLOAT: 299 | weights = onnx.numpy_helper.to_array(initializer) 300 | else: 301 | raise ValueError('Model contains conv operator weights in {}. Only float type quantization is supported.'.format( 302 | type_to_name[initializer.data_type])) 303 | return weights 304 | 305 | def _remove_quantized_weights(self): 306 | ''' Remove the weights which are already quantized from graph initializer list. 307 | This function assumes that after quantization, all nodes that previously use a weight: 308 | - use output from DequantizeLinear as input if they do not support quantization. 309 | - use quantized weight if they support quantization. 310 | ''' 311 | for weight in self._quantized_weights: 312 | # Remove existing weight initializer 313 | self.model.graph.initializer.remove(weight.initializer) 314 | 315 | # Removing input weight to a convolution 316 | try: 317 | weight_input = next(val for val in self.model.graph.input if val.name == weight.name) 318 | self.model.graph.input.remove(weight_input) 319 | except StopIteration: 320 | if self.model.ir_version < 4: 321 | raise ValueError('invalid weight name {} found in the graph (not a graph input) '.format(weight.name)) 322 | 323 | 324 | def _update_graph(self, weight): 325 | ''' 326 | Given a weight object, update the graph by doing the following: 327 | - remove old initializer, update new initializers for quantized weight, zero point, and scale 328 | - remove old weight input, update with new inputs for quantized weight, zero point, and scale 329 | This function does NOT update the nodes in the graph, just initializers and inputs 330 | ''' 331 | quantized_value = self.quantized_value_map[weight.name] 332 | assert(quantized_value is not None) 333 | packed_weight_name = quantized_value.q_name 334 | scale_name = quantized_value.scale_name 335 | zero_point_name = quantized_value.zp_name 336 | 337 | # Update packed weight, zero point, and scale initializers 338 | packed_weight_np_data = np.asarray(weight.quantized_data, 339 | dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[weight.qType]).reshape(weight.initializer.dims) 340 | packed_weight_initializer = onnx.numpy_helper.from_array(packed_weight_np_data, packed_weight_name) 341 | 342 | if weight.axis is not None: 343 | zero_scale_shape = [weight.initializer.dims[weight.axis]] 344 | else: # scale and zero point must be scalar 345 | zero_scale_shape = [] 346 | zero_point_type = weight.qType 347 | scale_initializer = onnx.helper.make_tensor(scale_name, onnx_proto.TensorProto.FLOAT, zero_scale_shape, weight.scales) 348 | zero_initializer = onnx.helper.make_tensor(zero_point_name, zero_point_type, zero_scale_shape, weight.zero_points) 349 | 350 | self.model.graph.initializer.extend([packed_weight_initializer, scale_initializer, zero_initializer]) 351 | 352 | # Create input for initialized scale and zeros 353 | packed_weight_value_info = onnx.helper.make_tensor_value_info(packed_weight_name, weight.qType, 354 | weight.initializer.dims) 355 | scale_value_info = onnx.helper.make_tensor_value_info(scale_name, onnx_proto.TensorProto.FLOAT, zero_scale_shape) 356 | zero_point_value_info = onnx.helper.make_tensor_value_info(zero_point_name, 357 | zero_point_type, zero_scale_shape) # zero_point is int for dequantize operator 358 | 359 | self.model.graph.input.extend([packed_weight_value_info, scale_value_info, zero_point_value_info]) 360 | 361 | self._quantized_weights.append(weight) 362 | 363 | def _get_quantized_weight(self, initializer, qType): 364 | ''' 365 | :param initializer: TensorProto initializer 366 | :param qType: type to quantize to 367 | :return: Weight class with quantization information 368 | ''' 369 | weights_data = self.find_weight_data(initializer) 370 | rmin, rmax, zero_point, scale, quantized_weights_data = quantize_data(weights_data.flatten().tolist(), 371 | _get_qrange_for_qType(qType), qType) 372 | weight = QuantizedInitializer(initializer.name, initializer, [rmin], [rmax], [zero_point], [scale], 373 | weights_data, quantized_weights_data, axis=None, qType=qType) 374 | 375 | # Log entry for this quantized weight 376 | assert(weight.name not in self.quantized_value_map) 377 | quantized_value = QuantizedValue(weight.name, weight.name + "_quantized", weight.name + "_scale", weight.name + "_zero_point", QuantizedValueType.Initializer, None, qType) 378 | self.quantized_value_map[weight.name] = quantized_value 379 | 380 | return weight 381 | 382 | def _get_quantized_weight_convolution(self, initializer, qType): 383 | ''' 384 | :param initializer: initializer TypeProto to quantize 385 | :param qType: type to quantize to 386 | :return: Weight class object with quantization information for a given initializer 387 | ''' 388 | if not self.per_channel: 389 | return self._get_quantized_weight(initializer, qType) 390 | 391 | weights = self.find_weight_data(initializer) 392 | # Quantize per output channel 393 | # Assuming (M x C/group x kH x kW) format where M is number of output channels. 394 | channel_count = initializer.dims[0] 395 | np_data = np.reshape(weights, initializer.dims) 396 | rmin_list = [] 397 | rmax_list = [] 398 | zero_point_list = [] 399 | scale_list = [] 400 | quantized_per_channel_data_list = [] 401 | for i in range(channel_count): 402 | # for each channel, compute quantization data. Assuming (M x C/group x kH x kW) 403 | per_channel_data = np_data[i,:,:,:].flatten() 404 | rmin, rmax, zero_point, scale, quantized_per_channel_data = quantize_data(per_channel_data.flatten().tolist(), 405 | _get_qrange_for_qType(qType), qType) 406 | rmin_list.append(rmin) 407 | rmax_list.append(rmax) 408 | zero_point_list.append(zero_point) 409 | scale_list.append(scale) 410 | quantized_per_channel_data_list.append(quantized_per_channel_data) 411 | channel_index = 0 # (M x C/group x kH x kW) 412 | # combine per_channel_data into one 413 | reshape_dims = list(initializer.dims) # deep copy 414 | reshape_dims[channel_index] = 1 # only one per channel for reshape 415 | quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims) 416 | for i in range(1, len(quantized_per_channel_data_list)): 417 | channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims) 418 | quantized_weights = np.concatenate((quantized_weights, channel_weights), axis=0) 419 | 420 | weight = QuantizedInitializer(initializer.name, initializer, rmin_list, rmax_list, zero_point_list, 421 | scale_list, weights, quantized_weights.flatten().tolist(), channel_index, qType) 422 | 423 | # Make entry for this quantized weight 424 | assert(weight.name not in self.quantized_value_map) 425 | quantized_value = QuantizedValue(weight.name, weight.name + "_quantized", weight.name + "_scale", weight.name + "_zero_point", QuantizedValueType.Initializer, None, qType) 426 | self.quantized_value_map[weight.name] = quantized_value 427 | 428 | return weight 429 | 430 | def _get_dynamic_input_quantization_params(self, input_name, nodes_list, qType): 431 | ''' 432 | Create nodes for dynamic quantization of input and add them to nodes_list. 433 | parameter input_name: Name of the input. 434 | parameter nodes_list: new nodes are appended to this list. 435 | parameter qType: type to quantize to. 436 | return: scale_name, zero_point_name, scale_shape, zero_point_shape. 437 | ''' 438 | if qType == onnx_proto.TensorProto.INT8: 439 | return self._get_dynamic_input_quantization_params_int8(input_name, nodes_list) 440 | 441 | return self._get_dynamic_input_quantization_params_uint8(input_name, nodes_list) 442 | 443 | def _get_dynamic_input_quantization_params_int8(self, input_name, nodes_list): 444 | ''' 445 | Create nodes for dynamic quantization of input to nit8 and add them to nodes_list 446 | parameter input_name: Name of the input. 447 | parameter nodes_list: new nodes are appended to this list. 448 | return: scale_name, zero_point_name, scale_shape, zero_point_shape. 449 | ''' 450 | qType = onnx_proto.TensorProto.INT8 451 | 452 | # Reduce min and Reduce max 453 | input_scale_name = input_name + "_scale" 454 | 455 | reduce_min_name = input_name + "_ReduceMin" 456 | reduce_min_node = onnx.helper.make_node("ReduceMin", [input_name], 457 | [reduce_min_name + ":0"], reduce_min_name, keepdims=0) 458 | nodes_list.append(reduce_min_node) 459 | 460 | reduce_max_name = input_name + "_ReduceMax" 461 | reduce_max_node = onnx.helper.make_node("ReduceMax", [input_name], 462 | [reduce_max_name + ":0"], reduce_max_name, keepdims=0) 463 | nodes_list.append(reduce_max_node) 464 | 465 | # Compute scale 466 | # Find abs(rmin) 467 | reduce_min_abs_name = reduce_min_name + "_Abs" 468 | reduce_min_abs_node = onnx.helper.make_node("Abs", [reduce_min_node.output[0]], 469 | [reduce_min_abs_name + ":0"], reduce_min_abs_name) 470 | nodes_list.append(reduce_min_abs_node) 471 | # Find abs(rmax) 472 | reduce_max_abs_name = reduce_max_name + "_Abs" 473 | reduce_max_abs_node = onnx.helper.make_node("Abs", [reduce_max_node.output[0]], 474 | [reduce_max_abs_name + ":0"], reduce_max_abs_name) 475 | nodes_list.append(reduce_max_abs_node) 476 | # Compute max of abs(rmin) and abs(rmax) 477 | abs_max_name = input_name + "_Abs_Max" 478 | abs_max_node = onnx.helper.make_node("Max", [reduce_min_abs_node.output[0], reduce_max_abs_node.output[0]], 479 | [abs_max_name + ":0"], abs_max_name) 480 | nodes_list.append(abs_max_node) 481 | # and divide by (quantize_range/2.0) which will be equal to max(...)*2.0/quantize_range 482 | _add_initializer_if_not_present(self.model.graph, self.fixed_qrange_int8_name, 483 | [_get_qrange_for_qType(qType)/2.0], [], onnx_proto.TensorProto.FLOAT) 484 | scale_div_name = input_name + "scale_Div" 485 | scale_div_node = onnx.helper.make_node("Div", [abs_max_node.output[0], self.fixed_qrange_int8_name], 486 | [input_scale_name], scale_div_name) 487 | nodes_list.append(scale_div_node) 488 | 489 | # Zero point 490 | _add_initializer_if_not_present(self.model.graph, self.fixed_zero_zp_name, 491 | [0], [], qType) 492 | 493 | return input_scale_name, self.fixed_zero_zp_name, [], [] 494 | 495 | def _get_dynamic_input_quantization_params_uint8(self, input_name, nodes_list): 496 | ''' 497 | Create nodes for dynamic quantization of input to uint8 and add them to nodes_list 498 | parameter input_name: Name of the input. 499 | parameter nodes_list: new nodes are appended to this list. 500 | return: scale_name, zero_point_name, scale_shape, zero_point_shape. 501 | ''' 502 | qType = onnx_proto.TensorProto.UINT8 503 | # Reduce min and Reduce max 504 | input_scale_name = input_name + "_scale" 505 | input_zp_name = input_name + "_zero_point" 506 | 507 | reduce_min_name = input_name + "_ReduceMin" 508 | reduce_min_node = onnx.helper.make_node("ReduceMin", [input_name], 509 | [reduce_min_name + ":0"], reduce_min_name, keepdims=0) 510 | nodes_list.append(reduce_min_node) 511 | 512 | reduce_max_name = input_name + "_ReduceMax" 513 | reduce_max_node = onnx.helper.make_node("ReduceMax", [input_name], 514 | [reduce_max_name + ":0"], reduce_max_name, keepdims=0) 515 | nodes_list.append(reduce_max_node) 516 | 517 | # Add tensors for quantize range and zero value. 518 | _add_initializer_if_not_present(self.model.graph, self.fixed_qrange_uint8_name, 519 | [_get_qrange_for_qType(qType)], [], onnx_proto.TensorProto.FLOAT) 520 | _add_initializer_if_not_present(self.model.graph, self.fixed_zero_name, 521 | [0.0], [], onnx_proto.TensorProto.FLOAT) 522 | 523 | # Compute Scale 524 | # Subtract rmax and rmin 525 | scale_sub_name = input_name + "_scale_Sub" 526 | scale_sub_node = onnx.helper.make_node("Sub", [reduce_max_node.output[0], reduce_min_node.output[0]], 527 | [scale_sub_name + ":0"], scale_sub_name) 528 | nodes_list.append(scale_sub_node) 529 | # and divide by quantize range 530 | scale_div_name = input_name + "_scale_Div" 531 | scale_div_node = onnx.helper.make_node("Div", [scale_sub_node.output[0], self.fixed_qrange_uint8_name], 532 | [input_scale_name], scale_div_name) 533 | nodes_list.append(scale_div_node) 534 | 535 | # Compute zero point 536 | # Subtract zero and rmin 537 | zp_sub_name = input_name + "_zero_point_Sub" 538 | zp_sub_node = onnx.helper.make_node("Sub", [self.fixed_zero_name, reduce_min_node.output[0]], 539 | [zp_sub_name + ":0"], zp_sub_name) 540 | nodes_list.append(zp_sub_node) 541 | # Divide by scale 542 | zp_div_name = input_name + "_zero_point_Div" 543 | zp_div_node = onnx.helper.make_node("Div", [zp_sub_node.output[0], input_scale_name], 544 | [zp_div_name + ":0"], zp_div_name) 545 | nodes_list.append(zp_div_node) 546 | # Compute floor 547 | zp_floor_name = input_name + "_zero_point_Floor" 548 | zp_floor_node = onnx.helper.make_node("Floor", zp_div_node.output, 549 | [zp_floor_name + ":0"], zp_floor_name) 550 | nodes_list.append(zp_floor_node) 551 | # Cast to integer 552 | zp_cast_name = input_name + "_zero_point_Cast" 553 | zp_cast_node = onnx.helper.make_node("Cast", zp_floor_node.output, 554 | [input_zp_name], zp_cast_name, to=qType) 555 | nodes_list.append(zp_cast_node) 556 | 557 | return input_scale_name, input_zp_name, [], [] 558 | 559 | def _get_quantization_params(self, param_name): 560 | ''' 561 | Create initializers and inputs in the graph for zero point and scale of output. 562 | Zero point and scale values are obtained from self.quantization_params if specified. 563 | parameter output_name: Name of the output. 564 | return: scale_name, zero_point_name, scale_shape, zero_point_shape. 565 | ''' 566 | if self.quantization_params is None or param_name not in self.quantization_params: 567 | return False, "", "", "", "" 568 | params = self.quantization_params[param_name] 569 | if params is None or len(params) != 2: 570 | raise ValueError("Quantization parameters should contain zero point and scale. " 571 | "Specified values for output {}: {}".format(output_name, params)) 572 | 573 | if not np.isscalar(params[0]): 574 | raise ValueError("Zero point for output {} should be a scalar value. Value specified: {}".format( 575 | output_name, params[0])) 576 | if not np.isscalar(params[1]): 577 | raise ValueError("Scale for output {} should be a scalar value. Value specified: {}".format( 578 | output_name, params[1])) 579 | 580 | zero_point_values = [params[0].item()] 581 | zero_point_shape = [] 582 | zero_point_name = param_name + "_zero_point" 583 | zero_point_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[params[0].dtype] 584 | 585 | scale_values = [params[1].item()] 586 | scale_shape = [] 587 | scale_name = param_name + "_scale" 588 | 589 | # Add initializers 590 | _add_initializer_if_not_present(self.model.graph, zero_point_name, zero_point_values, zero_point_shape, 591 | zero_point_type) 592 | _add_initializer_if_not_present(self.model.graph, scale_name, scale_values, scale_shape, 593 | onnx_proto.TensorProto.FLOAT) 594 | 595 | return True, scale_name, zero_point_name, scale_shape, zero_point_shape 596 | 597 | def _get_quantize_input_nodes(self, node, input_index, qType): 598 | ''' 599 | Given a input for a node (which is not a initializer), this function 600 | - add elements to graph to compute zero point and scale for this input. 601 | - add new QuantizeLinear nodes to quantize the input. 602 | parameter node: node being quantized in NodeProto format. 603 | parameter input_index: index of input in node.input. 604 | parameter qType: type to quantize to. 605 | return: List of newly created nodes in NodeProto format. 606 | ''' 607 | input_name = node.input[input_index] 608 | output_name = input_name + "_quantized" 609 | 610 | data_found, scale_name, zp_name, scale_shape, zp_shape = \ 611 | self._get_quantization_params(input_name) 612 | 613 | if self.static: 614 | if data_found == False: 615 | raise ValueError("Quantization parameters are not specified for param {}." 616 | "In static mode quantization params for inputs and outputs of odes to be quantized are required.".format(input_name)) 617 | 618 | qlinear_node = onnx.helper.make_node("QuantizeLinear", [input_name, scale_name, zp_name], 619 | [output_name], input_name + "_QuantizeLinear") 620 | 621 | return [qlinear_node] 622 | 623 | else: 624 | if data_found == True: 625 | qlinear_node = onnx.helper.make_node("QuantizeLinear", [input_name, scale_name, zp_name], 626 | [output_name], input_name + "_QuantizeLinear") 627 | else: 628 | # Scale and Zero Points not available for this input. Add nodes to dynamically compute it 629 | if self.fuse_dynamic_quant and qType == onnx_proto.TensorProto.UINT8: 630 | scale_name = input_name + "_scale" 631 | zeropoint_name = input_name + "_zero_point" 632 | qlinear_node = onnx.helper.make_node("DynamicQuantizeLinear", [input_name], 633 | [output_name, scale_name, zeropoint_name], input_name + "_QuantizeLinear") 634 | return [qlinear_node] 635 | 636 | else: 637 | nodes = [] 638 | scale_name, zp_name, scale_shape, zp_shape = \ 639 | self._get_dynamic_input_quantization_params(input_name, nodes, qType) 640 | qlinear_node = onnx.helper.make_node("QuantizeLinear", [input_name, scale_name, zp_name], 641 | [output_name], input_name + "_QuantizeLinear") 642 | 643 | return nodes + [qlinear_node] 644 | 645 | def _get_bias_add_nodes(self, nodes, node, last_output, quantized_bias_name): 646 | ''' 647 | Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node 648 | parameter nodes: new nodes would be appended into nodes 649 | parameter node: current node (Conv) 650 | parameter last_output: output of previous node (input to bias add) 651 | return: the name of output 652 | ''' 653 | # Add an Add operation for bias 654 | # Add reshape for correct broadcase 655 | reshape_input = [quantized_bias_name] 656 | 657 | # Add tensors for the shape to be reshaped to 658 | _add_initializer_if_not_present(self.model.graph, "reshape_shape", 659 | [1,-1,1,1], [4], onnx_proto.TensorProto.INT64) 660 | reshape_input.append('reshape_shape') 661 | reshape_op_output = node.output[0] + "_reshape" 662 | reshape_node = onnx.helper.make_node("Reshape", reshape_input, [reshape_op_output], 663 | quantized_bias_name+"reshape") 664 | nodes.append(reshape_node) 665 | 666 | bias_add_input = [last_output] 667 | bias_add_input.append(reshape_op_output) 668 | add_node_output = node.output[0] + "_bias_add" 669 | add_node = onnx.helper.make_node("Add", bias_add_input, [add_node_output], 670 | quantized_bias_name + "bias_add") 671 | nodes.append(add_node) 672 | return add_node_output 673 | 674 | def _update_unsupported_nodes_using_weight(self, weight, new_nodes_list): 675 | '''Find all nodes using a weight that do not support quantization and 676 | add a DequantizeLinear node before those nodes. This includes all nodes except Conv, MatMul. 677 | parameter weight: Weight object 678 | parameter new_nodes_list: List of new nodes created before processing current node. 679 | return: List of new nodes created. 680 | ''' 681 | nodes_using_weight = _find_nodes_using_initializer(self.model.graph, weight.initializer) 682 | unsupported_nodes = [node for node in nodes_using_weight if node.op_type not in ["Conv", "MatMul", "Gather"]] 683 | 684 | nodes_list = [] 685 | dequantize_linear_name = weight.name + "_DequantizeLinear" 686 | output_name = weight.name + "_dequantized" 687 | 688 | # Check if DequantizeLinear node needs to be added to graph. 689 | if len(unsupported_nodes) != 0 and \ 690 | _find_node_by_name(dequantize_linear_name, self.model.graph, new_nodes_list) is None: 691 | inputs = [weight.name + "_quantized", weight.name + "_scale", weight.name + "_zero_point"] 692 | node = onnx.helper.make_node("DequantizeLinear", inputs, [output_name], 693 | dequantize_linear_name) 694 | nodes_list.append(node) 695 | 696 | # Update unsupported nodes to take dequantized weight as input. 697 | for node in unsupported_nodes: 698 | for i, node_input in enumerate(node.input): 699 | if node_input == weight.name: 700 | node.input[i] = output_name 701 | 702 | return nodes_list 703 | 704 | def _dynamic_quantize_bias(self, input_name, weight_scale_name, bias_name, quantized_bias_name, new_node_list): 705 | ''' 706 | Adds series of nodes required to quantize the bias dynamically. 707 | parameter input_name: Input name 708 | parameter weight_scale_name: Weight scale. 709 | parameter bias_scale_name: Bias to quantize. 710 | parameter quantied_bias_name: Output name to use for quantized bias. 711 | ''' 712 | qType = onnx_proto.TensorProto.INT32 713 | 714 | input_scale_name = input_name + "_scale" 715 | bias_scale_node = onnx.helper.make_node("Mul", [input_scale_name, weight_scale_name], [bias_name + "_scale"], bias_name + "_scale_node") 716 | new_node_list.append(bias_scale_node) 717 | 718 | quantize_bias_node = onnx.helper.make_node("Div", [bias_name, bias_scale_node.output[0]], 719 | [bias_name + "_tmp_quant:0"], bias_name + "_tmp_qaunt") 720 | new_node_list.append(quantize_bias_node) 721 | 722 | bias_rounded_node = onnx.helper.make_node("Floor", quantize_bias_node.output, 723 | [bias_name + "_quant_rounded:0"], bias_name + "_quant_rounded") 724 | new_node_list.append(bias_rounded_node) 725 | 726 | bias_cast_node = onnx.helper.make_node("Cast", bias_rounded_node.output, 727 | [quantized_bias_name], quantized_bias_name + "_node", to=qType) 728 | new_node_list.append(bias_cast_node) 729 | 730 | return 731 | 732 | 733 | def _quantize_bias(self, node, new_node_list): 734 | ''' 735 | Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale 736 | ''' 737 | 738 | # get scale for weight 739 | weight_scale_name = self.quantized_value_map[node.input[1]].scale_name 740 | weight_initializer = _find_by_name(weight_scale_name, self.model.graph.initializer) 741 | weight_scale = self.find_weight_data(weight_initializer) 742 | 743 | # get bias 744 | bias_name = node.input[2] 745 | bias_initializer = _find_by_name(bias_name, self.model.graph.initializer) 746 | bias_data = self.find_weight_data(bias_initializer) 747 | quantized_bias_name = bias_name + "_quantized" 748 | 749 | # input scale is not provided and this input is dynamically quantized so it is not pre-computed at this point 750 | # so resort to dynamic quantization for bias 751 | if self.quantization_params is None or node.input[0] not in self.quantization_params and node.input[0] not in self.quantized_value_map: 752 | self._dynamic_quantize_bias(node.input[0], weight_scale_name, bias_name, quantized_bias_name, new_node_list) 753 | else: 754 | # get scale for input 755 | input_scale_name = self.quantized_value_map[node.input[0]].scale_name 756 | inputscale_initializer = _find_by_name(input_scale_name, self.model.graph.initializer) 757 | input_scale = self.find_weight_data(inputscale_initializer) 758 | 759 | # calcuate scale for bias 760 | bias_scale_name = node.input[2] + "_scale" 761 | bias_scale = input_scale * weight_scale 762 | print(bias_scale) 763 | 764 | # quantize bias 765 | quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32) 766 | print(quantized_data) 767 | 768 | #update bias initializer 769 | bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims) 770 | packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name) 771 | self.model.graph.initializer.extend([packed_bias_initializer]) 772 | 773 | bias_value_info = onnx.helper.make_tensor_value_info(quantized_bias_name, onnx_proto.TensorProto.INT32, bias_initializer.dims) 774 | self.model.graph.input.extend([bias_value_info]) 775 | 776 | # log entries for this quantized bias value 777 | quantized_bias_entry = QuantizedInitializer(bias_name, bias_initializer, [0], [0], [0], [bias_scale], 778 | bias_data, quantized_data, qType=onnx_proto.TensorProto.INT32) 779 | self._quantized_weights.append(quantized_bias_entry) 780 | 781 | assert(bias_name not in self.quantized_value_map) 782 | quantized_value = QuantizedValue(bias_name, quantized_bias_name, "", "", QuantizedValueType.Initializer, None, onnx_proto.TensorProto.INT32) 783 | self.quantized_value_map[bias_name] = quantized_value 784 | 785 | return quantized_bias_name 786 | 787 | 788 | def _quantize_inputs(self, node, indices, new_nodes_list): 789 | ''' 790 | Given a node, this function quantizes the inputs as follows: 791 | - If input is a initializer, quantize the initializer data, replace old initializer 792 | with new initializer 793 | - Else, add QuantizeLinear nodes to perform quantization 794 | parameter node: node being quantized in NodeProto format. 795 | parameter indices: input indices to quantize. 796 | parameter new_nodes_list: List of new nodes created before processing this node. This is used to 797 | check that two QuantizeLinear nodes are not being added for same input. 798 | return: (List of quantized input names, 799 | List of zero point names used for input quantization, 800 | List of scale names used for input quantization, 801 | List of new QuantizeLinear nodes created) 802 | ''' 803 | assert (node.op_type == "Conv" or node.op_type == "MatMul" or node.op_type == "Gather") 804 | 805 | quantized_input_names = [] 806 | zero_point_names = [] 807 | scale_names = [] 808 | nodes = [] 809 | 810 | for input_index in indices: 811 | node_input = node.input[input_index] 812 | 813 | # Find if this input is already quantized 814 | if node_input in self.quantized_value_map: 815 | quantized_value = self.quantized_value_map[node_input] 816 | qType = self.weight_qType if quantized_value.value_type == QuantizedValueType.Initializer else self.input_qType 817 | if quantized_value.qType != qType: 818 | raise ValueError("{} is being used by multiple nodes which are being quantized to different types. " 819 | "This is not suported.", node_input) 820 | 821 | quantized_input_names.append(quantized_value.q_name) 822 | scale_names.append(quantized_value.scale_name) 823 | zero_point_names.append(quantized_value.zp_name) 824 | continue 825 | 826 | # Quantize the input 827 | initializer = _find_by_name(node_input, self.model.graph.initializer) 828 | if initializer is not None: 829 | if node.op_type == "Conv": 830 | weight = self._get_quantized_weight_convolution(initializer, self.weight_qType) 831 | else: 832 | weight = self._get_quantized_weight(initializer, self.weight_qType) 833 | 834 | # Update graph 835 | nodes.extend(self._update_unsupported_nodes_using_weight(weight, new_nodes_list)) 836 | self._update_graph(weight) 837 | 838 | quantized_input_names.append(weight.name + "_quantized") 839 | zero_point_names.append(weight.name + "_zero_point") 840 | scale_names.append(weight.name + "_scale") 841 | else: 842 | # Add QuantizeLinear node. 843 | qlinear_node = _find_node_by_name(node_input + "_QuantizeLinear", self.model.graph, new_nodes_list) 844 | if qlinear_node is None: 845 | quantize_input_nodes = self._get_quantize_input_nodes(node, input_index, self.input_qType) 846 | nodes.extend(quantize_input_nodes) 847 | qlinear_node = quantize_input_nodes[-1] 848 | 849 | if qlinear_node.op_type == "QuantizeLinear": 850 | quantized_input_names.extend(qlinear_node.output) 851 | scale_names.append(qlinear_node.input[1]) 852 | zero_point_names.append(qlinear_node.input[2]) 853 | else: 854 | quantized_input_names.append(qlinear_node.output[0]) 855 | scale_names.append(qlinear_node.output[1]) 856 | zero_point_names.append(qlinear_node.output[2]) 857 | 858 | 859 | return (quantized_input_names, zero_point_names, scale_names, nodes) 860 | 861 | def _handle_other_ops(self, node, new_nodes_list): 862 | ''' 863 | Given a node which does not support quantization(Conv, Matmul, Gather), this method 864 | checks whether the input to this node is quantized and adds a DequantizeLinear node 865 | to dequantize this input back to FP32 866 | parameter node: Current node 867 | parameter new_nodes_list: List of new nodes created before processing current node 868 | return: List of new nodes created 869 | ''' 870 | nodes = [] 871 | for index, node_input in enumerate(node.input): 872 | if node_input in self.quantized_value_map: 873 | node_input_altered = True 874 | input_name = node.input[index] 875 | quantized_value = self.quantized_value_map[input_name] 876 | # Add DequantizeLinear Node for this input 877 | dqlinear_name = input_name + "_DequantizeLinear" 878 | dqlinear_node = _find_node_by_name(dqlinear_name, self.model.graph, new_nodes_list) 879 | if dqlinear_node is None: 880 | dqlinear_inputs = [quantized_value.q_name, quantized_value.scale_name, quantized_value.zp_name] 881 | dequantize_node = onnx.helper.make_node("DequantizeLinear", dqlinear_inputs, [input_name], dqlinear_name) 882 | nodes.append(dequantize_node) 883 | else: 884 | # DQ op is already present, assert it's output matches the input of current node 885 | assert(input_name == dqlinear_node.output[0]) 886 | 887 | # Append the original node 888 | nodes.append(node) 889 | return nodes 890 | 891 | def _handle_activation_ops(self, node, new_node_list): 892 | ''' 893 | Checks whether the give activation op can be removed from the graph. When mode is QLinearOps, 894 | the output quatization params are calculated based on outputs from activation nodes, 895 | therefore these nodes can be removed from the graph if they follow a quantized op. 896 | 897 | parameter node: Current node 898 | parameter new_nodes_list: List of new nodes created before processing current node 899 | return: List of nodes 900 | ''' 901 | assert(node.op_type == "Relu" or node.op_type == 'Clip') 902 | if self.mode is not QuantizationMode.QLinearOps: 903 | return [node] 904 | # When mode is QLinearOps, the output quatization params are calculated based on outputs from 905 | # activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op. 906 | # If input to this node is not quantized then keep this node 907 | if node.input[0] not in self.quantized_value_map: 908 | return [node] 909 | 910 | # Prepare to remove this node 911 | quantized_value = self.quantized_value_map[node.input[0]] 912 | self.quantized_value_map[node.output[0]] = quantized_value 913 | 914 | return [] 915 | 916 | def _quantize_gather_ops(self, node, new_nodes_list): 917 | assert (node.op_type == "Gather") 918 | (quantized_input_names, zero_point_names, scale_names, nodes) = \ 919 | self._quantize_inputs(node, [0], new_nodes_list) 920 | 921 | gather_new_output = node.output[0] + "_quantized" 922 | 923 | # Create an entry for this quantized value 924 | q_output = QuantizedValue(node.output[0], gather_new_output, scale_names[0], zero_point_names[0], QuantizedValueType.Input) 925 | self.quantized_value_map[node.output[0]] = q_output 926 | 927 | gather_original_output = node.output[0] 928 | node.output[0] = gather_new_output 929 | node.input[0] = quantized_input_names[0] 930 | nodes.append(node) 931 | 932 | return nodes 933 | 934 | def _quantize_convolution_integer_ops(self, node, new_nodes_list): 935 | ''' 936 | Used when self.mode is QuantizationMode.IntegerOps. 937 | parameter node: Conv node. 938 | parameter new_nodes_list: List of new nodes created before processing this node. 939 | return: a list of nodes in topological order that represents quantized Conv node. 940 | ''' 941 | assert (node.op_type == "Conv") 942 | 943 | (quantized_input_names, zero_point_names, scale_names, nodes) = \ 944 | self._quantize_inputs(node, [0, 1], new_nodes_list) 945 | 946 | # quantize bias if exist 947 | quantized_bias_name = "" 948 | bias_present = False 949 | if len(node.input) == 3: 950 | quantized_bias_name = self._quantize_bias(node, nodes) 951 | bias_present = True 952 | 953 | conv_integer_output = node.output[0] + "_quantized" 954 | conv_integer_name = "" 955 | if node.name != "": 956 | conv_integer_name = node.name + "_quant" 957 | kwargs = {} 958 | for attribute in node.attribute: 959 | kwargs.update(_attribute_to_kwarg(attribute)) 960 | conv_integer_node = onnx.helper.make_node("ConvInteger", quantized_input_names + zero_point_names, 961 | [conv_integer_output], conv_integer_name, **kwargs) 962 | nodes.append(conv_integer_node) 963 | 964 | # Add bias add nodes 965 | if bias_present: 966 | conv_integer_output = self._get_bias_add_nodes(nodes, node, conv_integer_output, quantized_bias_name) 967 | 968 | # Add cast operation to cast convInteger output to float. 969 | cast_op_output = conv_integer_output + "_cast_output" 970 | cast_node = onnx.helper.make_node("Cast", [conv_integer_output], [cast_op_output], 971 | conv_integer_output + "_cast", to=onnx_proto.TensorProto.FLOAT) 972 | nodes.append(cast_node) 973 | 974 | # Add mul operation to multiply scales of two inputs. 975 | assert (len(scale_names) == 2) 976 | if conv_integer_name != "": 977 | scales_mul_op = conv_integer_name + "_scales_mul" 978 | else: 979 | scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul" 980 | 981 | scales_mul_node = _find_node_by_name(scales_mul_op, self.model.graph, new_nodes_list) 982 | if scales_mul_node is None: 983 | scales_mul_node = _get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op) 984 | nodes.append(scales_mul_node) 985 | 986 | scales_mul_op_output = scales_mul_node.output[0] 987 | 988 | # Add mul operation to multiply mul_scales_op result with output of ConvInteger 989 | # and make the output of this node the same as output of original conv node. 990 | output_scale_mul_op = "" 991 | if conv_integer_name != "": 992 | output_scale_mul_op = conv_integer_name + "_output_scale_mul" 993 | nodes.append(_get_mul_node([cast_op_output, scales_mul_op_output], node.output[0], output_scale_mul_op)) 994 | 995 | return nodes 996 | 997 | def _quantize_matmul_integer_ops(self, node, new_nodes_list): 998 | ''' 999 | Used when self.mode is QuantizationMode.IntegerOps. 1000 | parameter node: MatMul node. 1001 | parameter new_nodes_list: List of new nodes created before processing this node. 1002 | return: a list of nodes in topological order that represents quantized MatMul node. 1003 | ''' 1004 | assert (node.op_type == "MatMul") 1005 | 1006 | (quantized_input_names, zero_point_names, scale_names, nodes) = \ 1007 | self._quantize_inputs(node, [0, 1], new_nodes_list) 1008 | 1009 | matmul_integer_output = node.output[0] + "_quantized" 1010 | matmul_integer_name = "" 1011 | if node.name != "": 1012 | matmul_integer_name = node.name + "_quant" 1013 | matmul_integer_node = onnx.helper.make_node("MatMulInteger", quantized_input_names + zero_point_names, 1014 | [matmul_integer_output], matmul_integer_name) 1015 | nodes.append(matmul_integer_node) 1016 | 1017 | # Add cast operation to cast matmulInteger output to float. 1018 | cast_op_output = matmul_integer_output + "_cast_output" 1019 | cast_node = onnx.helper.make_node("Cast", [matmul_integer_output], [cast_op_output], 1020 | matmul_integer_output + "_cast", to=onnx_proto.TensorProto.FLOAT) 1021 | nodes.append(cast_node) 1022 | 1023 | # Add mul operation to multiply scales of two inputs. 1024 | assert (len(scale_names) == 2) 1025 | if matmul_integer_name != "": 1026 | scales_mul_op = matmul_integer_name + "_scales_mul" 1027 | else: 1028 | scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul" 1029 | 1030 | scales_mul_node = _find_node_by_name(scales_mul_op, self.model.graph, new_nodes_list) 1031 | if scales_mul_node is None: 1032 | scales_mul_node = _get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op) 1033 | nodes.append(scales_mul_node) 1034 | 1035 | scales_mul_op_output = scales_mul_node.output[0] 1036 | 1037 | # Add mul operation to multiply mul_scales_op result with output of MatMulInteger 1038 | # and make the output of this node the same as output of original matmul node. 1039 | output_scale_mul_op = "" 1040 | if matmul_integer_name != "": 1041 | output_scale_mul_op = matmul_integer_name + "_output_scale_mul" 1042 | nodes.append(_get_mul_node([cast_op_output, scales_mul_op_output], node.output[0], 1043 | output_scale_mul_op)) 1044 | return nodes 1045 | 1046 | def _quantize_convolution_qlinear_ops(self, node, new_nodes_list): 1047 | ''' 1048 | Used when self.mode is QuantizationMode.QLinearOps. 1049 | parameter node: Conv node. 1050 | parameter new_nodes_list: List of new nodes created before processing this node. 1051 | return: a list of nodes in topological order that represents quantized Conv node. 1052 | ''' 1053 | assert (node.op_type == "Conv") 1054 | 1055 | (quantized_input_names, zero_point_names, scale_names, nodes) = \ 1056 | self._quantize_inputs(node, [0, 1], new_nodes_list) 1057 | 1058 | quantized_bias_name = "" 1059 | bias_present = False 1060 | if len(node.input) == 3: 1061 | quantized_bias_name = self._quantize_bias(node, nodes) 1062 | bias_present = True 1063 | data_found, output_scale_name, output_zp_name, output_scale_shape, output_zp_shape = \ 1064 | self._get_quantization_params(node.output[0]) 1065 | 1066 | assert(data_found) 1067 | 1068 | qlinear_conv_output = node.output[0] + "_quantized" 1069 | qlinear_conv_name = "" 1070 | if node.name != "": 1071 | qlinear_conv_name = node.name + "_quant" 1072 | kwargs = {} 1073 | for attribute in node.attribute: 1074 | kwargs.update(_attribute_to_kwarg(attribute)) 1075 | qlinear_conv_inputs = [] 1076 | # Input 0 1077 | qlinear_conv_inputs.append(quantized_input_names[0]) 1078 | qlinear_conv_inputs.append(scale_names[0]) 1079 | qlinear_conv_inputs.append(zero_point_names[0]) 1080 | # Input 1 1081 | qlinear_conv_inputs.append(quantized_input_names[1]) 1082 | qlinear_conv_inputs.append(scale_names[1]) 1083 | qlinear_conv_inputs.append(zero_point_names[1]) 1084 | 1085 | # Output 1086 | qlinear_conv_inputs.append(output_scale_name) 1087 | qlinear_conv_inputs.append(output_zp_name) 1088 | 1089 | if bias_present: 1090 | qlinear_conv_inputs.append(quantized_bias_name) 1091 | 1092 | qlinear_conv_node = onnx.helper.make_node("QLinearConv", qlinear_conv_inputs, 1093 | [qlinear_conv_output], qlinear_conv_name, **kwargs) 1094 | nodes.append(qlinear_conv_node) 1095 | 1096 | # Create an entry for this quantized value 1097 | q_output = QuantizedValue(node.output[0], qlinear_conv_output, output_scale_name, output_zp_name, QuantizedValueType.Input) 1098 | self.quantized_value_map[node.output[0]] = q_output 1099 | 1100 | return nodes 1101 | 1102 | def _quantize_matmul_qlinear_ops(self, node, new_nodes_list): 1103 | ''' 1104 | Used when self.mode is QuantizationMode.QLinearOps. 1105 | parameter node: MatMul node. 1106 | parameter new_nodes_list: List of new nodes created before processing this node. 1107 | return: a list of nodes in topological order that represents quantized Conv node. 1108 | ''' 1109 | assert (node.op_type == "MatMul") 1110 | 1111 | (quantized_input_names, zero_point_names, scale_names, nodes) = \ 1112 | self._quantize_inputs(node, [0, 1], new_nodes_list) 1113 | 1114 | data_found, output_scale_name, output_zp_name, output_scale_shape, output_zp_shape = \ 1115 | self._get_quantization_params(node.output[0]) 1116 | 1117 | assert(data_found) 1118 | 1119 | qlinear_matmul_output = node.output[0] + "_quantized" 1120 | qlinear_matmul_name = "" 1121 | if node.name != "": 1122 | qlinear_matmul_name = node.name + "_quant" 1123 | 1124 | qlinear_matmul_inputs = [] 1125 | # Input 0 1126 | qlinear_matmul_inputs.append(quantized_input_names[0]) 1127 | qlinear_matmul_inputs.append(scale_names[0]) 1128 | qlinear_matmul_inputs.append(zero_point_names[0]) 1129 | # Input 1 1130 | qlinear_matmul_inputs.append(quantized_input_names[1]) 1131 | qlinear_matmul_inputs.append(scale_names[1]) 1132 | qlinear_matmul_inputs.append(zero_point_names[1]) 1133 | # Output 1134 | qlinear_matmul_inputs.append(output_scale_name) 1135 | qlinear_matmul_inputs.append(output_zp_name) 1136 | 1137 | qlinear_matmul_node = onnx.helper.make_node("QLinearMatMul", qlinear_matmul_inputs, 1138 | [qlinear_matmul_output], qlinear_matmul_name) 1139 | nodes.append(qlinear_matmul_node) 1140 | 1141 | # Create an entry for this quantized value 1142 | q_output = QuantizedValue(node.output[0], qlinear_matmul_output, output_scale_name, output_zp_name, QuantizedValueType.Input) 1143 | self.quantized_value_map[node.output[0]] = q_output 1144 | 1145 | return nodes 1146 | 1147 | def _quantize_convolution(self, node, new_nodes_list): 1148 | ''' 1149 | https://github.com/onnx/onnx/blob/master/docs/Operators.md#Conv 1150 | :param node: Conv node 1151 | :param new_nodes_list: List of new nodes created before processing this node. 1152 | :return: a list of nodes in topological order that represents quantized Conv node 1153 | ''' 1154 | assert (node.op_type == "Conv") 1155 | 1156 | if self.mode == QuantizationMode.IntegerOps: 1157 | return self._quantize_convolution_integer_ops(node, new_nodes_list) 1158 | 1159 | if self.mode == QuantizationMode.QLinearOps: 1160 | return self._quantize_convolution_qlinear_ops(node, new_nodes_list) 1161 | 1162 | return [node] 1163 | 1164 | def _quantize_matmul(self, node, new_nodes_list): 1165 | ''' 1166 | https://github.com/onnx/onnx/blob/master/docs/Operators.md#MatMul 1167 | :param node: MatMul node 1168 | :param new_nodes_list: List of new nodes created before processing this node. 1169 | :return: a list of nodes in topological order that represents quantized MatMul node 1170 | ''' 1171 | assert(node.op_type == 'MatMul') 1172 | 1173 | if self.mode == QuantizationMode.IntegerOps: 1174 | return self._quantize_matmul_integer_ops(node, new_nodes_list) 1175 | 1176 | if self.mode == QuantizationMode.QLinearOps: 1177 | return self._quantize_matmul_qlinear_ops(node, new_nodes_list) 1178 | 1179 | return [node] 1180 | 1181 | def check_opset_version(org_model, force_fusions): 1182 | ''' 1183 | Check opset version of original model and set opset version and fuse_dynamic_quant accordingly. 1184 | If opset version < 10, set quantized model opset version to 10. 1185 | If opset version == 10, do quantization without using dynamicQuantizeLinear operator. 1186 | If opset version == 11, do quantization using dynamicQuantizeLinear operator. 1187 | :return: fuse_dynamic_quant boolean value. 1188 | ''' 1189 | global onnx_op_set_version 1190 | opset_version = org_model.opset_import[0].version 1191 | fuse_dynamic_quant = False 1192 | 1193 | if opset_version < 11 and force_fusions == True: 1194 | print("Warning: The original model opset version is {}, which does not support node fusions.\n\ 1195 | Forcing fusions can break other nodes in the model.".format(opset_version)) 1196 | fuse_dynamic_quant = True 1197 | 1198 | if opset_version < 10: 1199 | print("Warning: The original model opset version is {}, which does not support quantized operators.\n\ 1200 | The opset version of quantized model will be set to 10. Use onnx model checker to verify model after quantization.".format(opset_version)) 1201 | onnx_op_set_version = 10 1202 | elif opset_version == 10: 1203 | onnx_op_set_version = 10 1204 | else: 1205 | onnx_op_set_version > 10 1206 | fuse_dynamic_quant = True 1207 | return fuse_dynamic_quant 1208 | 1209 | def quantize(model, per_channel=False, nbits=8, quantization_mode=QuantizationMode.IntegerOps, 1210 | static=False, force_fusions=False, asymmetric_input_types=False, 1211 | quantization_params=None, nodes_to_quantize=None): 1212 | ''' 1213 | Given an onnx model, create a quantized onnx model and save it into a file 1214 | :param model: ModelProto to quantize 1215 | :param per_channel: quantize weights per channel 1216 | :param nbits: number of bits to represent quantized data. Currently only supporting 8-bit types 1217 | :param quantization_mode: Can be one of the QuantizationMode types. 1218 | IntegerOps: 1219 | the function will use integer ops. Only ConvInteger and MatMulInteger ops are supported now. 1220 | QLinearOps: 1221 | the function will use QLinear ops. Only QLinearConv and QLinearMatMul ops are supported now. 1222 | :param static: 1223 | True: The inputs/activations are quantized using static scale and zero point values 1224 | specified through quantization_params. 1225 | False: The inputs/activations are quantized using dynamic scale and zero point values 1226 | computed while running the model. 1227 | :param force_fusions: 1228 | True: Fuses nodes added for dynamic quantization 1229 | False: No fusion is applied for nodes which are added for dynamic quantization. 1230 | Should be only used in cases where backends want to apply special fusion routines 1231 | :param asymmetric_input_types: 1232 | True: Weights are quantized into signed integers and inputs/activations into unsigned integers. 1233 | False: Weights and inputs/activations are quantized into unsigned integers. 1234 | :param quantization_params: 1235 | Dictionary to specify the zero point and scale values for inputs to conv and matmul nodes. 1236 | Should be specified when static is set to True. 1237 | The quantization_params should be specified in the following format: 1238 | { 1239 | "input_name": [zero_point, scale] 1240 | }. 1241 | zero_point should be of type np.uint8 and scale should be of type np.float32. 1242 | example: 1243 | { 1244 | 'resnet_model/Relu_1:0': [np.uint8(0), np.float32(0.019539741799235344)], 1245 | 'resnet_model/Relu_2:0': [np.uint8(0), np.float32(0.011359662748873234)] 1246 | } 1247 | :return: ModelProto with quantization 1248 | :param nodes_to quantize: 1249 | List of nodes names to quantize. When this list is not None only the nodes in this list 1250 | are quantized. 1251 | exmaple: 1252 | [ 1253 | 'Cov__224', 1254 | 'Conv__252' 1255 | ] 1256 | ''' 1257 | if nbits == 8: 1258 | input_qType = onnx_proto.TensorProto.UINT8 1259 | weight_qType = onnx_proto.TensorProto.INT8 if asymmetric_input_types else onnx_proto.TensorProto.UINT8 1260 | mode = quantization_mode 1261 | copy_model = onnx_proto.ModelProto() 1262 | copy_model.CopyFrom(model) 1263 | fuse_dynamic_quant = check_opset_version(copy_model, force_fusions) 1264 | quantizer = ONNXQuantizer(copy_model, per_channel, mode, static, fuse_dynamic_quant, weight_qType, input_qType, 1265 | quantization_params, nodes_to_quantize) 1266 | quantizer.quantize_model() 1267 | quantizer.model.producer_name = __producer__ 1268 | quantizer.model.producer_version = __version__ 1269 | return quantizer.model 1270 | else: 1271 | raise ValueError('Unknown value for nbits. only 8 bit quantization is currently supported') 1272 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | from quantize import quantize,QuantizationMode 3 | 4 | # Load the onnx model 5 | model = onnx.load('mobilenet.onnx') 6 | # quantize 7 | quantizated_model = quantize(model,quantization_mode=QuantizationMode.IntegerOps) 8 | #Save the quantized model 9 | onnx.save(quantizated_model,'mobilenet_q.onnx') 10 | --------------------------------------------------------------------------------