├── README.md
├── inference.py
├── mobilenet.onnx
├── mobilenet_q.onnx
├── pkl_reader.py
├── quantize.py
└── test.py


/README.md:
--------------------------------------------------------------------------------
1 | ## onnx_quantization
2 | onnx model quantization int8
3 | ### 将fp32 onnx模型转化为 int8模型
4 | `python3.5 test.py`
5 | 
6 | ### 推理测试fp32和int8的结果准确性以及运行时间
7 | `Python3.5 inference.py`
8 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
 1 | #-*-coding:utf-8-*-
 2 | __author__ = 'taobiaoli'
 3 | 
 4 | import onnxruntime as rt
 5 | import numpy as np
 6 | import cv2
 7 | import onnxruntime.backend as backend
 8 | from onnx import load
 9 | import onnx
10 | from pkl_reader import DataGenerator
11 | import timeit
12 | 
13 | 
14 | #backend onnxruntime with sess, we need choose python3.5.2
15 | 
16 | 
17 | def top5_acc(pred,k=5):
18 |     Inf = 0.
19 |     results =[]
20 |     for i in range(k):
21 |        results.append(pred.index(max(pred)))
22 |        pred[pred.index(max(pred))] = Inf
23 |     return results
24 | 
25 | def inference(model_path,data_path):
26 |     sess = rt.InferenceSession(model_path)
27 |     input_name = sess.get_inputs()[0].name
28 |     output_name = sess.get_outputs()[0].name
29 |     acc_top1 = 0
30 |     acc_top5 = 0
31 |     img = cv2.imread('ILSVRC2012_val_00049517.JPEG')
32 |     img = cv2.resize(img,(224,224))
33 |     img = np.transpose(img,(2,0,1))
34 |     print(img.shape)
35 |     img = img.astype('float32')/255
36 |     img = img.reshape(1,224,224,3)
37 |     print(img.shape)
38 |     print(img.dtype)
39 |     starttime = timeit.default_timer()
40 |     res = sess.run([output_name],{input_name:img})
41 |     endtime = timeit.default_timer()
42 |     print('cost time: ',endtime-starttime)
43 |     print('result:',np.argmax(res))
44 | '''    
45 |     dg = DataGenerator(data_path,model = 'mobilenet', dtype='float32')
46 | 
47 |     for im, label in dg.generator():
48 |          res = sess.run([output_name],{input_name:im})
49 |          if(np.argmax(res) == label):
50 |              acc_top1 = acc_top1 + 1
51 |          if label in top5_acc(res):
52 |              acc_top5 = acc_top5 + 1
53 |     print('top1 accuracy: {}'.format(acc_top1/50000))
54 |     print('top5 accuracy: {}'.format(acc_top5/50000))
55 |     
56 | '''
57 | '''
58 | input_name = sess.get_inputs()[0].name
59 | print('input name',input_name)
60 | #input_shape = sess.get_inputs()[0].shape
61 | #print('input shape',input_shape)
62 | 
63 | input_type = sess.get_inputs()[0].type
64 | print('input type',input_type)
65 | 
66 | output_name = sess.get_outputs()[0].name
67 | print('output name',output_name)
68 | 
69 | 
70 | #backend foronnxruntime for backend ,we need choose python3.5.2
71 | #model = onnx.load('alex_cat_dog.onnx')
72 | #rep = backend.prepare(model,'CPU')
73 | 
74 | #prepare for model input image
75 | img = cv2.imread('0050.jpg')
76 | img = cv2.resize(img,(224,224))
77 | img = np.transpose(img,(2,0,1))
78 | print(img.shape)
79 | img = img.astype('float32')/255
80 | img = img.reshape(1,224,224,3)
81 | print(img.shape)
82 | print(img.dtype)
83 | 
84 | #backend onnxruntime with sess
85 | res = sess.run([output_name],{input_name:img})
86 | print(res)
87 | print(np.argmax(res))
88 | '''
89 | if __name__ == '__main__':
90 |     inference('./mobilenet.onnx','./data/val224_compressed.pkl')
91 | 
92 | 


--------------------------------------------------------------------------------
/mobilenet.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taobiaoli/onnx_quantization/9981df0775d9321d62ff9057538beaffc0ca9f31/mobilenet.onnx


--------------------------------------------------------------------------------
/mobilenet_q.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taobiaoli/onnx_quantization/9981df0775d9321d62ff9057538beaffc0ca9f31/mobilenet_q.onnx


--------------------------------------------------------------------------------
/pkl_reader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle as pkl
 3 | import cv2
 4 | import tqdm
 5 | 
 6 | 
 7 | class DataGenerator(object):
 8 |     def __init__(self, pkl_file, model='vgg', dtype='float32'):
 9 |         self.pkl_file = pkl_file
10 |         self.model = model
11 |         self.dtype = dtype
12 | 
13 |     def generator(self):
14 |         data = self.load_pickle(self.pkl_file)
15 |         assert len(data['data']) == 50000, len(data['data'])
16 |         assert len(data['target']) == 50000, len(data['target'])
17 |         for im, target in tqdm.tqdm(zip(data['data'], data['target']), total=50000):
18 |         # for im, target in zip(data['data'], data['target']):
19 |             im = self.str2img(im)
20 |             if self.model not in ['inception', 'xception', 'mobilenet', 'inception_resnet']:
21 |                 im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
22 |             if self.model == 'squeezenet':
23 |                 im = cv2.resize(im, (227, 227))
24 |             if self.model in ['inception', 'xception', 'inception_resnet']:
25 |                 im = cv2.resize(im, (299, 299))
26 |             im = self.preprocessing(im, model=self.model)
27 |             label = int(target)
28 |             yield im, label
29 | 
30 | 
31 |     @staticmethod
32 |     def load_pickle(path):
33 |         with open(path, 'rb') as f:
34 |             v = pkl.load(f)
35 |         f.close()
36 |         return v
37 | 
38 |     @staticmethod
39 |     def str2img(str_im):
40 |         return cv2.imdecode(np.fromstring(str_im, np.uint8), cv2.IMREAD_COLOR)
41 | 
42 |     @staticmethod
43 |     def preprocessing(im, model='vgg', dtype='float32'):
44 |         dtype = np.float16 if dtype == 'float16' else np.float32
45 |         im = im.astype(dtype)
46 |         im = np.expand_dims(im, axis=0)
47 |         if model in ['vgg', 'resnet', 'squeezenet']:
48 |             im[..., 0] -= 103.939
49 |             im[..., 1] -= 116.779
50 |             im[..., 2] -= 123.68
51 |         elif model in ['inception', 'mobilenet', 'xception', 'inception_resnet']:
52 |             im /= 255.
53 |             im -= 0.5
54 |             im *= 2.
55 |         elif model == 'densenet':
56 |             im[..., 0] -= 103.939
57 |             im[..., 1] -= 116.779
58 |             im[..., 2] -= 123.68
59 |             im[..., 0] *= 0.017
60 |             im[..., 1] *= 0.017
61 |             im[..., 2] *= 0.017
62 |         else:
63 |             pass
64 |         return im
65 | 


--------------------------------------------------------------------------------
/quantize.py:
--------------------------------------------------------------------------------
   1 | # -------------------------------------------------------------------------
   2 | # Copyright (c) Microsoft Corporation. All rights reserved.
   3 | # Licensed under the MIT License. See License.txt in the project root for
   4 | # license information.
   5 | # --------------------------------------------------------------------------
   6 | import os
   7 | import onnx
   8 | import onnx.numpy_helper
   9 | import struct
  10 | 
  11 | import numpy as np
  12 | from onnx import onnx_pb as onnx_proto
  13 | 
  14 | __producer__ = "onnx.quantize"
  15 | __version__ = "0.1.0"
  16 | onnx_domain = "ai.onnx"
  17 | onnx_op_set_version = 11
  18 | 
  19 | type_to_name = {
  20 |     1: "FLOAT",
  21 |     2: "UINT8",
  22 |     3: "INT8",
  23 |     4: "UINT16",
  24 |     5: "INT16",
  25 |     6: "INT32",
  26 |     7: "INT64",
  27 |     8: "STRING",
  28 |     9: "BOOL",
  29 |     10: "FLOAT16",
  30 |     11: "DOUBLE",
  31 |     12: "UINT32",
  32 |     13: "UINT64",
  33 |     14: "COMPLEX64",
  34 |     15: "COMPLEX128",
  35 | }
  36 | 
  37 | # Quantization mode
  38 | # IntegerOps: Use IntegerOps in quantized model. Only ConvInteger and MatMulInteger ops are supported now.
  39 | # QLinearOps: Use QLinearOps in quantized model. Only QLinearConv and QLinearMatMul ops are supported now.
  40 | class QuantizationMode():
  41 |     IntegerOps = 0
  42 |     QLinearOps = 1
  43 | 
  44 | quantization_modes = [getattr(QuantizationMode, attr) for attr in dir(QuantizationMode)\
  45 |     if not callable(getattr(QuantizationMode, attr)) and not attr.startswith("__")]
  46 | 
  47 | class QuantizedInitializer:
  48 |     '''
  49 |         Represents a linearly quantized weight input from ONNX operators
  50 |     '''
  51 |     def __init__(self, name, initializer, rmins, rmaxs, zero_points, scales, data=[], quantized_data=[], axis=None,
  52 |                  qType=onnx_proto.TensorProto.UINT8):
  53 |         self.name = name
  54 |         self.initializer = initializer  # TensorProto initializer in ONNX graph
  55 |         self.rmins = rmins  # List of minimum range for each axis
  56 |         self.rmaxs = rmaxs  # List of maximum range for each axis
  57 |         self.zero_points = zero_points  # 1D tensor of zero points computed for each axis. scalar if axis is empty
  58 |         self.scales = scales  # 1D tensor of scales computed for each axis. scalar if axis is empty
  59 |         self.data = data  # original data from initializer TensorProto
  60 |         self.quantized_data = quantized_data  # weight-packed data from data
  61 |         self.axis = axis  # Scalar to specify which dimension in the initializer to weight pack.
  62 |                           # If empty, single zero point and scales computed from a single rmin and rmax
  63 |         self.qType = qType # type of quantized data.
  64 | 
  65 | class QuantizedValueType():
  66 |     Input = 0
  67 |     Initializer = 1
  68 | 
  69 | class QuantizedValue:
  70 |     '''
  71 |     Represents a linearly quantized value (input\output\intializer)
  72 |     '''
  73 |     def __init__(self, name, new_quantized_name, scale_name, zero_point_name, quantized_value_type, axis=None,
  74 |                  qType=onnx_proto.TensorProto.UINT8):
  75 |         self.original_name = name
  76 |         self.q_name = new_quantized_name
  77 |         self.scale_name = scale_name
  78 |         self.zp_name = zero_point_name
  79 |         self.value_type = quantized_value_type
  80 |         self.axis = axis
  81 |         self.qType = qType
  82 | 
  83 | def quantize_data(data, quantize_range, qType):
  84 |     '''
  85 |         :parameter quantize_range: list of data to weight pack.
  86 |         :parameter qType: data type to quantize to. Supported types UINT8 and INT8
  87 |         :return: minimum, maximum, zero point, scale, and quantized weights
  88 |         To pack weights, we compute a linear transformation
  89 |             - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and
  90 |             - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where
  91 |                 m = max(abs(rmin), abs(rmax))
  92 |         and add necessary intermediate nodes to trasnform quantized weight to full weight using the equation
  93 |         r = S(q-z), where
  94 |             r: real original value
  95 |             q: quantized value
  96 |             S: scale
  97 |             z: zero point
  98 |     '''
  99 |     rmin = min(min(data), 0)
 100 |     rmax = max(max(data), 0)
 101 | 
 102 |     if qType == onnx_proto.TensorProto.INT8:
 103 |         max_range = max(abs(rmin), abs(rmax))
 104 |         scale = (float(max_range)*2) / quantize_range
 105 |         zero_point = 0
 106 |         quantized_data = (np.asarray(data) / scale).round().astype('b') #signed byte type
 107 |     elif qType == onnx_proto.TensorProto.UINT8:
 108 |         scale = (float(rmax) - rmin) / quantize_range if rmin != rmax else 1
 109 |         zero_point = round((0 - rmin) / scale) # round to nearest integer
 110 |         quantized_data = ((np.asarray(data) / scale).round() + zero_point).astype('B') # unsigned byte type
 111 |     else:
 112 |         raise ValueError("Unexpected data type {} requested. Only INT8 and UINT8 are supported.")
 113 | 
 114 |     return rmin, rmax, zero_point, scale, quantized_data
 115 | 
 116 | 
 117 | def _attribute_to_kwarg(attribute):
 118 |     '''
 119 |     Convert attribute to kwarg format for use with onnx.helper.make_node.
 120 |         :parameter attribute: attribute in AttributeProto format.
 121 |         :return: attribute in {key: value} format.
 122 |     '''
 123 |     if (attribute.type == 0):
 124 |         raise ValueError('attribute {} does not have type specified.'.format(attribute.name))
 125 | 
 126 |     # Based on attribute type definitions from AttributeProto
 127 |     # definition in https://github.com/onnx/onnx/blob/master/onnx/onnx.proto
 128 |     if (attribute.type == 1):
 129 |         value = attribute.f
 130 |     elif (attribute.type == 2):
 131 |         value = attribute.i
 132 |     elif (attribute.type == 3):
 133 |         value = attribute.s
 134 |     elif (attribute.type == 4):
 135 |         value = attribute.t
 136 |     elif (attribute.type == 5):
 137 |         value = attribute.g
 138 |     elif (attribute.type == 6):
 139 |         value = attribute.floats
 140 |     elif (attribute.type == 7):
 141 |         value = attribute.ints
 142 |     elif (attribute.type == 8):
 143 |         value = attribute.strings
 144 |     elif (attribute.type == 9):
 145 |         value = attribute.tensors
 146 |     elif (attribute.type == 10):
 147 |         value = attribute.graphs
 148 |     else:
 149 |         raise ValueError('attribute {} has unsupported type {}.'.format(attribute.name, attribute.type))
 150 | 
 151 |     return {attribute.name: value}
 152 | 
 153 | def _find_by_name(item_name, item_list):
 154 |     '''
 155 |     Helper function to find item by name in a list.
 156 |         parameter item_name: name of the item.
 157 |         parameter item_list: list of items.
 158 |         return: item if found. None otherwise.
 159 |     '''
 160 |     items = [item for item in item_list if item.name == item_name]
 161 |     return items[0] if len(items) > 0 else None
 162 | 
 163 | def _get_mul_node(inputs, output, name):
 164 |     '''
 165 |     Helper function to create a Mul node.
 166 |         parameter inputs: list of input names.
 167 |         parameter output: output name.
 168 |         parameter name: name of the node.
 169 |         return: Mul node in NodeProto format.
 170 |     '''
 171 |     return onnx.helper.make_node("Mul", inputs, [output], name)
 172 | 
 173 | def _find_node_by_name(node_name, graph, new_nodes_list):
 174 |     '''
 175 |     Helper function to check if a node exists in a graph or
 176 |     new set of nodes created during quantization.
 177 |         parameter node_name: name of the node.
 178 |         parameter graph: GraphProto.
 179 |         parameter new_nodes_list: list of nodes added during quantization.
 180 |         return: NodeProto if found. None otherwise.
 181 |     '''
 182 |     graph_nodes_list = list(graph.node) # deep copy
 183 |     graph_nodes_list.extend(new_nodes_list)
 184 |     node = _find_by_name(node_name, graph_nodes_list)
 185 |     return node
 186 | 
 187 | def _add_initializer_if_not_present(graph, name, value, shape, type):
 188 |     '''
 189 |     Helper function to add an initializer if it is not present in the graph.
 190 |         parameter graph: GraphProto.
 191 |         parameter name: Initializer's name.
 192 |         parameter value: Initializer's value.
 193 |         parameter shape: Initializer's shape.
 194 |         parameter type: Initializer's type.
 195 |     '''
 196 |     if _find_by_name(name, graph.initializer) is None:
 197 |         initializer = onnx.helper.make_tensor(name, type, shape, value)
 198 |         value_info = onnx.helper.make_tensor_value_info(name, type, shape)
 199 |         graph.initializer.extend([initializer])
 200 |         graph.input.extend([value_info])
 201 | 
 202 | def _get_qrange_for_qType(qType):
 203 |     '''
 204 |     Helper function to get the quantization range for a type.
 205 |         parameter qType: quantization type.
 206 |         return: quantization range.
 207 |     '''
 208 |     if qType == onnx_proto.TensorProto.UINT8:
 209 |         return 255  # 2^b - 1
 210 |     elif qType == onnx_proto.TensorProto.INT8:
 211 |         return 254  # [-(2^{b-1}-1), 2^{b-1}-1]: [-127, 127] for 8 bits.
 212 |     else:
 213 |         raise ValueError('unsupported quantization data type')
 214 | 
 215 | def _find_nodes_using_initializer(graph, initializer):
 216 |     '''
 217 |     Helper function to find all nodes with an initializer as a input.
 218 |         parameter graph: GraphProto.
 219 |         parameter initializer: Initializer in TensorProto format.
 220 |         return: List of nodes.
 221 |     '''
 222 |     result = []
 223 |     for node in graph.node:
 224 |         for node_input in node.input:
 225 |             if node_input == initializer.name:
 226 |                 result.append(node)
 227 |     return result
 228 | 
 229 | class ONNXQuantizer:
 230 |     def __init__(self, model, per_channel, mode, static, fuse_dynamic_quant, weight_qType, input_qType,
 231 |             quantization_params, nodes_to_quantize):
 232 |         self.model = model
 233 |         self.per_channel = per_channel # weight-pack per channel        
 234 |         self.mode = mode # QuantizationMode.Value
 235 |         self.static = static # use static quantization for inputs.
 236 |         self.fuse_dynamic_quant = fuse_dynamic_quant
 237 |         self.input_qType = input_qType # quantize input type
 238 |         self.weight_qType = weight_qType  # quantize data type
 239 |         self.quantization_params = quantization_params
 240 |         self.nodes_to_quantize = nodes_to_quantize # specific nodes to quantize
 241 | 
 242 |         if not self.mode in quantization_modes:
 243 |             raise ValueError('unsupported quantization mode {}'.format(self.mode))
 244 | 
 245 |         # QuantizeRange tensor name and zero tensor name for scale and zero point calculation.
 246 |         # Used when static is False
 247 |         self.fixed_qrange_uint8_name = "fixed_quantization_range_uint8"
 248 |         self.fixed_qrange_int8_name = "fixed_quantization_range_int8"
 249 |         # For uint8 data-type, to compute zero point, we subtract rmin from 0 (represented by fixed_zero_name tensor)
 250 |         self.fixed_zero_name = "fixed_zero"
 251 |         # For int8 data-type, zero point is always zero (respresented by fixed_zero_point_name tensor)
 252 |         self.fixed_zero_zp_name = "fixed_zero_zp"
 253 | 
 254 |         # List of quantized weights
 255 |         self._quantized_weights = []
 256 |         # Map of all original value names to quantized value names
 257 |         self.quantized_value_map = {}
 258 |     
 259 |     def quantize_model(self):
 260 |         # Create a new topologically sorted list for quantizing a model
 261 |         new_list = []
 262 |         for node in self.model.graph.node:
 263 |             if self.nodes_to_quantize is not None and node.name not in self.nodes_to_quantize:
 264 |                 new_list +=self._handle_other_ops(node, new_list)
 265 |             else:
 266 |                 if node.op_type == 'Conv':
 267 |                     new_list += self._quantize_convolution(node, new_list)
 268 |                 elif node.op_type == 'MatMul':
 269 |                     new_list += self._quantize_matmul(node, new_list)
 270 |                 elif node.op_type == 'Gather':
 271 |                     new_list += self._quantize_gather_ops(node, new_list)
 272 |                 elif node.op_type == 'Relu' or node.op_type == 'Clip':
 273 |                     new_list +=self._handle_activation_ops(node, new_list)
 274 |                 else:
 275 |                     new_list +=self._handle_other_ops(node, new_list)                    
 276 | 
 277 |         # extend is used to append to the list for a protobuf fields
 278 |         # https://developers.google.com/protocol-buffers/docs/reference/python-generated?csw=1#fields
 279 |         self.model.graph.ClearField('node')
 280 |         self.model.graph.node.extend(new_list)
 281 | 
 282 |         # Remove weights which are already quantized from graph.
 283 |         self._remove_quantized_weights()
 284 | 
 285 |         # update opset.
 286 |         opset_info = next((opset for opset in self.model.opset_import if opset.domain == '' or opset.domain == onnx_domain), None)
 287 |         if opset_info is not None:
 288 |             self.model.opset_import.remove(opset_info)
 289 |         self.model.opset_import.extend([onnx.helper.make_opsetid(onnx_domain, onnx_op_set_version)])
 290 | 
 291 |         return self.model
 292 | 
 293 |     def find_weight_data(self, initializer):
 294 |         '''
 295 |             :param initializer: TensorProto initializer object from a graph
 296 |             :return: a list of initialized data in a given initializer object
 297 |         '''
 298 |         if initializer.data_type == onnx_proto.TensorProto.FLOAT:
 299 |             weights = onnx.numpy_helper.to_array(initializer)
 300 |         else:
 301 |             raise ValueError('Model contains conv operator weights in {}. Only float type quantization is supported.'.format(
 302 |                 type_to_name[initializer.data_type]))
 303 |         return weights
 304 | 
 305 |     def _remove_quantized_weights(self):
 306 |         ''' Remove the weights which are already quantized from graph initializer list.
 307 |             This function assumes that after quantization, all nodes that previously use a weight:
 308 |                 - use output from DequantizeLinear as input if they do not support quantization.
 309 |                 - use quantized weight if they support quantization.
 310 |         '''
 311 |         for weight in self._quantized_weights:
 312 |             # Remove existing weight initializer
 313 |             self.model.graph.initializer.remove(weight.initializer)
 314 | 
 315 |             # Removing input weight to a convolution
 316 |             try:
 317 |                 weight_input = next(val for val in self.model.graph.input if val.name == weight.name)
 318 |                 self.model.graph.input.remove(weight_input)
 319 |             except StopIteration:
 320 |                 if self.model.ir_version < 4:
 321 |                     raise ValueError('invalid weight name {} found in the graph (not a graph input) '.format(weight.name))
 322 | 
 323 | 
 324 |     def _update_graph(self, weight):
 325 |         '''
 326 |             Given a weight object, update the graph by doing the following:
 327 |              - remove old initializer, update new initializers for quantized weight, zero point, and scale
 328 |              - remove old weight input, update with new inputs for quantized weight, zero point, and scale
 329 |             This function does NOT update the nodes in the graph, just initializers and inputs
 330 |         '''
 331 |         quantized_value = self.quantized_value_map[weight.name]
 332 |         assert(quantized_value is not None)
 333 |         packed_weight_name = quantized_value.q_name
 334 |         scale_name = quantized_value.scale_name
 335 |         zero_point_name = quantized_value.zp_name
 336 | 
 337 |         # Update packed weight, zero point, and scale initializers
 338 |         packed_weight_np_data = np.asarray(weight.quantized_data,
 339 |             dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[weight.qType]).reshape(weight.initializer.dims)
 340 |         packed_weight_initializer = onnx.numpy_helper.from_array(packed_weight_np_data, packed_weight_name)
 341 | 
 342 |         if weight.axis is not None:
 343 |             zero_scale_shape = [weight.initializer.dims[weight.axis]]
 344 |         else: # scale and zero point must be scalar
 345 |             zero_scale_shape = []
 346 |         zero_point_type = weight.qType
 347 |         scale_initializer = onnx.helper.make_tensor(scale_name, onnx_proto.TensorProto.FLOAT, zero_scale_shape, weight.scales)
 348 |         zero_initializer = onnx.helper.make_tensor(zero_point_name, zero_point_type, zero_scale_shape, weight.zero_points)
 349 | 
 350 |         self.model.graph.initializer.extend([packed_weight_initializer, scale_initializer, zero_initializer])
 351 | 
 352 |         # Create input for initialized scale and zeros
 353 |         packed_weight_value_info = onnx.helper.make_tensor_value_info(packed_weight_name, weight.qType,
 354 |                                         weight.initializer.dims)
 355 |         scale_value_info = onnx.helper.make_tensor_value_info(scale_name, onnx_proto.TensorProto.FLOAT, zero_scale_shape)
 356 |         zero_point_value_info = onnx.helper.make_tensor_value_info(zero_point_name,
 357 |             zero_point_type, zero_scale_shape) # zero_point is int for dequantize operator
 358 | 
 359 |         self.model.graph.input.extend([packed_weight_value_info, scale_value_info, zero_point_value_info])
 360 | 
 361 |         self._quantized_weights.append(weight)
 362 | 
 363 |     def _get_quantized_weight(self, initializer, qType):
 364 |         '''
 365 |             :param initializer: TensorProto initializer
 366 |             :param qType: type to quantize to
 367 |             :return: Weight class with quantization information
 368 |         '''
 369 |         weights_data = self.find_weight_data(initializer)
 370 |         rmin, rmax, zero_point, scale, quantized_weights_data = quantize_data(weights_data.flatten().tolist(),
 371 |             _get_qrange_for_qType(qType), qType)
 372 |         weight = QuantizedInitializer(initializer.name, initializer, [rmin], [rmax], [zero_point], [scale],
 373 |                         weights_data, quantized_weights_data, axis=None, qType=qType)
 374 | 
 375 |         # Log entry for this quantized weight
 376 |         assert(weight.name not in self.quantized_value_map)
 377 |         quantized_value = QuantizedValue(weight.name, weight.name + "_quantized", weight.name + "_scale", weight.name + "_zero_point", QuantizedValueType.Initializer, None, qType)
 378 |         self.quantized_value_map[weight.name] = quantized_value
 379 | 
 380 |         return weight
 381 | 
 382 |     def _get_quantized_weight_convolution(self, initializer, qType):
 383 |         '''
 384 |             :param initializer: initializer TypeProto to quantize
 385 |             :param qType: type to quantize to
 386 |             :return: Weight class object with quantization information for a given initializer
 387 |         '''
 388 |         if not self.per_channel:
 389 |             return self._get_quantized_weight(initializer, qType)
 390 | 
 391 |         weights = self.find_weight_data(initializer)
 392 |         # Quantize per output channel
 393 |         # Assuming (M x C/group x kH x kW) format where M is number of output channels.
 394 |         channel_count = initializer.dims[0]
 395 |         np_data = np.reshape(weights, initializer.dims)
 396 |         rmin_list = []
 397 |         rmax_list = []
 398 |         zero_point_list = []
 399 |         scale_list = []
 400 |         quantized_per_channel_data_list = []
 401 |         for i in range(channel_count):
 402 |             # for each channel, compute quantization data. Assuming (M x C/group x kH x kW)
 403 |             per_channel_data = np_data[i,:,:,:].flatten()
 404 |             rmin, rmax, zero_point, scale, quantized_per_channel_data = quantize_data(per_channel_data.flatten().tolist(),
 405 |                 _get_qrange_for_qType(qType), qType)
 406 |             rmin_list.append(rmin)
 407 |             rmax_list.append(rmax)
 408 |             zero_point_list.append(zero_point)
 409 |             scale_list.append(scale)
 410 |             quantized_per_channel_data_list.append(quantized_per_channel_data)
 411 |         channel_index = 0 # (M x C/group x kH x kW)
 412 |         # combine per_channel_data into one
 413 |         reshape_dims = list(initializer.dims)  # deep copy
 414 |         reshape_dims[channel_index] = 1  # only one per channel for reshape
 415 |         quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
 416 |         for i in range(1, len(quantized_per_channel_data_list)):
 417 |             channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims)
 418 |             quantized_weights = np.concatenate((quantized_weights, channel_weights), axis=0)
 419 | 
 420 |         weight = QuantizedInitializer(initializer.name, initializer, rmin_list, rmax_list, zero_point_list,
 421 |                         scale_list, weights, quantized_weights.flatten().tolist(), channel_index, qType)
 422 |         
 423 |         # Make entry for this quantized weight
 424 |         assert(weight.name not in self.quantized_value_map)
 425 |         quantized_value = QuantizedValue(weight.name, weight.name + "_quantized", weight.name + "_scale", weight.name + "_zero_point", QuantizedValueType.Initializer, None, qType)
 426 |         self.quantized_value_map[weight.name] = quantized_value
 427 | 
 428 |         return weight
 429 | 
 430 |     def _get_dynamic_input_quantization_params(self, input_name, nodes_list, qType):
 431 |         '''
 432 |         Create nodes for dynamic quantization of input and add them to nodes_list.
 433 |             parameter input_name: Name of the input.
 434 |             parameter nodes_list: new nodes are appended to this list.
 435 |             parameter qType: type to quantize to.
 436 |             return: scale_name, zero_point_name, scale_shape, zero_point_shape.
 437 |         '''
 438 |         if qType == onnx_proto.TensorProto.INT8:
 439 |             return self._get_dynamic_input_quantization_params_int8(input_name, nodes_list)
 440 | 
 441 |         return self._get_dynamic_input_quantization_params_uint8(input_name, nodes_list)
 442 | 
 443 |     def _get_dynamic_input_quantization_params_int8(self, input_name, nodes_list):
 444 |         '''
 445 |         Create nodes for dynamic quantization of input to nit8 and add them to nodes_list        
 446 |             parameter input_name: Name of the input.
 447 |             parameter nodes_list: new nodes are appended to this list.
 448 |             return: scale_name, zero_point_name, scale_shape, zero_point_shape.
 449 |         '''
 450 |         qType = onnx_proto.TensorProto.INT8
 451 | 
 452 |         # Reduce min and Reduce max
 453 |         input_scale_name = input_name + "_scale"
 454 | 
 455 |         reduce_min_name = input_name + "_ReduceMin"
 456 |         reduce_min_node = onnx.helper.make_node("ReduceMin", [input_name],
 457 |             [reduce_min_name + ":0"], reduce_min_name, keepdims=0)
 458 |         nodes_list.append(reduce_min_node)
 459 | 
 460 |         reduce_max_name = input_name + "_ReduceMax"
 461 |         reduce_max_node = onnx.helper.make_node("ReduceMax", [input_name],
 462 |             [reduce_max_name + ":0"], reduce_max_name, keepdims=0)
 463 |         nodes_list.append(reduce_max_node)
 464 | 
 465 |         # Compute scale
 466 |         #   Find abs(rmin)
 467 |         reduce_min_abs_name = reduce_min_name + "_Abs"
 468 |         reduce_min_abs_node = onnx.helper.make_node("Abs", [reduce_min_node.output[0]],
 469 |             [reduce_min_abs_name + ":0"], reduce_min_abs_name)
 470 |         nodes_list.append(reduce_min_abs_node)
 471 |         #   Find abs(rmax)
 472 |         reduce_max_abs_name = reduce_max_name + "_Abs"
 473 |         reduce_max_abs_node = onnx.helper.make_node("Abs", [reduce_max_node.output[0]],
 474 |             [reduce_max_abs_name + ":0"], reduce_max_abs_name)
 475 |         nodes_list.append(reduce_max_abs_node)
 476 |         #   Compute max of abs(rmin) and abs(rmax)
 477 |         abs_max_name = input_name + "_Abs_Max"
 478 |         abs_max_node = onnx.helper.make_node("Max", [reduce_min_abs_node.output[0], reduce_max_abs_node.output[0]],
 479 |             [abs_max_name + ":0"], abs_max_name)
 480 |         nodes_list.append(abs_max_node)
 481 |         #   and divide by (quantize_range/2.0) which will be equal to max(...)*2.0/quantize_range
 482 |         _add_initializer_if_not_present(self.model.graph, self.fixed_qrange_int8_name,
 483 |             [_get_qrange_for_qType(qType)/2.0], [], onnx_proto.TensorProto.FLOAT)
 484 |         scale_div_name = input_name + "scale_Div"
 485 |         scale_div_node = onnx.helper.make_node("Div", [abs_max_node.output[0], self.fixed_qrange_int8_name],
 486 |             [input_scale_name], scale_div_name)
 487 |         nodes_list.append(scale_div_node)
 488 | 
 489 |         # Zero point
 490 |         _add_initializer_if_not_present(self.model.graph, self.fixed_zero_zp_name,
 491 |             [0], [], qType)
 492 | 
 493 |         return input_scale_name, self.fixed_zero_zp_name, [], []
 494 | 
 495 |     def _get_dynamic_input_quantization_params_uint8(self, input_name, nodes_list):
 496 |         '''
 497 |         Create nodes for dynamic quantization of input to uint8 and add them to nodes_list
 498 |             parameter input_name: Name of the input.
 499 |             parameter nodes_list: new nodes are appended to this list.
 500 |             return: scale_name, zero_point_name, scale_shape, zero_point_shape.
 501 |         '''
 502 |         qType = onnx_proto.TensorProto.UINT8
 503 |         # Reduce min and Reduce max
 504 |         input_scale_name = input_name + "_scale"
 505 |         input_zp_name = input_name + "_zero_point"
 506 | 
 507 |         reduce_min_name = input_name + "_ReduceMin"
 508 |         reduce_min_node = onnx.helper.make_node("ReduceMin", [input_name],
 509 |             [reduce_min_name + ":0"], reduce_min_name, keepdims=0)
 510 |         nodes_list.append(reduce_min_node)
 511 | 
 512 |         reduce_max_name = input_name + "_ReduceMax"
 513 |         reduce_max_node = onnx.helper.make_node("ReduceMax", [input_name],
 514 |             [reduce_max_name + ":0"], reduce_max_name, keepdims=0)
 515 |         nodes_list.append(reduce_max_node)
 516 | 
 517 |         # Add tensors for quantize range and zero value.
 518 |         _add_initializer_if_not_present(self.model.graph, self.fixed_qrange_uint8_name,
 519 |             [_get_qrange_for_qType(qType)], [], onnx_proto.TensorProto.FLOAT)
 520 |         _add_initializer_if_not_present(self.model.graph, self.fixed_zero_name,
 521 |             [0.0], [], onnx_proto.TensorProto.FLOAT)
 522 | 
 523 |         # Compute Scale
 524 |         #   Subtract rmax and rmin
 525 |         scale_sub_name = input_name + "_scale_Sub"
 526 |         scale_sub_node = onnx.helper.make_node("Sub", [reduce_max_node.output[0], reduce_min_node.output[0]],
 527 |             [scale_sub_name + ":0"], scale_sub_name)
 528 |         nodes_list.append(scale_sub_node)
 529 |         #   and divide by quantize range
 530 |         scale_div_name = input_name + "_scale_Div"
 531 |         scale_div_node = onnx.helper.make_node("Div", [scale_sub_node.output[0], self.fixed_qrange_uint8_name],
 532 |             [input_scale_name], scale_div_name)
 533 |         nodes_list.append(scale_div_node)
 534 | 
 535 |         # Compute zero point
 536 |         #   Subtract zero and rmin
 537 |         zp_sub_name = input_name + "_zero_point_Sub"
 538 |         zp_sub_node = onnx.helper.make_node("Sub", [self.fixed_zero_name, reduce_min_node.output[0]],
 539 |             [zp_sub_name + ":0"], zp_sub_name)
 540 |         nodes_list.append(zp_sub_node)
 541 |         #   Divide by scale
 542 |         zp_div_name = input_name + "_zero_point_Div"
 543 |         zp_div_node = onnx.helper.make_node("Div", [zp_sub_node.output[0], input_scale_name],
 544 |             [zp_div_name + ":0"], zp_div_name)
 545 |         nodes_list.append(zp_div_node)
 546 |         #   Compute floor
 547 |         zp_floor_name = input_name + "_zero_point_Floor"
 548 |         zp_floor_node = onnx.helper.make_node("Floor", zp_div_node.output,
 549 |             [zp_floor_name + ":0"], zp_floor_name)
 550 |         nodes_list.append(zp_floor_node)
 551 |         #   Cast to integer
 552 |         zp_cast_name = input_name + "_zero_point_Cast"
 553 |         zp_cast_node = onnx.helper.make_node("Cast", zp_floor_node.output,
 554 |             [input_zp_name], zp_cast_name, to=qType)
 555 |         nodes_list.append(zp_cast_node)
 556 | 
 557 |         return input_scale_name, input_zp_name, [], []
 558 | 
 559 |     def _get_quantization_params(self, param_name):
 560 |         '''
 561 |         Create initializers and inputs in the graph for zero point and scale of output.
 562 |         Zero point and scale values are obtained from self.quantization_params if specified.
 563 |             parameter output_name: Name of the output.
 564 |             return: scale_name, zero_point_name, scale_shape, zero_point_shape.
 565 |         '''        
 566 |         if self.quantization_params is None or param_name not in self.quantization_params:
 567 |             return False, "", "", "", ""
 568 |         params = self.quantization_params[param_name]
 569 |         if params is None or len(params) != 2:
 570 |             raise ValueError("Quantization parameters should contain zero point and scale. "
 571 |                 "Specified values for output {}: {}".format(output_name, params))
 572 | 
 573 |         if not np.isscalar(params[0]):
 574 |             raise ValueError("Zero point for output {} should be a scalar value. Value specified: {}".format(
 575 |                 output_name, params[0]))
 576 |         if not np.isscalar(params[1]):
 577 |             raise ValueError("Scale for output {} should be a scalar value. Value specified: {}".format(
 578 |                 output_name, params[1]))
 579 | 
 580 |         zero_point_values = [params[0].item()]
 581 |         zero_point_shape = []
 582 |         zero_point_name = param_name + "_zero_point"
 583 |         zero_point_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[params[0].dtype]
 584 | 
 585 |         scale_values = [params[1].item()]
 586 |         scale_shape = []
 587 |         scale_name = param_name + "_scale"
 588 | 
 589 |         # Add initializers
 590 |         _add_initializer_if_not_present(self.model.graph, zero_point_name, zero_point_values, zero_point_shape,
 591 |             zero_point_type)
 592 |         _add_initializer_if_not_present(self.model.graph, scale_name, scale_values, scale_shape,
 593 |             onnx_proto.TensorProto.FLOAT)
 594 | 
 595 |         return True, scale_name, zero_point_name, scale_shape, zero_point_shape
 596 | 
 597 |     def _get_quantize_input_nodes(self, node, input_index, qType):
 598 |         '''
 599 |         Given a input for a node (which is not a initializer), this function
 600 |             - add elements to graph to compute zero point and scale for this input.
 601 |             - add new QuantizeLinear nodes to quantize the input.
 602 |             parameter node: node being quantized in NodeProto format.
 603 |             parameter input_index: index of input in node.input.
 604 |             parameter qType: type to quantize to.
 605 |             return: List of newly created nodes in NodeProto format.
 606 |         '''
 607 |         input_name = node.input[input_index]
 608 |         output_name = input_name + "_quantized"
 609 | 
 610 |         data_found, scale_name, zp_name, scale_shape, zp_shape = \
 611 |                 self._get_quantization_params(input_name)
 612 | 
 613 |         if self.static:
 614 |             if data_found == False:
 615 |                 raise ValueError("Quantization parameters are not specified for param {}."
 616 |                 "In static mode quantization params for inputs and outputs of odes to be quantized are required.".format(input_name))
 617 | 
 618 |             qlinear_node = onnx.helper.make_node("QuantizeLinear", [input_name, scale_name, zp_name], 
 619 |                 [output_name], input_name + "_QuantizeLinear")
 620 |             
 621 |             return [qlinear_node]
 622 |             
 623 |         else:
 624 |             if data_found == True:
 625 |                 qlinear_node = onnx.helper.make_node("QuantizeLinear", [input_name, scale_name, zp_name], 
 626 |                     [output_name], input_name + "_QuantizeLinear")
 627 |             else:
 628 |                 # Scale and Zero Points not available for this input. Add nodes to dynamically compute it
 629 |                 if self.fuse_dynamic_quant and qType == onnx_proto.TensorProto.UINT8:
 630 |                     scale_name = input_name + "_scale"
 631 |                     zeropoint_name = input_name + "_zero_point"
 632 |                     qlinear_node = onnx.helper.make_node("DynamicQuantizeLinear", [input_name],
 633 |                         [output_name, scale_name, zeropoint_name], input_name + "_QuantizeLinear")
 634 |                     return [qlinear_node]
 635 |                 
 636 |                 else:
 637 |                     nodes = []
 638 |                     scale_name, zp_name, scale_shape, zp_shape = \
 639 |                         self._get_dynamic_input_quantization_params(input_name, nodes, qType)
 640 |                     qlinear_node = onnx.helper.make_node("QuantizeLinear", [input_name, scale_name, zp_name], 
 641 |                         [output_name], input_name + "_QuantizeLinear")
 642 |             
 643 |                     return nodes + [qlinear_node]           
 644 | 
 645 |     def _get_bias_add_nodes(self, nodes, node, last_output, quantized_bias_name):
 646 |         '''
 647 |         Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node
 648 |             parameter nodes: new nodes would be appended into nodes
 649 |             parameter node: current node (Conv)
 650 |             parameter last_output: output of previous node (input to bias add)
 651 |             return: the name of output
 652 |         '''
 653 |         # Add an Add operation for bias
 654 |         # Add reshape for correct broadcase
 655 |         reshape_input = [quantized_bias_name]
 656 | 
 657 |         # Add tensors for the shape to be reshaped to
 658 |         _add_initializer_if_not_present(self.model.graph, "reshape_shape",
 659 |                                         [1,-1,1,1], [4], onnx_proto.TensorProto.INT64)
 660 |         reshape_input.append('reshape_shape')
 661 |         reshape_op_output = node.output[0] + "_reshape"
 662 |         reshape_node = onnx.helper.make_node("Reshape", reshape_input, [reshape_op_output],
 663 |                                             quantized_bias_name+"reshape")
 664 |         nodes.append(reshape_node)
 665 | 
 666 |         bias_add_input = [last_output]
 667 |         bias_add_input.append(reshape_op_output)
 668 |         add_node_output = node.output[0] + "_bias_add"
 669 |         add_node = onnx.helper.make_node("Add", bias_add_input, [add_node_output],
 670 |                                         quantized_bias_name + "bias_add")
 671 |         nodes.append(add_node)
 672 |         return add_node_output
 673 | 
 674 |     def _update_unsupported_nodes_using_weight(self, weight, new_nodes_list):        
 675 |         '''Find all nodes using a weight that do not support quantization and
 676 |         add a DequantizeLinear node before those nodes. This includes all nodes except Conv, MatMul.
 677 |             parameter weight: Weight object
 678 |             parameter new_nodes_list: List of new nodes created before processing current node.
 679 |             return: List of new nodes created.
 680 |         '''
 681 |         nodes_using_weight = _find_nodes_using_initializer(self.model.graph, weight.initializer)
 682 |         unsupported_nodes = [node for node in nodes_using_weight if node.op_type not in ["Conv", "MatMul", "Gather"]]
 683 | 
 684 |         nodes_list = []
 685 |         dequantize_linear_name = weight.name + "_DequantizeLinear"
 686 |         output_name = weight.name + "_dequantized"
 687 | 
 688 |         # Check if DequantizeLinear node needs to be added to graph.
 689 |         if len(unsupported_nodes) != 0 and \
 690 |             _find_node_by_name(dequantize_linear_name, self.model.graph, new_nodes_list) is None:
 691 |             inputs = [weight.name + "_quantized", weight.name + "_scale", weight.name + "_zero_point"]
 692 |             node = onnx.helper.make_node("DequantizeLinear", inputs, [output_name],
 693 |                                          dequantize_linear_name)
 694 |             nodes_list.append(node)
 695 | 
 696 |         # Update unsupported nodes to take dequantized weight as input.
 697 |         for node in unsupported_nodes:
 698 |             for i, node_input in enumerate(node.input):
 699 |                 if node_input == weight.name:
 700 |                     node.input[i] = output_name
 701 | 
 702 |         return nodes_list
 703 | 
 704 |     def _dynamic_quantize_bias(self, input_name, weight_scale_name, bias_name, quantized_bias_name, new_node_list):
 705 |         '''
 706 |         Adds series of nodes required to quantize the bias dynamically.
 707 |             parameter input_name: Input name
 708 |             parameter weight_scale_name: Weight scale.
 709 |             parameter bias_scale_name: Bias to quantize.
 710 |             parameter quantied_bias_name: Output name to use for quantized bias.
 711 |         '''
 712 |         qType = onnx_proto.TensorProto.INT32
 713 |         
 714 |         input_scale_name = input_name + "_scale"
 715 |         bias_scale_node = onnx.helper.make_node("Mul", [input_scale_name, weight_scale_name], [bias_name + "_scale"], bias_name + "_scale_node")
 716 |         new_node_list.append(bias_scale_node)
 717 | 
 718 |         quantize_bias_node = onnx.helper.make_node("Div", [bias_name, bias_scale_node.output[0]],
 719 |             [bias_name + "_tmp_quant:0"], bias_name + "_tmp_qaunt")
 720 |         new_node_list.append(quantize_bias_node)
 721 | 
 722 |         bias_rounded_node = onnx.helper.make_node("Floor", quantize_bias_node.output,
 723 |             [bias_name + "_quant_rounded:0"], bias_name + "_quant_rounded")
 724 |         new_node_list.append(bias_rounded_node)
 725 |         
 726 |         bias_cast_node = onnx.helper.make_node("Cast", bias_rounded_node.output,
 727 |             [quantized_bias_name], quantized_bias_name + "_node", to=qType)
 728 |         new_node_list.append(bias_cast_node)
 729 |         
 730 |         return 
 731 | 
 732 | 
 733 |     def _quantize_bias(self, node, new_node_list):
 734 |         '''
 735 |         Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale 
 736 |         '''
 737 | 
 738 |          # get scale for weight 
 739 |         weight_scale_name = self.quantized_value_map[node.input[1]].scale_name
 740 |         weight_initializer = _find_by_name(weight_scale_name, self.model.graph.initializer)
 741 |         weight_scale = self.find_weight_data(weight_initializer)  
 742 | 
 743 |         # get bias
 744 |         bias_name = node.input[2]
 745 |         bias_initializer = _find_by_name(bias_name, self.model.graph.initializer)
 746 |         bias_data = self.find_weight_data(bias_initializer)
 747 |         quantized_bias_name = bias_name + "_quantized"      
 748 | 
 749 |         # input scale is not provided and this input is dynamically quantized so it is not pre-computed at this point
 750 |         # so resort to dynamic quantization for bias
 751 |         if self.quantization_params is None or node.input[0] not in self.quantization_params and node.input[0] not in self.quantized_value_map:
 752 |             self._dynamic_quantize_bias(node.input[0], weight_scale_name, bias_name, quantized_bias_name, new_node_list)
 753 |         else:
 754 |             # get scale for input
 755 |             input_scale_name = self.quantized_value_map[node.input[0]].scale_name
 756 |             inputscale_initializer = _find_by_name(input_scale_name, self.model.graph.initializer)
 757 |             input_scale = self.find_weight_data(inputscale_initializer)        
 758 | 
 759 |             # calcuate scale for bias
 760 |             bias_scale_name = node.input[2] + "_scale"
 761 |             bias_scale = input_scale * weight_scale
 762 |             print(bias_scale)
 763 |      
 764 |             # quantize bias
 765 |             quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
 766 |             print(quantized_data)
 767 | 
 768 |             #update bias initializer        
 769 |             bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
 770 |             packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
 771 |             self.model.graph.initializer.extend([packed_bias_initializer])
 772 | 
 773 |             bias_value_info = onnx.helper.make_tensor_value_info(quantized_bias_name, onnx_proto.TensorProto.INT32, bias_initializer.dims)
 774 |             self.model.graph.input.extend([bias_value_info])
 775 | 
 776 |             # log entries for this quantized bias value
 777 |             quantized_bias_entry = QuantizedInitializer(bias_name, bias_initializer, [0], [0], [0], [bias_scale],
 778 |                             bias_data, quantized_data, qType=onnx_proto.TensorProto.INT32)
 779 |             self._quantized_weights.append(quantized_bias_entry)
 780 |         
 781 |             assert(bias_name not in self.quantized_value_map)
 782 |             quantized_value = QuantizedValue(bias_name, quantized_bias_name, "", "", QuantizedValueType.Initializer, None, onnx_proto.TensorProto.INT32)
 783 |             self.quantized_value_map[bias_name] = quantized_value
 784 | 
 785 |         return quantized_bias_name
 786 | 
 787 | 
 788 |     def _quantize_inputs(self, node, indices, new_nodes_list):
 789 |         '''
 790 |         Given a node, this function quantizes the inputs as follows:
 791 |             - If input is a initializer, quantize the initializer data, replace old initializer
 792 |               with new initializer
 793 |             - Else, add QuantizeLinear nodes to perform quantization
 794 |             parameter node: node being quantized in NodeProto format.
 795 |             parameter indices: input indices to quantize.
 796 |             parameter new_nodes_list: List of new nodes created before processing this node. This is used to
 797 |                                       check that two QuantizeLinear nodes are not being added for same input.
 798 |             return: (List of quantized input names,
 799 |                      List of zero point names used for input quantization,
 800 |                      List of scale names used for input quantization,
 801 |                      List of new QuantizeLinear nodes created)
 802 |         '''
 803 |         assert (node.op_type == "Conv" or node.op_type == "MatMul" or node.op_type == "Gather")
 804 | 
 805 |         quantized_input_names = []
 806 |         zero_point_names = []
 807 |         scale_names = []
 808 |         nodes = []
 809 | 
 810 |         for input_index in indices:
 811 |             node_input = node.input[input_index]
 812 | 
 813 |             # Find if this input is already quantized
 814 |             if node_input in self.quantized_value_map:
 815 |                 quantized_value = self.quantized_value_map[node_input]
 816 |                 qType = self.weight_qType if quantized_value.value_type == QuantizedValueType.Initializer else self.input_qType
 817 |                 if quantized_value.qType != qType: 
 818 |                     raise ValueError("{} is being used by multiple nodes which are being quantized to different types. "
 819 |                 "This is not suported.", node_input)
 820 | 
 821 |                 quantized_input_names.append(quantized_value.q_name)
 822 |                 scale_names.append(quantized_value.scale_name)
 823 |                 zero_point_names.append(quantized_value.zp_name)
 824 |                 continue
 825 | 
 826 |             # Quantize the input
 827 |             initializer = _find_by_name(node_input, self.model.graph.initializer)
 828 |             if initializer is not None:
 829 |                 if node.op_type == "Conv":
 830 |                     weight = self._get_quantized_weight_convolution(initializer, self.weight_qType)
 831 |                 else:
 832 |                     weight = self._get_quantized_weight(initializer, self.weight_qType)
 833 | 
 834 |                 # Update graph
 835 |                 nodes.extend(self._update_unsupported_nodes_using_weight(weight, new_nodes_list))
 836 |                 self._update_graph(weight)
 837 | 
 838 |                 quantized_input_names.append(weight.name + "_quantized")
 839 |                 zero_point_names.append(weight.name + "_zero_point")
 840 |                 scale_names.append(weight.name + "_scale")
 841 |             else:
 842 |                 # Add QuantizeLinear node.
 843 |                 qlinear_node = _find_node_by_name(node_input + "_QuantizeLinear", self.model.graph, new_nodes_list)
 844 |                 if qlinear_node is None:
 845 |                     quantize_input_nodes = self._get_quantize_input_nodes(node, input_index, self.input_qType)
 846 |                     nodes.extend(quantize_input_nodes)
 847 |                     qlinear_node = quantize_input_nodes[-1]
 848 | 
 849 |                 if qlinear_node.op_type == "QuantizeLinear":
 850 |                     quantized_input_names.extend(qlinear_node.output)
 851 |                     scale_names.append(qlinear_node.input[1])
 852 |                     zero_point_names.append(qlinear_node.input[2])
 853 |                 else:
 854 |                     quantized_input_names.append(qlinear_node.output[0])
 855 |                     scale_names.append(qlinear_node.output[1])
 856 |                     zero_point_names.append(qlinear_node.output[2])
 857 | 
 858 | 
 859 |         return (quantized_input_names, zero_point_names, scale_names, nodes)
 860 |  
 861 |     def _handle_other_ops(self, node, new_nodes_list):
 862 |         '''
 863 |         Given a node which does not support quantization(Conv, Matmul, Gather), this method 
 864 |         checks whether the input to this node is quantized and adds a DequantizeLinear node 
 865 |         to dequantize this input back to FP32
 866 |             parameter node: Current node
 867 |             parameter new_nodes_list: List of new nodes created before processing current node
 868 |             return: List of new nodes created
 869 |         '''
 870 |         nodes = []
 871 |         for index, node_input in enumerate(node.input):
 872 |             if node_input in self.quantized_value_map:
 873 |                 node_input_altered = True
 874 |                 input_name = node.input[index]
 875 |                 quantized_value = self.quantized_value_map[input_name]
 876 |                 # Add DequantizeLinear Node for this input
 877 |                 dqlinear_name = input_name + "_DequantizeLinear"
 878 |                 dqlinear_node = _find_node_by_name(dqlinear_name, self.model.graph, new_nodes_list)
 879 |                 if dqlinear_node is None:
 880 |                     dqlinear_inputs = [quantized_value.q_name, quantized_value.scale_name, quantized_value.zp_name]
 881 |                     dequantize_node = onnx.helper.make_node("DequantizeLinear", dqlinear_inputs, [input_name], dqlinear_name)
 882 |                     nodes.append(dequantize_node)
 883 |                 else:
 884 |                     # DQ op is already present, assert it's output matches the input of current node
 885 |                     assert(input_name == dqlinear_node.output[0])
 886 | 
 887 |         # Append the original node
 888 |         nodes.append(node)
 889 |         return nodes
 890 | 
 891 |     def _handle_activation_ops(self, node, new_node_list):
 892 |         '''
 893 |         Checks whether the give activation op can be removed from the graph. When mode is QLinearOps, 
 894 |         the output quatization params are calculated based on outputs from activation nodes, 
 895 |         therefore these nodes can be removed from the graph if they follow a quantized op.
 896 |         
 897 |             parameter node: Current node
 898 |             parameter new_nodes_list: List of new nodes created before processing current node
 899 |             return: List of nodes
 900 |         '''
 901 |         assert(node.op_type == "Relu" or node.op_type == 'Clip')
 902 |         if self.mode is not QuantizationMode.QLinearOps:
 903 |             return [node]
 904 |         # When mode is QLinearOps, the output quatization params are calculated based on outputs from
 905 |         # activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op.
 906 |         # If input to this node is not quantized then keep this node
 907 |         if node.input[0] not in self.quantized_value_map:
 908 |             return [node]
 909 | 
 910 |         # Prepare to remove this node
 911 |         quantized_value = self.quantized_value_map[node.input[0]]
 912 |         self.quantized_value_map[node.output[0]] = quantized_value
 913 | 
 914 |         return []
 915 | 
 916 |     def _quantize_gather_ops(self, node, new_nodes_list):
 917 |         assert (node.op_type == "Gather")
 918 |         (quantized_input_names, zero_point_names, scale_names, nodes) = \
 919 |             self._quantize_inputs(node, [0], new_nodes_list)
 920 |         
 921 |         gather_new_output = node.output[0] + "_quantized"
 922 | 
 923 |         # Create an entry for this quantized value
 924 |         q_output = QuantizedValue(node.output[0], gather_new_output, scale_names[0], zero_point_names[0], QuantizedValueType.Input)        
 925 |         self.quantized_value_map[node.output[0]] = q_output
 926 | 
 927 |         gather_original_output = node.output[0]
 928 |         node.output[0] = gather_new_output
 929 |         node.input[0] = quantized_input_names[0]
 930 |         nodes.append(node)
 931 | 
 932 |         return nodes
 933 | 
 934 |     def _quantize_convolution_integer_ops(self, node, new_nodes_list):
 935 |         '''
 936 |         Used when self.mode is QuantizationMode.IntegerOps.
 937 |             parameter node: Conv node.
 938 |             parameter new_nodes_list: List of new nodes created before processing this node.
 939 |             return: a list of nodes in topological order that represents quantized Conv node.
 940 |         '''
 941 |         assert (node.op_type == "Conv")
 942 | 
 943 |         (quantized_input_names, zero_point_names, scale_names, nodes) = \
 944 |             self._quantize_inputs(node, [0, 1], new_nodes_list)
 945 | 
 946 |         # quantize bias if exist
 947 |         quantized_bias_name = ""
 948 |         bias_present = False
 949 |         if len(node.input) == 3:
 950 |             quantized_bias_name = self._quantize_bias(node, nodes)
 951 |             bias_present = True
 952 | 
 953 |         conv_integer_output = node.output[0] + "_quantized"
 954 |         conv_integer_name = ""
 955 |         if node.name != "":
 956 |             conv_integer_name = node.name + "_quant"
 957 |         kwargs = {}
 958 |         for attribute in node.attribute:
 959 |             kwargs.update(_attribute_to_kwarg(attribute))
 960 |         conv_integer_node = onnx.helper.make_node("ConvInteger", quantized_input_names + zero_point_names,
 961 |             [conv_integer_output], conv_integer_name, **kwargs)
 962 |         nodes.append(conv_integer_node)
 963 | 
 964 |         # Add bias add nodes
 965 |         if bias_present:
 966 |             conv_integer_output = self._get_bias_add_nodes(nodes, node, conv_integer_output, quantized_bias_name)
 967 | 
 968 |         # Add cast operation to cast convInteger output to float.
 969 |         cast_op_output = conv_integer_output + "_cast_output"
 970 |         cast_node = onnx.helper.make_node("Cast", [conv_integer_output], [cast_op_output],
 971 |             conv_integer_output + "_cast", to=onnx_proto.TensorProto.FLOAT)
 972 |         nodes.append(cast_node)
 973 | 
 974 |         # Add mul operation to multiply scales of two inputs.
 975 |         assert (len(scale_names) == 2)
 976 |         if conv_integer_name != "":
 977 |             scales_mul_op = conv_integer_name + "_scales_mul"
 978 |         else:
 979 |             scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
 980 | 
 981 |         scales_mul_node = _find_node_by_name(scales_mul_op, self.model.graph, new_nodes_list)
 982 |         if scales_mul_node is None:
 983 |             scales_mul_node = _get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
 984 |             nodes.append(scales_mul_node)
 985 | 
 986 |         scales_mul_op_output = scales_mul_node.output[0]
 987 | 
 988 |         # Add mul operation to multiply mul_scales_op result with output of ConvInteger
 989 |         # and make the output of this node the same as output of original conv node.
 990 |         output_scale_mul_op = ""
 991 |         if conv_integer_name != "":
 992 |             output_scale_mul_op = conv_integer_name + "_output_scale_mul"
 993 |         nodes.append(_get_mul_node([cast_op_output, scales_mul_op_output], node.output[0], output_scale_mul_op))
 994 | 
 995 |         return nodes
 996 | 
 997 |     def _quantize_matmul_integer_ops(self, node, new_nodes_list):
 998 |         '''
 999 |         Used when self.mode is QuantizationMode.IntegerOps.
1000 |             parameter node: MatMul node.
1001 |             parameter new_nodes_list: List of new nodes created before processing this node.
1002 |             return: a list of nodes in topological order that represents quantized MatMul node.
1003 |         '''
1004 |         assert (node.op_type == "MatMul")
1005 | 
1006 |         (quantized_input_names, zero_point_names, scale_names, nodes) = \
1007 |             self._quantize_inputs(node, [0, 1], new_nodes_list)
1008 | 
1009 |         matmul_integer_output = node.output[0] + "_quantized"
1010 |         matmul_integer_name = ""
1011 |         if node.name != "":
1012 |             matmul_integer_name = node.name + "_quant"
1013 |         matmul_integer_node = onnx.helper.make_node("MatMulInteger", quantized_input_names + zero_point_names,
1014 |             [matmul_integer_output], matmul_integer_name)
1015 |         nodes.append(matmul_integer_node)
1016 | 
1017 |         # Add cast operation to cast matmulInteger output to float.
1018 |         cast_op_output = matmul_integer_output + "_cast_output"
1019 |         cast_node = onnx.helper.make_node("Cast", [matmul_integer_output], [cast_op_output],
1020 |             matmul_integer_output + "_cast", to=onnx_proto.TensorProto.FLOAT)
1021 |         nodes.append(cast_node)
1022 | 
1023 |         # Add mul operation to multiply scales of two inputs.
1024 |         assert (len(scale_names) == 2)
1025 |         if matmul_integer_name != "":
1026 |             scales_mul_op = matmul_integer_name + "_scales_mul"
1027 |         else:
1028 |             scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
1029 | 
1030 |         scales_mul_node = _find_node_by_name(scales_mul_op, self.model.graph, new_nodes_list)
1031 |         if scales_mul_node is None:
1032 |             scales_mul_node = _get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
1033 |             nodes.append(scales_mul_node)
1034 | 
1035 |         scales_mul_op_output = scales_mul_node.output[0]
1036 | 
1037 |         # Add mul operation to multiply mul_scales_op result with output of MatMulInteger
1038 |         # and make the output of this node the same as output of original matmul node.
1039 |         output_scale_mul_op = ""
1040 |         if matmul_integer_name != "":
1041 |             output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
1042 |         nodes.append(_get_mul_node([cast_op_output, scales_mul_op_output], node.output[0],
1043 |             output_scale_mul_op))
1044 |         return nodes
1045 | 
1046 |     def _quantize_convolution_qlinear_ops(self, node, new_nodes_list):
1047 |         '''
1048 |         Used when self.mode is QuantizationMode.QLinearOps.
1049 |             parameter node: Conv node.
1050 |             parameter new_nodes_list: List of new nodes created before processing this node.
1051 |             return: a list of nodes in topological order that represents quantized Conv node.
1052 |         '''
1053 |         assert (node.op_type == "Conv")
1054 | 
1055 |         (quantized_input_names, zero_point_names, scale_names, nodes) = \
1056 |             self._quantize_inputs(node, [0, 1], new_nodes_list)
1057 |         
1058 |         quantized_bias_name = ""
1059 |         bias_present = False
1060 |         if len(node.input) == 3:
1061 |             quantized_bias_name = self._quantize_bias(node, nodes)
1062 |             bias_present = True        
1063 |         data_found, output_scale_name, output_zp_name, output_scale_shape, output_zp_shape = \
1064 |             self._get_quantization_params(node.output[0])
1065 | 
1066 |         assert(data_found)
1067 | 
1068 |         qlinear_conv_output = node.output[0] + "_quantized"
1069 |         qlinear_conv_name = ""
1070 |         if node.name != "":
1071 |             qlinear_conv_name = node.name + "_quant"
1072 |         kwargs = {}
1073 |         for attribute in node.attribute:
1074 |             kwargs.update(_attribute_to_kwarg(attribute))
1075 |         qlinear_conv_inputs = []
1076 |         # Input 0
1077 |         qlinear_conv_inputs.append(quantized_input_names[0])
1078 |         qlinear_conv_inputs.append(scale_names[0])
1079 |         qlinear_conv_inputs.append(zero_point_names[0])
1080 |         # Input 1
1081 |         qlinear_conv_inputs.append(quantized_input_names[1])
1082 |         qlinear_conv_inputs.append(scale_names[1])
1083 |         qlinear_conv_inputs.append(zero_point_names[1])
1084 | 
1085 |         # Output
1086 |         qlinear_conv_inputs.append(output_scale_name)
1087 |         qlinear_conv_inputs.append(output_zp_name)
1088 | 
1089 |         if bias_present:
1090 |             qlinear_conv_inputs.append(quantized_bias_name)
1091 | 
1092 |         qlinear_conv_node = onnx.helper.make_node("QLinearConv", qlinear_conv_inputs,
1093 |             [qlinear_conv_output], qlinear_conv_name, **kwargs)
1094 |         nodes.append(qlinear_conv_node)
1095 | 
1096 |         # Create an entry for this quantized value
1097 |         q_output = QuantizedValue(node.output[0], qlinear_conv_output, output_scale_name, output_zp_name, QuantizedValueType.Input)        
1098 |         self.quantized_value_map[node.output[0]] = q_output
1099 |         
1100 |         return nodes
1101 | 
1102 |     def _quantize_matmul_qlinear_ops(self, node, new_nodes_list):
1103 |         '''
1104 |         Used when self.mode is QuantizationMode.QLinearOps.
1105 |             parameter node: MatMul node.
1106 |             parameter new_nodes_list: List of new nodes created before processing this node.
1107 |             return: a list of nodes in topological order that represents quantized Conv node.
1108 |         '''
1109 |         assert (node.op_type == "MatMul")
1110 | 
1111 |         (quantized_input_names, zero_point_names, scale_names, nodes) = \
1112 |             self._quantize_inputs(node, [0, 1], new_nodes_list)
1113 | 
1114 |         data_found, output_scale_name, output_zp_name, output_scale_shape, output_zp_shape = \
1115 |             self._get_quantization_params(node.output[0])
1116 |         
1117 |         assert(data_found)
1118 | 
1119 |         qlinear_matmul_output = node.output[0] + "_quantized"
1120 |         qlinear_matmul_name = ""
1121 |         if node.name != "":
1122 |             qlinear_matmul_name = node.name + "_quant"
1123 | 
1124 |         qlinear_matmul_inputs = []
1125 |         # Input 0
1126 |         qlinear_matmul_inputs.append(quantized_input_names[0])
1127 |         qlinear_matmul_inputs.append(scale_names[0])
1128 |         qlinear_matmul_inputs.append(zero_point_names[0])
1129 |         # Input 1
1130 |         qlinear_matmul_inputs.append(quantized_input_names[1])
1131 |         qlinear_matmul_inputs.append(scale_names[1])
1132 |         qlinear_matmul_inputs.append(zero_point_names[1])
1133 |         # Output
1134 |         qlinear_matmul_inputs.append(output_scale_name)
1135 |         qlinear_matmul_inputs.append(output_zp_name)
1136 | 
1137 |         qlinear_matmul_node = onnx.helper.make_node("QLinearMatMul", qlinear_matmul_inputs,
1138 |             [qlinear_matmul_output], qlinear_matmul_name)
1139 |         nodes.append(qlinear_matmul_node)
1140 | 
1141 |         # Create an entry for this quantized value
1142 |         q_output = QuantizedValue(node.output[0], qlinear_matmul_output, output_scale_name, output_zp_name, QuantizedValueType.Input)        
1143 |         self.quantized_value_map[node.output[0]] = q_output
1144 |         
1145 |         return nodes
1146 | 
1147 |     def _quantize_convolution(self, node, new_nodes_list):
1148 |         '''
1149 |             https://github.com/onnx/onnx/blob/master/docs/Operators.md#Conv
1150 |             :param node: Conv node
1151 |             :param new_nodes_list: List of new nodes created before processing this node.
1152 |             :return: a list of nodes in topological order that represents quantized Conv node
1153 |         '''
1154 |         assert (node.op_type == "Conv")
1155 | 
1156 |         if self.mode == QuantizationMode.IntegerOps:
1157 |             return self._quantize_convolution_integer_ops(node, new_nodes_list)
1158 | 
1159 |         if self.mode == QuantizationMode.QLinearOps:
1160 |             return self._quantize_convolution_qlinear_ops(node, new_nodes_list)
1161 | 
1162 |         return [node]
1163 | 
1164 |     def _quantize_matmul(self, node, new_nodes_list):
1165 |         '''
1166 |             https://github.com/onnx/onnx/blob/master/docs/Operators.md#MatMul
1167 |             :param node: MatMul node
1168 |             :param new_nodes_list: List of new nodes created before processing this node.
1169 |             :return: a list of nodes in topological order that represents quantized MatMul node
1170 |         '''
1171 |         assert(node.op_type == 'MatMul')
1172 | 
1173 |         if self.mode == QuantizationMode.IntegerOps:
1174 |             return self._quantize_matmul_integer_ops(node, new_nodes_list)
1175 | 
1176 |         if self.mode == QuantizationMode.QLinearOps:
1177 |             return self._quantize_matmul_qlinear_ops(node, new_nodes_list)
1178 | 
1179 |         return [node]
1180 | 
1181 | def check_opset_version(org_model, force_fusions):
1182 |     '''
1183 |         Check opset version of original model and set opset version and fuse_dynamic_quant accordingly.
1184 |         If opset version < 10, set quantized model opset version to 10.
1185 |         If opset version == 10, do quantization without using dynamicQuantizeLinear operator.
1186 |         If opset version == 11, do quantization using dynamicQuantizeLinear operator.
1187 |         :return: fuse_dynamic_quant boolean value.
1188 |     '''
1189 |     global onnx_op_set_version
1190 |     opset_version = org_model.opset_import[0].version
1191 |     fuse_dynamic_quant = False
1192 | 
1193 |     if opset_version < 11 and force_fusions == True:
1194 |         print("Warning: The original model opset version is {}, which does not support node fusions.\n\
1195 |             Forcing fusions can break other nodes in the model.".format(opset_version))
1196 |         fuse_dynamic_quant = True
1197 | 
1198 |     if opset_version < 10:
1199 |         print("Warning: The original model opset version is {}, which does not support quantized operators.\n\
1200 |             The opset version of quantized model will be set to 10. Use onnx model checker to verify model after quantization.".format(opset_version))
1201 |         onnx_op_set_version = 10
1202 |     elif opset_version == 10:
1203 |         onnx_op_set_version = 10
1204 |     else:
1205 |         onnx_op_set_version > 10
1206 |         fuse_dynamic_quant = True
1207 |     return fuse_dynamic_quant
1208 | 
1209 | def quantize(model, per_channel=False, nbits=8, quantization_mode=QuantizationMode.IntegerOps,
1210 |     static=False, force_fusions=False, asymmetric_input_types=False, 
1211 |     quantization_params=None, nodes_to_quantize=None):
1212 |     '''
1213 |         Given an onnx model, create a quantized onnx model and save it into a file
1214 |     :param model: ModelProto to quantize
1215 |     :param per_channel: quantize weights per channel
1216 |     :param nbits: number of bits to represent quantized data. Currently only supporting 8-bit types
1217 |     :param quantization_mode: Can be one of the QuantizationMode types.
1218 |         IntegerOps:
1219 |             the function will use integer ops. Only ConvInteger and MatMulInteger ops are supported now.
1220 |         QLinearOps:
1221 |             the function will use QLinear ops. Only QLinearConv and QLinearMatMul ops are supported now.
1222 |     :param static:
1223 |         True: The inputs/activations are quantized using static scale and zero point values
1224 |               specified through quantization_params.
1225 |         False: The inputs/activations are quantized using dynamic scale and zero point values
1226 |                computed while running the model.
1227 |     :param force_fusions:
1228 |         True: Fuses nodes added for dynamic quantization
1229 |         False: No fusion is applied for nodes which are added for dynamic quantization.
1230 |         Should be only used in cases where backends want to apply special fusion routines
1231 |     :param asymmetric_input_types:
1232 |         True: Weights are quantized into signed integers and inputs/activations into unsigned integers.
1233 |         False: Weights and inputs/activations are quantized into unsigned integers.
1234 |     :param quantization_params:
1235 |         Dictionary to specify the zero point and scale values for inputs to conv and matmul nodes.
1236 |         Should be specified when static is set to True.
1237 |         The quantization_params should be specified in the following format:
1238 |             {
1239 |                 "input_name": [zero_point, scale]
1240 |             }.
1241 |         zero_point should be of type np.uint8 and scale should be of type np.float32.
1242 |         example:
1243 |             {
1244 |                 'resnet_model/Relu_1:0': [np.uint8(0), np.float32(0.019539741799235344)],
1245 |                 'resnet_model/Relu_2:0': [np.uint8(0), np.float32(0.011359662748873234)]
1246 |             }    
1247 |     :return: ModelProto with quantization
1248 |     :param nodes_to quantize:
1249 |         List of nodes names to quantize. When this list is not None only the nodes in this list
1250 |         are quantized.
1251 |         exmaple:
1252 |         [
1253 |             'Cov__224',
1254 |             'Conv__252'
1255 |         ]
1256 |     '''
1257 |     if nbits == 8:
1258 |         input_qType = onnx_proto.TensorProto.UINT8
1259 |         weight_qType = onnx_proto.TensorProto.INT8 if asymmetric_input_types else onnx_proto.TensorProto.UINT8
1260 |         mode = quantization_mode
1261 |         copy_model = onnx_proto.ModelProto()
1262 |         copy_model.CopyFrom(model)
1263 |         fuse_dynamic_quant = check_opset_version(copy_model, force_fusions)
1264 |         quantizer = ONNXQuantizer(copy_model, per_channel, mode, static, fuse_dynamic_quant, weight_qType, input_qType,
1265 |                         quantization_params, nodes_to_quantize)
1266 |         quantizer.quantize_model()
1267 |         quantizer.model.producer_name = __producer__
1268 |         quantizer.model.producer_version = __version__
1269 |         return quantizer.model
1270 |     else:
1271 |         raise ValueError('Unknown value for nbits. only 8 bit quantization is currently supported')
1272 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | from quantize import quantize,QuantizationMode
 3 | 
 4 | # Load the onnx model
 5 | model = onnx.load('mobilenet.onnx')
 6 | # quantize
 7 | quantizated_model = quantize(model,quantization_mode=QuantizationMode.IntegerOps)
 8 | #Save the quantized model
 9 | onnx.save(quantizated_model,'mobilenet_q.onnx')
10 | 


--------------------------------------------------------------------------------