├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── affinity_helper.py
├── benchmark.py
├── benchmark_gpt2.py
├── benchmark_helper.py
├── fusion_attention.py
├── fusion_base.py
├── fusion_biasgelu.py
├── fusion_embedlayer.py
├── fusion_fastgelu.py
├── fusion_gelu.py
├── fusion_gelu_approximation.py
├── fusion_gpt_attention.py
├── fusion_gpt_attention_megatron.py
├── fusion_gpt_attention_no_past.py
├── fusion_layernorm.py
├── fusion_options.py
├── fusion_reshape.py
├── fusion_shape.py
├── fusion_skiplayernorm.py
├── fusion_utils.py
├── gpt2_beamsearch_helper.py
├── gpt2_beamsearch_tester.py
├── gpt2_helper.py
├── gpt2_parity.py
├── gpt2_tester.py
├── hf.co_1ms
    ├── README.md
    ├── benchmark.log
    ├── detail.csv
    ├── fusion.csv
    ├── images
    │   ├── cpu_16_2_5ms.png
    │   ├── cpu_9_7ms.png
    │   ├── gpu_128_2_6ms.png
    │   ├── gpu_16_1_7ms.png
    │   ├── infinity_model.png
    │   └── model_dir.png
    ├── onnx.diff
    ├── onnx_with_eigen.diff
    ├── output.txt
    ├── requirements.txt
    ├── result.csv
    ├── run_benchmark.sh
    ├── summary_detail.csv
    ├── summary_fusion.csv
    └── summary_result.csv
├── hf_co_models.py
├── huggingface_MiniLM_loadsave.py
├── huggingface_models.py
├── nightly_job.sh
├── onnx_exporter.py
├── onnx_model.py
├── onnx_model_bart.py
├── onnx_model_bert.py
├── onnx_model_bert_keras.py
├── onnx_model_bert_tf.py
├── onnx_model_gpt2.py
├── optimizer.py
├── perf-ci.sh
├── quantize_helper.py
├── resnet50.py
├── run_benchmark.sh
├── shape_infer_helper.py
├── shape_optimizer.py
├── symbolic_shape_infer.py
└── update_submodules.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | cache_models
 3 | onnx_models
 4 | benchmark.log
 5 | model.mlir
 6 | detail.csv
 7 | fusion.csv
 8 | result.csv
 9 | summary_detail.csv
10 | summary_result.csv
11 | summary_fusion.csv
12 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "mmperf"]
2 | 	path = mmperf
3 | 	url = https://github.com/mmperf/mmperf
4 | [submodule "thirdparty/SHARK"]
5 | 	path = thirdparty/SHARK
6 | 	url = https://github.com/nod-ai/SHARK.git
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2021, powderluv
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # transformer-benchmarks
 2 | benchmarking some transformers
 3 | 
 4 | ## Quickstart
 5 | 
 6 | ```
 7 | git clone https://github.com/powderluv/transformer-benchmarks
 8 | 
 9 | cd transformer-benchmarks
10 | 
11 | python -m venv myenv
12 | 
13 | source myenv/bin/activate
14 | 
15 | ./run_benchmark.sh. #change the variables to run on cpu / gpu and which backends to use org, torchscript, tf, mlir 
16 | 
17 | ```
18 | 
19 | ![Measuring up Transformers](https://i0.wp.com/cdnssl.ubergizmo.com/wp-content/uploads/2021/04/optimus-prime-toy.jpg)
20 | 


--------------------------------------------------------------------------------
/affinity_helper.py:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
 3 | # Licensed under the MIT License.
 4 | #--------------------------------------------------------------------------
 5 | 
 6 | # Get/Set cpu affinity. Currently only support part of Unix system
 7 | import logging
 8 | import os
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class AffinitySetting():
14 |     def __init__(self):
15 |         self.pid = os.getpid()
16 |         self.affinity = None
17 |         self.is_os_supported = hasattr(os, 'sched_getaffinity') and hasattr(os, 'sched_setaffinity')
18 |         if not self.is_os_supported:
19 |             logger.warning("Current OS does not support os.get_affinity() and os.set_affinity()")
20 | 
21 |     def get_affinity(self):
22 |         if self.is_os_supported:
23 |             self.affinity = os.sched_getaffinity(self.pid)
24 | 
25 |     def set_affinity(self):
26 |         if self.is_os_supported:
27 |             current_affinity = os.sched_getaffinity(self.pid)
28 |             if (self.affinity != current_affinity):
29 |                 logger.warning("Replacing affinity setting %s with %s", str(current_affinity), str(self.affinity))
30 |                 os.sched_setaffinity(self.pid, self.affinity)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     affi_helper = AffinitySetting()
35 |     affi_helper.get_affinity()
36 |     affi_helper.set_affinity()
37 | 


--------------------------------------------------------------------------------
/benchmark_helper.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.  See License.txt in the project root for
  4 | # license information.
  5 | # --------------------------------------------------------------------------
  6 | 
  7 | import os
  8 | import sys
  9 | import csv
 10 | import numpy
 11 | import time
 12 | import timeit
 13 | from datetime import datetime
 14 | import argparse
 15 | import logging
 16 | import coloredlogs
 17 | import torch
 18 | import onnx
 19 | from enum import Enum
 20 | from packaging import version
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | 
 25 | class Precision(Enum):
 26 |     FLOAT32 = 'fp32'
 27 |     FLOAT16 = 'fp16'
 28 |     INT8 = 'int8'
 29 | 
 30 |     def __str__(self):
 31 |         return self.value
 32 | 
 33 | 
 34 | IO_BINDING_DATA_TYPE_MAP = {
 35 |     "float32": numpy.float32,
 36 |     # TODO: Add more.
 37 | }
 38 | 
 39 | 
 40 | def create_onnxruntime_session(onnx_model_path,
 41 |                                use_gpu,
 42 |                                enable_all_optimization=True,
 43 |                                num_threads=-1,
 44 |                                enable_profiling=False,
 45 |                                verbose=False):
 46 |     session = None
 47 |     try:
 48 |         from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version
 49 |         sess_options = SessionOptions()
 50 | 
 51 |         if enable_all_optimization:
 52 |             sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
 53 |         else:
 54 |             sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
 55 | 
 56 |         if enable_profiling:
 57 |             sess_options.enable_profiling = True
 58 | 
 59 |         if num_threads > 0:
 60 |             sess_options.intra_op_num_threads = num_threads
 61 |             logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}")
 62 | 
 63 |         if verbose:
 64 |             sess_options.log_severity_level = 0
 65 |         else:
 66 |             sess_options.log_severity_level = 4
 67 | 
 68 |         logger.debug(f"Create session for onnx model: {onnx_model_path}")
 69 |         execution_providers = ['CPUExecutionProvider'
 70 |                                ] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
 71 |         session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers)
 72 |     except:
 73 |         logger.error(f"Exception", exc_info=True)
 74 | 
 75 |     return session
 76 | 
 77 | 
 78 | def setup_logger(verbose=True):
 79 |     if verbose:
 80 |         coloredlogs.install(level='DEBUG', fmt='[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s')
 81 |     else:
 82 |         coloredlogs.install(fmt='%(message)s')
 83 |         logging.getLogger("transformers").setLevel(logging.WARNING)
 84 | 
 85 | 
 86 | def prepare_environment(cache_dir, output_dir, use_gpu):
 87 |     if cache_dir and not os.path.exists(cache_dir):
 88 |         os.makedirs(cache_dir)
 89 | 
 90 |     if output_dir and not os.path.exists(output_dir):
 91 |         os.makedirs(output_dir)
 92 | 
 93 |     import onnxruntime
 94 |     if use_gpu:
 95 |         assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers(
 96 |         ), "Please install onnxruntime-gpu package to test GPU inference."
 97 | 
 98 |     import transformers
 99 |     logger.info(f'PyTorch Version:{torch.__version__}')
100 |     logger.info(f'Transformers Version:{transformers.__version__}')
101 |     logger.info(f'Onnxruntime Version:{onnxruntime.__version__}')
102 | 
103 |     # Support three major versions of PyTorch and OnnxRuntime, and up to 6 months of transformers.
104 |     from packaging import version
105 |     assert version.parse(torch.__version__) >= version.parse('1.5.0')
106 |     assert version.parse(transformers.__version__) >= version.parse('3.0.0')
107 |     assert version.parse(onnxruntime.__version__) >= version.parse('1.4.0')
108 | 
109 | 
110 | def get_latency_result(runtimes, batch_size):
111 |     latency_ms = sum(runtimes) / float(len(runtimes)) * 1000.0
112 |     latency_variance = numpy.var(runtimes, dtype=numpy.float64) * 1000.0
113 |     throughput = batch_size * (1000.0 / latency_ms)
114 | 
115 |     return {
116 |         "test_times": len(runtimes),
117 |         "latency_variance": "{:.2f}".format(latency_variance),
118 |         "latency_90_percentile": "{:.2f}".format(numpy.percentile(runtimes, 90) * 1000.0),
119 |         "latency_95_percentile": "{:.2f}".format(numpy.percentile(runtimes, 95) * 1000.0),
120 |         "latency_99_percentile": "{:.2f}".format(numpy.percentile(runtimes, 99) * 1000.0),
121 |         "average_latency_ms": "{:.2f}".format(latency_ms),
122 |         "QPS": "{:.2f}".format(throughput),
123 |     }
124 | 
125 | 
126 | def output_details(results, csv_filename):
127 |     with open(csv_filename, mode="a", newline='') as csv_file:
128 |         column_names = [
129 |             "engine", "version", "device", "precision", "optimizer", "io_binding", "model_name", "inputs", "threads",
130 |             "batch_size", "sequence_length", "datetime", "test_times", "QPS", "average_latency_ms", "latency_variance",
131 |             "latency_90_percentile", "latency_95_percentile", "latency_99_percentile"
132 |         ]
133 | 
134 |         csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
135 |         csv_writer.writeheader()
136 |         for result in results:
137 |             csv_writer.writerow(result)
138 | 
139 |     logger.info(f"Detail results are saved to csv file: {csv_filename}")
140 | 
141 | 
142 | def output_summary(results, csv_filename, args):
143 |     with open(csv_filename, mode="a", newline='') as csv_file:
144 |         header_names = [
145 |             "model_name", "inputs", "engine", "version", "device", "precision", "optimizer", "io_binding", "threads"
146 |         ]
147 |         data_names = []
148 |         for batch_size in args.batch_sizes:
149 |             for sequence_length in args.sequence_lengths:
150 |                 data_names.append(f"b{batch_size}_s{sequence_length}")
151 | 
152 |         csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + data_names)
153 |         csv_writer.writeheader()
154 |         for model_name in args.models:
155 |             for input_count in [1, 2, 3]:
156 |                 for engine_name in args.engines:
157 |                     for io_binding in [True, False, ""]:
158 |                         for threads in args.num_threads:
159 |                             row = {}
160 |                             for result in results:
161 |                                 if result["model_name"] == model_name and result["inputs"] == input_count and result[
162 |                                         "engine"] == engine_name and result["io_binding"] == io_binding and result[
163 |                                             "threads"] == threads:
164 |                                     headers = {k: v for k, v in result.items() if k in header_names}
165 |                                     if not row:
166 |                                         row.update(headers)
167 |                                         row.update({k: "" for k in data_names})
168 |                                     else:
169 |                                         for k in header_names:
170 |                                             assert row[k] == headers[k]
171 |                                     b = result["batch_size"]
172 |                                     s = result["sequence_length"]
173 |                                     row[f"b{b}_s{s}"] = result["average_latency_ms"]
174 |                             if row:
175 |                                 csv_writer.writerow(row)
176 | 
177 |     logger.info(f"Summary results are saved to csv file: {csv_filename}")
178 | 
179 | 
180 | def output_fusion_statistics(model_fusion_statistics, csv_filename):
181 |     from transformers import __version__ as transformers_version
182 |     with open(csv_filename, mode="a", newline='') as csv_file:
183 |         column_names = ["model_filename", "datetime", "transformers", "torch"] + list(
184 |             next(iter(model_fusion_statistics.values())).keys())
185 |         csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
186 |         csv_writer.writeheader()
187 |         for key in model_fusion_statistics.keys():
188 |             model_fusion_statistics[key]["datetime"] = str(datetime.now())
189 |             model_fusion_statistics[key]["transformers"] = transformers_version
190 |             model_fusion_statistics[key]["torch"] = torch.__version__
191 |             model_fusion_statistics[key]["model_filename"] = key
192 |             csv_writer.writerow(model_fusion_statistics[key])
193 |     logger.info(f"Fusion statistics is saved to csv file: {csv_filename}")
194 | 
195 | 
196 | def inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size):
197 |     result = {}
198 |     runtimes = timeit.repeat(lambda: ort_session.run(None, ort_inputs), number=1, repeat=repeat_times)
199 |     result.update(result_template)
200 |     result.update({"io_binding": False})
201 |     result.update(get_latency_result(runtimes, batch_size))
202 |     return result
203 | 
204 | 
205 | def inference_ort_with_io_binding(ort_session,
206 |                                   ort_inputs,
207 |                                   result_template,
208 |                                   repeat_times,
209 |                                   ort_output_names,
210 |                                   ort_outputs,
211 |                                   output_buffers,
212 |                                   output_buffer_max_sizes,
213 |                                   batch_size,
214 |                                   device,
215 |                                   data_type=numpy.longlong):
216 |     result = {}
217 | 
218 |     # Bind inputs and outputs to onnxruntime session
219 |     io_binding = ort_session.io_binding()
220 |     # Bind inputs to device
221 |     for name in ort_inputs.keys():
222 |         np_input = torch.from_numpy(ort_inputs[name]).to(device)
223 |         input_type = IO_BINDING_DATA_TYPE_MAP[str(ort_inputs[name].dtype)] if str(
224 |             ort_inputs[name].dtype) in IO_BINDING_DATA_TYPE_MAP else data_type
225 |         io_binding.bind_input(name, np_input.device.type, 0, input_type, np_input.shape, np_input.data_ptr())
226 |     # Bind outputs buffers with the sizes needed if not allocated already
227 |     if len(output_buffers) == 0:
228 |         allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device)
229 | 
230 |     for i in range(len(ort_output_names)):
231 |         io_binding.bind_output(ort_output_names[i], output_buffers[i].device.type, 0, numpy.float32,
232 |                                ort_outputs[i].shape, output_buffers[i].data_ptr())
233 |     runtimes = timeit.repeat(lambda: ort_session.run_with_iobinding(io_binding), number=1, repeat=repeat_times)
234 |     result.update(result_template)
235 |     result.update({"io_binding": True})
236 |     result.update(get_latency_result(runtimes, batch_size))
237 |     return result
238 | 
239 | 
240 | def allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device):
241 |     # Allocate output tensors with the largest test size needed. So the allocated memory can be reused
242 |     # for each test run.
243 | 
244 |     for i in output_buffer_max_sizes:
245 |         output_buffers.append(torch.empty(i, dtype=torch.float32, device=device))
246 | 
247 | 
248 | def set_random_seed(seed=123):
249 |     """Set random seed manully to get deterministic results"""
250 |     import random
251 |     random.seed(seed)
252 |     numpy.random.seed(seed)
253 |     torch.manual_seed(seed)
254 |     torch.cuda.manual_seed(seed)
255 |     torch.cuda.manual_seed_all(seed)
256 |     #torch.backends.cudnn.enabled = False
257 |     #torch.backends.cudnn.benchmark = False
258 |     #torch.backends.cudnn.deterministic = True
259 | 
260 | 
261 | def measure_memory(is_gpu, func):
262 |     import os
263 |     import psutil
264 |     from time import sleep
265 | 
266 |     class MemoryMonitor:
267 |         def __init__(self, keep_measuring=True):
268 |             self.keep_measuring = keep_measuring
269 | 
270 |         def measure_cpu_usage(self):
271 |             max_usage = 0
272 |             while True:
273 |                 max_usage = max(max_usage, psutil.Process(os.getpid()).memory_info().rss / 1024**2)
274 |                 sleep(0.005)  # 5ms
275 |                 if not self.keep_measuring:
276 |                     break
277 |             return max_usage
278 | 
279 |         def measure_gpu_usage(self):
280 |             from py3nvml.py3nvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, \
281 |                                  nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlShutdown, NVMLError
282 |             max_gpu_usage = []
283 |             gpu_name = []
284 |             try:
285 |                 nvmlInit()
286 |                 deviceCount = nvmlDeviceGetCount()
287 |                 max_gpu_usage = [0 for i in range(deviceCount)]
288 |                 gpu_name = [nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)) for i in range(deviceCount)]
289 |                 while True:
290 |                     for i in range(deviceCount):
291 |                         info = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(i))
292 |                         max_gpu_usage[i] = max(max_gpu_usage[i], info.used / 1024**2)
293 |                     sleep(0.005)  # 5ms
294 |                     if not self.keep_measuring:
295 |                         break
296 |                 nvmlShutdown()
297 |                 return [{
298 |                     "device_id": i,
299 |                     "name": gpu_name[i],
300 |                     "max_used_MB": max_gpu_usage[i]
301 |                 } for i in range(deviceCount)]
302 |             except NVMLError as error:
303 |                 if not self.silent:
304 |                     self.logger.error("Error fetching GPU information using nvml: %s", error)
305 |                 return None
306 | 
307 |     monitor = MemoryMonitor(False)
308 | 
309 |     memory_before_test = monitor.measure_gpu_usage() if is_gpu else monitor.measure_cpu_usage()
310 | 
311 |     from concurrent.futures import ThreadPoolExecutor
312 |     with ThreadPoolExecutor() as executor:
313 |         monitor = MemoryMonitor()
314 |         mem_thread = executor.submit(monitor.measure_gpu_usage if is_gpu else monitor.measure_cpu_usage)
315 |         try:
316 |             fn_thread = executor.submit(func)
317 |             result = fn_thread.result()
318 |         finally:
319 |             monitor.keep_measuring = False
320 |             max_usage = mem_thread.result()
321 | 
322 |         if is_gpu:
323 |             print(f"GPU memory usage: before={memory_before_test}  peak={max_usage}")
324 |             if len(memory_before_test) >= 1 and len(max_usage) >= 1:
325 |                 before = memory_before_test[0]["max_used_MB"]
326 |                 after = max_usage[0]["max_used_MB"]
327 |                 return after - before
328 |             else:
329 |                 return None
330 |         else:
331 |             print(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB")
332 |             return max_usage - memory_before_test
333 | 


--------------------------------------------------------------------------------
/fusion_base.py:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
 3 | # Licensed under the MIT License.
 4 | #--------------------------------------------------------------------------
 5 | from logging import getLogger
 6 | from onnx_model import OnnxModel
 7 | from typing import Union, List
 8 | from onnx import GraphProto
 9 | 
10 | logger = getLogger(__name__)
11 | 
12 | 
13 | class Fusion:
14 |     def __init__(self,
15 |                  model: OnnxModel,
16 |                  fused_op_type: str,
17 |                  search_op_types: Union[str, List[str]],
18 |                  description: str = None):
19 |         self.search_op_types: List[str] = [search_op_types] if isinstance(search_op_types, str) else search_op_types
20 |         self.fused_op_type: str = fused_op_type
21 |         self.description: str = f"{fused_op_type}({description})" if description else fused_op_type
22 |         self.model: OnnxModel = model
23 |         self.nodes_to_remove: List = []
24 |         self.nodes_to_add: List = []
25 |         self.prune_graph: bool = False
26 |         self.node_name_to_graph_name: dict = {}
27 |         self.this_graph_name: str = None
28 |         # It is optional that subclass updates fused_count since we will also check nodes_to_add to get counter.
29 |         self.fused_count: int = 0
30 | 
31 |     def apply(self):
32 |         logger.debug(f"start {self.description} fusion...")
33 |         input_name_to_nodes = self.model.input_name_to_nodes()
34 |         output_name_to_node = self.model.output_name_to_node()
35 | 
36 |         # This assumes that two search ops will not be fused at same time!
37 |         for search_op_type in self.search_op_types:
38 |             for node in self.model.get_nodes_by_op_type(search_op_type):
39 |                 graph = self.model.get_graph_by_node(node)
40 |                 if graph is None:
41 |                     raise Exception("Can not find node in any graphs")
42 |                 self.this_graph_name = graph.name
43 |                 self.fuse(node, input_name_to_nodes, output_name_to_node)
44 | 
45 |         op_list = [node.op_type for node in self.nodes_to_add]
46 |         count = max(self.fused_count, op_list.count(self.fused_op_type))
47 |         if count > 0:
48 |             logger.info(f"Fused {self.description} count: {count}")
49 | 
50 |         self.model.remove_nodes(self.nodes_to_remove)
51 |         self.model.add_nodes(self.nodes_to_add, self.node_name_to_graph_name)
52 | 
53 |         if self.prune_graph:
54 |             self.model.prune_graph()
55 |         elif self.nodes_to_remove or self.nodes_to_add:
56 |             self.model.update_graph()
57 | 


--------------------------------------------------------------------------------
/fusion_biasgelu.py:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
 3 | # Licensed under the MIT License.
 4 | #--------------------------------------------------------------------------
 5 | 
 6 | from logging import getLogger
 7 | from onnx import helper
 8 | from onnx_model import OnnxModel
 9 | from fusion_base import Fusion
10 | from fusion_utils import NumpyHelper
11 | 
12 | logger = getLogger(__name__)
13 | 
14 | 
15 | class FusionBiasGelu(Fusion):
16 |     def __init__(self, model: OnnxModel, is_fastgelu):
17 |         if is_fastgelu:
18 |             super().__init__(model, 'FastGelu', 'FastGelu', 'add bias')
19 |         else:
20 |             super().__init__(model, 'BiasGelu', 'Gelu')
21 | 
22 |     def fuse(self, node, input_name_to_nodes, output_name_to_node):
23 |         gelu_op_type = node.op_type
24 |         fuse_op_type = 'BiasGelu' if gelu_op_type == 'Gelu' else 'FastGelu'
25 | 
26 |         if len(node.input) != 1:
27 |             return
28 | 
29 |         nodes = self.model.match_parent_path(node, ['Add', 'MatMul'], [0, None])
30 |         if nodes is None:
31 |             return
32 |         (add, matmul) = nodes
33 | 
34 |         bias_weight = None
35 |         # bias should be one dimension
36 |         bias_index = -1
37 |         for i, input in enumerate(add.input):
38 |             initializer = self.model.get_initializer(input)
39 |             if initializer is None:
40 |                 continue
41 |             bias_index = i
42 |             bias_weight = NumpyHelper.to_array(initializer)
43 |             break
44 |         if bias_weight is None:
45 |             return
46 |         if len(bias_weight.shape) != 1:
47 |             return
48 | 
49 |         subgraph_nodes = [node, add]
50 |         if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [node.output[0]], input_name_to_nodes,
51 |                                                 output_name_to_node):
52 |             return
53 | 
54 |         self.nodes_to_remove.extend(subgraph_nodes)
55 | 
56 |         fused_node = helper.make_node(fuse_op_type,
57 |                                       inputs=[matmul.output[0], add.input[bias_index]],
58 |                                       outputs=node.output,
59 |                                       name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_"))
60 |         fused_node.domain = "com.microsoft"
61 |         self.nodes_to_add.append(fused_node)
62 |         self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
63 | 


--------------------------------------------------------------------------------
/fusion_fastgelu.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.
  4 | #--------------------------------------------------------------------------
  5 | from typing import Dict, Optional
  6 | from logging import getLogger
  7 | from onnx import helper
  8 | from onnx_model import OnnxModel
  9 | from fusion_base import Fusion
 10 | 
 11 | logger = getLogger(__name__)
 12 | 
 13 | 
 14 | class FusionFastGelu(Fusion):
 15 |     def __init__(self, model: OnnxModel):
 16 |         super().__init__(model, "FastGelu", "Tanh")
 17 | 
 18 |     def fuse(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
 19 |         if self.fuse_1(tanh_node, input_name_to_nodes, output_name_to_node):
 20 |             return
 21 | 
 22 |         if self.fuse_2(tanh_node, input_name_to_nodes, output_name_to_node):
 23 |             return
 24 | 
 25 |         if self.fuse_3(tanh_node, input_name_to_nodes, output_name_to_node):
 26 |             return
 27 | 
 28 |     def fuse_1(self, tanh_node, input_name_to_nodes, output_name_to_node) -> Optional[bool]:
 29 |         """
 30 |         Fuse Gelu with tanh into one node:
 31 |               +---------------------------+
 32 |               |                           |
 33 |               |                           v
 34 |             [root] --> Pow --> Mul -----> Add  --> Mul --> Tanh --> Add --> Mul
 35 |               |       (Y=3)   (B=0.0447...)       (B=0.7978...)    (B=1)     ^
 36 |               |                                                              |
 37 |               +------> Mul(B=0.5)--------------------------------------------+
 38 |         Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
 39 |         """
 40 |         if tanh_node.output[0] not in input_name_to_nodes:
 41 |             return
 42 |         children = input_name_to_nodes[tanh_node.output[0]]
 43 |         if len(children) != 1 or children[0].op_type != 'Add':
 44 |             return
 45 |         add_after_tanh = children[0]
 46 | 
 47 |         if not self.model.has_constant_input(add_after_tanh, 1.0):
 48 |             return
 49 | 
 50 |         if add_after_tanh.output[0] not in input_name_to_nodes:
 51 |             return
 52 |         children = input_name_to_nodes[add_after_tanh.output[0]]
 53 |         if len(children) != 1 or children[0].op_type != 'Mul':
 54 |             return
 55 |         mul_after_tanh = children[0]
 56 | 
 57 |         mul_half = self.model.match_parent(mul_after_tanh, 'Mul', None, output_name_to_node)
 58 |         if mul_half is None:
 59 |             return
 60 | 
 61 |         i = self.model.find_constant_input(mul_half, 0.5)
 62 |         if i < 0:
 63 |             return
 64 | 
 65 |         root_input = mul_half.input[0 if i == 1 else 1]
 66 | 
 67 |         #root_node could be None when root_input is graph input
 68 |         root_node = self.model.get_parent(mul_half, 0 if i == 1 else 1, output_name_to_node)
 69 | 
 70 |         mul_before_tanh = self.model.match_parent(tanh_node, 'Mul', 0, output_name_to_node)
 71 |         if mul_before_tanh is None:
 72 |             return
 73 | 
 74 |         i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001)
 75 |         if i < 0:
 76 |             return
 77 | 
 78 |         add_before_tanh = self.model.match_parent(mul_before_tanh, 'Add', 0 if i == 1 else 1, output_name_to_node)
 79 |         if add_before_tanh is None:
 80 |             return
 81 | 
 82 |         mul_after_pow = self.model.match_parent(add_before_tanh,
 83 |                                                 'Mul',
 84 |                                                 None,
 85 |                                                 output_name_to_node,
 86 |                                                 exclude=[root_node] if root_node else [])
 87 |         if mul_after_pow is None:
 88 |             return
 89 | 
 90 |         i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001)
 91 |         if i < 0:
 92 |             return
 93 | 
 94 |         pow = self.model.match_parent(mul_after_pow, 'Pow', 0 if i == 1 else 1, output_name_to_node)
 95 |         if pow is None:
 96 |             return
 97 | 
 98 |         if not self.model.has_constant_input(pow, 3.0):
 99 |             return
100 | 
101 |         if pow.input[0] != root_input:
102 |             return
103 | 
104 |         subgraph_nodes = [
105 |             mul_after_tanh, mul_half, add_after_tanh, tanh_node, mul_before_tanh, add_before_tanh, mul_after_pow, pow
106 |         ]
107 |         if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul_after_tanh.output[0]], input_name_to_nodes,
108 |                                                 output_name_to_node):
109 |             return
110 | 
111 |         self.nodes_to_remove.extend(subgraph_nodes)
112 |         fused_node = helper.make_node('FastGelu',
113 |                                       inputs=[root_input],
114 |                                       outputs=mul_after_tanh.output,
115 |                                       name=self.model.create_node_name('FastGelu'))
116 |         fused_node.domain = "com.microsoft"
117 |         self.nodes_to_add.append(fused_node)
118 |         self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
119 |         return True
120 | 
121 |     def fuse_2(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
122 |         """
123 |         This pattern is from Tensorflow model.
124 |         Fuse Gelu with tanh into one node:
125 |               +---------------------------+
126 |               |                           |
127 |               |                           v
128 |             [root] --> Pow --> Mul -----> Add  --> Mul --> Tanh --> Add --> Mul(B=0.5)-->Mul-->
129 |               |       (Y=3)   (B=0.0447...)       (B=0.7978...)    (B=1)                  ^
130 |               |                                                                           |
131 |               +---------------------------------------------------------------------------+
132 |         Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
133 |         """
134 |         if tanh_node.output[0] not in input_name_to_nodes:
135 |             return
136 |         children = input_name_to_nodes[tanh_node.output[0]]
137 |         if len(children) != 1 or children[0].op_type != 'Add':
138 |             return
139 |         add_after_tanh = children[0]
140 | 
141 |         if not self.model.has_constant_input(add_after_tanh, 1.0):
142 |             return
143 | 
144 |         if add_after_tanh.output[0] not in input_name_to_nodes:
145 |             return
146 |         children = input_name_to_nodes[add_after_tanh.output[0]]
147 |         if len(children) != 1 or children[0].op_type != 'Mul':
148 |             return
149 |         mul_half = children[0]
150 | 
151 |         i = self.model.find_constant_input(mul_half, 0.5)
152 |         if i < 0:
153 |             return
154 | 
155 |         if mul_half.output[0] not in input_name_to_nodes:
156 |             return
157 |         children = input_name_to_nodes[mul_half.output[0]]
158 |         if len(children) != 1 or children[0].op_type != 'Mul':
159 |             return
160 |         mul_after_mul_half = children[0]
161 | 
162 |         root_node = self.model.get_parent(mul_after_mul_half,
163 |                                           0 if mul_after_mul_half.input[1] == mul_half.output[0] else 1,
164 |                                           output_name_to_node)
165 |         if root_node is None:
166 |             return
167 | 
168 |         mul_before_tanh = self.model.match_parent(tanh_node, 'Mul', 0, output_name_to_node)
169 |         if mul_before_tanh is None:
170 |             return
171 | 
172 |         i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001)
173 |         if i < 0:
174 |             return
175 | 
176 |         add_before_tanh = self.model.match_parent(mul_before_tanh, 'Add', 0 if i == 1 else 1, output_name_to_node)
177 |         if add_before_tanh is None:
178 |             return
179 | 
180 |         mul_after_pow = self.model.match_parent(add_before_tanh, 'Mul', None, output_name_to_node, exclude=[root_node])
181 |         if mul_after_pow is None:
182 |             return
183 | 
184 |         i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001)
185 |         if i < 0:
186 |             return
187 | 
188 |         pow = self.model.match_parent(mul_after_pow, 'Pow', 0 if i == 1 else 1, output_name_to_node)
189 |         if pow is None:
190 |             return
191 | 
192 |         if not self.model.has_constant_input(pow, 3.0):
193 |             return
194 | 
195 |         if pow.input[0] != root_node.output[0]:
196 |             return
197 | 
198 |         subgraph_nodes = [
199 |             mul_after_mul_half, mul_half, add_after_tanh, tanh_node, mul_before_tanh, add_before_tanh, mul_after_pow,
200 |             pow
201 |         ]
202 |         if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul_after_mul_half.output[0]], input_name_to_nodes,
203 |                                                 output_name_to_node):
204 |             return
205 | 
206 |         self.nodes_to_remove.extend(subgraph_nodes)
207 |         fused_node = helper.make_node('FastGelu',
208 |                                       inputs=[root_node.output[0]],
209 |                                       outputs=mul_after_mul_half.output,
210 |                                       name=self.model.create_node_name('FastGelu'))
211 |         fused_node.domain = "com.microsoft"
212 |         self.nodes_to_add.append(fused_node)
213 |         self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
214 |         return True
215 | 
216 |     def fuse_3(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
217 |         """
218 |             OpenAI's gelu implementation, also used in Megatron:
219 |                Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x)))
220 | 
221 |             Fuse subgraph into a FastGelu node:
222 |                 +------------ Mul (B=0.79788456) -------------------+
223 |                 |                                                   |
224 |                 +-------------------------------+                   |
225 |                 |                               |                   |
226 |                 |                               v                   v
227 |               [root] --> Mul (B=0.044715) --> Mul --> Add(B=1) --> Mul --> Tanh --> Add(B=1) --> Mul-->
228 |                 |                                                                                 ^
229 |                 |                                                                                 |
230 |                 +-----------> Mul (B=0.5) --------------------------------------------------------+
231 |             """
232 |         if tanh_node.output[0] not in input_name_to_nodes:
233 |             return
234 | 
235 |         children = input_name_to_nodes[tanh_node.output[0]]
236 |         if len(children) != 1 or children[0].op_type != 'Add':
237 |             return
238 |         add_after_tanh = children[0]
239 | 
240 |         if not self.model.has_constant_input(add_after_tanh, 1.0):
241 |             return
242 | 
243 |         if add_after_tanh.output[0] not in input_name_to_nodes:
244 |             return
245 |         children = input_name_to_nodes[add_after_tanh.output[0]]
246 |         if len(children) != 1 or children[0].op_type != 'Mul':
247 |             return
248 |         mul_last = children[0]
249 | 
250 |         mul_half = self.model.match_parent(mul_last, 'Mul', None, output_name_to_node)
251 |         if mul_half is None:
252 |             return
253 | 
254 |         i = self.model.find_constant_input(mul_half, 0.5)
255 |         if i < 0:
256 |             return
257 | 
258 |         root_input = mul_half.input[0 if i == 1 else 1]
259 | 
260 |         mul_before_tanh = self.model.match_parent(tanh_node, 'Mul', 0, output_name_to_node)
261 |         if mul_before_tanh is None:
262 |             return
263 | 
264 |         add_1 = self.model.match_parent(mul_before_tanh, 'Add', None, output_name_to_node)
265 |         if add_1 is None:
266 |             return
267 |         j = self.model.find_constant_input(add_1, 1.0)
268 |         if j < 0:
269 |             return
270 | 
271 |         mul_7978 = self.model.match_parent(mul_before_tanh, 'Mul', None, output_name_to_node)
272 |         if mul_7978 is None:
273 |             return
274 |         k = self.model.find_constant_input(mul_7978, 0.7978, delta=0.0001)
275 |         if k < 0:
276 |             return
277 |         if mul_7978.input[0 if k == 1 else 1] != root_input:
278 |             return
279 | 
280 |         mul_before_add_1 = self.model.match_parent(add_1, 'Mul', 0 if j == 1 else 1, output_name_to_node)
281 |         if mul_before_add_1 is None:
282 |             return
283 | 
284 |         if mul_before_add_1.input[0] == root_input:
285 |             another = 1
286 |         elif mul_before_add_1.input[1] == root_input:
287 |             another = 0
288 |         else:
289 |             return
290 | 
291 |         mul_0447 = self.model.match_parent(mul_before_add_1, 'Mul', another, output_name_to_node)
292 |         if mul_0447 is None:
293 |             return
294 |         m = self.model.find_constant_input(mul_0447, 0.0447, delta=0.0001)
295 |         if m < 0:
296 |             return
297 | 
298 |         if mul_0447.input[0 if m == 1 else 1] != root_input:
299 |             return
300 | 
301 |         subgraph_nodes = [
302 |             mul_0447, mul_before_add_1, add_1, mul_before_tanh, tanh_node, add_after_tanh, mul_7978, mul_half, mul_last
303 |         ]
304 |         if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul_last.output[0]], input_name_to_nodes,
305 |                                                 output_name_to_node):
306 |             return
307 | 
308 |         self.nodes_to_remove.extend(subgraph_nodes)
309 |         fused_node = helper.make_node('FastGelu',
310 |                                       inputs=[root_input],
311 |                                       outputs=mul_last.output,
312 |                                       name=self.model.create_node_name('FastGelu'))
313 |         fused_node.domain = "com.microsoft"
314 |         self.nodes_to_add.append(fused_node)
315 |         self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
316 |         return True
317 | 


--------------------------------------------------------------------------------
/fusion_gelu.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.
  4 | #--------------------------------------------------------------------------
  5 | from typing import Dict, Optional
  6 | from logging import getLogger
  7 | from onnx import helper
  8 | from onnx_model import OnnxModel
  9 | from fusion_base import Fusion
 10 | 
 11 | logger = getLogger(__name__)
 12 | 
 13 | 
 14 | class FusionGelu(Fusion):
 15 |     def __init__(self, model: OnnxModel):
 16 |         super().__init__(model, "Gelu", "Erf")
 17 | 
 18 |     def fuse(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
 19 |         if self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node):
 20 |             return
 21 |         if self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node):
 22 |             return
 23 |         self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node)
 24 | 
 25 |     def fuse_1(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
 26 |         """
 27 |         This pattern is from PyTorch model
 28 |         Fuse Gelu with Erf into one node:
 29 |         Pattern 1:
 30 |                        +-------Mul(0.5)---------------------+
 31 |                        |                                    |
 32 |                        |                                    v
 33 |                     [root] --> Div -----> Erf  --> Add --> Mul -->
 34 |                               (B=1.4142...)       (1)
 35 | 
 36 |         Pattern 2:
 37 |                        +------------------------------------+
 38 |                        |                                    |
 39 |                        |                                    v
 40 |                     [root] --> Div -----> Erf  --> Add --> Mul -->Mul -->
 41 |                               (B=1.4142...)       (1)            (0.5)
 42 | 
 43 |         Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
 44 |         """
 45 |         if erf_node.output[0] not in input_name_to_nodes:
 46 |             return
 47 |         children = input_name_to_nodes[erf_node.output[0]]
 48 |         if len(children) != 1 or children[0].op_type != 'Add':
 49 |             return
 50 |         add_after_erf = children[0]
 51 | 
 52 |         if not self.model.has_constant_input(add_after_erf, 1):
 53 |             return
 54 | 
 55 |         if add_after_erf.output[0] not in input_name_to_nodes:
 56 |             return
 57 |         children = input_name_to_nodes[add_after_erf.output[0]]
 58 |         if len(children) != 1 or children[0].op_type != 'Mul':
 59 |             return
 60 |         mul_after_erf = children[0]
 61 | 
 62 |         div = self.model.match_parent(erf_node, 'Div', 0, output_name_to_node)
 63 |         if div is None:
 64 |             return
 65 | 
 66 |         if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
 67 |             return
 68 | 
 69 |         subgraph_input = div.input[0]
 70 | 
 71 |         another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
 72 |         if subgraph_input == mul_after_erf.input[another]:  # pattern 2
 73 |             children = input_name_to_nodes[mul_after_erf.output[0]]
 74 |             if len(children) != 1 or children[0].op_type != 'Mul':
 75 |                 return
 76 |             mul_half = children[0]
 77 |             if not self.model.has_constant_input(mul_half, 0.5):
 78 |                 return
 79 |             subgraph_output = mul_half.output[0]
 80 |         else:  # pattern 1
 81 |             mul_half = self.model.match_parent(mul_after_erf, 'Mul', another, output_name_to_node)
 82 |             if mul_half is None:
 83 |                 return
 84 | 
 85 |             if not self.model.has_constant_input(mul_half, 0.5):
 86 |                 return
 87 | 
 88 |             if subgraph_input not in mul_half.input:
 89 |                 return
 90 | 
 91 |             subgraph_output = mul_after_erf.output[0]
 92 | 
 93 |         subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
 94 |         if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes,
 95 |                                                 output_name_to_node):
 96 |             return
 97 | 
 98 |         self.nodes_to_remove.extend(subgraph_nodes)
 99 |         fused_node = helper.make_node('Gelu', inputs=[subgraph_input], outputs=[subgraph_output])
100 |         fused_node.domain = "com.microsoft"
101 |         self.nodes_to_add.append(fused_node)
102 |         self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
103 |         return True
104 | 
105 |     def fuse_2(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
106 |         """
107 |         This pattern is from Keras model
108 |         Fuse Gelu with Erf into one node:
109 |                        +------------------------------------------+
110 |                        |                                          |
111 |                        |                                          v
112 |                     [root] --> Div -----> Erf  --> Add --> Mul -->Mul
113 |                               (B=1.4142...)       (A=1)   (A=0.5)
114 | 
115 |         Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
116 |         """
117 |         if erf_node.output[0] not in input_name_to_nodes:
118 |             return
119 |         children = input_name_to_nodes[erf_node.output[0]]
120 |         if len(children) != 1 or children[0].op_type != 'Add':
121 |             return
122 |         add_after_erf = children[0]
123 | 
124 |         if not self.model.has_constant_input(add_after_erf, 1):
125 |             return
126 | 
127 |         if add_after_erf.output[0] not in input_name_to_nodes:
128 |             return
129 |         children = input_name_to_nodes[add_after_erf.output[0]]
130 |         if len(children) != 1 or children[0].op_type != 'Mul':
131 |             return
132 |         mul_after_erf = children[0]
133 | 
134 |         if not self.model.has_constant_input(mul_after_erf, 0.5):
135 |             return
136 | 
137 |         if mul_after_erf.output[0] not in input_name_to_nodes:
138 |             return
139 |         children = input_name_to_nodes[mul_after_erf.output[0]]
140 |         if len(children) != 1 or children[0].op_type != 'Mul':
141 |             return
142 |         mul = children[0]
143 | 
144 |         div = self.model.match_parent(erf_node, 'Div', 0, output_name_to_node)
145 |         if div is None:
146 |             return
147 | 
148 |         sqrt_node = None
149 |         if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
150 |             sqrt_node = self.model.match_parent(div, 'Sqrt', 1, output_name_to_node)
151 |             if sqrt_node is None:
152 |                 return
153 |             if not self.model.has_constant_input(sqrt_node, 2.0):
154 |                 return
155 | 
156 |         root_node = self.model.get_parent(div, 0, output_name_to_node)
157 |         if root_node is None:
158 |             return
159 | 
160 |         if root_node.output[0] not in mul.input:
161 |             return
162 | 
163 |         subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
164 |         if sqrt_node:
165 |             subgraph_nodes.append(sqrt_node)
166 | 
167 |         if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul.output[0]], input_name_to_nodes,
168 |                                                 output_name_to_node):
169 |             return
170 | 
171 |         self.nodes_to_remove.extend(subgraph_nodes)
172 |         fused_node = helper.make_node('Gelu', inputs=[root_node.output[0]], outputs=[mul.output[0]])
173 |         fused_node.domain = "com.microsoft"
174 |         self.nodes_to_add.append(fused_node)
175 |         self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
176 |         return True
177 | 
178 |     def fuse_3(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
179 |         """
180 |         This pattern is from TensorFlow model
181 |         Fuse Gelu with Erf into one node:
182 |                        +----------------------------------------------+
183 |                        |                                              |
184 |                        |                                              v
185 |                     [root] --> Mul -----> Erf    -->   Add --> Mul -->Mul
186 |                                (A=0.7071067690849304)  (B=1)  (B=0.5)
187 | 
188 |         Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
189 |         """
190 | 
191 |         if erf_node.output[0] not in input_name_to_nodes:
192 |             return
193 |         children = input_name_to_nodes[erf_node.output[0]]
194 |         if len(children) != 1 or children[0].op_type != 'Add':
195 |             return
196 |         add_after_erf = children[0]
197 | 
198 |         if not self.model.has_constant_input(add_after_erf, 1):
199 |             return
200 | 
201 |         if add_after_erf.output[0] not in input_name_to_nodes:
202 |             return
203 |         children = input_name_to_nodes[add_after_erf.output[0]]
204 |         if len(children) != 1 or children[0].op_type != 'Mul':
205 |             return
206 |         mul_half = children[0]
207 | 
208 |         if not self.model.has_constant_input(mul_half, 0.5):
209 |             return
210 | 
211 |         first_mul = self.model.match_parent(erf_node, 'Mul', 0, output_name_to_node)
212 |         if first_mul is None:
213 |             return
214 | 
215 |         i = self.model.find_constant_input(first_mul, 0.7071067690849304, delta=0.001)
216 |         if i < 0:
217 |             return
218 | 
219 |         root_node = self.model.get_parent(first_mul, 0 if i == 1 else 1, output_name_to_node)
220 |         if root_node is None:
221 |             return
222 | 
223 |         if mul_half.output[0] not in input_name_to_nodes:
224 |             return
225 |         children = input_name_to_nodes[mul_half.output[0]]
226 |         if len(children) != 1 or children[0].op_type != 'Mul':
227 |             return
228 |         last_mul = children[0]
229 | 
230 |         if not (last_mul.input[0] == root_node.output[0] or last_mul.input[1] == root_node.output[0]):
231 |             return
232 | 
233 |         subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
234 |         if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [last_mul.output[0]], input_name_to_nodes,
235 |                                                 output_name_to_node):
236 |             return
237 | 
238 |         self.nodes_to_remove.extend(subgraph_nodes)
239 |         fused_node = helper.make_node('Gelu', inputs=[root_node.output[0]], outputs=[last_mul.output[0]])
240 |         fused_node.domain = "com.microsoft"
241 |         self.nodes_to_add.append(fused_node)
242 |         self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
243 |         return True
244 | 


--------------------------------------------------------------------------------
/fusion_gelu_approximation.py:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
 3 | # Licensed under the MIT License.
 4 | #--------------------------------------------------------------------------
 5 | 
 6 | from logging import getLogger
 7 | from onnx import helper
 8 | from onnx_model import OnnxModel
 9 | from fusion_base import Fusion
10 | 
11 | 
12 | class FusionGeluApproximation(Fusion):
13 |     def __init__(self, model: OnnxModel):
14 |         super().__init__(model, 'FastGelu', ['Gelu', 'BiasGelu'], 'GeluApproximation')
15 | 
16 |     def fuse(self, node, input_name_to_nodes, output_name_to_node):
17 |         new_node = helper.make_node("FastGelu",
18 |                                     inputs=node.input,
19 |                                     outputs=node.output,
20 |                                     name=self.model.create_node_name("FastGelu", node.op_type + "_Approximation"))
21 |         new_node.domain = "com.microsoft"
22 |         self.nodes_to_remove.append(node)
23 |         self.nodes_to_add.append(new_node)
24 |         self.node_name_to_graph_name[new_node.name] = self.this_graph_name
25 | 


--------------------------------------------------------------------------------
/fusion_gpt_attention_megatron.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.
  4 | #--------------------------------------------------------------------------
  5 | import numpy as np
  6 | from logging import getLogger
  7 | from onnx import helper, numpy_helper, TensorProto
  8 | from onnx_model import OnnxModel
  9 | from fusion_base import Fusion
 10 | from fusion_utils import FusionUtils
 11 | from fusion_gpt_attention import FusionGptAttentionPastBase
 12 | 
 13 | logger = getLogger(__name__)
 14 | 
 15 | 
 16 | def is_close(value, expected_value):
 17 |     return abs(value - expected_value) <= 1e-6
 18 | 
 19 | 
 20 | class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
 21 |     """
 22 |     Fuse GPT-2 Attention with past state subgraph from Megatron into one Attention node.
 23 |     """
 24 |     def __init__(self, model: OnnxModel, num_heads: int):
 25 |         super().__init__(model, num_heads)
 26 | 
 27 |     def fuse_attention_node(self, matmul_before_split, add_before_split, past, present, input, reshape_qkv, mask):
 28 |         attention_node_name = self.model.create_node_name('GptAttention')
 29 |         int32_mask = self.cast_attention_mask(mask)
 30 |         output = reshape_qkv.output[0]
 31 |         i = 1 if (add_before_split.input[0] == matmul_before_split.output[0]) else 0
 32 |         attention_node = helper.make_node(
 33 |             'Attention',
 34 |             inputs=[input, matmul_before_split.input[1], add_before_split.input[i], int32_mask, past],
 35 |             outputs=[output, present],
 36 |             name=attention_node_name)
 37 |         attention_node.domain = "com.microsoft"
 38 |         attention_node.attribute.extend([
 39 |             helper.make_attribute("num_heads", self.num_heads),
 40 |             helper.make_attribute("unidirectional", 0)  # unidirectional shall not be ON for 4D attention mask
 41 |         ])
 42 | 
 43 |         nodes_to_add = [attention_node]
 44 |         self.nodes_to_add.extend(nodes_to_add)
 45 | 
 46 |         for node in nodes_to_add:
 47 |             self.node_name_to_graph_name[node.name] = self.this_graph_name
 48 | 
 49 |         self.nodes_to_remove.append(reshape_qkv)
 50 | 
 51 |         # we rely on prune_graph() to clean old subgraph nodes
 52 |         self.prune_graph = True
 53 | 
 54 |     def match_mask(self, sub_qk, mul_qk, matmul_qk, layernorm_before_attention):
 55 |         mask_nodes = self.model.match_parent_path(
 56 |             sub_qk,
 57 |             ['Mul', 'Sub', 'Slice', 'Slice'],
 58 |             [1,      0,       1,       0])  # yapf: disable
 59 |         if mask_nodes is None:
 60 |             logger.debug("fuse_attention: failed to match unidirectional mask path")
 61 |             return None
 62 |         (mul_mask, sub_mask, last_slice_mask, slice_mask) = mask_nodes
 63 | 
 64 |         if mul_qk.input[1] != last_slice_mask.output[0]:
 65 |             logger.debug("fuse_attention failed: mul_qk.input[1] != last_slice_mask.output[0]")
 66 |             return None
 67 | 
 68 |         if not self.utils.check_node_input_value(mul_mask, 1, 10000.0):
 69 |             logger.debug("fuse_attention failed: mul_mask input 1 is not constant 10000.0")
 70 |             return None
 71 | 
 72 |         if not self.utils.check_node_input_value(sub_mask, 0, 1.0):
 73 |             logger.debug("fuse_attention failed: sub_mask input 0 is not constant 1.0")
 74 |             return None
 75 | 
 76 |         if not self.model.find_graph_input(slice_mask.input[0]):
 77 |             logger.info("expect slick_mask input 0 to be graph input")
 78 |             return None
 79 | 
 80 |         if not self.utils.check_node_input_value(last_slice_mask, 1, [0]):
 81 |             logger.debug("fuse_attention failed: last_slice_mask input 1 (starts) is not constant [0]")
 82 |             return None
 83 | 
 84 |         if not self.utils.check_node_input_value(last_slice_mask, 3, [3]):
 85 |             logger.debug("fuse_attention failed: last_slice_mask input 3 (axes) is not constant [3]")
 86 |             return False
 87 | 
 88 |         if not self.utils.check_node_input_value(last_slice_mask, 4, [1]):
 89 |             logger.debug("fuse_attention failed: last_slice_mask input 4 (steps) is not constant [1]")
 90 |             return False
 91 | 
 92 |         if not self.utils.check_node_input_value(slice_mask, 3, [2]):
 93 |             logger.debug("fuse_attention failed: slice_mask input 3 (axes) is not constant [2]")
 94 |             return None
 95 | 
 96 |         if not self.utils.check_node_input_value(slice_mask, 4, [1]):
 97 |             logger.debug("fuse_attention failed: slice_mask input 4 (steps) is not constant [1]")
 98 |             return None
 99 | 
100 |         last_slice_path = self.model.match_parent_path(last_slice_mask, ['Unsqueeze', 'Gather', 'Shape', 'MatMul'],
101 |                                                        [2, 0, 0, 0])
102 |         if last_slice_path is None or last_slice_path[-1] != matmul_qk:
103 |             logger.debug("fuse_attention: failed to match last slice path")
104 |             return None
105 | 
106 |         first_slice_path = self.model.match_parent_path(slice_mask, ['Unsqueeze', 'Gather', 'Shape', 'MatMul'],
107 |                                                         [2, 0, 0, 0])
108 |         if first_slice_path is None or first_slice_path[-1] != matmul_qk:
109 |             logger.debug("fuse_attention: failed to match first slice path")
110 |             return None
111 | 
112 |         first_slice_sub = self.model.match_parent_path(slice_mask, ['Unsqueeze', 'Sub', 'Gather', 'Shape', 'MatMul'],
113 |                                                        [1, 0, 0, 0, 0])
114 |         if first_slice_sub is None or first_slice_sub[-1] != matmul_qk:
115 |             logger.debug("fuse_attention: failed to match last slice sub path")
116 |             return None
117 | 
118 |         first_slice_sub_1 = self.model.match_parent_path(slice_mask,
119 |                                                          ['Unsqueeze', 'Sub', 'Gather', 'Shape', 'LayerNormalization'],
120 |                                                          [1, 0, 1, 0, 0])
121 |         if first_slice_sub_1 is None or first_slice_sub_1[-1] != layernorm_before_attention:
122 |             logger.debug("fuse_attention: failed to match last slice sub path 1")
123 |             return None
124 | 
125 |         return slice_mask.input[0]
126 | 
127 |     def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
128 |         past = None
129 |         present = None
130 | 
131 |         qkv_nodes = self.model.match_parent_path(
132 |             normalize_node,
133 |             ['Add', 'Add', 'MatMul', 'Reshape', 'Transpose', 'MatMul'],
134 |             [  0,      1,     None,          0,         0,           0],
135 |             output_name_to_node=output_name_to_node,
136 |             ) # yapf: disable
137 |         if qkv_nodes is None:
138 |             return
139 |         (add_skip, add_after_attention, matmul_after_attention, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
140 | 
141 |         skip_input = add_skip.input[0]
142 | 
143 |         v_nodes = self.model.match_parent_path(
144 |             matmul_qkv,
145 |             ['Concat', 'Transpose', 'Reshape', 'Split', 'Add', 'MatMul', 'LayerNormalization'],
146 |             [1,        1,            0,         0,       0,         None,      0]) # yapf: disable
147 |         if v_nodes is None:
148 |             logger.debug("fuse_attention: failed to match v path")
149 |             return
150 |         (concat_v, transpose_v, reshape_v, split_v, add_before_split, matmul_before_split,
151 |          layernorm_before_attention) = v_nodes
152 |         if skip_input != layernorm_before_attention.input[0]:
153 |             logger.debug("fuse_attention: skip_input != layernorm_before_attention.input[0]")
154 |             return
155 | 
156 |         qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Sub', 'Mul', 'MatMul'], [0, 0, 0, 0])
157 |         if qk_nodes is None:
158 |             logger.debug("fuse_attention: failed to match qk path")
159 |             return None
160 |         (softmax_qk, sub_qk, mul_qk, matmul_qk) = qk_nodes
161 |         if self.model.get_node_attribute(softmax_qk, "axis") != 3:
162 |             logger.debug("fuse_attention failed: softmax_qk axis != 3")
163 |             return None
164 | 
165 |         attention_mask = self.match_mask(sub_qk, mul_qk, matmul_qk, layernorm_before_attention)
166 | 
167 |         q_nodes = self.model.match_parent_path(matmul_qk, ['Div', 'Transpose', 'Reshape', 'Split'], [0, 0, 0, 0])
168 |         if q_nodes is None:
169 |             logger.debug("fuse_attention: failed to match q path")
170 |             return
171 |         (div_q, transpose_q, reshape_q, split_q) = q_nodes
172 |         if split_v != split_q:
173 |             logger.debug("fuse_attention: skip since split_v != split_q")
174 |             return
175 | 
176 |         k_nodes = self.model.match_parent_path(matmul_qk,
177 |                                                ['Div', 'Transpose', 'Concat', 'Transpose', 'Reshape', 'Split'],
178 |                                                [1, 0, 0, 1, 0, 0])
179 |         if k_nodes is None:
180 |             logger.debug("fuse_attention: failed to match k path")
181 |             return
182 |         (div_k, _, concat_k, transpose_k, reshape_k, split_k) = k_nodes
183 |         if split_v != split_k:
184 |             logger.debug("fuse_attention: skip since split_v != split_k")
185 |             return
186 | 
187 |         i, value = self.model.get_constant_input(reshape_k)
188 |         if not (isinstance(value, np.ndarray) and list(value.shape) == [4] and value[0] == 0 and value[1] == 0
189 |                 and value[2] > 0 and value[3] > 0):
190 |             logger.debug("fuse_attention: reshape constant input is not [0, 0, N, H]")
191 |             return
192 | 
193 |         num_heads = value[2]
194 |         if num_heads != self.num_heads:
195 |             logger.info(f"Detected num_heads={num_heads}. Ignore user specified value {self.num_heads}")
196 |             self.num_heads = num_heads
197 | 
198 |         hidden_size_per_head = value[3]
199 |         i, value = self.model.get_constant_input(div_k)
200 |         expected_value = float(np.sqrt(np.sqrt(hidden_size_per_head)))
201 |         if not is_close(value, expected_value):
202 |             logger.debug(f"fuse_attention: div_k value={value} expected={expected_value}")
203 |             return
204 | 
205 |         i, value = self.model.get_constant_input(div_q)
206 |         if not is_close(value, expected_value):
207 |             logger.debug(f"fuse_attention: div_q value={value} expected={expected_value}")
208 |             return
209 | 
210 |         # Match past and present paths
211 |         past = self.match_past_pattern_2(concat_k, concat_v, output_name_to_node)
212 |         if past is None:
213 |             logger.debug("fuse_attention: match past failed")
214 |             return
215 |         if not self.model.find_graph_input(past):
216 |             logger.debug("fuse_attention: past is not graph input.")
217 |             # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input.
218 | 
219 |         present = self.match_present(concat_v, input_name_to_nodes)
220 |         if present is None:
221 |             logger.debug("fuse_attention: match present failed")
222 |             return
223 |         if not self.model.find_graph_output(present):
224 |             logger.info("fuse_attention: expect present to be graph output")
225 |             return
226 | 
227 |         self.fuse_attention_node(matmul_before_split, add_before_split, past, present,
228 |                                  layernorm_before_attention.output[0], reshape_qkv, attention_mask)
229 | 


--------------------------------------------------------------------------------
/fusion_gpt_attention_no_past.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.
  4 | #--------------------------------------------------------------------------
  5 | import numpy as np
  6 | from logging import getLogger
  7 | from onnx import helper, numpy_helper, TensorProto
  8 | from onnx_model import OnnxModel
  9 | from fusion_base import Fusion
 10 | from fusion_utils import FusionUtils
 11 | 
 12 | logger = getLogger(__name__)
 13 | 
 14 | 
 15 | class FusionGptAttentionNoPast(Fusion):
 16 |     """
 17 |     Fuse GPT-2 Attention without past state into one Attention node.
 18 |     This does not support attention_mask graph input right now.
 19 |     """
 20 |     def __init__(self, model: OnnxModel, num_heads: int):
 21 |         super().__init__(model, "Attention", "LayerNormalization", "without past")
 22 |         # TODO: detect num_heads from graph like FusionAttention
 23 |         self.num_heads = num_heads
 24 | 
 25 |     def create_attention_node(self, gemm, gemm_qkv, input, output):
 26 |         attention_node_name = self.model.create_node_name('Attention')
 27 |         attention_node = helper.make_node('Attention',
 28 |                                           inputs=[input, gemm.input[1], gemm.input[2]],
 29 |                                           outputs=[attention_node_name + "_output"],
 30 |                                           name=attention_node_name)
 31 |         attention_node.domain = "com.microsoft"
 32 |         attention_node.attribute.extend(
 33 |             [helper.make_attribute("num_heads", self.num_heads),
 34 |              helper.make_attribute("unidirectional", 1)])
 35 | 
 36 |         matmul_node = helper.make_node('MatMul',
 37 |                                        inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
 38 |                                        outputs=[attention_node_name + "_matmul_output"],
 39 |                                        name=attention_node_name + "_matmul")
 40 | 
 41 |         add_node = helper.make_node('Add',
 42 |                                     inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
 43 |                                     outputs=[output],
 44 |                                     name=attention_node_name + "_add")
 45 | 
 46 |         self.nodes_to_add.extend([attention_node, matmul_node, add_node])
 47 |         self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
 48 |         self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name
 49 |         self.node_name_to_graph_name[add_node.name] = self.this_graph_name
 50 | 
 51 |     def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
 52 |         return_indice = []
 53 |         qkv_nodes = self.model.match_parent_path(
 54 |             normalize_node,
 55 |             ['Add', 'Reshape', 'Gemm', 'Reshape', 'Reshape', 'Transpose', 'MatMul'],
 56 |             [0, None, 0, 0, 0, 0, 0],
 57 |             output_name_to_node=output_name_to_node,
 58 |             return_indice=return_indice
 59 |             ) # yapf: disable
 60 |         if qkv_nodes is None:
 61 |             return
 62 |         (add_qkv, reshape_qkv, gemm_qkv, reshape_1, reshape_2, transpose_qkv, matmul_qkv) = qkv_nodes
 63 | 
 64 |         another_input = add_qkv.input[1 - return_indice[0]]
 65 | 
 66 |         v_nodes = self.model.match_parent_path(
 67 |             matmul_qkv,
 68 |             ['Transpose', 'Reshape', 'Split', 'Reshape', 'Gemm', 'Reshape'],
 69 |             [1, 0, 0, 0, 0, 0]) # yapf: disable
 70 |         if v_nodes is None:
 71 |             logger.debug("fuse_attention: failed to match v path")
 72 |             return
 73 |         (transpose_v, reshape_v, split_v, reshape_after_gemm, gemm, reshape_before_gemm) = v_nodes
 74 | 
 75 |         layernorm_before_attention = self.model.get_parent(reshape_before_gemm, 0, output_name_to_node)
 76 |         if layernorm_before_attention is None or layernorm_before_attention.op_type != 'LayerNormalization':
 77 |             if layernorm_before_attention.op_type != 'Add':
 78 |                 logger.debug(f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}")
 79 |                 return
 80 | 
 81 |         if not another_input in layernorm_before_attention.input:
 82 |             # match openai-gpt
 83 |             if not another_input in layernorm_before_attention.output:
 84 |                 logger.debug("Add and LayerNormalization shall have one same input")
 85 |                 return
 86 | 
 87 |         qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Sub', 'Mul', 'Div', 'MatMul'], [0, 0, 0, 0, 0])
 88 |         if qk_nodes is not None:
 89 |             (softmax_qk, sub_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
 90 |             mask_nodes = self.model.match_parent_path(
 91 |                 sub_qk,
 92 |                 ['Mul', 'Sub', 'Slice', 'Slice', 'Unsqueeze', 'Sub', 'Squeeze', 'Slice', 'Shape', 'Div'],
 93 |                 [1,      0,     1,       0,       1,           0,     0,         0,       0,       0])  # yapf: disable
 94 |             if mask_nodes is None:
 95 |                 logger.debug("fuse_attention: failed to match mask path")
 96 |                 return
 97 |             div_mask = mask_nodes[-1]
 98 | 
 99 |             if div_qk != div_mask:
100 |                 logger.debug("fuse_attention: skip since div_qk != div_mask")
101 |                 return
102 |         else:
103 |             # New pattern for gpt2 from PyTorch 1.5.0 and Transformers 2.9.0.
104 |             qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Where', 'Div', 'MatMul'], [0, 0, 1, 0])
105 |             if qk_nodes is not None:
106 |                 (softmax_qk, where_qk, div_qk, matmul_qk) = qk_nodes
107 |                 mask_nodes = self.model.match_parent_path(
108 |                     where_qk,
109 |                     ['Cast', 'Slice', 'Slice', 'Unsqueeze', 'Sub', 'Squeeze', 'Slice', 'Shape', 'Div'],
110 |                     [ 0,     0,       0,       1,           0,     0,         0,       0,       0])  # yapf: disable
111 |                 if mask_nodes is None:
112 |                     logger.debug("fuse_attention: failed to match mask path")
113 |                     return
114 |                 div_mask = mask_nodes[-1]
115 | 
116 |                 if div_qk != div_mask:
117 |                     logger.debug("fuse_attention: skip since div_qk != div_mask")
118 |                     return
119 |             else:
120 |                 # match openai-gpt
121 |                 qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Mul', 'Div', 'MatMul'],
122 |                                                         [0, 0, 0, 0, 0])
123 |                 if qk_nodes is None:
124 |                     logger.debug("fuse_attention: failed to match qk path")
125 |                     return
126 |                 (softmax_qk, add_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
127 |                 mask_nodes = self.model.match_parent_path(
128 |                     mul_qk,
129 |                     ['Slice', 'Slice', 'Unsqueeze', 'Squeeze', 'Slice', 'Shape', 'Div'],
130 |                     [ 1,       0,       2,           0,         0,       0,       0])  # yapf: disable
131 |                 if mask_nodes is None:
132 |                     logger.debug("fuse_attention: failed to match mask path")
133 |                     return
134 |                 div_mask = mask_nodes[-1]
135 | 
136 |                 if div_qk != div_mask:
137 |                     logger.debug("fuse_attention: skip since div_qk != div_mask")
138 |                     return
139 | 
140 |         q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Split'], [0, 0, 0])
141 |         if q_nodes is None:
142 |             logger.debug("fuse_attention: failed to match q path")
143 |             return
144 |         (transpose_q, reshape_q, split_q) = q_nodes
145 |         if split_v != split_q:
146 |             logger.debug("fuse_attention: skip since split_v != split_q")
147 |             return
148 | 
149 |         k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Split'], [1, 0, 0])
150 |         if k_nodes is None:
151 |             logger.debug("fuse_attention: failed to match k path")
152 |             return
153 |         (transpose_k, reshape_k, split_k) = k_nodes
154 |         if split_v != split_k:
155 |             logger.debug("fuse_attention: skip since split_v != split_k")
156 |             return
157 | 
158 |         self.create_attention_node(gemm, gemm_qkv, layernorm_before_attention.output[0], reshape_qkv.output[0])
159 | 
160 |         # we rely on prune_graph() to clean old subgraph nodes:
161 |         # qk_nodes + q_nodes + k_nodes + v_nodes + mask_nodes + [reshape_qkv, transpose_qkv, matmul_qkv]
162 |         self.prune_graph = True
163 | 


--------------------------------------------------------------------------------
/fusion_layernorm.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.
  4 | #--------------------------------------------------------------------------
  5 | from typing import Dict
  6 | from logging import getLogger
  7 | from onnx import helper
  8 | from onnx_model import OnnxModel
  9 | from fusion_base import Fusion
 10 | 
 11 | logger = getLogger(__name__)
 12 | 
 13 | 
 14 | class FusionLayerNormalization(Fusion):
 15 |     def __init__(self, model: OnnxModel):
 16 |         super().__init__(model, "LayerNormalization", "ReduceMean")
 17 | 
 18 |     def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
 19 |         """
 20 |         Fuse Layer Normalization subgraph into one node LayerNormalization:
 21 |               +----------------------+
 22 |               |                      |
 23 |               |                      v
 24 |           [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
 25 |                      (axis=2 or -1)  |      (Y=2)   (axis=2 or -1)  (E-6 or E-12 or 0)    ^
 26 |                                      |                                               |
 27 |                                      +-----------------------------------------------+
 28 | 
 29 |          It also handles cases of duplicated sub nodes exported from older version of PyTorch:
 30 |               +----------------------+
 31 |               |                      v
 32 |               |           +-------> Sub-----------------------------------------------+
 33 |               |           |                                                           |
 34 |               |           |                                                           v
 35 |           [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div  --> Mul --> Add
 36 |               |                      ^
 37 |               |                      |
 38 |               +----------------------+
 39 |         """
 40 |         children = self.model.get_children(node, input_name_to_nodes)
 41 |         if len(children) == 0 or len(children) > 2:
 42 |             return
 43 | 
 44 |         root_input = node.input[0]
 45 | 
 46 |         if children[0].op_type != 'Sub' or children[0].input[0] != root_input:
 47 |             return
 48 | 
 49 |         if len(children) == 2:
 50 |             if children[1].op_type != 'Sub' or children[1].input[0] != root_input:
 51 |                 return
 52 | 
 53 |         div_node = None
 54 |         for child in children:
 55 |             div_node = self.model.find_first_child_by_type(child, 'Div', input_name_to_nodes, recursive=False)
 56 |             if div_node is not None:
 57 |                 break
 58 |         if div_node is None:
 59 |             return
 60 | 
 61 |         path_id, parent_nodes, _ = self.model.match_parent_paths(
 62 |             div_node, [(['Sqrt', 'Add', 'ReduceMean', 'Pow', 'Sub'], [1, 0, 0, 0, 0]),
 63 |                        (['Sqrt', 'Add', 'ReduceMean', 'Pow', 'Cast', 'Sub'], [1, 0, 0, 0, 0, 0])], output_name_to_node)
 64 |         if path_id < 0:
 65 |             return
 66 | 
 67 |         sub_node = parent_nodes[-1]
 68 |         if sub_node not in children:
 69 |             return
 70 | 
 71 |         second_add_node = parent_nodes[1]
 72 |         i, add_weight = self.model.get_constant_input(second_add_node)
 73 |         if add_weight is None or add_weight <= 0 or add_weight > 1.0E-4:
 74 |             logger.warning(f"epsilon value is not expeced: {add_weight}")
 75 |             return
 76 | 
 77 |         pow_node = parent_nodes[3]
 78 |         if not self.model.find_constant_input(pow_node, 2.0) == 1:
 79 |             return
 80 | 
 81 |         mul_node = input_name_to_nodes[div_node.output[0]][0]
 82 |         if mul_node.op_type != 'Mul':
 83 |             return
 84 | 
 85 |         last_add_node = input_name_to_nodes[mul_node.output[0]][0]
 86 |         if last_add_node.op_type != 'Add':
 87 |             return
 88 | 
 89 |         subgraph_nodes = [node]
 90 |         subgraph_nodes.extend(children)
 91 |         subgraph_nodes.extend(parent_nodes[:-1])
 92 | 
 93 |         subgraph_nodes.extend([last_add_node, mul_node, div_node])
 94 |         if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, last_add_node.output, input_name_to_nodes,
 95 |                                                 output_name_to_node):
 96 |             logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
 97 |             return
 98 | 
 99 |         weight_input = mul_node.input[1 - self.model.input_index(div_node.output[0], mul_node)]
100 |         if not self.model.is_constant_with_specified_dimension(weight_input, 1, "layernorm weight"):
101 |             return
102 | 
103 |         bias_input = last_add_node.input[1 - self.model.input_index(mul_node.output[0], last_add_node)]
104 |         if not self.model.is_constant_with_specified_dimension(bias_input, 1, "layernorm bias"):
105 |             return
106 | 
107 |         self.nodes_to_remove.extend(subgraph_nodes)
108 | 
109 |         normalize_node = helper.make_node('LayerNormalization',
110 |                                           inputs=[node.input[0], weight_input, bias_input],
111 |                                           outputs=[last_add_node.output[0]],
112 |                                           name=self.model.create_node_name("LayerNormalization",
113 |                                                                            name_prefix="LayerNorm"))
114 |         normalize_node.attribute.extend([helper.make_attribute("epsilon", float(add_weight))])
115 |         self.nodes_to_add.append(normalize_node)
116 |         self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
117 | 
118 | 
119 | class FusionLayerNormalizationTF(Fusion):
120 |     def __init__(self, model: OnnxModel):
121 |         super().__init__(model, "LayerNormalization", "Add", "TF")
122 | 
123 |     def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
124 |         """
125 |         Layer Norm from Tensorflow model(using keras2onnx or tf2onnx):
126 |          +------------------------------------+
127 |          |                                    |
128 |          |                                    |
129 |        (Cast_1)                               |
130 |          |                                    |
131 |          |                                    v                                           (B)                             (B)             (A)
132 |         Add --> (Cast_1) --> ReduceMean -->  Sub  --> Mul --> ReduceMean --> (Cast_3) --> Add --> Sqrt --> Reciprocol --> Mul --> Mul --> Sub --> Add
133 |          |                       |                                                                                         |       ^              ^
134 |          |                       |                                                                                         |       |              |
135 |          |                       +--------------------------------------------------(Cast_2)-------------------------------|-------+              |
136 |          |                                                                                                                 v                      |
137 |          +---------------------------------------------------------------------------------------------------------------> Mul--------------------+
138 |         """
139 |         return_indice = []
140 |         _, parent_nodes, return_indice = self.model.match_parent_paths(
141 |             node,
142 |             [(['Sub', 'Mul', 'Mul', 'Reciprocal', 'Sqrt', 'Add', 'ReduceMean', 'Mul', 'Sub', 'ReduceMean'],
143 |             [   1,     1,   None,            0,      0,     0,         None,     0,    0,          None]),
144 |             (['Sub', 'Mul', 'Mul', 'Reciprocal', 'Sqrt', 'Add', 'Cast', 'ReduceMean', 'Mul', 'Sub', 'ReduceMean'],
145 |             [   1,     1,   None,            0,      0,     0,     0,      None,        0,    0,          None])],
146 |             output_name_to_node) # yapf: disable
147 | 
148 |         if parent_nodes is None:
149 |             return
150 | 
151 |         assert len(return_indice) == 3
152 |         if not (return_indice[0] in [0, 1] and return_indice[1] in [0, 1] and return_indice[2] in [0, 1]):
153 |             logger.debug("return indice is exepected in [0, 1], but got {return_indice}")
154 |             return
155 | 
156 |         sub_node_0, mul_node_0, mul_node_1, reciprocol_node, sqrt_node, add_node_0 = parent_nodes[:6]
157 |         reduce_mean_node_0, mul_node_2, sub_node_1, reduce_mean_node_1 = parent_nodes[-4:]
158 | 
159 |         cast_node_3 = None
160 |         if len(parent_nodes) == 11:
161 |             cast_node_3 = parent_nodes[6]
162 |             assert (cast_node_3.op_type == 'Cast')
163 | 
164 |         mul_node_3 = self.model.match_parent(node, 'Mul', 0, output_name_to_node)
165 |         if mul_node_3 is None:
166 |             logger.debug("mul_node_3 not found")
167 |             return
168 | 
169 |         node_before_reduce = self.model.get_parent(reduce_mean_node_1, 0, output_name_to_node)
170 |         root_node = node_before_reduce if cast_node_3 is None else self.model.get_parent(
171 |             node_before_reduce, 0, output_name_to_node)
172 |         if root_node is None:
173 |             logger.debug("root node is none")
174 |             return
175 | 
176 |         i, epsilon = self.model.get_constant_input(add_node_0)
177 |         if epsilon is None or epsilon <= 0 or (epsilon > 1.0E-5 and cast_node_3 is None):
178 |             logger.debug("epsilon is not matched")
179 |             return
180 | 
181 |         if cast_node_3 is None and (reduce_mean_node_1.input[0] not in mul_node_3.input
182 |                                     or reduce_mean_node_1.input[0] not in sub_node_1.input):
183 |             logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
184 |             return
185 | 
186 |         if cast_node_3 is not None and (node_before_reduce.input[0] not in mul_node_3.input
187 |                                         or reduce_mean_node_1.input[0] not in sub_node_1.input):
188 |             logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
189 |             return
190 | 
191 |         if mul_node_2.input[0] != mul_node_2.input[1]:
192 |             logger.debug("mul_node_2 shall have two same inputs")
193 |             return
194 | 
195 |         subgraph_nodes = [
196 |             node, sub_node_0, mul_node_0, mul_node_1, reciprocol_node, sqrt_node, add_node_0, reduce_mean_node_0,
197 |             mul_node_2, sub_node_1, reduce_mean_node_1, mul_node_3
198 |         ]
199 | 
200 |         if cast_node_3 is not None:
201 |             cast_node_2 = self.model.match_parent(mul_node_0, 'Cast', 0, output_name_to_node)
202 |             if cast_node_2 is None:
203 |                 logger.debug("cast_node_2 not found")
204 |                 return
205 |             subgraph_nodes.extend([node_before_reduce, cast_node_2, cast_node_3])
206 | 
207 |         if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, node.output, self.model.input_name_to_nodes(),
208 |                                                 self.model.output_name_to_node()):
209 |             logger.debug("not safe to fuse layer normalization")
210 |             return
211 | 
212 |         self.nodes_to_remove.extend(subgraph_nodes)
213 | 
214 |         weight_input = mul_node_1.input[1]
215 |         bias_input = sub_node_0.input[0]
216 | 
217 |         #TODO: add epsilon attribute
218 |         fused_node = helper.make_node('LayerNormalization',
219 |                                       inputs=[mul_node_3.input[0], weight_input, bias_input],
220 |                                       outputs=[node.output[0]],
221 |                                       name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm"))
222 |         fused_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))])
223 |         self.nodes_to_add.append(fused_node)
224 |         self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
225 | 


--------------------------------------------------------------------------------
/fusion_options.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.
  4 | #--------------------------------------------------------------------------
  5 | from argparse import ArgumentParser
  6 | 
  7 | 
  8 | class AttentionMaskFormat:
  9 |     MaskIndexEnd = 0
 10 |     MaskIndexEndAndStart = 1
 11 |     AttentionMask = 2
 12 |     NoMask = 3
 13 | 
 14 | 
 15 | class FusionOptions:
 16 |     """ Options of fusion in graph optimization
 17 |     """
 18 |     def __init__(self, model_type):
 19 |         self.enable_gelu = True
 20 |         self.enable_layer_norm = True
 21 |         self.enable_attention = True
 22 |         self.enable_skip_layer_norm = True
 23 |         self.enable_embed_layer_norm = True
 24 |         self.enable_bias_skip_layer_norm = True
 25 |         self.enable_bias_gelu = True
 26 |         self.enable_gelu_approximation = False
 27 |         self.attention_mask_format = AttentionMaskFormat.AttentionMask
 28 | 
 29 |         if model_type == 'gpt2':
 30 |             self.enable_skip_layer_norm = False
 31 | 
 32 |     def use_raw_attention_mask(self, use_raw_mask=True):
 33 |         if use_raw_mask:
 34 |             self.attention_mask_format = AttentionMaskFormat.AttentionMask
 35 |         else:
 36 |             self.attention_mask_format = AttentionMaskFormat.MaskIndexEnd
 37 | 
 38 |     def disable_attention_mask(self):
 39 |         self.attention_mask_format = AttentionMaskFormat.NoMask
 40 | 
 41 |     @staticmethod
 42 |     def parse(args):
 43 |         options = FusionOptions(args.model_type)
 44 |         if args.disable_gelu:
 45 |             options.enable_gelu = False
 46 |         if args.disable_layer_norm:
 47 |             options.enable_layer_norm = False
 48 |         if args.disable_attention:
 49 |             options.enable_attention = False
 50 |         if args.disable_skip_layer_norm:
 51 |             options.enable_skip_layer_norm = False
 52 |         if args.disable_embed_layer_norm:
 53 |             options.enable_embed_layer_norm = False
 54 |         if args.disable_bias_skip_layer_norm:
 55 |             options.enable_bias_skip_layer_norm = False
 56 |         if args.disable_bias_gelu:
 57 |             options.enable_bias_gelu = False
 58 |         if args.enable_gelu_approximation:
 59 |             options.enable_gelu_approximation = True
 60 |         if args.use_mask_index:
 61 |             options.use_raw_attention_mask(False)
 62 |         if args.no_attention_mask:
 63 |             options.disable_attention_mask()
 64 |         return options
 65 | 
 66 |     @staticmethod
 67 |     def add_arguments(parser: ArgumentParser):
 68 |         parser.add_argument('--disable_attention', required=False, action='store_true', help="disable Attention fusion")
 69 |         parser.set_defaults(disable_attention=False)
 70 | 
 71 |         parser.add_argument('--disable_skip_layer_norm',
 72 |                             required=False,
 73 |                             action='store_true',
 74 |                             help="disable SkipLayerNormalization fusion")
 75 |         parser.set_defaults(disable_skip_layer_norm=False)
 76 | 
 77 |         parser.add_argument('--disable_embed_layer_norm',
 78 |                             required=False,
 79 |                             action='store_true',
 80 |                             help="disable EmbedLayerNormalization fusion")
 81 |         parser.set_defaults(disable_embed_layer_norm=False)
 82 | 
 83 |         parser.add_argument('--disable_bias_skip_layer_norm',
 84 |                             required=False,
 85 |                             action='store_true',
 86 |                             help="disable Add Bias and SkipLayerNormalization fusion")
 87 |         parser.set_defaults(disable_bias_skip_layer_norm=False)
 88 | 
 89 |         parser.add_argument('--disable_bias_gelu',
 90 |                             required=False,
 91 |                             action='store_true',
 92 |                             help="disable Add Bias and Gelu/FastGelu fusion")
 93 |         parser.set_defaults(disable_bias_gelu=False)
 94 | 
 95 |         parser.add_argument('--disable_layer_norm',
 96 |                             required=False,
 97 |                             action='store_true',
 98 |                             help="disable LayerNormalization fusion")
 99 |         parser.set_defaults(disable_layer_norm=False)
100 | 
101 |         parser.add_argument('--disable_gelu', required=False, action='store_true', help="disable Gelu fusion")
102 |         parser.set_defaults(disable_gelu=False)
103 | 
104 |         parser.add_argument('--enable_gelu_approximation',
105 |                             required=False,
106 |                             action='store_true',
107 |                             help="enable Gelu/BiasGelu to FastGelu conversion")
108 |         parser.set_defaults(enable_gelu_approximation=False)
109 | 
110 |         parser.add_argument('--use_mask_index',
111 |                             required=False,
112 |                             action='store_true',
113 |                             help="use mask index instead of raw attention mask in attention operator")
114 |         parser.set_defaults(use_mask_index=False)
115 | 
116 |         parser.add_argument('--no_attention_mask',
117 |                             required=False,
118 |                             action='store_true',
119 |                             help="no attention mask. Only works for model_type=bert")
120 |         parser.set_defaults(no_attention_mask=False)
121 | 


--------------------------------------------------------------------------------
/fusion_reshape.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.
  4 | #--------------------------------------------------------------------------
  5 | 
  6 | from fusion_base import Fusion
  7 | from logging import getLogger
  8 | import numpy as np
  9 | from onnx import helper, numpy_helper, TensorProto
 10 | from onnx_model import OnnxModel
 11 | 
 12 | logger = getLogger(__name__)
 13 | 
 14 | 
 15 | class FusionReshape(Fusion):
 16 |     def __init__(self, model: OnnxModel):
 17 |         super().__init__(model, "Reshape", "Reshape")
 18 | 
 19 |     def replace_reshape_node(self, shape, reshape_node, concat_node):
 20 |         shape_value = np.asarray(shape, dtype=np.int64)
 21 |         constant_shape_name = self.model.create_node_name('Constant', 'constant_shape')
 22 |         new_node = helper.make_node('Constant',
 23 |                                     inputs=[],
 24 |                                     outputs=[constant_shape_name],
 25 |                                     value=helper.make_tensor(name='const_tensor',
 26 |                                                              data_type=TensorProto.INT64,
 27 |                                                              dims=shape_value.shape,
 28 |                                                              vals=bytes(shape_value),
 29 |                                                              raw=True))
 30 |         reshape_node.input[1] = constant_shape_name
 31 |         reshape_node.name = self.model.create_node_name('Reshape', 'Reshape_Fuse')
 32 |         self.nodes_to_remove.extend([concat_node])
 33 |         self.nodes_to_add.append(new_node)
 34 |         self.node_name_to_graph_name[new_node.name] = self.this_graph_name
 35 | 
 36 |     def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node):
 37 |         if reshape_node.input[1] not in output_name_to_node:
 38 |             return
 39 | 
 40 |         concat_node = output_name_to_node[reshape_node.input[1]]
 41 |         if concat_node.op_type != 'Concat' or len(concat_node.input) < 3 or len(concat_node.input) > 4:
 42 |             return
 43 | 
 44 |         path0 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Gather', 'Shape'], [0, 0, 0],
 45 |                                              output_name_to_node)
 46 |         if path0 is None:
 47 |             return
 48 | 
 49 |         (unsqueeze_0, gather_0, shape_0) = path0
 50 | 
 51 |         path1 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Gather', 'Shape'], [1, 0, 0],
 52 |                                              output_name_to_node)
 53 |         if path1 is None:
 54 |             return
 55 |         (unsqueeze_1, gather_1, shape_1) = path1
 56 | 
 57 |         shape = []
 58 |         gather_value = self.model.get_constant_value(gather_0.input[1])
 59 |         if gather_value == 0:
 60 |             shape.append(0)
 61 | 
 62 |         gather_value = self.model.get_constant_value(gather_1.input[1])
 63 |         if gather_value == 1:
 64 |             shape.append(0)
 65 | 
 66 |         if len(shape) != 2:
 67 |             return
 68 | 
 69 |         path2 = []
 70 |         path3 = []
 71 |         shape_nodes = [shape_0, shape_1]
 72 |         if len(concat_node.input) == 3 and self.model.get_initializer(concat_node.input[2]) is None:
 73 |             path2 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Mul', 'Gather', 'Shape'], [2, 0, 0, 0],
 74 |                                                  output_name_to_node)
 75 |             if path2 is None:
 76 |                 path2 = self.model.match_parent_path(
 77 |                     concat_node, ['Unsqueeze', 'Mul', 'Squeeze', 'Slice', 'Shape'], [2, 0, 0, 0, 0],
 78 |                     output_name_to_node)  # GPT2 exported by PyTorch 1.4 with opset_version=11
 79 |                 if path2 is None:
 80 |                     return
 81 | 
 82 |             path3 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Mul', 'Gather', 'Shape'], [2, 0, 1, 0],
 83 |                                                  output_name_to_node)
 84 |             if path3 is None:
 85 |                 path3 = self.model.match_parent_path(
 86 |                     concat_node, ['Unsqueeze', 'Mul', 'Squeeze', 'Slice', 'Shape'], [2, 0, 1, 0, 0],
 87 |                     output_name_to_node)  # GPT2 exported by PyTorch 1.4 with opset_version=11
 88 |                 if path3 is None:
 89 |                     return
 90 | 
 91 |             shape_nodes.extend([path2[-1], path3[-1]])
 92 |             shape.append(-1)
 93 |         elif (len(concat_node.input) > 2):
 94 |             concat_2 = self.model.get_initializer(concat_node.input[2])
 95 |             if concat_2 is None:
 96 |                 return
 97 |             concat_value = numpy_helper.to_array(concat_2)
 98 |             if isinstance(concat_value, list):
 99 |                 shape.extend(concat_value)
100 |             else:
101 |                 shape.append(concat_value)
102 | 
103 |         if len(concat_node.input) == 4 and self.model.get_initializer(concat_node.input[3]) is None:
104 |             if -1 in shape:
105 |                 return
106 | 
107 |             path2 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Div', 'Gather', 'Shape'], [3, 0, 0, 0],
108 |                                                  output_name_to_node)
109 |             if path2 is None:
110 |                 path2 = self.model.match_parent_path(
111 |                     concat_node, ['Unsqueeze', 'Div', 'Squeeze', 'Slice', 'Shape'], [3, 0, 0, 0, 0],
112 |                     output_name_to_node)  # GPT2 exported by PyTorch 1.4 with opset_version=11
113 |                 if path2 is None:
114 |                     return
115 |             shape_nodes.extend([path2[-1]])
116 |             shape.append(-1)
117 |         elif (len(concat_node.input) > 3):
118 |             concat_3 = self.model.get_initializer(concat_node.input[3])
119 |             if concat_3 is None:
120 |                 return
121 | 
122 |             concat_value = numpy_helper.to_array(concat_3)
123 |             if isinstance(concat_value, list):
124 |                 shape.extend(concat_value)
125 |             else:
126 |                 shape.append(concat_value)
127 | 
128 |         root_input = reshape_node.input[0]
129 |         same_shape_input = True
130 |         for shape_node in shape_nodes:
131 |             if shape_node.input[0] != root_input:
132 |                 same_shape_input = False
133 | 
134 |         if not same_shape_input:
135 |             return
136 | 
137 |         self.replace_reshape_node(shape, reshape_node, concat_node)
138 | 
139 |         self.nodes_to_remove.extend(path0)
140 |         self.nodes_to_remove.extend(path1)
141 |         self.nodes_to_remove.extend(path2)
142 |         self.nodes_to_remove.extend(path3)
143 | 


--------------------------------------------------------------------------------
/fusion_shape.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.
  4 | #--------------------------------------------------------------------------
  5 | 
  6 | from fusion_base import Fusion
  7 | from logging import getLogger
  8 | from onnx import TensorProto, NodeProto
  9 | from onnx_model import OnnxModel
 10 | from fusion_utils import FusionUtils
 11 | from typing import Union, Dict, List
 12 | 
 13 | logger = getLogger(__name__)
 14 | 
 15 | 
 16 | class FusionShape(Fusion):
 17 |     def __init__(self, model: OnnxModel):
 18 |         super().__init__(model, "Shape", "Concat")
 19 |         self.utils = FusionUtils(model)
 20 |         self.shape_infer = None
 21 |         self.shape_infer_done = False
 22 | 
 23 |     def get_dimensions_from_tensor_proto(self, tensor_proto: TensorProto) -> Union[int, None]:
 24 |         if tensor_proto.type.tensor_type.HasField('shape'):
 25 |             return len(tensor_proto.type.tensor_type.shape.dim)
 26 |         else:
 27 |             return None
 28 | 
 29 |     def get_dimensions(self, input_name: str) -> Union[int, None]:
 30 |         graph_input = self.model.find_graph_input(input_name)
 31 |         if graph_input:
 32 |             return self.get_dimensions_from_tensor_proto(graph_input)
 33 | 
 34 |         if not self.shape_infer_done:
 35 |             self.shape_infer = self.model.infer_runtime_shape({}, update=True)
 36 |             self.shape_infer_done = True
 37 | 
 38 |         if self.shape_infer is not None:
 39 |             return self.get_dimensions_from_tensor_proto(self.shape_infer.known_vi_[input_name])
 40 | 
 41 |         return None
 42 | 
 43 |     def fuse(self, concat_node: NodeProto, input_name_to_nodes: Dict[str, List[NodeProto]],
 44 |              output_name_to_node: Dict[str, NodeProto]):
 45 |         """
 46 |         Smplify subgraph like
 47 | 
 48 |                    (2d_input)
 49 |                     /       \
 50 |                 Shape       shape
 51 |                 /             \
 52 |             Gather(indices=0)  Gather(indices=1)
 53 |                 |                |
 54 |             Unsqueeze(axes=0)   Unsqueeze(axes=0)
 55 |                    \          /
 56 |                       Concat 
 57 |                         |
 58 | 
 59 |         into  (2d_input) --> Shape -->
 60 |         """
 61 |         opset_version = self.model.get_opset_version()
 62 | 
 63 |         inputs = len(concat_node.input)
 64 |         root = None
 65 |         shape_output = None
 66 |         for i in range(inputs):
 67 |             path = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Gather', 'Shape'], [i, 0, 0],
 68 |                                                 output_name_to_node)
 69 |             if path is None:
 70 |                 return
 71 | 
 72 |             unsqueeze, gather, shape = path
 73 |             if i == 0:
 74 |                 shape_output = shape.output[0]
 75 |             if root is None:
 76 |                 root = shape.input[0]
 77 |                 if self.get_dimensions(root) != inputs:
 78 |                     return
 79 |             elif shape.input[0] != root:
 80 |                 return
 81 | 
 82 |             if not FusionUtils.check_node_attribute(unsqueeze, 'axis', 0, default_value=0):
 83 |                 return
 84 | 
 85 |             if opset_version < 13:
 86 |                 if not FusionUtils.check_node_attribute(unsqueeze, 'axes', [0]):
 87 |                     return
 88 |             else:
 89 |                 if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
 90 |                     return
 91 | 
 92 |             value = self.model.get_constant_value(gather.input[1])
 93 |             from numpy import ndarray, array_equal
 94 |             if not (isinstance(value, ndarray) and value.size == 1 and value.item() == i):
 95 |                 return
 96 | 
 97 |         if self.model.find_graph_output(concat_node.output[0]) is None:
 98 |             self.model.replace_input_of_all_nodes(concat_node.output[0], shape_output)
 99 |             self.fused_count += 1
100 |             self.prune_graph = True
101 | 


--------------------------------------------------------------------------------
/fusion_skiplayernorm.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.
  4 | #--------------------------------------------------------------------------
  5 | 
  6 | from logging import getLogger
  7 | from onnx import helper
  8 | from onnx_model import OnnxModel
  9 | from fusion_base import Fusion
 10 | from fusion_utils import NumpyHelper
 11 | 
 12 | logger = getLogger(__name__)
 13 | 
 14 | 
 15 | class FusionSkipLayerNormalization(Fusion):
 16 |     """
 17 |     Fuse Add + LayerNormalization into one node: SkipLayerNormalization
 18 |     Note: This fusion does not check the input shape of Add and LayerNormalization.
 19 |     """
 20 |     def __init__(self, model: OnnxModel):
 21 |         super().__init__(model, "SkipLayerNormalization", "LayerNormalization")
 22 |         # Update shape inference is needed since other fusions might add new edge which does not have shape info yet.
 23 |         self.shape_infer_helper = self.model.infer_runtime_shape({"batch_size": 4, "seq_len": 7}, update=True)
 24 | 
 25 |     def fuse(self, node, input_name_to_nodes, output_name_to_node):
 26 |         add = self.model.get_parent(node, 0, output_name_to_node)
 27 | 
 28 |         # In some models there is input_ids->gather->add->LayerNorm and one of input of the
 29 |         # add node is initializer with fixed shape which should not be fused into SkipLayerNorm
 30 |         if add is None:
 31 |             return
 32 | 
 33 |         for add_input in add.input:
 34 |             if self.model.get_initializer(add_input) != None:
 35 |                 return
 36 | 
 37 |         # The number of input node of add should be 2
 38 |         if len(self.model.get_parents(add)) != 2:
 39 |             return
 40 | 
 41 |         if self.shape_infer_helper is not None:
 42 |             if not self.shape_infer_helper.compare_shape(add.input[0], add.input[1]):
 43 |                 logger.debug(
 44 |                     f"skip skiplayernorm fusion since shape of inputs ({add.input[0]}, {add.input[1]}) are not same")
 45 |                 return
 46 |         else:
 47 |             # shape_infer_helper can not handle subgraphs. Current work around is to disable skiplayernorm fusion
 48 |             # longterm todo: support subgraph in symbolic_shape_infer or support add broadcasting in skiplayernorm op
 49 |             logger.warning(
 50 |                 "symbolic shape infer failed. it's safe to ignore this message if there is no issue with optimized model"
 51 |             )
 52 | 
 53 |         gather_path = self.model.match_parent_path(add, ['Gather'], [None])
 54 |         if gather_path is not None and self.model.find_graph_input(gather_path[0].input[1]) is None:
 55 |             if self.model.match_parent_path(gather_path[0], ['ConstantOfShape'], [1]) is None:
 56 |                 return
 57 | 
 58 |         if add is not None and add.op_type == 'Add' and self.model.is_safe_to_fuse_nodes(
 59 |             [add, node], node.output, input_name_to_nodes, output_name_to_node):
 60 |             self.nodes_to_remove.extend([add, node])
 61 | 
 62 |             inputs = [add.input[0], add.input[1], node.input[1], node.input[2]]
 63 |             normalize_node = helper.make_node("SkipLayerNormalization",
 64 |                                               inputs=inputs,
 65 |                                               outputs=[node.output[0]],
 66 |                                               name=self.model.create_node_name("SkipLayerNormalization",
 67 |                                                                                name_prefix="SkipLayerNorm"))
 68 |             normalize_node.domain = "com.microsoft"
 69 | 
 70 |             # Pass attribute "epsilon" from layernorm node to SkipLayerNormalization
 71 |             for att in node.attribute:
 72 |                 if att.name == 'epsilon':
 73 |                     normalize_node.attribute.extend([att])
 74 | 
 75 |             # Set default epsilon if no epsilon exists from layernorm
 76 |             if len(normalize_node.attribute) == 0:
 77 |                 normalize_node.attribute.extend([helper.make_attribute("epsilon", 1.0E-12)])
 78 | 
 79 |             self.nodes_to_add.append(normalize_node)
 80 |             self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
 81 | 
 82 | 
 83 | class FusionBiasSkipLayerNormalization(Fusion):
 84 |     def __init__(self, model: OnnxModel):
 85 |         super().__init__(model, "SkipLayerNormalization", "SkipLayerNormalization", "add bias")
 86 | 
 87 |     def fuse(self, node, input_name_to_nodes, output_name_to_node):
 88 |         if len(node.input) != 4:
 89 |             return
 90 | 
 91 |         return_indice = []
 92 |         nodes = self.model.match_parent_path(node, ['Add', 'MatMul'], [None, None], None, return_indice)
 93 |         if nodes is None:
 94 |             return
 95 |         assert len(return_indice) == 2
 96 |         add_input_index = return_indice[0]
 97 |         if add_input_index >= 2:
 98 |             return
 99 | 
100 |         (add, matmul) = nodes
101 | 
102 |         # bias should be one dimension
103 |         bias_index = -1
104 |         for i, input in enumerate(add.input):
105 |             initializer = self.model.get_initializer(input)
106 |             if initializer is None:
107 |                 continue
108 |             bias_index = i
109 |             bias_weight = NumpyHelper.to_array(initializer)
110 |             break
111 |         if bias_weight is None:
112 |             logger.debug(f"Bias weight not found")
113 |             return
114 |         if len(bias_weight.shape) != 1:
115 |             logger.debug(f"Bias weight is not 1D")
116 |             return
117 | 
118 |         subgraph_nodes = [node, add]
119 |         if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [node.output[0]], input_name_to_nodes,
120 |                                                 output_name_to_node):
121 |             logger.debug(f"Skip fusing SkipLayerNormalization with Bias since it is not safe")
122 |             return
123 | 
124 |         self.nodes_to_remove.extend(subgraph_nodes)
125 |         inputs = [
126 |             node.input[1 - add_input_index], matmul.output[0], node.input[2], node.input[3], add.input[bias_index]
127 |         ]
128 |         new_node = helper.make_node("SkipLayerNormalization",
129 |                                     inputs=inputs,
130 |                                     outputs=node.output,
131 |                                     name=self.model.create_node_name("SkipLayerNormalization",
132 |                                                                      "SkipLayerNorm_AddBias_"))
133 |         new_node.domain = "com.microsoft"
134 | 
135 |         # Pass attribute "epsilon" from skiplayernorm node to skiplayernorm(add bias)
136 |         for att in node.attribute:
137 |             if att.name == 'epsilon':
138 |                 new_node.attribute.extend([att])
139 | 
140 |         # Set default epsilon if no epsilon exists from skiplayernorm
141 |         if len(new_node.attribute) == 0:
142 |             new_node.attribute.extend([helper.make_attribute("epsilon", 1.0E-12)])
143 | 
144 |         self.nodes_to_add.append(new_node)
145 |         self.node_name_to_graph_name[new_node.name] = self.this_graph_name
146 | 


--------------------------------------------------------------------------------
/fusion_utils.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.
  4 | #--------------------------------------------------------------------------
  5 | from logging import getLogger
  6 | from typing import Tuple
  7 | from onnx import helper, numpy_helper, TensorProto
  8 | from numpy import ndarray, array_equal
  9 | from onnx_model import OnnxModel
 10 | 
 11 | logger = getLogger(__name__)
 12 | 
 13 | 
 14 | class FusionUtils:
 15 |     def __init__(self, model: OnnxModel):
 16 |         self.model: OnnxModel = model
 17 | 
 18 |     def cast_graph_input_to_int32(self, input_name: str) -> Tuple[bool, str]:
 19 |         graph_input = self.model.find_graph_input(input_name)
 20 |         if graph_input is not None and graph_input.type.tensor_type.elem_type != TensorProto.INT32:
 21 |             cast_output, cast_node = self.cast_input_to_int32(input_name)
 22 |             logger.debug(f"Casted graph input {input_name} to int32")
 23 |             return True, cast_output
 24 | 
 25 |         logger.debug(f"Did not cast graph input {input_name} to int32: found {graph_input is not None}")
 26 |         return False, input_name
 27 | 
 28 |     def cast_input_to_int32(self, input_name: str):
 29 |         cast_output = input_name + '_int32'
 30 | 
 31 |         # Avoid consequent Cast nodes.
 32 |         inputs = [input_name]
 33 |         output_name_to_node = self.model.output_name_to_node()
 34 |         if input_name in output_name_to_node:
 35 |             parent_node = output_name_to_node[input_name]
 36 |             if parent_node and parent_node.op_type == 'Cast':
 37 |                 inputs = [parent_node.input[0]]
 38 | 
 39 |         cast_node = helper.make_node('Cast', inputs=inputs, outputs=[cast_output])
 40 |         cast_node.attribute.extend([helper.make_attribute("to", int(TensorProto.INT32))])
 41 |         self.model.add_node(cast_node)
 42 | 
 43 |         return cast_output, cast_node
 44 | 
 45 |     def remove_cast_int32(self, input_name: str):
 46 |         input_name_to_nodes = self.model.input_name_to_nodes()
 47 |         nodes = input_name_to_nodes[input_name]
 48 |         for node in nodes:
 49 |             if node.op_type == "Cast":
 50 |                 is_int32 = False
 51 |                 for att in node.attribute:
 52 |                     if att.name == 'to' and att.i == int(TensorProto.INT32):
 53 |                         is_int32 = True
 54 |                         break
 55 |                 if is_int32:
 56 |                     output_name = node.output[0]
 57 |                     self.model.remove_node(node)
 58 |                     self.model.replace_input_of_all_nodes(output_name, input_name)
 59 | 
 60 |     @staticmethod
 61 |     def check_node_attribute(node, attribute_name: str, expected_value, default_value=None):
 62 |         """Verify that a node has expected value for an attribute.
 63 | 
 64 |         Args:
 65 |             node (NodeProto): a node to check
 66 |             attribute_name (str): name of attribute
 67 |             expected_value (Any): expected value of the attribute
 68 |             default_value (Any, optional): default value if the attribute does not exist. Defaults to None.
 69 | 
 70 |         Returns:
 71 |             bool: whether the check is passed or not
 72 |         """
 73 |         value = default_value
 74 |         for attr in node.attribute:
 75 |             if attr.name == attribute_name:
 76 |                 value = helper.get_attribute_value(attr)
 77 | 
 78 |         if isinstance(expected_value, list):
 79 |             return (isinstance(value, ndarray) or isinstance(value, list)) and array_equal(
 80 |                 expected_value, value, equal_nan=False)
 81 |         else:
 82 |             return value == expected_value
 83 | 
 84 |     def check_node_input_value(self, node, input_index: int, expected_value):
 85 |         """Verify that a node has expected input value
 86 | 
 87 |         Args:
 88 |             node (NodeProto): a node to check
 89 |             input_index (int): index of its input to be verified
 90 |             expected_value (Any): expected value of the input
 91 | 
 92 |         Returns:
 93 |             bool: whether the check is passed or not
 94 |         """
 95 |         assert len(node.input) > input_index
 96 | 
 97 |         value = self.model.get_constant_value(node.input[input_index])
 98 | 
 99 |         if isinstance(expected_value, list):
100 |             return (isinstance(value, ndarray) or isinstance(value, list)) and array_equal(
101 |                 expected_value, value, equal_nan=False)
102 |         else:
103 |             return value == expected_value
104 | 
105 |     def get_dtype(self, shape_infer_helper, input_or_output_name: str) -> int:
106 |         """Get data type of an input or output.
107 | 
108 |         Args:
109 |             shape_infer_helper (SymbolicShapeInferenceHelper): object of symbolic shape inference
110 |             input_or_output_name (str): name of input or output
111 | 
112 |         Returns:
113 |             int: tensor data type
114 |         """
115 |         dtype = self.model.get_dtype(input_or_output_name)
116 |         if dtype is not None:
117 |             return dtype
118 | 
119 |         if shape_infer_helper:
120 |             tensor_proto = shape_infer_helper.known_vi_[input_or_output_name]
121 |             if tensor_proto.type.tensor_type.HasField('elem_type'):
122 |                 return tensor_proto.type.tensor_type.elem_type
123 | 
124 |         return None
125 | 
126 |     def remove_useless_cast_nodes(self):
127 |         """Remove cast nodes that are not needed: input and output has same data type.
128 |         """
129 |         shape_infer = self.model.infer_runtime_shape(update=True)
130 |         if shape_infer is None:
131 |             return
132 | 
133 |         nodes_to_remove = []
134 |         for node in self.model.nodes():
135 |             if node.op_type == 'Cast':
136 |                 input_dtype = self.get_dtype(shape_infer, node.input[0])
137 |                 output_dtype = self.get_dtype(shape_infer, node.output[0])
138 |                 if input_dtype and input_dtype == output_dtype:
139 |                     nodes_to_remove.append(node)
140 | 
141 |         if nodes_to_remove:
142 |             graph_input_names = set(self.model.get_graphs_input_names())
143 |             graph_output_names = set(self.model.get_graphs_output_names())
144 |             for node in nodes_to_remove:
145 |                 if bool(set(node.output) & graph_output_names):
146 |                     if not bool(set(node.input) & graph_input_names):
147 |                         self.model.replace_output_of_all_nodes(node.input[0], node.output[0])
148 |                     else:
149 |                         continue
150 |                 else:
151 |                     self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
152 |                 self.model.remove_node(node)
153 |         logger.info(f"Removed {len(nodes_to_remove)} Cast nodes with output type same as input")
154 | 
155 |     def remove_useless_reshape_nodes(self):
156 |         """Remove reshape node that is not needed based on symbolic shape inference: input and output has same shape
157 |         """
158 |         shape_infer = self.model.infer_runtime_shape(update=True)
159 |         if shape_infer is None:
160 |             return
161 | 
162 |         nodes_to_remove = []
163 |         for node in self.model.nodes():
164 |             if node.op_type == 'Reshape':
165 |                 input_shape = shape_infer.get_edge_shape(node.input[0])
166 |                 output_shape = shape_infer.get_edge_shape(node.output[0])
167 |                 if input_shape and output_shape and input_shape == output_shape:
168 |                     logger.info(
169 |                         f"Remove reshape node {node.name} since its input shape is same as output: {input_shape}")
170 |                     nodes_to_remove.append(node)
171 | 
172 |         if nodes_to_remove:
173 |             graph_input_names = set(self.model.get_graphs_input_names())
174 |             graph_output_names = set(self.model.get_graphs_output_names())
175 |             for node in nodes_to_remove:
176 |                 if bool(set(node.output) & graph_output_names):
177 |                     if not bool(set(node.input) & graph_input_names):
178 |                         self.model.replace_output_of_all_nodes(node.input[0], node.output[0])
179 |                     else:
180 |                         continue
181 |                 else:
182 |                     self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
183 |                 self.model.remove_node(node)
184 | 
185 | 
186 | class NumpyHelper:
187 |     @staticmethod
188 |     def to_array(tensor: TensorProto, fill_zeros: bool = False) -> ndarray:
189 |         # When weights are in external data format but not presented, we can still test the optimizer with two changes:
190 |         # (1) set fill_zeros = True  (2) change load_external_data=False in optimizer.py
191 |         if fill_zeros:
192 |             from onnx import mapping
193 |             return ndarray(shape=tensor.dims, dtype=mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.data_type])
194 | 
195 |         return numpy_helper.to_array(tensor)
196 | 


--------------------------------------------------------------------------------
/hf.co_1ms/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### Quick Summary - Use vendor supplied Pytorch and you will get the same performance as Infinity (as of 10/3/2021)
 3 | tl;dr: Repackage OneDNN/DNNL on CPU and CUDNN for TensorRT/Tensorcore and you have Infinity without $20k/cpu/yr
 4 | 
 5 | 
 6 | Reconstructed  Demos from launch Video here: https://www.youtube.com/watch?v=jiftCAhOYQA
 7 | 
 8 | Infinity CPU Inference Dual-core Cascade lake VM:
 9 | Seq length 16:  2.6ms
10 | ![cpu 16](images/cpu_16_2_5ms.png)
11 | Seq length 128:  9.7ms
12 | ![gpu 128](images/cpu_9_7ms.png)
13 | 
14 | Infinity GPU Inference Quad-core Cascade lake VM + 1 T4 GPU:
15 | Seq length 16:  1.7ms
16 | ![cpu 16](images/gpu_16_1_7ms.png)
17 | Seq length 128:  2.6ms
18 | ![gpu 128](images/gpu_128_2_6ms.png)
19 | 
20 | 
21 | The original model used in the video is available here: https://huggingface.co/philschmid/MiniLM-L6-H384-uncased-sst2
22 | 
23 | The optimized "Infinity Model" switch is basically the QNNX quantized model is available here:
24 | 	https://huggingface.co/philschmid/Infinity_cpu_MiniLM_L6_H384_uncased_sst2
25 | 
26 | # To Infinity and Beyond
27 | For our experiments we want to start from the original model to see if we can reach the demo'ed metrics. 
28 | 
29 | Setup your Python ENV
30 | ```
31 | python3.9 -m venv ~/1msenv
32 | source ~/1msenv/bin/activate
33 | pip install --upgrade pip
34 | pip install --upgrade onnx coloredlogs packaging psutil py3nvml onnxconverter_common numpy transformers sympy wheel
35 | 
36 | # This is the compare regular PyTorch / Torchscript performance
37 | # To install Intel's Pytorch enahcements -- you will recreate the "1ms" demos with this at 9.8ms
38 | # uninstall with pip uninstall torch torch-ipex
39 | pip install torch_ipex==1.9.0 -f https://software.intel.com/ipex-whl-stable
40 | 
41 | # To install stock Pytorch nighty -- you will run a couple ms slower at 11ms
42 | pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
43 | 
44 | ```
45 | 
46 | ## build ONNX with OneDNN and CUDNN
47 | 
48 | ```
49 | ./build.sh --config Release --build_wheel --parallel --use_openmp --use_dnnl --skip_tests --use_cuda --cuda_home /usr/local/cuda --cudnn_home /usr/local/cuda
50 | #find . -name *.whl
51 | ./build/Linux/Release/dist/onnxruntime_gpu-1.10.0-cp39-cp39-linux_x86_64.whl
52 | ./build/Linux/Release/dist/onnxruntime_dnnl-1.10.0-cp39-cp39-linux_x86_64.whl
53 | 
54 | pip install ./build/Linux/Release/dist/onnxruntime_dnnl-1.10.0-cp39-cp39-linux_x86_64.whl ./build/Linux/Release/dist/onnxruntime_gpu-1.10.0-cp39-cp39-linux_x86_64.whl
55 | ```
56 | 
57 | ## Approaching Infinity
58 | Run the Benchmark script in this folder. Change the parameters to GPU if you are doing a gpu run
59 | 
60 | ```
61 | ./hf.co_1ms/run_benchmark.sh
62 | ```
63 | 
64 | ## Are we there yet? 
65 | 
66 | ## CPU Benchmark Results
67 | 
68 | | Seq.Len |  1.11-dev Torchscript (FP32) | 1.11-dev Torchscript (INT8) | Intel 1.9.0 Torchscript (FP32) | Intel 1.9.0 Torchscript (Int8) | ONNX (FP32) | ONNX (Int8) |
69 | |---------| ----------- | ----------- | ----------- | ----------- | ----------- | ----------- |
70 | | 16 |6.14|2.49|5.86|1.96|2.76|1.24|
71 | | 128 |17.39|11.67|16.65|9.59|13.63|7.48|
72 | 
73 | ## GPU Benchmark Results on A100 *NOT* T4 demo'ed in Infinity Video
74 | 
75 | | Seq.Len |  1.11.0.dev20211003+cu111 Torchscript (FP32) | 1.11.0.dev20211003+cu111 Torchscript (FP16) | ONNX (FP32) | ONNX (FP16) |
76 | |---------| ----------- | ----------- | ----------- | ----------- 
77 | | 16 |3.10|2.77|0.81|0.83|
78 | | 128 |3.55|2.96|0.74|0.97|
79 | 
80 | Detailed results ![here](result.csv)
81 | 
82 | 
83 | ### Sample CPU run
84 | 
85 | # 'average_latency_ms': '9.59'  vs Infinity's '9.7ms'
86 | 
87 | ```
88 | ./hf.co_1ms/run_benchmark.sh
89 | ...
90 | Run PyTorch on philschmid/MiniLM-L6-H384-uncased-sst2 with input shape [1, 128]
91 | {'engine': 'torchscript', 'version': '1.9.0+cpu', 'device': 'cpu', 'optimizer': '', 'precision': <Precision.INT8: 'int8'>, 'io_binding': '', 'model_name': 'philschmid/MiniLM-L6-H384-uncased-sst2', 'inputs': 1, 'threads': 2, 'batch_size': 1, 'sequence_length': 128, 'datetime': '2021-10-04 03:50:52.568732', 'test_times': 100, 'latency_variance': '0.00', 'latency_90_percentile': '9.79', 'latency_95_percentile': '9.84', 'latency_99_percentile': '9.94', 'average_latency_ms': '9.59', 'QPS': '104.32'}
92 | ```
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/hf.co_1ms/fusion.csv:
--------------------------------------------------------------------------------
 1 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
 2 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_cpu.onnx,2021-10-04 03:50:24.257021,4.11.2,1.9.0+cpu,1,6,0,0,6,0,12
 3 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
 4 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_int8_cpu.onnx,2021-10-04 03:50:43.020711,4.11.2,1.9.0+cpu,0,6,0,0,6,1,12
 5 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
 6 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_cpu.onnx,2021-10-04 04:34:02.831263,4.11.2,1.11.0.dev20211003+cpu,1,6,0,0,6,0,12
 7 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
 8 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_int8_cpu.onnx,2021-10-04 04:34:22.689454,4.11.2,1.11.0.dev20211003+cpu,0,6,0,0,6,1,12
 9 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
10 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_gpu.onnx,2021-10-04 05:22:26.404124,4.11.2,1.11.0.dev20211003+cpu,1,6,0,0,6,0,12
11 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
12 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp16_gpu.onnx,2021-10-04 05:22:49.040808,4.11.2,1.11.0.dev20211003+cpu,1,6,0,6,0,0,12
13 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
14 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_gpu.onnx,2021-10-04 05:29:54.038759,4.11.2,1.11.0.dev20211003+cu111,1,6,0,0,6,0,12
15 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
16 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp16_gpu.onnx,2021-10-04 05:30:44.705058,4.11.2,1.11.0.dev20211003+cu111,1,6,0,6,0,0,12
17 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
18 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_gpu.onnx,2021-10-05 06:12:25.602878,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
19 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
20 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 06:13:58.767952,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
21 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
22 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_int8_cpu.onnx,2021-10-05 06:14:37.706671,4.11.2,1.11.0.dev20211003+cu111,0,12,0,0,12,1,24
23 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
24 | ./onnx_models/bert_large_uncased_1_fp32_cpu.onnx,2021-10-05 06:18:24.971957,4.11.2,1.11.0.dev20211003+cu111,1,24,0,0,24,0,48
25 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
26 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 06:21:34.252637,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
27 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
28 | ./onnx_models/bert_large_uncased_1_int8_cpu.onnx,2021-10-05 06:23:53.876515,4.11.2,1.11.0.dev20211003+cu111,0,24,0,0,24,1,48
29 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
30 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_int8_cpu.onnx,2021-10-05 06:26:36.232346,4.11.2,1.11.0.dev20211003+cu111,0,12,0,0,12,1,24
31 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
32 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 23:05:17.742694,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
33 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
34 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 23:09:44.412182,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
35 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
36 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 23:12:46.956941,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
37 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
38 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_gpu.onnx,2021-10-05 23:15:17.325840,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
39 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
40 | ./onnx_models/gpt2_1_fp32_gpu.onnx,2021-10-05 23:22:20.170049,4.11.2,1.11.0.dev20211003+cu111,0,0,0,12,0,25,0
41 | 


--------------------------------------------------------------------------------
/hf.co_1ms/images/cpu_16_2_5ms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/c9cc725effa7b8070b69487c54dee5ba28cffa9b/hf.co_1ms/images/cpu_16_2_5ms.png


--------------------------------------------------------------------------------
/hf.co_1ms/images/cpu_9_7ms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/c9cc725effa7b8070b69487c54dee5ba28cffa9b/hf.co_1ms/images/cpu_9_7ms.png


--------------------------------------------------------------------------------
/hf.co_1ms/images/gpu_128_2_6ms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/c9cc725effa7b8070b69487c54dee5ba28cffa9b/hf.co_1ms/images/gpu_128_2_6ms.png


--------------------------------------------------------------------------------
/hf.co_1ms/images/gpu_16_1_7ms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/c9cc725effa7b8070b69487c54dee5ba28cffa9b/hf.co_1ms/images/gpu_16_1_7ms.png


--------------------------------------------------------------------------------
/hf.co_1ms/images/infinity_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/c9cc725effa7b8070b69487c54dee5ba28cffa9b/hf.co_1ms/images/infinity_model.png


--------------------------------------------------------------------------------
/hf.co_1ms/images/model_dir.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/c9cc725effa7b8070b69487c54dee5ba28cffa9b/hf.co_1ms/images/model_dir.png


--------------------------------------------------------------------------------
/hf.co_1ms/onnx.diff:
--------------------------------------------------------------------------------
 1 | diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
 2 | index 6e5d5b98e..a9f0e3a93 100644
 3 | --- a/onnxruntime/python/tools/transformers/benchmark.py
 4 | +++ b/onnxruntime/python/tools/transformers/benchmark.py
 5 | @@ -483,7 +483,7 @@ def parse_arguments():
 6 |                          help='Disable running ONNX Runtime with binded inputs and outputs. ')
 7 |      parser.set_defaults(disable_ort_io_binding=False)
 8 |  
 9 | -    parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[0], help="Threads to use")
10 | +    parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[2], help="Threads to use")
11 |  
12 |      args = parser.parse_args()
13 |      return args
14 | diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py
15 | index 051480ebb..31bd05b87 100644
16 | --- a/onnxruntime/python/tools/transformers/huggingface_models.py
17 | +++ b/onnxruntime/python/tools/transformers/huggingface_models.py
18 | @@ -16,6 +16,7 @@ MODELS = {
19 |      "bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
20 |      "bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
21 |      "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
22 | +    "philschmid/MiniLM-L6-H384-uncased-sst2": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
23 |      # "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
24 |      # "bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
25 |      # "bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
26 | 


--------------------------------------------------------------------------------
/hf.co_1ms/onnx_with_eigen.diff:
--------------------------------------------------------------------------------
 1 | diff --git a/.gitmodules b/.gitmodules
 2 | index 5c2838373..cae75f4b7 100644
 3 | --- a/.gitmodules
 4 | +++ b/.gitmodules
 5 | @@ -24,7 +24,7 @@
 6 |  	url = https://github.com/google/re2.git
 7 |  [submodule "cmake/external/eigen"]
 8 |  	path = cmake/external/eigen
 9 | -	url = https://gitlab.com/libeigen/eigen.git
10 | +	url = https://gitlab.com/cantonios/eigen.git
11 |  [submodule "cmake/external/cxxopts"]
12 |  	path = cmake/external/cxxopts
13 |  	url = https://github.com/jarro2783/cxxopts.git
14 | diff --git a/cgmanifests/submodules/cgmanifest.json b/cgmanifests/submodules/cgmanifest.json
15 | index 41c43a6ff..1388141ca 100644
16 | --- a/cgmanifests/submodules/cgmanifest.json
17 | +++ b/cgmanifests/submodules/cgmanifest.json
18 | @@ -115,7 +115,7 @@
19 |          "type": "git",
20 |          "git": {
21 |            "commitHash": "efd9867ff0e8df23016ac6c9828d0d7bf8bec1b1",
22 | -          "repositoryUrl": "https://gitlab.com/libeigen/eigen.git"
23 | +          "repositoryUrl": "https://gitlab.com/cantonios/eigen.git"
24 |          },
25 |          "comments": "git submodule at cmake/external/FeaturizersLibrary/src/3rdParty/eigen"
26 |        }
27 | @@ -195,7 +195,7 @@
28 |          "type": "git",
29 |          "git": {
30 |            "commitHash": "d10b27fe37736d2944630ecd7557cefa95cf87c9",
31 | -          "repositoryUrl": "https://gitlab.com/libeigen/eigen.git"
32 | +          "repositoryUrl": "https://gitlab.com/cantonios/eigen.git"
33 |          },
34 |          "comments": "git submodule at cmake/external/eigen"
35 |        }
36 | diff --git a/cmake/external/FeaturizersLibrary b/cmake/external/FeaturizersLibrary
37 | --- a/cmake/external/FeaturizersLibrary
38 | +++ b/cmake/external/FeaturizersLibrary
39 | @@ -1 +1 @@
40 | -Subproject commit fd5fe3de507d4a19f5923c5d4c267e3d730500a9
41 | +Subproject commit fd5fe3de507d4a19f5923c5d4c267e3d730500a9-dirty
42 | diff --git a/cmake/external/eigen b/cmake/external/eigen
43 | --- a/cmake/external/eigen
44 | +++ b/cmake/external/eigen
45 | @@ -1 +1 @@
46 | -Subproject commit d10b27fe37736d2944630ecd7557cefa95cf87c9
47 | +Subproject commit d10b27fe37736d2944630ecd7557cefa95cf87c9-dirty
48 | diff --git a/cmake/external/onnx b/cmake/external/onnx
49 | --- a/cmake/external/onnx
50 | +++ b/cmake/external/onnx
51 | @@ -1 +1 @@
52 | -Subproject commit 1f63dcb7fcc3a8bf5c3c8e326867ecd6f5c43f35
53 | +Subproject commit 1f63dcb7fcc3a8bf5c3c8e326867ecd6f5c43f35-dirty
54 | diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
55 | index 6e5d5b98e..a9f0e3a93 100644
56 | --- a/onnxruntime/python/tools/transformers/benchmark.py
57 | +++ b/onnxruntime/python/tools/transformers/benchmark.py
58 | @@ -483,7 +483,7 @@ def parse_arguments():
59 |                          help='Disable running ONNX Runtime with binded inputs and outputs. ')
60 |      parser.set_defaults(disable_ort_io_binding=False)
61 |  
62 | -    parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[0], help="Threads to use")
63 | +    parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[2], help="Threads to use")
64 |  
65 |      args = parser.parse_args()
66 |      return args
67 | diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py
68 | index 051480ebb..31bd05b87 100644
69 | --- a/onnxruntime/python/tools/transformers/huggingface_models.py
70 | +++ b/onnxruntime/python/tools/transformers/huggingface_models.py
71 | @@ -16,6 +16,7 @@ MODELS = {
72 |      "bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
73 |      "bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
74 |      "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
75 | +    "philschmid/MiniLM-L6-H384-uncased-sst2": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
76 |      # "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
77 |      # "bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
78 |      # "bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
79 | 


--------------------------------------------------------------------------------
/hf.co_1ms/requirements.txt:
--------------------------------------------------------------------------------
1 | 
2 | sympy
3 | wheel
4 | psutill
5 | 


--------------------------------------------------------------------------------
/hf.co_1ms/result.csv:
--------------------------------------------------------------------------------
 1 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
 2 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,2.76,13.63
 3 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
 4 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.9.0+cpu,cpu,fp32,,,2,5.86,16.65
 5 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
 6 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,int8,True,True,2,1.24,7.48
 7 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
 8 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.9.0+cpu,cpu,int8,,,2,1.96,9.59
 9 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
10 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,2.72,13.33
11 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
12 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cpu,cpu,fp32,,,2,6.14,17.39
13 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
14 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,int8,True,True,2,1.24,7.68
15 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
16 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cpu,cpu,int8,,,2,2.49,11.67
17 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
18 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,0.81,0.83
19 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
20 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,2,3.10,3.55
21 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
22 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cuda,fp16,True,True,2,0.74,0.97
23 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
24 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp16,,,2,2.77,2.96
25 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
26 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,1.18,1.62
27 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
28 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,7.99,44.87
29 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
30 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,13.17,51.22
31 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
32 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,int8,True,True,2,5.49,41.44
33 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
34 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,2,6.80,41.23
35 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
36 | bert-large-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,105.78,547.92
37 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
38 | bert-large-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,176.39,633.71
39 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
40 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,8.01,44.48
41 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
42 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,12.99,51.68
43 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
44 | bert-large-uncased,1,onnxruntime,1.10.0,cpu,int8,True,True,2,68.26,472.31
45 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
46 | bert-large-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,2,54.54,414.33
47 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
48 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,int8,True,True,2,5.92,43.16
49 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
50 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,2,7.85,41.07
51 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
52 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,8.76,49.58
53 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
54 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,14.09,58.02
55 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
56 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,8.14,48.34
57 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
58 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,14.05,58.36
59 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
60 | microsoft/MiniLM-L12-H384-uncased,1,tensorflow,2.8.0-dev20211005,cpu,fp32,,,2,22.49,91.46
61 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128
62 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,49.27
63 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128
64 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,56.76
65 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128
66 | microsoft/MiniLM-L12-H384-uncased,1,tensorflow,2.8.0-dev20211005,cpu,fp32,,,2,93.29
67 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128
68 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,1.64
69 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128
70 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,2,5.69
71 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128
72 | microsoft/MiniLM-L12-H384-uncased,1,tensorflow,2.8.0-dev20211005,cuda,fp32,,,2,7.82
73 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128
74 | gpt2,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,3.80
75 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128
76 | gpt2,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,2,5.03
77 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128
78 | gpt2,1,tensorflow,2.8.0-dev20211005,cuda,fp32,,,2,6.58
79 | 


--------------------------------------------------------------------------------
/hf.co_1ms/run_benchmark.sh:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.  See License.txt in the project root for
  4 | # license information.
  5 | # --------------------------------------------------------------------------
  6 | # This measures the performance of OnnxRuntime, PyTorch and TorchScript on transformer models.
  7 | # Please install PyTorch (see https://pytorch.org/) before running this benchmark. Like the following:
  8 | # GPU:   conda install pytorch torchvision cudatoolkit=11.0 -c pytorch
  9 | # CPU:   conda install pytorch torchvision cpuonly -c pytorch
 10 | 
 11 | # When use_package=true, you need not copy other files to run benchmarks except this sh file.
 12 | # Otherwise, it will use python script (*.py) files in this directory.
 13 | use_package=true
 14 | 
 15 | # only need once
 16 | run_install=false
 17 | 
 18 | # Engines to test.
 19 | run_ort=true
 20 | run_torch=false
 21 | run_torchscript=true
 22 | run_tensorflow=false
 23 | 
 24 | # Onnx model source (default is from pytorch, set export_onnx_from_tf=true to convert from tensorflow model)
 25 | export_onnx_from_tf=false
 26 | 
 27 | # Devices to test (You can run either CPU or GPU, but not both: gpu need onnxruntime-gpu, and CPU need onnxruntime).
 28 | run_gpu_fp32=false
 29 | run_gpu_fp16=false
 30 | run_cpu_fp32=true
 31 | run_cpu_int8=true
 32 | 
 33 | average_over=1000
 34 | # CPU takes longer time to run, only run 100 inferences to get average latency.
 35 | if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
 36 |   average_over=100
 37 | fi
 38 | 
 39 | # Enable optimizer (use script instead of OnnxRuntime for graph optimization)
 40 | use_optimizer=true
 41 | 
 42 | # Batch Sizes and Sequence Lengths
 43 | batch_sizes="1"
 44 | sequence_lengths="16 128"
 45 | 
 46 | # Number of inputs (input_ids, token_type_ids, attention_mask) for ONNX model.
 47 | # Not that different input count might lead to different performance
 48 | # Here we only test one input (input_ids) for fair comparison with PyTorch.
 49 | input_counts=1
 50 | 
 51 | # Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased
 52 | #models_to_test="bert-base-cased roberta-base distilbert-base-uncased"
 53 | models_to_test="philschmid/MiniLM-L6-H384-uncased-sst2"
 54 | 
 55 | # If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
 56 | # export CUDA_VISIBLE_DEVICES=1
 57 | 
 58 | # This script will generate a logs file with a list of commands used in tests.
 59 | echo echo "ort=$run_ort torch=$run_torch torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" >> benchmark.log
 60 | 
 61 | # Set it to false to skip testing. You can use it to dry run this script with the log file.
 62 | run_tests=true
 63 | 
 64 | # Directory for downloading pretrained models.
 65 | cache_dir="./cache_models"
 66 | 
 67 | # Directory for ONNX models
 68 | onnx_dir="./onnx_models"
 69 | 
 70 | # -------------------------------------------
 71 | if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
 72 |   if [ "$run_gpu_fp32" = true ] ; then
 73 |     echo "cannot test cpu and gpu at same time"
 74 |     exit 1
 75 |   fi
 76 |   if [ "$run_gpu_fp16" = true ] ; then
 77 |     echo "cannot test cpu and gpu at same time"
 78 |     exit 1
 79 |   fi
 80 | fi
 81 | 
 82 | 
 83 | if [ "$run_install" = true ] ; then
 84 |   pip uninstall --yes ort-nightly ort-gpu-nightly
 85 |   pip uninstall --yes onnxruntime
 86 |   pip uninstall --yes onnxruntime-gpu
 87 |   if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
 88 |     pip install onnxruntime
 89 |   else
 90 |     pip install onnxruntime-gpu
 91 |   fi
 92 |   pip install --upgrade onnx coloredlogs packaging psutil py3nvml onnxconverter_common numpy transformers
 93 | fi
 94 | 
 95 | if [ "$use_package" = true ] ; then
 96 |   echo "Use onnxruntime.transformers.benchmark"
 97 |   benchmark_script="-m onnxruntime.transformers.benchmark"
 98 | else
 99 |   benchmark_script="benchmark.py"
100 | fi
101 | 
102 | onnx_export_options="-i $input_counts -v -b 0 --overwrite -f fusion.csv -c $cache_dir --onnx_dir $onnx_dir"
103 | benchmark_options="-b $batch_sizes -s $sequence_lengths -t $average_over -f fusion.csv -r result.csv -d detail.csv -c $cache_dir --onnx_dir $onnx_dir"
104 | 
105 | if [ "$export_onnx_from_tf" = true ] ; then
106 |   onnx_export_options="$onnx_export_options --model_source tf"
107 |   benchmark_options="$benchmark_options --model_source tf"
108 | fi
109 | 
110 | if [ "$use_optimizer" = true ] ; then
111 |   onnx_export_options="$onnx_export_options -o"
112 |   benchmark_options="$benchmark_options -o"
113 | fi
114 | 
115 | # -------------------------------------------
116 | run_one_test() {
117 |     if [ "$run_ort" = true ] ; then
118 |       echo python $benchmark_script -m $1 $onnx_export_options $2 $3 $4 >> benchmark.log
119 |       echo python $benchmark_script -m $1 $benchmark_options $2 $3 $4 -i $input_counts >> benchmark.log
120 |       if [ "$run_tests" = true ] ; then
121 |         python $benchmark_script -m $1 $onnx_export_options $2 $3 $4
122 |         python $benchmark_script -m $1 $benchmark_options $2 $3 $4 -i $input_counts
123 |       fi
124 |     fi
125 | 
126 |     if [ "$run_torch" = true ] ; then
127 |       echo python $benchmark_script -e torch -m $1 $benchmark_options $2 $3 $4 >> benchmark.log
128 |       if [ "$run_tests" = true ] ; then
129 |         python $benchmark_script -e torch -m $1 $benchmark_options $2 $3 $4
130 |       fi
131 |     fi
132 | 
133 |     if [ "$run_torchscript" = true ] ; then
134 |       echo python $benchmark_script -e torchscript -m $1 $benchmark_options $2 $3 $4 >> benchmark.log
135 |       if [ "$run_tests" = true ] ; then
136 |         python $benchmark_script -e torchscript -m $1 $benchmark_options $2 $3 $4
137 |       fi
138 |     fi
139 | 
140 |     if [ "$run_tensorflow" = true ] ; then
141 |       echo python $benchmark_script -e tensorflow -m $1 $benchmark_options $2 $3 $4 >> benchmark.log
142 |       if [ "$run_tests" = true ] ; then
143 |         python $benchmark_script -e tensorflow -m $1 $benchmark_options $2 $3 $4
144 |       fi
145 |     fi
146 | }
147 | 
148 | # -------------------------------------------
149 | if [ "$run_gpu_fp32" = true ] ; then
150 |   for m in $models_to_test
151 |   do
152 |     echo Run GPU FP32 Benchmark on model ${m}
153 |     run_one_test "${m}" -g
154 |   done
155 | fi
156 | 
157 | if [ "$run_gpu_fp16" = true ] ; then
158 |   for m in $models_to_test
159 |   do
160 |     echo Run GPU FP16 Benchmark on model ${m}
161 |     run_one_test "${m}" -g -p fp16
162 |   done
163 | fi
164 | 
165 | if [ "$run_cpu_fp32" = true ] ; then
166 |   for m in $models_to_test
167 |   do
168 |     echo Run CPU Benchmark on model ${m}
169 |     run_one_test "${m}"
170 |   done
171 | fi
172 | 
173 | if [ "$run_cpu_int8" = true ] ; then
174 |   for m in $models_to_test
175 |   do
176 |     echo Run CPU Benchmark on model ${m}
177 |     run_one_test "${m}" -p int8
178 |   done
179 | fi
180 | 
181 | if [ "run_tests" = false ] ; then
182 |     more $log_file
183 | fi
184 | 
185 | # Remove duplicated lines
186 | awk '!x[$0]++' ./result.csv > summary_result.csv
187 | awk '!x[$0]++' ./fusion.csv > summary_fusion.csv
188 | awk '!x[$0]++' ./detail.csv > summary_detail.csv
189 | 


--------------------------------------------------------------------------------
/hf.co_1ms/summary_detail.csv:
--------------------------------------------------------------------------------
 1 | engine,version,device,precision,optimizer,io_binding,model_name,inputs,threads,batch_size,sequence_length,datetime,test_times,QPS,average_latency_ms,latency_variance,latency_90_percentile,latency_95_percentile,latency_99_percentile
 2 | onnxruntime,1.10.0,cpu,fp32,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 03:50:26.867866,100,362.77,2.76,0.00,2.90,2.92,2.98
 3 | onnxruntime,1.10.0,cpu,fp32,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 03:50:27.149803,100,73.39,13.63,0.00,13.69,13.83,14.11
 4 | torchscript,1.9.0+cpu,cpu,fp32,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 03:50:32.062478,100,170.78,5.86,0.00,5.96,6.00,6.16
 5 | torchscript,1.9.0+cpu,cpu,fp32,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 03:50:34.446485,100,60.07,16.65,0.00,17.08,17.15,17.43
 6 | onnxruntime,1.10.0,cpu,int8,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 03:50:45.708862,100,805.29,1.24,0.00,1.36,1.41,1.77
 7 | onnxruntime,1.10.0,cpu,int8,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 03:50:45.837148,100,133.75,7.48,0.00,7.62,7.70,7.75
 8 | torchscript,1.9.0+cpu,cpu,int8,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 03:50:50.498982,100,509.34,1.96,0.00,2.02,2.04,2.14
 9 | torchscript,1.9.0+cpu,cpu,int8,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 03:50:52.568732,100,104.32,9.59,0.00,9.79,9.84,9.94
10 | onnxruntime,1.10.0,cpu,fp32,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 04:34:05.479853,100,368.08,2.72,0.00,2.82,2.89,3.04
11 | onnxruntime,1.10.0,cpu,fp32,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 04:34:05.757331,100,75.02,13.33,0.00,13.43,13.49,13.55
12 | torchscript,1.11.0.dev20211003+cpu,cpu,fp32,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 04:34:10.839467,100,162.78,6.14,0.00,6.24,6.33,6.72
13 | torchscript,1.11.0.dev20211003+cpu,cpu,fp32,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 04:34:13.508216,100,57.51,17.39,0.00,17.46,17.59,19.88
14 | onnxruntime,1.10.0,cpu,int8,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 04:34:25.458148,100,803.30,1.24,0.00,1.27,1.32,1.44
15 | onnxruntime,1.10.0,cpu,int8,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 04:34:25.586571,100,130.24,7.68,0.00,7.72,7.74,8.28
16 | torchscript,1.11.0.dev20211003+cpu,cpu,int8,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 04:34:30.947950,100,402.31,2.49,0.00,2.52,2.53,2.75
17 | torchscript,1.11.0.dev20211003+cpu,cpu,int8,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 04:34:33.658199,100,85.68,11.67,0.00,11.81,11.92,12.11
18 | onnxruntime,1.10.0,cuda,fp32,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 05:30:04.821624,1000,1231.85,0.81,0.00,0.83,0.84,0.88
19 | onnxruntime,1.10.0,cuda,fp32,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 05:30:05.648448,1000,1197.71,0.83,0.00,0.85,0.85,0.90
20 | torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 05:30:19.976177,1000,322.43,3.10,0.00,3.12,3.14,3.33
21 | torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 05:30:25.230227,1000,281.58,3.55,0.00,3.57,3.59,3.73
22 | onnxruntime,1.10.0,cuda,fp16,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 05:30:55.521173,1000,1357.36,0.74,0.00,0.75,0.76,0.79
23 | onnxruntime,1.10.0,cuda,fp16,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 05:30:56.270584,1000,1026.73,0.97,0.00,0.99,1.01,1.07
24 | torchscript,1.11.0.dev20211003+cu111,cuda,fp16,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 05:31:10.413587,1000,360.63,2.77,0.00,2.80,2.83,2.96
25 | torchscript,1.11.0.dev20211003+cu111,cuda,fp16,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 05:31:15.078604,1000,338.01,2.96,0.00,3.00,3.03,3.23
26 | onnxruntime,1.10.0,cuda,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:12:35.849579,1000,846.56,1.18,0.00,1.27,1.34,1.59
27 | onnxruntime,1.10.0,cuda,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:12:37.044826,1000,615.51,1.62,0.00,1.64,1.65,1.86
28 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:14:02.389280,100,125.11,7.99,0.00,8.24,8.30,8.78
29 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:14:03.200105,100,22.29,44.87,0.00,46.53,46.65,47.82
30 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:14:14.120446,100,75.91,13.17,0.00,13.86,13.93,14.58
31 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:14:21.541510,100,19.52,51.22,0.00,52.90,54.08,55.23
32 | onnxruntime,1.10.0,cpu,int8,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:14:41.473194,100,182.00,5.49,0.00,5.82,5.95,6.12
33 | onnxruntime,1.10.0,cpu,int8,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:14:42.031579,100,24.13,41.44,0.01,44.82,47.02,47.71
34 | torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:14:54.679325,100,147.13,6.80,0.00,7.12,7.37,8.59
35 | torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:15:02.571615,100,24.25,41.23,0.00,42.92,43.60,45.47
36 | onnxruntime,1.10.0,cpu,fp32,True,True,bert-large-uncased,1,2,1,16,2021-10-05 06:18:34.056351,100,9.45,105.78,0.00,108.11,108.99,109.30
37 | onnxruntime,1.10.0,cpu,fp32,True,True,bert-large-uncased,1,2,1,128,2021-10-05 06:18:44.743220,100,1.83,547.92,0.10,559.56,570.29,578.96
38 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,bert-large-uncased,1,2,1,16,2021-10-05 06:20:10.589961,100,5.67,176.39,0.01,181.04,182.60,189.76
39 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,bert-large-uncased,1,2,1,128,2021-10-05 06:21:21.849099,100,1.58,633.71,0.13,645.70,648.50,669.45
40 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:21:37.840842,100,124.88,8.01,0.00,8.36,8.38,8.63
41 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:21:38.653203,100,22.48,44.48,0.00,46.29,47.69,49.16
42 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:21:49.417318,100,76.97,12.99,0.00,13.80,14.08,15.73
43 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:21:56.896994,100,19.35,51.68,0.01,54.59,55.95,58.57
44 | onnxruntime,1.10.0,cpu,int8,True,True,bert-large-uncased,1,2,1,16,2021-10-05 06:24:05.239337,100,14.65,68.26,0.01,72.23,73.16,76.34
45 | onnxruntime,1.10.0,cpu,int8,True,True,bert-large-uncased,1,2,1,128,2021-10-05 06:24:12.135738,100,2.12,472.31,0.11,484.75,489.73,502.92
46 | torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,bert-large-uncased,1,2,1,16,2021-10-05 06:25:27.824158,100,18.34,54.54,0.00,57.09,59.17,61.63
47 | torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,bert-large-uncased,1,2,1,128,2021-10-05 06:26:18.327245,100,2.41,414.33,0.15,428.85,444.39,453.52
48 | onnxruntime,1.10.0,cpu,int8,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:26:40.112340,100,168.96,5.92,0.00,6.20,6.31,6.51
49 | onnxruntime,1.10.0,cpu,int8,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:26:40.713564,100,23.17,43.16,0.00,45.58,46.19,47.42
50 | torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:26:53.915002,100,127.43,7.85,0.00,8.05,8.08,8.15
51 | torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:27:01.869530,100,24.35,41.07,0.00,43.17,43.80,46.70
52 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 23:05:21.913857,100,114.10,8.76,0.00,9.07,9.31,10.18
53 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:05:22.805509,100,20.17,49.58,0.00,50.75,51.17,55.66
54 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 23:05:34.701467,100,70.95,14.09,0.00,14.40,15.01,16.26
55 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:05:43.042837,100,17.24,58.02,0.00,59.97,61.35,63.13
56 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 23:09:49.882880,100,122.90,8.14,0.00,8.38,8.49,8.77
57 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:09:50.709533,100,20.69,48.34,0.00,49.81,50.59,54.69
58 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 23:10:04.007583,100,71.19,14.05,0.00,14.75,15.23,17.61
59 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:10:12.404564,100,17.14,58.36,0.00,59.46,61.13,63.78
60 | tensorflow,2.8.0-dev20211005,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 23:10:28.886317,100,44.46,22.49,0.00,25.37,26.07,26.85
61 | tensorflow,2.8.0-dev20211005,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:10:39.513954,100,10.93,91.46,0.02,97.39,100.28,105.02
62 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:12:52.513580,100,20.29,49.27,0.00,50.35,51.24,54.06
63 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:13:10.888478,100,17.62,56.76,0.00,58.22,58.75,64.49
64 | tensorflow,2.8.0-dev20211005,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:13:30.827603,100,10.72,93.29,0.01,96.71,100.44,102.02
65 | onnxruntime,1.10.0,cuda,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:15:30.156163,1000,608.39,1.64,0.00,1.80,1.81,1.86
66 | torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:15:52.719761,1000,175.79,5.69,0.00,5.92,6.00,6.41
67 | tensorflow,2.8.0-dev20211005,cuda,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:16:15.848500,1000,127.84,7.82,0.00,8.24,8.44,8.85
68 | onnxruntime,1.10.0,cuda,fp32,True,True,gpt2,1,2,1,128,2021-10-05 23:22:34.121060,1000,262.99,3.80,0.00,3.83,3.85,4.35
69 | torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,gpt2,1,2,1,128,2021-10-05 23:23:00.527563,1000,198.88,5.03,0.00,5.23,5.32,5.79
70 | tensorflow,2.8.0-dev20211005,cuda,fp32,,,gpt2,1,2,1,128,2021-10-05 23:23:35.287606,1000,152.00,6.58,0.00,6.93,7.09,7.68
71 | 


--------------------------------------------------------------------------------
/hf.co_1ms/summary_fusion.csv:
--------------------------------------------------------------------------------
 1 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization
 2 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_cpu.onnx,2021-10-04 03:50:24.257021,4.11.2,1.9.0+cpu,1,6,0,0,6,0,12
 3 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_int8_cpu.onnx,2021-10-04 03:50:43.020711,4.11.2,1.9.0+cpu,0,6,0,0,6,1,12
 4 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_cpu.onnx,2021-10-04 04:34:02.831263,4.11.2,1.11.0.dev20211003+cpu,1,6,0,0,6,0,12
 5 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_int8_cpu.onnx,2021-10-04 04:34:22.689454,4.11.2,1.11.0.dev20211003+cpu,0,6,0,0,6,1,12
 6 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_gpu.onnx,2021-10-04 05:22:26.404124,4.11.2,1.11.0.dev20211003+cpu,1,6,0,0,6,0,12
 7 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp16_gpu.onnx,2021-10-04 05:22:49.040808,4.11.2,1.11.0.dev20211003+cpu,1,6,0,6,0,0,12
 8 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_gpu.onnx,2021-10-04 05:29:54.038759,4.11.2,1.11.0.dev20211003+cu111,1,6,0,0,6,0,12
 9 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp16_gpu.onnx,2021-10-04 05:30:44.705058,4.11.2,1.11.0.dev20211003+cu111,1,6,0,6,0,0,12
10 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_gpu.onnx,2021-10-05 06:12:25.602878,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
11 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 06:13:58.767952,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
12 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_int8_cpu.onnx,2021-10-05 06:14:37.706671,4.11.2,1.11.0.dev20211003+cu111,0,12,0,0,12,1,24
13 | ./onnx_models/bert_large_uncased_1_fp32_cpu.onnx,2021-10-05 06:18:24.971957,4.11.2,1.11.0.dev20211003+cu111,1,24,0,0,24,0,48
14 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 06:21:34.252637,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
15 | ./onnx_models/bert_large_uncased_1_int8_cpu.onnx,2021-10-05 06:23:53.876515,4.11.2,1.11.0.dev20211003+cu111,0,24,0,0,24,1,48
16 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_int8_cpu.onnx,2021-10-05 06:26:36.232346,4.11.2,1.11.0.dev20211003+cu111,0,12,0,0,12,1,24
17 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 23:05:17.742694,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
18 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 23:09:44.412182,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
19 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 23:12:46.956941,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
20 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_gpu.onnx,2021-10-05 23:15:17.325840,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24
21 | ./onnx_models/gpt2_1_fp32_gpu.onnx,2021-10-05 23:22:20.170049,4.11.2,1.11.0.dev20211003+cu111,0,0,0,12,0,25,0
22 | 


--------------------------------------------------------------------------------
/hf.co_1ms/summary_result.csv:
--------------------------------------------------------------------------------
 1 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128
 2 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,2.76,13.63
 3 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.9.0+cpu,cpu,fp32,,,2,5.86,16.65
 4 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,int8,True,True,2,1.24,7.48
 5 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.9.0+cpu,cpu,int8,,,2,1.96,9.59
 6 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,2.72,13.33
 7 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cpu,cpu,fp32,,,2,6.14,17.39
 8 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,int8,True,True,2,1.24,7.68
 9 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cpu,cpu,int8,,,2,2.49,11.67
10 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,0.81,0.83
11 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,2,3.10,3.55
12 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cuda,fp16,True,True,2,0.74,0.97
13 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp16,,,2,2.77,2.96
14 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,1.18,1.62
15 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,7.99,44.87
16 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,13.17,51.22
17 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,int8,True,True,2,5.49,41.44
18 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,2,6.80,41.23
19 | bert-large-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,105.78,547.92
20 | bert-large-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,176.39,633.71
21 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,8.01,44.48
22 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,12.99,51.68
23 | bert-large-uncased,1,onnxruntime,1.10.0,cpu,int8,True,True,2,68.26,472.31
24 | bert-large-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,2,54.54,414.33
25 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,int8,True,True,2,5.92,43.16
26 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,2,7.85,41.07
27 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,8.76,49.58
28 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,14.09,58.02
29 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,8.14,48.34
30 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,14.05,58.36
31 | microsoft/MiniLM-L12-H384-uncased,1,tensorflow,2.8.0-dev20211005,cpu,fp32,,,2,22.49,91.46
32 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128
33 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,49.27
34 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,56.76
35 | microsoft/MiniLM-L12-H384-uncased,1,tensorflow,2.8.0-dev20211005,cpu,fp32,,,2,93.29
36 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,1.64
37 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,2,5.69
38 | microsoft/MiniLM-L12-H384-uncased,1,tensorflow,2.8.0-dev20211005,cuda,fp32,,,2,7.82
39 | gpt2,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,3.80
40 | gpt2,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,2,5.03
41 | gpt2,1,tensorflow,2.8.0-dev20211005,cuda,fp32,,,2,6.58
42 | 


--------------------------------------------------------------------------------
/hf_co_models.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.  See License.txt in the project root for
  4 | # license information.
  5 | # --------------------------------------------------------------------------
  6 | 
  7 | # Maps model class name to a tuple of model class
  8 | MODEL_CLASSES = [
  9 |     'AutoModel', 'AutoModelWithLMHead', 'AutoModelForSequenceClassification', 'AutoModelForQuestionAnswering'
 10 | ]
 11 | 
 12 | # List of pretrained models: https://huggingface.co/transformers/pretrained_models.html
 13 | # Pretrained model name to a tuple of input names, opset_version, use_external_data_format, optimization model type
 14 | MODELS = {
 15 |     # BERT
 16 |     "bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 17 |     "bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 18 |     "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 19 |     "philschmid/MiniLM-L6-H384-uncased-sst2": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 20 |     # "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 21 |     # "bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 22 |     # "bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 23 |     # "bert-base-chinese": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 24 |     # "bert-base-german-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 25 |     # "bert-large-uncased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 26 |     # "bert-large-cased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 27 |     # "bert-large-uncased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask",
 28 |     #                                                            "token_type_ids"], 12, False, "bert"),
 29 |     # "bert-large-cased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask",
 30 |     #                                                          "token_type_ids"], 12, False, "bert"),
 31 |     # "bert-base-cased-finetuned-mrpc": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 32 |     # "bert-base-german-dbmdz-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 33 |     # "bert-base-german-dbmdz-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 34 |     # todo: more models to add
 35 |     # GPT (no past state)
 36 |     "openai-gpt": (["input_ids"], 11, False, "gpt2"),
 37 |     # GPT-2 (no past state, use benchmark_gpt2.py for past_key_values)
 38 |     "gpt2": (["input_ids"], 11, False, "gpt2"),
 39 |     "gpt2-medium": (["input_ids"], 11, False, "gpt2"),
 40 |     "gpt2-large": (["input_ids"], 11, True, "gpt2"),
 41 |     "gpt2-xl": (["input_ids"], 11, True, "gpt2"),
 42 |     "distilgpt2": (["input_ids"], 11, False, "gpt2"),
 43 |     # Transformer-XL (Models uses Einsum, which need opset version 12 or later.)
 44 |     "transfo-xl-wt103": (["input_ids", "mems"], 12, False, "bert"),
 45 |     # XLNet
 46 |     "xlnet-base-cased": (["input_ids"], 12, False, "bert"),
 47 |     "xlnet-large-cased": (["input_ids"], 12, False, "bert"),
 48 |     # XLM
 49 |     "xlm-mlm-en-2048": (["input_ids"], 11, True, "bert"),
 50 |     "xlm-mlm-ende-1024": (["input_ids"], 11, False, "bert"),
 51 |     "xlm-mlm-enfr-1024": (["input_ids"], 11, False, "bert"),
 52 |     # RoBERTa
 53 |     "roberta-base": (["input_ids", "attention_mask"], 12, False, "bert"),
 54 |     "roberta-large": (["input_ids", "attention_mask"], 12, False, "bert"),
 55 |     "roberta-large-mnli": (["input_ids", "attention_mask"], 12, False, "bert"),
 56 |     "deepset/roberta-base-squad2": (["input_ids", "attention_mask"], 11, False, "bert"),
 57 |     "distilroberta-base": (["input_ids", "attention_mask"], 12, False, "bert"),
 58 | 
 59 |     # DistilBERT
 60 |     "distilbert-base-uncased": (["input_ids", "attention_mask"], 11, False, "bert"),
 61 |     "distilbert-base-uncased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"),
 62 |     # CTRL
 63 |     "ctrl": (["input_ids"], 11, True, "bert"),
 64 |     # CamemBERT
 65 |     "camembert-base": (["input_ids"], 11, False, "bert"),
 66 |     # ALBERT
 67 |     "albert-base-v1": (["input_ids"], 12, False, "bert"),
 68 |     "albert-large-v1": (["input_ids"], 12, False, "bert"),
 69 |     "albert-xlarge-v1": (["input_ids"], 12, True, "bert"),
 70 |     #"albert-xxlarge-v1": (["input_ids"], 12, True, "bert"),
 71 |     "albert-base-v2": (["input_ids"], 12, False, "bert"),
 72 |     "albert-large-v2": (["input_ids"], 12, False, "bert"),
 73 |     "albert-xlarge-v2": (["input_ids"], 12, True, "bert"),
 74 |     #"albert-xxlarge-v2": (["input_ids"], 12, True, "bert"),
 75 |     # T5 (use benchmark_t5.py instead)
 76 |     # "t5-small": (["input_ids", "decoder_input_ids"], 12, False, "bert"),
 77 |     # "t5-base": (["input_ids", "decoder_input_ids"], 12, False, "bert"),
 78 |     # "t5-large": (["input_ids", "decoder_input_ids"], 12, True, "bert"),
 79 |     # "t5-3b": (["input_ids", "decoder_input_ids"], 12, True, "bert"),
 80 |     # "t5-11b": (["input_ids", "decoder_input_ids"], 12, True, "bert"),
 81 |     #"valhalla/t5-small-qa-qg-hl": (["input_ids"], 12, True, "bert"),
 82 |     # XLM-RoBERTa
 83 |     "xlm-roberta-base": (["input_ids"], 11, False, "bert"),
 84 |     "xlm-roberta-large": (["input_ids"], 11, True, "bert"),
 85 |     # FlauBERT
 86 |     "flaubert/flaubert_small_cased": (["input_ids"], 11, False, "bert"),
 87 |     #"flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"),
 88 |     "flaubert/flaubert_base_cased": (["input_ids"], 11, False, "bert"),
 89 |     #"flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"),
 90 |     # Bart
 91 |     "facebook/bart-large": (["input_ids", "attention_mask"], 11, False, "bart"),
 92 |     "facebook/bart-base": (["input_ids", "attention_mask"], 11, False, "bart"),
 93 |     "facebook/bart-large-mnli": (["input_ids", "attention_mask"], 11, False, "bart"),
 94 |     "facebook/bart-large-cnn": (["input_ids", "attention_mask"], 11, False, "bart"),
 95 | 
 96 |     # DialoGPT
 97 |     "microsoft/DialoGPT-small": (["input_ids"], 11, False, "gpt2"),
 98 |     "microsoft/DialoGPT-medium": (["input_ids"], 11, False, "gpt2"),
 99 |     #"microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"),
100 |     # Reformer
101 |     #"google/reformer-enwik8": (["input_ids"], 11, False, "bert"),
102 |     #"google/reformer-crime-and-punishment": (["input_ids"], 11, False, "bert"),
103 |     # MarianMT
104 |     #"Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"),
105 |     # Longformer (use benchmark_longformer.py instead)
106 |     #"allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"),
107 |     #"allenai/longformer-large-4096": (["input_ids"], 12, False, "bert"),
108 |     # MBart
109 |     "facebook/mbart-large-cc25": (["input_ids"], 11, True, "bert"),
110 |     "facebook/mbart-large-en-ro": (["input_ids"], 11, True, "bert"),
111 |     # "Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"),
112 |     # # Longformer
113 |     # "allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"),
114 |     # "allenai/longformer-large-4096": (["input_ids"], 12, True, "bert"),
115 |     # "funnel-transformer/small": (["input_ids"], 12, False, "bert"),
116 |     # "funnel-transformer/small-base": (["input_ids"], 12, False, "bert"),
117 |     # "funnel-transformer/medium": (["input_ids"], 12, False, "bert"),
118 |     # "funnel-transformer/medium-base": (["input_ids"], 12, False, "bert"),
119 |     # "funnel-transformer/intermediate": (["input_ids"], 12, False, "bert"),
120 |     # "funnel-transformer/intermediate-base": (["input_ids"], 12, False, "bert"),
121 |     # "funnel-transformer/large": (["input_ids"], 12, True, "bert"),
122 |     # "funnel-transformer/large-base": (["input_ids"], 12, True, "bert"),
123 |     # "funnel-transformer/xlarge": (["input_ids"], 12, True, "bert"),
124 |     # "funnel-transformer/xlarge-base": (["input_ids"], 12, True, "bert"),
125 |     # Layoutlm
126 |     "microsoft/layoutlm-base-uncased": (["input_ids"], 11, False, "bert"),
127 |     "microsoft/layoutlm-large-uncased": (["input_ids"], 11, False, "bert"),
128 |     # Squeezebert
129 |     "squeezebert/squeezebert-uncased": (["input_ids"], 11, False, "bert"),
130 |     "squeezebert/squeezebert-mnli": (["input_ids"], 11, False, "bert"),
131 |     "squeezebert/squeezebert-mnli-headless": (["input_ids"], 11, False, "bert"),
132 |     "unc-nlp/lxmert-base-uncased": (["input_ids", "visual_feats", "visual_pos"], 11, False, "bert"),
133 |     # "google/pegasus-xsum": (["input_ids"], 11, False, "bert"),
134 |     # "google/pegasus-large": (["input_ids"], 11, False, "bert"),
135 | }
136 | 


--------------------------------------------------------------------------------
/huggingface_MiniLM_loadsave.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from iree import runtime as ireert
 4 | from iree.compiler import tf as tfc
 5 | from iree.compiler import compile_str
 6 | import sys
 7 | from absl import app
 8 | 
 9 | import numpy as np
10 | import os
11 | import tempfile
12 | import tensorflow as tf
13 | 
14 | import time
15 | import cProfile
16 | from transformers import BertModel, BertTokenizer, TFBertModel
17 | 
18 | MAX_SEQUENCE_LENGTH = 128
19 | BATCH_SIZE = 1
20 | 
21 | # Create a set of 2-dimensional inputs
22 | bert_input = [tf.TensorSpec(shape=[BATCH_SIZE,MAX_SEQUENCE_LENGTH],dtype=tf.int32),
23 |             tf.TensorSpec(shape=[BATCH_SIZE,MAX_SEQUENCE_LENGTH], dtype=tf.int32),
24 |             tf.TensorSpec(shape=[BATCH_SIZE,MAX_SEQUENCE_LENGTH], dtype=tf.int32)]
25 | 
26 | class BertModule(tf.Module):
27 |     def __init__(self):
28 |         super(BertModule, self).__init__()
29 |         # Create a BERT trainer with the created network.
30 |         self.m = TFBertModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased", from_pt=True)
31 | 
32 |         # Invoke the trainer model on the inputs. This causes the layer to be built.
33 |         self.m.predict = lambda x,y,z: self.m.call(input_ids=x, attention_mask=y, token_type_ids=z, training=False)
34 | 
35 |     @tf.function(input_signature=bert_input)
36 |     def predict(self, input_ids, attention_mask, token_type_ids):
37 |         return self.m.predict(input_ids, attention_mask, token_type_ids)
38 | 
39 | if __name__ == "__main__":
40 |     # Prepping Data
41 |     tokenizer = BertTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
42 |     text = "Replace me by any text you'd like."
43 |     encoded_input = tokenizer(text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH)
44 |     for key in encoded_input:
45 |         encoded_input[key] = tf.expand_dims(tf.convert_to_tensor(encoded_input[key]),0)
46 | 
47 |     # Compile the model using IREE
48 |     compiler_module = tfc.compile_module(BertModule(), exported_names = ["predict"], import_only=True)
49 |     ARITFACTS_DIR = os.getcwd()
50 |     mlir_path = os.path.join(ARITFACTS_DIR, "model_raw.mlir")
51 |     with open(mlir_path, "wb") as output_file:
52 |         output_file.write(compiler_module)
53 |     with open(mlir_path, "rb") as input_file:
54 |         compiled_data = input_file.read()
55 | 
56 |     # Compile the model using IREE
57 |     #backend = "dylib-llvm-aot"
58 |     #args = ["--iree-llvm-target-cpu-features=host"]
59 |     #backend_config = "dylib"
60 |     backend = "cuda"
61 |     backend_config = "cuda"
62 |     #args = ["--iree-cuda-llvm-target-arch=sm_75", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"]
63 |     # FIXME: Stella's GPU is only 7.5
64 |     args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"]
65 |     flatbuffer_blob = compile_str(compiler_module, target_backends=[backend], extra_args=args, input_type="mhlo")
66 |     #flatbuffer_blob = compile_str(compiled_data, target_backends=["dylib-llvm-aot"])
67 | 
68 |     # Save module as MLIR file in a directory
69 |     ireert.flags.FUNCTION_INPUT_VALIDATION = False
70 |     ireert.flags.parse_flags("--cuda_allow_inline_execution")
71 |     vm_module = ireert.VmModule.from_flatbuffer(flatbuffer_blob)
72 |     #tracer = ireert.Tracer(os.getcwd())
73 |     config = ireert.Config(backend_config)
74 |     ctx = ireert.SystemContext(config=config)
75 |     ctx.add_vm_module(vm_module)
76 | 
77 |     # Setting up training/benchmark information.
78 |     total_iter = 15
79 |     host_inputs =[encoded_input["input_ids"], encoded_input["attention_mask"], encoded_input["token_type_ids"]]
80 |     device_inputs = [ireert.asdevicearray(config.device, a) for a in host_inputs]
81 |     BertCompiled = ctx.modules.module
82 |     predict_f = BertCompiled.predict
83 |     device_outputs = predict_f(*device_inputs)
84 |     with cProfile.Profile(timer=time.perf_counter_ns, timeunit=0.000001) as pr:
85 |         start = time.time()
86 |         for i in range(total_iter):
87 |             device_outputs = predict_f(*device_inputs)
88 |         end = time.time()
89 | 
90 |     print("RESULTS:", {k:v.to_host() for k, v in device_outputs.items()})
91 | 
92 |     total_time = end - start
93 |     print("time: "+str(total_time))
94 |     print("time/iter: "+str(total_time/total_iter))
95 | 


--------------------------------------------------------------------------------
/huggingface_models.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.  See License.txt in the project root for
  4 | # license information.
  5 | # --------------------------------------------------------------------------
  6 | 
  7 | # Maps model class name to a tuple of model class
  8 | MODEL_CLASSES = [
  9 |     'AutoModel', 'AutoModelWithLMHead', 'AutoModelForSequenceClassification', 'AutoModelForQuestionAnswering'
 10 | ]
 11 | 
 12 | # List of pretrained models: https://huggingface.co/transformers/pretrained_models.html
 13 | # Pretrained model name to a tuple of input names, opset_version, use_external_data_format, optimization model type
 14 | MODELS = {
 15 |     # BERT
 16 |     "bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 17 |     "bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 18 |     "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 19 |     "philschmid/MiniLM-L6-H384-uncased-sst2": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 20 |     "microsoft/MiniLM-L12-H384-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 21 |     # "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 22 |     # "bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 23 |     # "bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 24 |     # "bert-base-chinese": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 25 |     # "bert-base-german-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 26 |     # "bert-large-uncased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 27 |     # "bert-large-cased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 28 |     # "bert-large-uncased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask",
 29 |     #                                                            "token_type_ids"], 12, False, "bert"),
 30 |     # "bert-large-cased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask",
 31 |     #                                                          "token_type_ids"], 12, False, "bert"),
 32 |     # "bert-base-cased-finetuned-mrpc": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 33 |     # "bert-base-german-dbmdz-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 34 |     # "bert-base-german-dbmdz-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"),
 35 |     # todo: more models to add
 36 |     # GPT (no past state)
 37 |     "openai-gpt": (["input_ids"], 11, False, "gpt2"),
 38 |     # GPT-2 (no past state, use benchmark_gpt2.py for past_key_values)
 39 |     "gpt2": (["input_ids"], 11, False, "gpt2"),
 40 |     "gpt2-medium": (["input_ids"], 11, False, "gpt2"),
 41 |     "gpt2-large": (["input_ids"], 11, True, "gpt2"),
 42 |     "gpt2-xl": (["input_ids"], 11, True, "gpt2"),
 43 |     "distilgpt2": (["input_ids"], 11, False, "gpt2"),
 44 |     # Transformer-XL (Models uses Einsum, which need opset version 12 or later.)
 45 |     "transfo-xl-wt103": (["input_ids", "mems"], 12, False, "bert"),
 46 |     # XLNet
 47 |     "xlnet-base-cased": (["input_ids"], 12, False, "bert"),
 48 |     "xlnet-large-cased": (["input_ids"], 12, False, "bert"),
 49 |     # XLM
 50 |     "xlm-mlm-en-2048": (["input_ids"], 11, True, "bert"),
 51 |     "xlm-mlm-ende-1024": (["input_ids"], 11, False, "bert"),
 52 |     "xlm-mlm-enfr-1024": (["input_ids"], 11, False, "bert"),
 53 |     # RoBERTa
 54 |     "roberta-base": (["input_ids", "attention_mask"], 12, False, "bert"),
 55 |     "roberta-large": (["input_ids", "attention_mask"], 12, False, "bert"),
 56 |     "roberta-large-mnli": (["input_ids", "attention_mask"], 12, False, "bert"),
 57 |     "deepset/roberta-base-squad2": (["input_ids", "attention_mask"], 11, False, "bert"),
 58 |     "distilroberta-base": (["input_ids", "attention_mask"], 12, False, "bert"),
 59 | 
 60 |     # DistilBERT
 61 |     "distilbert-base-uncased": (["input_ids", "attention_mask"], 11, False, "bert"),
 62 |     "distilbert-base-uncased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"),
 63 |     # CTRL
 64 |     "ctrl": (["input_ids"], 11, True, "bert"),
 65 |     # CamemBERT
 66 |     "camembert-base": (["input_ids"], 11, False, "bert"),
 67 |     # ALBERT
 68 |     "albert-base-v1": (["input_ids"], 12, False, "bert"),
 69 |     "albert-large-v1": (["input_ids"], 12, False, "bert"),
 70 |     "albert-xlarge-v1": (["input_ids"], 12, True, "bert"),
 71 |     #"albert-xxlarge-v1": (["input_ids"], 12, True, "bert"),
 72 |     "albert-base-v2": (["input_ids"], 12, False, "bert"),
 73 |     "albert-large-v2": (["input_ids"], 12, False, "bert"),
 74 |     "albert-xlarge-v2": (["input_ids"], 12, True, "bert"),
 75 |     #"albert-xxlarge-v2": (["input_ids"], 12, True, "bert"),
 76 |     # T5 (use benchmark_t5.py instead)
 77 |     # "t5-small": (["input_ids", "decoder_input_ids"], 12, False, "bert"),
 78 |     # "t5-base": (["input_ids", "decoder_input_ids"], 12, False, "bert"),
 79 |     # "t5-large": (["input_ids", "decoder_input_ids"], 12, True, "bert"),
 80 |     # "t5-3b": (["input_ids", "decoder_input_ids"], 12, True, "bert"),
 81 |     # "t5-11b": (["input_ids", "decoder_input_ids"], 12, True, "bert"),
 82 |     #"valhalla/t5-small-qa-qg-hl": (["input_ids"], 12, True, "bert"),
 83 |     # XLM-RoBERTa
 84 |     "xlm-roberta-base": (["input_ids"], 11, False, "bert"),
 85 |     "xlm-roberta-large": (["input_ids"], 11, True, "bert"),
 86 |     # FlauBERT
 87 |     "flaubert/flaubert_small_cased": (["input_ids"], 11, False, "bert"),
 88 |     #"flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"),
 89 |     "flaubert/flaubert_base_cased": (["input_ids"], 11, False, "bert"),
 90 |     #"flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"),
 91 |     # Bart
 92 |     "facebook/bart-large": (["input_ids", "attention_mask"], 11, False, "bart"),
 93 |     "facebook/bart-base": (["input_ids", "attention_mask"], 11, False, "bart"),
 94 |     "facebook/bart-large-mnli": (["input_ids", "attention_mask"], 11, False, "bart"),
 95 |     "facebook/bart-large-cnn": (["input_ids", "attention_mask"], 11, False, "bart"),
 96 | 
 97 |     # DialoGPT
 98 |     "microsoft/DialoGPT-small": (["input_ids"], 11, False, "gpt2"),
 99 |     "microsoft/DialoGPT-medium": (["input_ids"], 11, False, "gpt2"),
100 |     #"microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"),
101 |     # Reformer
102 |     #"google/reformer-enwik8": (["input_ids"], 11, False, "bert"),
103 |     #"google/reformer-crime-and-punishment": (["input_ids"], 11, False, "bert"),
104 |     # MarianMT
105 |     #"Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"),
106 |     # Longformer (use benchmark_longformer.py instead)
107 |     #"allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"),
108 |     #"allenai/longformer-large-4096": (["input_ids"], 12, False, "bert"),
109 |     # MBart
110 |     "facebook/mbart-large-cc25": (["input_ids"], 11, True, "bert"),
111 |     "facebook/mbart-large-en-ro": (["input_ids"], 11, True, "bert"),
112 |     # "Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"),
113 |     # # Longformer
114 |     # "allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"),
115 |     # "allenai/longformer-large-4096": (["input_ids"], 12, True, "bert"),
116 |     # "funnel-transformer/small": (["input_ids"], 12, False, "bert"),
117 |     # "funnel-transformer/small-base": (["input_ids"], 12, False, "bert"),
118 |     # "funnel-transformer/medium": (["input_ids"], 12, False, "bert"),
119 |     # "funnel-transformer/medium-base": (["input_ids"], 12, False, "bert"),
120 |     # "funnel-transformer/intermediate": (["input_ids"], 12, False, "bert"),
121 |     # "funnel-transformer/intermediate-base": (["input_ids"], 12, False, "bert"),
122 |     # "funnel-transformer/large": (["input_ids"], 12, True, "bert"),
123 |     # "funnel-transformer/large-base": (["input_ids"], 12, True, "bert"),
124 |     # "funnel-transformer/xlarge": (["input_ids"], 12, True, "bert"),
125 |     # "funnel-transformer/xlarge-base": (["input_ids"], 12, True, "bert"),
126 |     # Layoutlm
127 |     "microsoft/layoutlm-base-uncased": (["input_ids"], 11, False, "bert"),
128 |     "microsoft/layoutlm-large-uncased": (["input_ids"], 11, False, "bert"),
129 |     # Squeezebert
130 |     "squeezebert/squeezebert-uncased": (["input_ids"], 11, False, "bert"),
131 |     "squeezebert/squeezebert-mnli": (["input_ids"], 11, False, "bert"),
132 |     "squeezebert/squeezebert-mnli-headless": (["input_ids"], 11, False, "bert"),
133 |     "unc-nlp/lxmert-base-uncased": (["input_ids", "visual_feats", "visual_pos"], 11, False, "bert"),
134 |     # "google/pegasus-xsum": (["input_ids"], 11, False, "bert"),
135 |     # "google/pegasus-large": (["input_ids"], 11, False, "bert"),
136 | }
137 | 


--------------------------------------------------------------------------------
/nightly_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TIMESTAMP=`date +%Y-%m-%d_%H-%M-%S`
 4 | [ -d $HOME/ci ] || mkdir $HOME/ci
 5 | log_file=$HOME/ci/nightly_log_${TIMESTAMP}.txt
 6 | exec &> >(tee -a "$log_file")
 7 | 
 8 | rm -rf $HOME/ci/nightly
 9 | mkdir -p $HOME/ci/nightly
10 | cd $HOME/ci/nightly
11 | curl -O --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/main/perf-ci.sh
12 | chmod +x $HOME/ci/nightly/perf-ci.sh
13 | $HOME/ci/nightly/perf-ci.sh
14 | gsutil cp $log_file gs://iree-shared-files/nod-perf/logs/
15 | 


--------------------------------------------------------------------------------
/onnx_model_bart.py:
--------------------------------------------------------------------------------
  1 | #-------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  3 | # Licensed under the MIT License.
  4 | #--------------------------------------------------------------------------
  5 | import logging
  6 | from fusion_attention import FusionAttention, AttentionMask
  7 | from fusion_reshape import FusionReshape
  8 | from onnx import numpy_helper
  9 | from onnx_model import OnnxModel
 10 | from onnx_model_bert import BertOnnxModel
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class FusionBartEncoderAttention(FusionAttention):
 16 |     """
 17 |     Fuse Bart Attention subgraph into one Attention node.
 18 |     """
 19 |     def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int, attention_mask: AttentionMask):
 20 |         super().__init__(model, hidden_size, num_heads, attention_mask)
 21 | 
 22 |     def check_runtime_shape_path(self, reshape_qkv_2, reshape_qkv_1, reshape_q_2, reshape_k_2, reshape_v_2, root_input):
 23 |         concat_qkv_2_path = self.model.match_parent_path(reshape_qkv_2, ['Concat'], [1])
 24 |         if concat_qkv_2_path is None:
 25 |             return False
 26 |         concat_qkv_2 = concat_qkv_2_path[0]
 27 | 
 28 |         reshape_qkv_2_path_1 = self.model.match_parent_path(concat_qkv_2, ['Unsqueeze', 'Gather', 'Shape'], [0, 0, 0])
 29 |         reshape_qkv_2_path_2 = self.model.match_parent_path(concat_qkv_2, ['Unsqueeze', 'Gather', 'Shape'], [1, 0, 0])
 30 |         reshape_qkv_2_path_3 = self.model.match_parent_path(concat_qkv_2, ['Unsqueeze', 'Gather', 'Shape'], [2, 0, 0])
 31 |         if reshape_qkv_2_path_1 is None or reshape_qkv_2_path_2 is None or reshape_qkv_2_path_3 is None:
 32 |             return False
 33 | 
 34 |         _, gather_1, shape_1 = reshape_qkv_2_path_1
 35 |         _, gather_2, shape_2 = reshape_qkv_2_path_2
 36 |         _, _, shape_3 = reshape_qkv_2_path_3
 37 | 
 38 |         if shape_1.input[0] != root_input or shape_2.input[0] != root_input or shape_3.input[0] != root_input:
 39 |             return False
 40 | 
 41 |         reshape_qkv_1_path_1 = self.model.match_parent_path(reshape_qkv_1, ['Concat', 'Unsqueeze', 'Gather'], [1, 0, 0])
 42 |         reshape_qkv_1_path_2 = self.model.match_parent_path(reshape_qkv_1, ['Concat', 'Unsqueeze', 'Gather'], [1, 2, 0])
 43 |         if reshape_qkv_1_path_1 is None or reshape_qkv_1_path_2 is None:
 44 |             return False
 45 |         if reshape_qkv_1_path_1[-1].name != gather_1.name or reshape_qkv_1_path_2[-1].name != gather_2.name:
 46 |             return False
 47 | 
 48 |         reshape_q_2_path = self.model.match_parent_path(reshape_q_2, ['Concat', 'Unsqueeze', 'Mul'], [1, 0, 0])
 49 |         reshape_k_2_path = self.model.match_parent_path(reshape_k_2, ['Concat', 'Unsqueeze', 'Mul'], [1, 0, 0])
 50 |         reshape_v_2_path = self.model.match_parent_path(reshape_v_2, ['Concat', 'Unsqueeze', 'Mul'], [1, 0, 0])
 51 |         if reshape_q_2_path is None or reshape_k_2_path is None or reshape_v_2_path is None:
 52 |             return False
 53 | 
 54 |         mul_q = reshape_q_2_path[-1]
 55 |         mul_k = reshape_k_2_path[-1]
 56 |         mul_v = reshape_v_2_path[-1]
 57 | 
 58 |         gather_1_out = gather_1.output[0]
 59 |         if mul_q.input[0] != gather_1_out or mul_k.input[0] != gather_1_out or mul_v.input[0] != gather_1_out:
 60 |             return False
 61 | 
 62 |         return True
 63 | 
 64 |     def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
 65 |         # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
 66 |         qkv_nodes = self.model.match_parent_path(normalize_node,
 67 |                                                  ['Add', 'MatMul', 'Reshape', 'Transpose', 'Reshape', 'MatMul'],
 68 |                                                  [None, 1, 0, 0, 0, 0])
 69 |         if qkv_nodes is not None:
 70 |             (add_out, matmul_out, reshape_qkv_2, transpose_qkv, reshape_qkv_1, matmul_qkv) = qkv_nodes
 71 |         else:
 72 |             return
 73 | 
 74 |         other_inputs = []
 75 |         for i, input in enumerate(normalize_node.input):
 76 |             if input not in output_name_to_node:
 77 |                 continue
 78 |             if input == qkv_nodes[0].output[0]:
 79 |                 continue
 80 |             other_inputs.append(input)
 81 |         if len(other_inputs) != 1:
 82 |             return
 83 | 
 84 |         root_input = other_inputs[0]
 85 |         children = input_name_to_nodes[root_input]
 86 |         children_types = [child.op_type for child in children]
 87 |         if children_types.count('MatMul') != 3:
 88 |             return
 89 | 
 90 |         v_nodes = self.model.match_parent_path(matmul_qkv, ['Reshape', 'Transpose', 'Reshape', 'Add', 'MatMul'],
 91 |                                                [1, 0, 0, 0, None])
 92 |         if v_nodes is None:
 93 |             logger.debug("fuse_attention: failed to match v path")
 94 |             return
 95 |         (reshape_v_2, transpose_v, reshape_v_1, add_v, matmul_v) = v_nodes
 96 | 
 97 |         qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'MatMul'], [0, 0])
 98 |         if qk_nodes is not None:
 99 |             _, matmul_qk = qk_nodes
100 |         else:
101 |             return
102 | 
103 |         q_nodes = self.model.match_parent_path(matmul_qk, ['Reshape', 'Transpose', 'Reshape', 'Mul', 'Add', 'MatMul'],
104 |                                                [0, 0, 0, 0, 0, 1])
105 |         if q_nodes is not None:
106 |             reshape_q_2, _, reshape_q_1, _, add_q, matmul_q = q_nodes
107 |         else:
108 |             return
109 | 
110 |         k_nodes = self.model.match_parent_path(matmul_qk,
111 |                                                ['Transpose', 'Reshape', 'Transpose', 'Reshape', 'Add', 'MatMul'],
112 |                                                [1, 0, 0, 0, 0, 1])
113 |         if k_nodes is not None:
114 |             _, reshape_k_2, _, reshape_k_1, add_k, matmul_k = k_nodes
115 |         else:
116 |             return
117 | 
118 |         if not self.check_runtime_shape_path(reshape_qkv_2, reshape_qkv_1, reshape_q_2, reshape_k_2, reshape_v_2,
119 |                                              root_input):
120 |             return
121 | 
122 |         if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_v.input[0] == root_input:
123 | 
124 |             mask_nodes = []
125 |             mask_index = None
126 |             attention_last_node = reshape_qkv_2
127 | 
128 |             num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q_1)
129 | 
130 |             if num_heads <= 0 or hidden_size <= 0 or (hidden_size % num_heads) != 0:
131 |                 logger.debug("fuse_attention: failed to detect num_heads or hidden_size")
132 |                 return
133 | 
134 |             new_node = self.create_attention_node(mask_index, matmul_q, matmul_k, matmul_v, add_q, add_k, add_v,
135 |                                                   num_heads, hidden_size, root_input, attention_last_node.output[0],
136 |                                                   None)
137 |             if new_node is None:
138 |                 return
139 | 
140 |             self.nodes_to_add.append(new_node)
141 |             self.node_name_to_graph_name[new_node.name] = self.this_graph_name
142 | 
143 |             self.nodes_to_remove.extend([attention_last_node, transpose_qkv, matmul_qkv])
144 |             self.nodes_to_remove.extend(qk_nodes)
145 |             self.nodes_to_remove.extend(q_nodes)
146 |             self.nodes_to_remove.extend(k_nodes)
147 |             self.nodes_to_remove.extend(v_nodes)
148 | 
149 |             # Use prune graph to remove mask nodes since they are shared by all attention nodes.
150 |             self.nodes_to_remove.extend(mask_nodes)
151 |             self.prune_graph = True
152 | 
153 | 
154 | class FusionBartReshape(FusionReshape):
155 |     def __init__(self, model: OnnxModel):
156 |         super().__init__(model)
157 | 
158 |     def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node):
159 |         if reshape_node.input[1] not in output_name_to_node:
160 |             return
161 | 
162 |         concat_node = output_name_to_node[reshape_node.input[1]]
163 |         if concat_node.op_type != 'Concat' or len(concat_node.input) != 4:
164 |             return
165 | 
166 |         path0 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Gather', 'Shape'], [0, 0, 0],
167 |                                              output_name_to_node)
168 |         if path0 is None:
169 |             return
170 | 
171 |         (_, gather_0, shape_0) = path0
172 | 
173 |         shape = []
174 |         gather_value = self.model.get_constant_value(gather_0.input[1])
175 |         if gather_value == 0:
176 |             shape.append(0)
177 | 
178 |         path1 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Gather', 'Shape'], [1, 0, 0],
179 |                                              output_name_to_node)
180 |         if path1 is None:
181 |             input_1_proto = self.model.get_initializer(concat_node.input[1])
182 |             input_2_proto = self.model.get_initializer(concat_node.input[2])
183 |             input_3_proto = self.model.get_initializer(concat_node.input[3])
184 |             if input_1_proto is None or input_2_proto is None or input_3_proto is None:
185 |                 return
186 | 
187 |             input_1 = numpy_helper.to_array(input_1_proto)
188 |             input_2 = numpy_helper.to_array(input_2_proto)
189 |             input_3 = numpy_helper.to_array(input_3_proto)
190 |             if len(input_1) != 1 or len(input_2) != 1 or len(input_3) != 1:
191 |                 return
192 | 
193 |             if not (input_1[0] == -1 and input_2[0] > 0 and input_3[0] > 0):
194 |                 return
195 | 
196 |             shape.extend(input_1)
197 |             shape.extend(input_2)
198 |             shape.extend(input_3)
199 |             gemm_path = self.model.match_parent_path(reshape_node, ['Add', 'MatMul'], [0, 1], output_name_to_node)
200 |             if gemm_path is None:
201 |                 return
202 | 
203 |             top_matmul = gemm_path[-1]
204 |             root_input = top_matmul.input[0]
205 |             if shape_0.input[0] != root_input:
206 |                 return
207 | 
208 |             self.replace_reshape_node(shape, reshape_node, concat_node)
209 |         else:
210 |             (_, gather_1, shape_1) = path1
211 | 
212 |             gather_value = self.model.get_constant_value(gather_1.input[1])
213 |             if gather_value == 1:
214 |                 shape.append(0)
215 | 
216 |             input_2_proto = self.model.get_initializer(concat_node.input[2])
217 |             input_3_proto = self.model.get_initializer(concat_node.input[3])
218 |             if input_2_proto is None or input_3_proto is None:
219 |                 return
220 | 
221 |             input_2 = numpy_helper.to_array(input_2_proto)
222 |             input_3 = numpy_helper.to_array(input_3_proto)
223 |             if len(input_2) != 1 or len(input_3) != 1:
224 |                 return
225 | 
226 |             if not (input_2[0] > 0 and input_3[0] > 0):
227 |                 return
228 | 
229 |             shape.extend(input_2)
230 |             shape.extend(input_3)
231 |             gemm_path = self.model.match_parent_path(reshape_node, ['Mul', 'Add', 'MatMul'], [0, 0, 1],
232 |                                                      output_name_to_node)
233 |             if gemm_path is None:
234 |                 return
235 | 
236 |             top_matmul = gemm_path[-1]
237 |             root_input = top_matmul.input[0]
238 |             if shape_0.input[0] != root_input or shape_1.input[0] != root_input:
239 |                 return
240 | 
241 |             self.replace_reshape_node(shape, reshape_node, concat_node)
242 | 
243 | 
244 | class BartOnnxModel(BertOnnxModel):
245 |     def __init__(self, model, num_heads, hidden_size):
246 |         super().__init__(model, num_heads, hidden_size)
247 |         self.attention_mask = AttentionMask(self)
248 |         self.attention_fusion = FusionBartEncoderAttention(self, self.hidden_size, self.num_heads, self.attention_mask)
249 |         self.bart_reshape_fusion_preprocess = FusionBartReshape(self)
250 | 
251 |     def fuse_attention(self):
252 |         self.attention_fusion.apply()
253 | 
254 |     def preprocess(self):
255 |         self.adjust_reshape_and_expand()
256 |         self.bart_reshape_fusion_preprocess.apply()
257 | 


--------------------------------------------------------------------------------
/onnx_model_gpt2.py:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
 3 | # Licensed under the MIT License.
 4 | #--------------------------------------------------------------------------
 5 | import logging
 6 | import onnx
 7 | from onnx_model_bert import BertOnnxModel
 8 | from fusion_gpt_attention_no_past import FusionGptAttentionNoPast
 9 | from fusion_gpt_attention import FusionGptAttention
10 | from fusion_gpt_attention_megatron import FusionGptAttentionMegatron
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | class Gpt2OnnxModel(BertOnnxModel):
16 |     def __init__(self, model, num_heads, hidden_size):
17 |         super().__init__(model, num_heads, hidden_size)
18 | 
19 |     def fuse_attention(self):
20 |         if len(self.model.graph.input) == 1 or len(self.model.graph.output) == 1:
21 |             fusion = FusionGptAttentionNoPast(self, self.num_heads)
22 |             fusion.apply()
23 |         else:
24 |             fusion = FusionGptAttention(self, self.num_heads)
25 |             fusion.apply()
26 |             fusion = FusionGptAttentionMegatron(self, self.num_heads)
27 |             fusion.apply()
28 | 
29 |     def postprocess(self):
30 |         """
31 |         Remove extra reshape nodes.
32 |         """
33 |         logger.debug(f"start postprocessing...")
34 | 
35 |         input_name_to_nodes = self.input_name_to_nodes()
36 |         output_name_to_node = self.output_name_to_node()
37 | 
38 |         reshape_count = 0
39 |         for gemm_node in self.get_nodes_by_op_type("Gemm"):
40 |             reshape_after_gemm = self.find_first_child_by_type(gemm_node,
41 |                                                                'Reshape',
42 |                                                                input_name_to_nodes,
43 |                                                                recursive=False)
44 | 
45 |             return_indice = []
46 |             nodes = self.match_parent_path(gemm_node, ['Reshape', 'FastGelu'], [0, 0], output_name_to_node)
47 |             if nodes is None:
48 |                 nodes = self.match_parent_path(gemm_node, ['Reshape', 'LayerNormalization'], [0, 0],
49 |                                                output_name_to_node)
50 |                 if nodes is None:
51 |                     continue
52 |             (reshape_before_gemm, root_node) = nodes
53 | 
54 |             matmul_node_name = self.create_node_name('MatMul', 'FullyConnect_MatMul')
55 |             matmul_node = onnx.helper.make_node('MatMul',
56 |                                                 inputs=[matmul_node_name + "_input", gemm_node.input[1]],
57 |                                                 outputs=[matmul_node_name + "_output"],
58 |                                                 name=matmul_node_name)
59 | 
60 |             add_node_name = self.create_node_name('Add', 'FullyConnect_Add')
61 |             add_node = onnx.helper.make_node('Add',
62 |                                              inputs=[matmul_node_name + "_output", gemm_node.input[2]],
63 |                                              outputs=[add_node_name + "_output"],
64 |                                              name=add_node_name)
65 | 
66 |             self.replace_input_of_all_nodes(reshape_after_gemm.output[0], add_node_name + "_output")
67 | 
68 |             # Link root node output with MatMul
69 |             self.replace_input_of_all_nodes(root_node.output[0], matmul_node_name + "_input")
70 |             root_node.output[0] = matmul_node_name + "_input"
71 | 
72 |             self.replace_input_of_all_nodes(reshape_after_gemm.output[0], add_node_name + "_output")
73 | 
74 |             self.add_node(matmul_node)
75 |             self.add_node(add_node)
76 | 
77 |             reshape_count += 2
78 | 
79 |         self.prune_graph()
80 |         logger.info(f"postprocess: remove Reshape count:{reshape_count}")
81 | 


--------------------------------------------------------------------------------
/perf-ci.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -x
  2 | 
  3 | NO_SRC=false
  4 | 
  5 | TVM_TUNED_CPU=$HOME/tvm_tuned_cpu
  6 | TVM_TUNED_GPU=$HOME/tvm_tuned_gpu
  7 | 
  8 | while getopts “n” OPTION
  9 | do
 10 |      case $OPTION in
 11 |          n)
 12 |              echo "Not checking out src tree.. running from current checkout.."
 13 |              NO_SRC=true
 14 |              ;;
 15 |          ?)
 16 |              echo "Unsupported option.. -n for no checkout and run as developer instead of a CI"
 17 |              exit
 18 |              ;;
 19 |      esac
 20 | done
 21 | 
 22 | if [ "$NO_SRC" = true ]; then
 23 |   echo "Using existing checkout"
 24 | else
 25 |   echo "Checking out transformer-benchmarks..."
 26 |   git clone https://github.com/nod-ai/transformer-benchmarks --recursive
 27 |   cd transformer-benchmarks
 28 |   git submodule update --init --recursive
 29 |   cd mmperf/external/iree
 30 |   git submodule update --init --recursive
 31 |   cd -
 32 |   #echo "Updating submodules to origin/main...things may break.. but that is the point.."
 33 |   #./update_submodules.sh
 34 | fi
 35 | 
 36 | #Gather results
 37 | TIMESTAMP=`date +%Y-%m-%d_%H-%M-%S`
 38 | 
 39 | #. $HOME/miniconda3/etc/profile.d/conda.sh
 40 | #conda env remove -n perf_env
 41 | #conda create -n perf_env python=3.9 -y
 42 | #conda activate perf_env
 43 | 
 44 | rm -rf perf_env
 45 | python3 -m venv perf_env
 46 | source perf_env/bin/activate
 47 | 
 48 | #E2E Transformer benchmarks
 49 | ./run_benchmark.sh --cpu_fp32=true --gpu_fp32=false --create_venv=true --ort=true --torchscript=true --tensorflow=true --iree=true --ort_optimizer=false
 50 | #./run_benchmark.sh --gpu_fp32=true --cpu_fp32=false --create_venv=true --ort=true --torchscript=true --tensorflow=true --iree=true --ort_optimizer=false
 51 | 
 52 | 
 53 | mkdir -p  transformer-bench-results/${TIMESTAMP}/BERT_e2e/
 54 | cp *.csv transformer-bench-results/${TIMESTAMP}/BERT_e2e/
 55 | cp model.mlir transformer-bench-results/${TIMESTAMP}/BERT_e2e/model_${TIMESTAMP}.mlir
 56 | 
 57 | #mmperf tests
 58 | cd mmperf
 59 | 
 60 | rm -rf mmperf_env
 61 | python3 -m venv mmperf_env
 62 | source mmperf_env/bin/activate
 63 | pip install --upgrade pip
 64 | pip install -r requirements.txt
 65 | 
 66 | #CPU tests
 67 | 
 68 | if [ -d ${TVM_TUNED_CPU} ]; then
 69 |   echo "Using TVM TUNED for CPU"
 70 |   cmake -GNinja -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DMKL_DIR=/opt/intel/oneapi/mkl/latest/ -DUSE_TVM=ON -DUSE_MKL=ON -DUSE_MLIR=ON -DUSE_IREE=ON -DIREE_DYLIB=ON -DUSE_TVM_TUNED=ON -DTVM_LIB_DIR=${TVM_TUNED_CPU} -B build .
 71 | else
 72 |   echo "No TVM tuned libs so skipping.."
 73 |   cmake -GNinja -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DMKL_DIR=/opt/intel/oneapi/mkl/latest/ -DUSE_MKL=ON -DUSE_MLIR=ON -DUSE_IREE=ON -DIREE_DYLIB=ON -B build .
 74 | fi
 75 | 
 76 | #build mmperf
 77 | cmake --build build
 78 | #Sometimes bad things happen to MLIR deps and ninja deps. Lets do another try.
 79 | cmake --build build
 80 | 
 81 | #Run all tests and generate the plots
 82 | cmake --build build/matmul --target run_all_tests
 83 | 
 84 | python mmperf.py build/matmul  ../transformer-bench-results/${TIMESTAMP}/mmperf-cpu/
 85 | 
 86 | mv build build.cpu
 87 | 
 88 | #GPU tests
 89 | if [ -d ${TVM_TUNED_GPU} ] ; then
 90 |   echo "Using TVM TUNED for GPU"
 91 | #  cmake -GNinja -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DMKL_DIR=/opt/intel/oneapi/mkl/latest/ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DUSE_TVM=ON -DUSE_MLIR=ON -DUSE_IREE=ON -DIREE_CUDA=ON -DUSE_CUBLAS=ON -DUSE_TVM_CUDA=ON -DTVM_ENABLE_CUDA=ON -DUSE_TVM_TUNED=ON -DTVM_LIB_DIR=${TVM_TUNED_GPU} -B build .
 92 | else
 93 |   echo "No TVM tuned libs so skipping.."
 94 | #  cmake -GNinja -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DUSE_MLIR_CUDA=ON -DUSE_IREE=ON -DIREE_CUDA=ON -DUSE_CUBLAS=ON -B build .
 95 | fi
 96 | 
 97 | #build mmperf
 98 | #cmake --build build
 99 | #Sometimes bad things happen to MLIR deps and ninja deps. Lets do another try.
100 | #cmake --build build
101 | 
102 | #Run all tests and generate the plots
103 | #cmake --build build/matmul --target run_all_tests
104 | 
105 | #python mmperf.py build/matmul  ../transformer-bench-results/${TIMESTAMP}/mmperf-gpu/
106 | 
107 | #mv build build.gpu
108 | 
109 | cd ..
110 | 
111 | cd transformer-bench-results
112 | ln -s ${TIMESTAMP} latest
113 | cd ../
114 | 
115 | echo "Remove old symlink.."
116 | gsutil rm -rf gs://shark-public/nod-perf/results/transformer-bench/latest
117 | 
118 | echo "Copying to Google Storage.."
119 | gsutil cp -r transformer-bench-results/* gs://shark-public/nod-perf/results/transformer-bench/
120 | 
121 | if [ "$NO_SRC" = true ]; then
122 |   echo "leaving sources and results for manual clean up"
123 | else
124 |   cd ../..
125 |   echo "deleting transformer-benchmarks..."
126 |   echo `pwd`
127 |   rm -rf transformer-bench
128 | fi
129 | 


--------------------------------------------------------------------------------
/quantize_helper.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
 3 | # Licensed under the MIT License.  See License.txt in the project root for
 4 | # license information.
 5 | # --------------------------------------------------------------------------
 6 | 
 7 | import logging
 8 | import torch
 9 | import onnx
10 | import os
11 | from transformers.modeling_utils import Conv1D
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def _conv1d_to_linear(module):
17 |     in_size, out_size = module.weight.shape
18 |     linear = torch.nn.Linear(in_size, out_size)
19 |     linear.weight.data = module.weight.data.T.contiguous()
20 |     linear.bias.data = module.bias.data
21 |     return linear
22 | 
23 | 
24 | def conv1d_to_linear(model):
25 |     '''in-place
26 |     This is for Dynamic Quantization, as Conv1D is not recognized by PyTorch, convert it to nn.Linear
27 |     '''
28 |     logger.debug("replace Conv1D with Linear")
29 |     for name in list(model._modules):
30 |         module = model._modules[name]
31 |         if isinstance(module, Conv1D):
32 |             linear = _conv1d_to_linear(module)
33 |             model._modules[name] = linear
34 |         else:
35 |             conv1d_to_linear(module)
36 | 
37 | 
38 | def _get_size_of_pytorch_model(model):
39 |     torch.save(model.state_dict(), "temp.p")
40 |     size = os.path.getsize("temp.p") / (1024 * 1024)
41 |     os.remove('temp.p')
42 |     return size
43 | 
44 | 
45 | class QuantizeHelper:
46 |     @staticmethod
47 |     def quantize_torch_model(model, dtype=torch.qint8):
48 |         '''
49 |         Usage: model = quantize_model(model)
50 | 
51 |         TODO: mix of in-place and return, but results are different
52 |         '''
53 |         conv1d_to_linear(model)
54 |         quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=dtype)
55 |         logger.info(f'Size of full precision Torch model(MB):{_get_size_of_pytorch_model(model)}')
56 |         logger.info(f'Size of quantized Torch model(MB):{_get_size_of_pytorch_model(quantized_model)}')
57 |         return quantized_model
58 | 
59 |     @staticmethod
60 |     def quantize_onnx_model(onnx_model_path, quantized_model_path, use_external_data_format=False):
61 |         from onnxruntime.quantization import quantize, QuantizationMode
62 |         logger.info(f'Size of full precision ONNX model(MB):{os.path.getsize(onnx_model_path)/(1024*1024)}')
63 |         onnx_opt_model = onnx.load_model(onnx_model_path)
64 |         quantized_onnx_model = quantize(onnx_opt_model,
65 |                                         quantization_mode=QuantizationMode.IntegerOps,
66 |                                         symmetric_weight=True,
67 |                                         force_fusions=True)
68 | 
69 |         if use_external_data_format:
70 |             from pathlib import Path
71 |             Path(quantized_model_path).parent.mkdir(parents=True, exist_ok=True)
72 |             onnx.external_data_helper.convert_model_to_external_data(quantized_onnx_model,
73 |                                                                      all_tensors_to_one_file=True,
74 |                                                                      location=Path(quantized_model_path).name + ".data")
75 |         onnx.save_model(quantized_onnx_model, quantized_model_path)
76 | 
77 |         logger.info(f"quantized model saved to:{quantized_model_path}")
78 |         #TODO: inlcude external data in total model size.
79 |         logger.info(f'Size of quantized ONNX model(MB):{os.path.getsize(quantized_model_path)/(1024*1024)}')
80 | 


--------------------------------------------------------------------------------
/resnet50.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | from transformers import AutoFeatureExtractor, ResNetForImageClassification
 4 | from datasets import load_dataset
 5 | 
 6 | dataset = load_dataset("huggingface/cats-image")
 7 | image = dataset["test"]["image"][0]
 8 | 
 9 | feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/resnet-50")
10 | 
11 | 
12 | #PyTorch
13 | import torch
14 | 
15 | model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")
16 | warmup = 5
17 | total_iter = 100
18 | num_iter = total_iter - warmup
19 | for i in range(num_iter):
20 |     if(i == warmup-1):
21 |         start = time.time()
22 |     inputs = feature_extractor(image, return_tensors="pt")
23 |     with torch.no_grad():
24 |         logits = model(**inputs).logits
25 |     predicted_label = logits.argmax(-1).item()
26 | end = time.time()
27 | total_time = end - start
28 | print("PyTorch: time/iter in ms : "+str(total_time*1000/num_iter))
29 | #print(model.config.id2label[predicted_label])
30 | 
31 | 
32 | # OnnxRuntime
33 | from onnxruntime import InferenceSession
34 | import urllib.request
35 | 
36 | if not os.path.isfile("model.onnx"):
37 |     urllib.request.urlretrieve('https://huggingface.co/OWG/resnet-50/resolve/main/onnx/model.onnx',"model.onnx")
38 | 
39 | session = InferenceSession("model.onnx")
40 | 
41 | warmup = 5
42 | total_iter = 100
43 | num_iter = total_iter - warmup
44 | for i in range(num_iter):
45 |     if(i == warmup-1):
46 |         start = time.time()
47 |     #print(BertCompiled.learn(predict_sample_input,np.random.randint(5, size=(BATCH_SIZE))))
48 |     # ONNX Runtime expects NumPy arrays as input
49 |     inputs = feature_extractor(image, return_tensors="np")
50 |     outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
51 | end = time.time()
52 | total_time = end - start
53 | print("Onnx: time/iter in ms : "+str(total_time*1000/num_iter))
54 | 
55 | 


--------------------------------------------------------------------------------
/run_benchmark.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -e
  2 | # -------------------------------------------------------------------------
  3 | # Copyright (c) Microsoft Corporation.  All rights reserved.
  4 | # Copyright (c) Nod, Inc.  All rights reserved.
  5 | # Licensed under the MIT License.  See License.txt in the project root for
  6 | # license information.
  7 | # --------------------------------------------------------------------------
  8 | # This measures the performance of OnnxRuntime, PyTorch and TorchScript on transformer models.
  9 | # Please install PyTorch or Tensorflow or MLIR Runtime (see https://pytorch.org/) before running this benchmark. Like the following:
 10 | # GPU:   conda install pytorch torchvision cudatoolkit=11.0 -c pytorch
 11 | # CPU:   conda install pytorch torchvision cpuonly -c pytorch
 12 | 
 13 | # When use_package=true, you need not copy other files to run benchmarks except this sh file.
 14 | # Otherwise, it will use python script (*.py) files in this directory.
 15 | 
 16 | ARGUMENT_LIST=(
 17 |     "gpu_fp32"
 18 |     "gpu_fp16"
 19 |     "cpu_fp32"
 20 |     "cpu_int8"
 21 |     "ort"
 22 |     "torch"
 23 |     "torchscript"
 24 |     "tensorflow"
 25 |     "iree"
 26 |     "shark"
 27 |     "pip_install_pkg"
 28 |     "ort_optimizer"
 29 |     "create_venv"
 30 |     "with_nsys"
 31 | )
 32 | 
 33 | #setup defaults
 34 | # Devices to test (You can run either CPU or GPU, but not both: gpu need onnxruntime-gpu, and CPU need onnxruntime).
 35 | run_gpu_fp32=true
 36 | run_gpu_fp16=false
 37 | run_cpu_fp32=false
 38 | run_cpu_int8=false
 39 | # Engines to test.
 40 | run_ort=true
 41 | run_shark=false
 42 | run_torch=false
 43 | run_torchscript=true
 44 | run_tensorflow=true
 45 | run_iree=true
 46 | 
 47 | # only need once
 48 | run_create_venv=false
 49 | install_pkg=true
 50 | run_with_nsys=true
 51 | 
 52 | # Enable optimizer (use script instead of OnnxRuntime for graph optimization)
 53 | use_optimizer=false
 54 | 
 55 | # read arguments
 56 | opts=$(getopt \
 57 |     --longoptions "$(printf "%s:," "${ARGUMENT_LIST[@]}")" \
 58 |     --name "$(basename "$0")" \
 59 |     --options "" \
 60 |     -- "$@"
 61 | )
 62 | 
 63 | eval set --$opts
 64 | 
 65 | while [[ $# -gt 0 ]]; do
 66 |     case "$1" in
 67 |         --with_nsys)
 68 |             run_with_nsys=$2
 69 |             shift 2
 70 |             ;;
 71 |         --create_venv)
 72 |             run_create_venv=$2
 73 |             echo  "Removing old bench_venv.."
 74 |             rm -rf bench_venv
 75 |             echo  "Creating new bench_venv.."
 76 |             python3 -m venv bench_venv
 77 |             echo  "sourcing new env.."
 78 |             source bench_venv/bin/activate
 79 |             shift 2
 80 |             ;;
 81 | 
 82 |         --ort_optimizer)
 83 |             use_optimizer=$2
 84 |             shift 2
 85 |             ;;
 86 | 
 87 |         --pip_install_pkg)
 88 |             install_pkg=$2
 89 |             shift 2
 90 |             ;;
 91 | 
 92 |         --iree)
 93 |             run_iree=$2
 94 |             shift 2
 95 |             ;;
 96 | 	
 97 | 	--shark)
 98 | 	    run_shark=$2
 99 | 	    shift 2
100 | 	    ;;
101 | 
102 |         --tensorflow)
103 |             run_tensorflow=$2
104 |             shift 2
105 |             ;;
106 | 
107 |         --torchscript)
108 |             run_torchscript=$2
109 |             shift 2
110 |             ;;
111 | 
112 |         --torch)
113 |             run_torch=$2
114 |             shift 2
115 |             ;;
116 | 
117 |         --ort)
118 |             run_ort=$2
119 |             shift 2
120 |             ;;
121 | 
122 |         --gpu_fp32)
123 |             run_gpu_fp32=$2
124 |             shift 2
125 |             ;;
126 | 
127 |         --gpu_fp16)
128 |             run_gpu_fp16=$2
129 |             shift 2
130 |             ;;
131 | 
132 |         --cpu_fp32)
133 |             run_cpu_fp32=$2
134 |             shift 2
135 |             ;;
136 | 
137 |         --cpu_int8)
138 |             run_cpu_int8=$2
139 |             shift 2
140 |             ;;
141 | 
142 |         *)
143 |             echo "Using defaults...: "
144 |             echo "   you can change them with --var=true or false"
145 |             break
146 |             ;;
147 |     esac
148 | done
149 | 
150 | 
151 | echo "Parsed command line args as:"
152 | echo "gpu_fp32 $run_gpu_fp32"
153 | echo "gpu_fp16 $run_gpu_fp16"
154 | echo "cpu_fp32 $run_cpu_fp32"
155 | echo "cpu_int8 $run_cpu_int8"
156 | 
157 | echo "ort $run_ort"
158 | echo "ort_optimizer $use_optimizer"
159 | echo "shark $run_shark"
160 | echo "torch $run_torch"
161 | echo "torchscript $run_torchscript"
162 | echo "tensorflow $run_tensorflow"
163 | echo "iree $run_iree"
164 | echo "create_venv $run_create_venv"
165 | echo "run_with_nsys $run_with_nsys"
166 | echo "pip_install_pkg $install_pkg"
167 | 
168 | echo  "Check python path.. "
169 | which python
170 | 
171 | use_package=false
172 | # Onnx model source (default is from pytorch, set export_onnx_from_tf=true to convert from tensorflow model)
173 | export_onnx_from_tf=false
174 | 
175 | 
176 | average_over=1000
177 | # CPU takes longer time to run, only run 100 inferences to get average latency.
178 | if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
179 |   average_over=100
180 | fi
181 | 
182 | # Batch Sizes and Sequence Lengths
183 | batch_sizes="1"
184 | sequence_lengths="128"
185 | 
186 | # Number of inputs (input_ids, token_type_ids, attention_mask) for ONNX model.
187 | # Not that different input count might lead to different performance
188 | # Here we only test one input (input_ids) for fair comparison with PyTorch.
189 | input_counts=1
190 | 
191 | # Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased
192 | #models_to_test="bert-base-cased roberta-base distilbert-base-uncased"
193 | #models_to_test="philschmid/MiniLM-L6-H384-uncased-sst2"
194 | models_to_test="microsoft/MiniLM-L12-H384-uncased"
195 | #models_to_test="gpt2"
196 | 
197 | # If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
198 | # export CUDA_VISIBLE_DEVICES=1
199 | 
200 | # This script will generate a logs file with a list of commands used in tests.
201 | echo echo "ort=$run_ort torch=$run_torch torchscript=$run_torchscript tensorflow=$run_tensorflow iree=$run_iree gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" >> benchmark.log
202 | 
203 | # Set it to false to skip testing. You can use it to dry run this script with the log file.
204 | run_tests=true
205 | 
206 | # Directory for downloading pretrained models.
207 | cache_dir="./cache_models"
208 | 
209 | # Directory for ONNX models
210 | onnx_dir="./onnx_models"
211 | 
212 | # -------------------------------------------
213 | if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
214 |   if [ "$run_gpu_fp32" = true ] ; then
215 |     echo "cannot test cpu and gpu at same time"
216 |     exit 1
217 |   fi
218 |   if [ "$run_gpu_fp16" = true ] ; then
219 |     echo "cannot test cpu and gpu at same time"
220 |     exit 1
221 |   fi
222 | fi
223 | 
224 | 
225 | if [ "$install_pkg" = true ] ; then
226 |   pip install --upgrade pip
227 |   pip uninstall --yes ort-nightly ort-gpu-nightly
228 |   pip uninstall --yes onnxruntime
229 |   pip uninstall --yes onnxruntime-gpu
230 |   pip uninstall --yes torch
231 |   pip uninstall --yes iree-compiler iree-runtime iree-tools-tf iree-tools-tflite iree-tools-xla
232 | 
233 |   if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
234 |     pip install onnxruntime
235 |     pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
236 |   else
237 |     pip install onnxruntime-gpu
238 |     pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html
239 |     #pip3 install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/rocm4.2/torch_nightly.html
240 |   fi
241 |   pip install tf-nightly
242 |   pip install --upgrade onnx coloredlogs packaging psutil py3nvml onnxconverter_common numpy transformers sympy wheel
243 |   pip install gin-config 
244 | 
245 |   ### Installing IREE-Python
246 |   python -m pip install iree-compiler iree-runtime iree-tools-tf iree-tools-tflite iree-tools-xla --find-links https://github.com/google/iree/releases
247 | 
248 | 	if [ "$run_shark" = true ] ; then
249 | 		### Installing shark
250 | 		git submodule update --init
251 | 		pip install -r `pwd`/thirdparty/SHARK/requirements.txt --no-cache-dir
252 | 		python -m pip install --find-links https://github.com/llvm/torch-mlir/releases torch-mlir
253 | 		python -m pip install ninja
254 | 		python -m pip install thirdparty/SHARK
255 | 	fi 
256 | 
257 | fi
258 | 
259 | if [ "$use_package" = true ] ; then
260 |   echo "Use onnxruntime.transformers.benchmark"
261 |   benchmark_script="-m onnxruntime.transformers.benchmark"
262 | else
263 |   benchmark_script="benchmark.py"
264 | fi
265 | 
266 | onnx_export_options="-i $input_counts -v -b 0 --overwrite -f fusion.csv -c $cache_dir --onnx_dir $onnx_dir"
267 | benchmark_options="-b $batch_sizes -s $sequence_lengths -t $average_over -f fusion.csv -r result.csv -d detail.csv -c $cache_dir --onnx_dir $onnx_dir"
268 | 
269 | if [ "$export_onnx_from_tf" = true ] ; then
270 |   onnx_export_options="$onnx_export_options --model_source tf"
271 |   benchmark_options="$benchmark_options --model_source tf"
272 | fi
273 | 
274 | if [ "$use_optimizer" = true ] ; then
275 |   onnx_export_options="$onnx_export_options -o"
276 |   benchmark_options="$benchmark_options -o"
277 | fi
278 | 
279 | # -------------------------------------------
280 | run_one_test() {
281 |     if [ "$run_ort" = true ] ; then
282 |       echo python $benchmark_script -m $1 $onnx_export_options $2 $3 $4 >> benchmark.log
283 |       echo python $benchmark_script -m $1 $benchmark_options $2 $3 $4 -i $input_counts >> benchmark.log
284 |       if [ "$run_tests" = true ] ; then
285 |         python $benchmark_script -m $1 $onnx_export_options $2 $3 $4
286 |         python $benchmark_script -m $1 $benchmark_options $2 $3 $4 -i $input_counts
287 |       fi
288 |     fi
289 | 
290 |     if [ "$run_shark" = true ] ; then
291 |       echo python $benchmark_script -e shark -m $1 $benchmark_options $2 $3 $4 >> benchmark.log
292 |       if [ "$run_tests" = true ] ; then
293 |         python $benchmark_script -e shark -m $1 $benchmark_options $2 $3 $4
294 |       fi
295 |     fi
296 | 
297 |     if [ "$run_torch" = true ] ; then
298 |       echo python $benchmark_script -e torch -m $1 $benchmark_options $2 $3 $4 >> benchmark.log
299 |       if [ "$run_tests" = true ] ; then
300 |         python $benchmark_script -e torch -m $1 $benchmark_options $2 $3 $4
301 |       fi
302 |     fi
303 | 
304 |     if [ "$run_torchscript" = true ] ; then
305 |       echo python $benchmark_script -e torchscript -m $1 $benchmark_options $2 $3 $4 >> benchmark.log
306 |       if [ "$run_tests" = true ] ; then
307 |         python $benchmark_script -e torchscript -m $1 $benchmark_options $2 $3 $4
308 |       fi
309 |     fi
310 | 
311 |     if [ "$run_tensorflow" = true ] ; then
312 |       echo python $benchmark_script -e tensorflow -m $1 $benchmark_options $2 $3 $4 >> benchmark.log
313 |       if [ "$run_tests" = true ] ; then
314 |         python $benchmark_script -e tensorflow -m $1 $benchmark_options $2 $3 $4
315 |       fi
316 |     fi
317 | 
318 |     if [ "$run_iree" = true ] ; then
319 |       echo python $benchmark_script -e iree -m $1 $benchmark_options $2 $3 $4 >> benchmark.log
320 |       if [ "$run_tests" = true ] ; then
321 |         python $benchmark_script -e iree -m $1 $benchmark_options $2 $3 $4
322 |       fi
323 |     fi
324 | }
325 | 
326 | # -------------------------------------------
327 | if [ "$run_gpu_fp32" = true ] ; then
328 |   for m in $models_to_test
329 |   do
330 |     echo Run GPU FP32 Benchmark on model ${m}
331 |     run_one_test "${m}" -g
332 |   done
333 | fi
334 | 
335 | if [ "$run_gpu_fp16" = true ] ; then
336 |   for m in $models_to_test
337 |   do
338 |     echo Run GPU FP16 Benchmark on model ${m}
339 |     run_one_test "${m}" -g -p fp16
340 |   done
341 | fi
342 | 
343 | if [ "$run_cpu_fp32" = true ] ; then
344 |   for m in $models_to_test
345 |   do
346 |     echo Run CPU Benchmark on model ${m}
347 |     run_one_test "${m}"
348 |   done
349 | fi
350 | 
351 | if [ "$run_cpu_int8" = true ] ; then
352 |   for m in $models_to_test
353 |   do
354 |     echo Run CPU Benchmark on model ${m}
355 |     run_one_test "${m}" -p int8
356 |   done
357 | fi
358 | 
359 | if [ "run_tests" = false ] ; then
360 |     more $log_file
361 | fi
362 | 
363 | # Remove duplicated lines
364 | awk '!x[$0]++' ./result.csv > summary_result.csv
365 | awk '!x[$0]++' ./fusion.csv > summary_fusion.csv
366 | awk '!x[$0]++' ./detail.csv > summary_detail.csv
367 | 


--------------------------------------------------------------------------------
/shape_infer_helper.py:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation.  All rights reserved.
 3 | # Licensed under the MIT License.
 4 | #--------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | import onnx
 9 | 
10 | # In ORT Package the symbolic_shape_infer.py is in ../tools
11 | file_path = os.path.dirname(__file__)
12 | if os.path.exists(os.path.join(file_path, "../tools/symbolic_shape_infer.py")):
13 |     sys.path.append(os.path.join(file_path, '../tools'))
14 | else:
15 |     sys.path.append(os.path.join(file_path, '..'))
16 | 
17 | from symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto, sympy
18 | 
19 | 
20 | class SymbolicShapeInferenceHelper(SymbolicShapeInference):
21 |     def __init__(self, model, verbose=0, int_max=2**31 - 1, auto_merge=True, guess_output_rank=False):
22 |         super().__init__(int_max, auto_merge, guess_output_rank, verbose)
23 |         self.model_ = onnx.ModelProto()
24 |         self.model_.CopyFrom(model)
25 |         self.all_shapes_inferred_ = False
26 |         self.inferred_ = False
27 | 
28 |     # The goal is to remove dynamic_axis_mapping
29 |     def infer(self, dynamic_axis_mapping):
30 |         if self.inferred_:
31 |             return self.all_shapes_inferred_
32 | 
33 |         self.dynamic_axis_mapping_ = dynamic_axis_mapping  # e.g {"batch_size" : 4, "seq_len" :7}
34 | 
35 |         self._preprocess(self.model_)
36 |         while self.run_:
37 |             self.all_shapes_inferred_ = self._infer_impl()
38 | 
39 |         self.inferred_ = True
40 |         return self.all_shapes_inferred_
41 | 
42 |     # override _preprocess() to avoid unnecessary model copy since ctor copies the model
43 |     def _preprocess(self, in_mp):
44 |         self.out_mp_ = in_mp
45 |         self.graph_inputs_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
46 |         self.initializers_ = dict([(i.name, i) for i in self.out_mp_.graph.initializer])
47 |         self.known_vi_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
48 |         self.known_vi_.update(
49 |             dict([(i.name, onnx.helper.make_tensor_value_info(i.name, i.data_type, list(i.dims)))
50 |                   for i in self.out_mp_.graph.initializer]))
51 | 
52 |     # Override _get_sympy_shape() in symbolic_shape_infer.py to ensure shape inference by giving the actual value of dynamic axis
53 |     def _get_sympy_shape(self, node, idx):
54 |         sympy_shape = []
55 |         for d in self._get_shape(node, idx):
56 |             if type(d) == str:
57 |                 if d in self.dynamic_axis_mapping_.keys():
58 |                     sympy_shape.append(self.dynamic_axis_mapping_[d])
59 |                 elif d in self.symbolic_dims_:
60 |                     sympy_shape.append(self.symbolic_dims_[d])
61 |                 else:
62 |                     sympy_shape.append(sympy.Symbol(d, integer=True))
63 |             else:
64 |                 assert None != d
65 |                 sympy_shape.append(d)
66 |         return sympy_shape
67 | 
68 |     def get_edge_shape(self, edge):
69 |         assert (self.all_shapes_inferred_ == True)
70 |         if edge not in self.known_vi_:
71 |             print("Cannot retrive the shape of " + str(edge))
72 |             return None
73 |         type_proto = self.known_vi_[edge].type
74 |         shape = get_shape_from_type_proto(type_proto)
75 |         for i in range(len(shape)):
76 |             d = shape[i]
77 |             if type(d) == str and d in self.dynamic_axis_mapping_.keys():
78 |                 shape[i] = self.dynamic_axis_mapping_[d]
79 |         return shape
80 | 
81 |     def compare_shape(self, edge, edge_other):
82 |         assert (self.all_shapes_inferred_ == True)
83 |         shape = self.get_edge_shape(edge)
84 |         shape_other = self.get_edge_shape(edge_other)
85 |         if shape is None or shape_other is None:
86 |             raise Exception("At least one shape is missed for edges to compare")
87 |         return shape == shape_other
88 | 


--------------------------------------------------------------------------------
/update_submodules.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | COMMIT_PUSH=false
 4 | 
 5 | while getopts “p” OPTION
 6 | do
 7 |      case $OPTION in
 8 |          p)
 9 |              echo "Pushing changes up.."
10 |              COMMIT_PUSH=true
11 |              ;;
12 |          ?)
13 |              echo "Unsupported option.. -p for pushing changes up after update"
14 |              exit
15 |              ;;
16 |      esac
17 | done
18 | 
19 | echo "Updating repos.."
20 | 
21 | cd mmperf && git fetch --all && git checkout origin/main
22 | 
23 | #update mmperf submodules first
24 | git submodule update --init
25 | #Update the submodules inside mmperf too
26 | ./update_submodules.sh
27 | 
28 | if [ "$COMMIT_PUSH" = true ]; then
29 |   echo "Checking out transformer-benchmarks..."
30 |   git add .
31 |   git commit -m "Roll external deps"
32 |   echo git push https://github.com/mmperf/mmperf
33 | fi
34 | 


--------------------------------------------------------------------------------