├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── affinity_helper.py ├── benchmark.py ├── benchmark_gpt2.py ├── benchmark_helper.py ├── fusion_attention.py ├── fusion_base.py ├── fusion_biasgelu.py ├── fusion_embedlayer.py ├── fusion_fastgelu.py ├── fusion_gelu.py ├── fusion_gelu_approximation.py ├── fusion_gpt_attention.py ├── fusion_gpt_attention_megatron.py ├── fusion_gpt_attention_no_past.py ├── fusion_layernorm.py ├── fusion_options.py ├── fusion_reshape.py ├── fusion_shape.py ├── fusion_skiplayernorm.py ├── fusion_utils.py ├── gpt2_beamsearch_helper.py ├── gpt2_beamsearch_tester.py ├── gpt2_helper.py ├── gpt2_parity.py ├── gpt2_tester.py ├── hf.co_1ms ├── README.md ├── benchmark.log ├── detail.csv ├── fusion.csv ├── images │ ├── cpu_16_2_5ms.png │ ├── cpu_9_7ms.png │ ├── gpu_128_2_6ms.png │ ├── gpu_16_1_7ms.png │ ├── infinity_model.png │ └── model_dir.png ├── onnx.diff ├── onnx_with_eigen.diff ├── output.txt ├── requirements.txt ├── result.csv ├── run_benchmark.sh ├── summary_detail.csv ├── summary_fusion.csv └── summary_result.csv ├── hf_co_models.py ├── huggingface_MiniLM_loadsave.py ├── huggingface_models.py ├── nightly_job.sh ├── onnx_exporter.py ├── onnx_model.py ├── onnx_model_bart.py ├── onnx_model_bert.py ├── onnx_model_bert_keras.py ├── onnx_model_bert_tf.py ├── onnx_model_gpt2.py ├── optimizer.py ├── perf-ci.sh ├── quantize_helper.py ├── resnet50.py ├── run_benchmark.sh ├── shape_infer_helper.py ├── shape_optimizer.py ├── symbolic_shape_infer.py └── update_submodules.sh /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | cache_models 3 | onnx_models 4 | benchmark.log 5 | model.mlir 6 | detail.csv 7 | fusion.csv 8 | result.csv 9 | summary_detail.csv 10 | summary_result.csv 11 | summary_fusion.csv 12 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "mmperf"] 2 | path = mmperf 3 | url = https://github.com/mmperf/mmperf 4 | [submodule "thirdparty/SHARK"] 5 | path = thirdparty/SHARK 6 | url = https://github.com/nod-ai/SHARK.git 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021, powderluv 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # transformer-benchmarks 2 | benchmarking some transformers 3 | 4 | ## Quickstart 5 | 6 | ``` 7 | git clone https://github.com/powderluv/transformer-benchmarks 8 | 9 | cd transformer-benchmarks 10 | 11 | python -m venv myenv 12 | 13 | source myenv/bin/activate 14 | 15 | ./run_benchmark.sh. #change the variables to run on cpu / gpu and which backends to use org, torchscript, tf, mlir 16 | 17 | ``` 18 | 19 | ![Measuring up Transformers](https://i0.wp.com/cdnssl.ubergizmo.com/wp-content/uploads/2021/04/optimus-prime-toy.jpg) 20 | -------------------------------------------------------------------------------- /affinity_helper.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | 6 | # Get/Set cpu affinity. Currently only support part of Unix system 7 | import logging 8 | import os 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class AffinitySetting(): 14 | def __init__(self): 15 | self.pid = os.getpid() 16 | self.affinity = None 17 | self.is_os_supported = hasattr(os, 'sched_getaffinity') and hasattr(os, 'sched_setaffinity') 18 | if not self.is_os_supported: 19 | logger.warning("Current OS does not support os.get_affinity() and os.set_affinity()") 20 | 21 | def get_affinity(self): 22 | if self.is_os_supported: 23 | self.affinity = os.sched_getaffinity(self.pid) 24 | 25 | def set_affinity(self): 26 | if self.is_os_supported: 27 | current_affinity = os.sched_getaffinity(self.pid) 28 | if (self.affinity != current_affinity): 29 | logger.warning("Replacing affinity setting %s with %s", str(current_affinity), str(self.affinity)) 30 | os.sched_setaffinity(self.pid, self.affinity) 31 | 32 | 33 | if __name__ == '__main__': 34 | affi_helper = AffinitySetting() 35 | affi_helper.get_affinity() 36 | affi_helper.set_affinity() 37 | -------------------------------------------------------------------------------- /benchmark_helper.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | 7 | import os 8 | import sys 9 | import csv 10 | import numpy 11 | import time 12 | import timeit 13 | from datetime import datetime 14 | import argparse 15 | import logging 16 | import coloredlogs 17 | import torch 18 | import onnx 19 | from enum import Enum 20 | from packaging import version 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class Precision(Enum): 26 | FLOAT32 = 'fp32' 27 | FLOAT16 = 'fp16' 28 | INT8 = 'int8' 29 | 30 | def __str__(self): 31 | return self.value 32 | 33 | 34 | IO_BINDING_DATA_TYPE_MAP = { 35 | "float32": numpy.float32, 36 | # TODO: Add more. 37 | } 38 | 39 | 40 | def create_onnxruntime_session(onnx_model_path, 41 | use_gpu, 42 | enable_all_optimization=True, 43 | num_threads=-1, 44 | enable_profiling=False, 45 | verbose=False): 46 | session = None 47 | try: 48 | from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version 49 | sess_options = SessionOptions() 50 | 51 | if enable_all_optimization: 52 | sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL 53 | else: 54 | sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC 55 | 56 | if enable_profiling: 57 | sess_options.enable_profiling = True 58 | 59 | if num_threads > 0: 60 | sess_options.intra_op_num_threads = num_threads 61 | logger.debug(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}") 62 | 63 | if verbose: 64 | sess_options.log_severity_level = 0 65 | else: 66 | sess_options.log_severity_level = 4 67 | 68 | logger.debug(f"Create session for onnx model: {onnx_model_path}") 69 | execution_providers = ['CPUExecutionProvider' 70 | ] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider'] 71 | session = InferenceSession(onnx_model_path, sess_options, providers=execution_providers) 72 | except: 73 | logger.error(f"Exception", exc_info=True) 74 | 75 | return session 76 | 77 | 78 | def setup_logger(verbose=True): 79 | if verbose: 80 | coloredlogs.install(level='DEBUG', fmt='[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s') 81 | else: 82 | coloredlogs.install(fmt='%(message)s') 83 | logging.getLogger("transformers").setLevel(logging.WARNING) 84 | 85 | 86 | def prepare_environment(cache_dir, output_dir, use_gpu): 87 | if cache_dir and not os.path.exists(cache_dir): 88 | os.makedirs(cache_dir) 89 | 90 | if output_dir and not os.path.exists(output_dir): 91 | os.makedirs(output_dir) 92 | 93 | import onnxruntime 94 | if use_gpu: 95 | assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers( 96 | ), "Please install onnxruntime-gpu package to test GPU inference." 97 | 98 | import transformers 99 | logger.info(f'PyTorch Version:{torch.__version__}') 100 | logger.info(f'Transformers Version:{transformers.__version__}') 101 | logger.info(f'Onnxruntime Version:{onnxruntime.__version__}') 102 | 103 | # Support three major versions of PyTorch and OnnxRuntime, and up to 6 months of transformers. 104 | from packaging import version 105 | assert version.parse(torch.__version__) >= version.parse('1.5.0') 106 | assert version.parse(transformers.__version__) >= version.parse('3.0.0') 107 | assert version.parse(onnxruntime.__version__) >= version.parse('1.4.0') 108 | 109 | 110 | def get_latency_result(runtimes, batch_size): 111 | latency_ms = sum(runtimes) / float(len(runtimes)) * 1000.0 112 | latency_variance = numpy.var(runtimes, dtype=numpy.float64) * 1000.0 113 | throughput = batch_size * (1000.0 / latency_ms) 114 | 115 | return { 116 | "test_times": len(runtimes), 117 | "latency_variance": "{:.2f}".format(latency_variance), 118 | "latency_90_percentile": "{:.2f}".format(numpy.percentile(runtimes, 90) * 1000.0), 119 | "latency_95_percentile": "{:.2f}".format(numpy.percentile(runtimes, 95) * 1000.0), 120 | "latency_99_percentile": "{:.2f}".format(numpy.percentile(runtimes, 99) * 1000.0), 121 | "average_latency_ms": "{:.2f}".format(latency_ms), 122 | "QPS": "{:.2f}".format(throughput), 123 | } 124 | 125 | 126 | def output_details(results, csv_filename): 127 | with open(csv_filename, mode="a", newline='') as csv_file: 128 | column_names = [ 129 | "engine", "version", "device", "precision", "optimizer", "io_binding", "model_name", "inputs", "threads", 130 | "batch_size", "sequence_length", "datetime", "test_times", "QPS", "average_latency_ms", "latency_variance", 131 | "latency_90_percentile", "latency_95_percentile", "latency_99_percentile" 132 | ] 133 | 134 | csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) 135 | csv_writer.writeheader() 136 | for result in results: 137 | csv_writer.writerow(result) 138 | 139 | logger.info(f"Detail results are saved to csv file: {csv_filename}") 140 | 141 | 142 | def output_summary(results, csv_filename, args): 143 | with open(csv_filename, mode="a", newline='') as csv_file: 144 | header_names = [ 145 | "model_name", "inputs", "engine", "version", "device", "precision", "optimizer", "io_binding", "threads" 146 | ] 147 | data_names = [] 148 | for batch_size in args.batch_sizes: 149 | for sequence_length in args.sequence_lengths: 150 | data_names.append(f"b{batch_size}_s{sequence_length}") 151 | 152 | csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + data_names) 153 | csv_writer.writeheader() 154 | for model_name in args.models: 155 | for input_count in [1, 2, 3]: 156 | for engine_name in args.engines: 157 | for io_binding in [True, False, ""]: 158 | for threads in args.num_threads: 159 | row = {} 160 | for result in results: 161 | if result["model_name"] == model_name and result["inputs"] == input_count and result[ 162 | "engine"] == engine_name and result["io_binding"] == io_binding and result[ 163 | "threads"] == threads: 164 | headers = {k: v for k, v in result.items() if k in header_names} 165 | if not row: 166 | row.update(headers) 167 | row.update({k: "" for k in data_names}) 168 | else: 169 | for k in header_names: 170 | assert row[k] == headers[k] 171 | b = result["batch_size"] 172 | s = result["sequence_length"] 173 | row[f"b{b}_s{s}"] = result["average_latency_ms"] 174 | if row: 175 | csv_writer.writerow(row) 176 | 177 | logger.info(f"Summary results are saved to csv file: {csv_filename}") 178 | 179 | 180 | def output_fusion_statistics(model_fusion_statistics, csv_filename): 181 | from transformers import __version__ as transformers_version 182 | with open(csv_filename, mode="a", newline='') as csv_file: 183 | column_names = ["model_filename", "datetime", "transformers", "torch"] + list( 184 | next(iter(model_fusion_statistics.values())).keys()) 185 | csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) 186 | csv_writer.writeheader() 187 | for key in model_fusion_statistics.keys(): 188 | model_fusion_statistics[key]["datetime"] = str(datetime.now()) 189 | model_fusion_statistics[key]["transformers"] = transformers_version 190 | model_fusion_statistics[key]["torch"] = torch.__version__ 191 | model_fusion_statistics[key]["model_filename"] = key 192 | csv_writer.writerow(model_fusion_statistics[key]) 193 | logger.info(f"Fusion statistics is saved to csv file: {csv_filename}") 194 | 195 | 196 | def inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size): 197 | result = {} 198 | runtimes = timeit.repeat(lambda: ort_session.run(None, ort_inputs), number=1, repeat=repeat_times) 199 | result.update(result_template) 200 | result.update({"io_binding": False}) 201 | result.update(get_latency_result(runtimes, batch_size)) 202 | return result 203 | 204 | 205 | def inference_ort_with_io_binding(ort_session, 206 | ort_inputs, 207 | result_template, 208 | repeat_times, 209 | ort_output_names, 210 | ort_outputs, 211 | output_buffers, 212 | output_buffer_max_sizes, 213 | batch_size, 214 | device, 215 | data_type=numpy.longlong): 216 | result = {} 217 | 218 | # Bind inputs and outputs to onnxruntime session 219 | io_binding = ort_session.io_binding() 220 | # Bind inputs to device 221 | for name in ort_inputs.keys(): 222 | np_input = torch.from_numpy(ort_inputs[name]).to(device) 223 | input_type = IO_BINDING_DATA_TYPE_MAP[str(ort_inputs[name].dtype)] if str( 224 | ort_inputs[name].dtype) in IO_BINDING_DATA_TYPE_MAP else data_type 225 | io_binding.bind_input(name, np_input.device.type, 0, input_type, np_input.shape, np_input.data_ptr()) 226 | # Bind outputs buffers with the sizes needed if not allocated already 227 | if len(output_buffers) == 0: 228 | allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device) 229 | 230 | for i in range(len(ort_output_names)): 231 | io_binding.bind_output(ort_output_names[i], output_buffers[i].device.type, 0, numpy.float32, 232 | ort_outputs[i].shape, output_buffers[i].data_ptr()) 233 | runtimes = timeit.repeat(lambda: ort_session.run_with_iobinding(io_binding), number=1, repeat=repeat_times) 234 | result.update(result_template) 235 | result.update({"io_binding": True}) 236 | result.update(get_latency_result(runtimes, batch_size)) 237 | return result 238 | 239 | 240 | def allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device): 241 | # Allocate output tensors with the largest test size needed. So the allocated memory can be reused 242 | # for each test run. 243 | 244 | for i in output_buffer_max_sizes: 245 | output_buffers.append(torch.empty(i, dtype=torch.float32, device=device)) 246 | 247 | 248 | def set_random_seed(seed=123): 249 | """Set random seed manully to get deterministic results""" 250 | import random 251 | random.seed(seed) 252 | numpy.random.seed(seed) 253 | torch.manual_seed(seed) 254 | torch.cuda.manual_seed(seed) 255 | torch.cuda.manual_seed_all(seed) 256 | #torch.backends.cudnn.enabled = False 257 | #torch.backends.cudnn.benchmark = False 258 | #torch.backends.cudnn.deterministic = True 259 | 260 | 261 | def measure_memory(is_gpu, func): 262 | import os 263 | import psutil 264 | from time import sleep 265 | 266 | class MemoryMonitor: 267 | def __init__(self, keep_measuring=True): 268 | self.keep_measuring = keep_measuring 269 | 270 | def measure_cpu_usage(self): 271 | max_usage = 0 272 | while True: 273 | max_usage = max(max_usage, psutil.Process(os.getpid()).memory_info().rss / 1024**2) 274 | sleep(0.005) # 5ms 275 | if not self.keep_measuring: 276 | break 277 | return max_usage 278 | 279 | def measure_gpu_usage(self): 280 | from py3nvml.py3nvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, \ 281 | nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlShutdown, NVMLError 282 | max_gpu_usage = [] 283 | gpu_name = [] 284 | try: 285 | nvmlInit() 286 | deviceCount = nvmlDeviceGetCount() 287 | max_gpu_usage = [0 for i in range(deviceCount)] 288 | gpu_name = [nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)) for i in range(deviceCount)] 289 | while True: 290 | for i in range(deviceCount): 291 | info = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(i)) 292 | max_gpu_usage[i] = max(max_gpu_usage[i], info.used / 1024**2) 293 | sleep(0.005) # 5ms 294 | if not self.keep_measuring: 295 | break 296 | nvmlShutdown() 297 | return [{ 298 | "device_id": i, 299 | "name": gpu_name[i], 300 | "max_used_MB": max_gpu_usage[i] 301 | } for i in range(deviceCount)] 302 | except NVMLError as error: 303 | if not self.silent: 304 | self.logger.error("Error fetching GPU information using nvml: %s", error) 305 | return None 306 | 307 | monitor = MemoryMonitor(False) 308 | 309 | memory_before_test = monitor.measure_gpu_usage() if is_gpu else monitor.measure_cpu_usage() 310 | 311 | from concurrent.futures import ThreadPoolExecutor 312 | with ThreadPoolExecutor() as executor: 313 | monitor = MemoryMonitor() 314 | mem_thread = executor.submit(monitor.measure_gpu_usage if is_gpu else monitor.measure_cpu_usage) 315 | try: 316 | fn_thread = executor.submit(func) 317 | result = fn_thread.result() 318 | finally: 319 | monitor.keep_measuring = False 320 | max_usage = mem_thread.result() 321 | 322 | if is_gpu: 323 | print(f"GPU memory usage: before={memory_before_test} peak={max_usage}") 324 | if len(memory_before_test) >= 1 and len(max_usage) >= 1: 325 | before = memory_before_test[0]["max_used_MB"] 326 | after = max_usage[0]["max_used_MB"] 327 | return after - before 328 | else: 329 | return None 330 | else: 331 | print(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB") 332 | return max_usage - memory_before_test 333 | -------------------------------------------------------------------------------- /fusion_base.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | from logging import getLogger 6 | from onnx_model import OnnxModel 7 | from typing import Union, List 8 | from onnx import GraphProto 9 | 10 | logger = getLogger(__name__) 11 | 12 | 13 | class Fusion: 14 | def __init__(self, 15 | model: OnnxModel, 16 | fused_op_type: str, 17 | search_op_types: Union[str, List[str]], 18 | description: str = None): 19 | self.search_op_types: List[str] = [search_op_types] if isinstance(search_op_types, str) else search_op_types 20 | self.fused_op_type: str = fused_op_type 21 | self.description: str = f"{fused_op_type}({description})" if description else fused_op_type 22 | self.model: OnnxModel = model 23 | self.nodes_to_remove: List = [] 24 | self.nodes_to_add: List = [] 25 | self.prune_graph: bool = False 26 | self.node_name_to_graph_name: dict = {} 27 | self.this_graph_name: str = None 28 | # It is optional that subclass updates fused_count since we will also check nodes_to_add to get counter. 29 | self.fused_count: int = 0 30 | 31 | def apply(self): 32 | logger.debug(f"start {self.description} fusion...") 33 | input_name_to_nodes = self.model.input_name_to_nodes() 34 | output_name_to_node = self.model.output_name_to_node() 35 | 36 | # This assumes that two search ops will not be fused at same time! 37 | for search_op_type in self.search_op_types: 38 | for node in self.model.get_nodes_by_op_type(search_op_type): 39 | graph = self.model.get_graph_by_node(node) 40 | if graph is None: 41 | raise Exception("Can not find node in any graphs") 42 | self.this_graph_name = graph.name 43 | self.fuse(node, input_name_to_nodes, output_name_to_node) 44 | 45 | op_list = [node.op_type for node in self.nodes_to_add] 46 | count = max(self.fused_count, op_list.count(self.fused_op_type)) 47 | if count > 0: 48 | logger.info(f"Fused {self.description} count: {count}") 49 | 50 | self.model.remove_nodes(self.nodes_to_remove) 51 | self.model.add_nodes(self.nodes_to_add, self.node_name_to_graph_name) 52 | 53 | if self.prune_graph: 54 | self.model.prune_graph() 55 | elif self.nodes_to_remove or self.nodes_to_add: 56 | self.model.update_graph() 57 | -------------------------------------------------------------------------------- /fusion_biasgelu.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | 6 | from logging import getLogger 7 | from onnx import helper 8 | from onnx_model import OnnxModel 9 | from fusion_base import Fusion 10 | from fusion_utils import NumpyHelper 11 | 12 | logger = getLogger(__name__) 13 | 14 | 15 | class FusionBiasGelu(Fusion): 16 | def __init__(self, model: OnnxModel, is_fastgelu): 17 | if is_fastgelu: 18 | super().__init__(model, 'FastGelu', 'FastGelu', 'add bias') 19 | else: 20 | super().__init__(model, 'BiasGelu', 'Gelu') 21 | 22 | def fuse(self, node, input_name_to_nodes, output_name_to_node): 23 | gelu_op_type = node.op_type 24 | fuse_op_type = 'BiasGelu' if gelu_op_type == 'Gelu' else 'FastGelu' 25 | 26 | if len(node.input) != 1: 27 | return 28 | 29 | nodes = self.model.match_parent_path(node, ['Add', 'MatMul'], [0, None]) 30 | if nodes is None: 31 | return 32 | (add, matmul) = nodes 33 | 34 | bias_weight = None 35 | # bias should be one dimension 36 | bias_index = -1 37 | for i, input in enumerate(add.input): 38 | initializer = self.model.get_initializer(input) 39 | if initializer is None: 40 | continue 41 | bias_index = i 42 | bias_weight = NumpyHelper.to_array(initializer) 43 | break 44 | if bias_weight is None: 45 | return 46 | if len(bias_weight.shape) != 1: 47 | return 48 | 49 | subgraph_nodes = [node, add] 50 | if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [node.output[0]], input_name_to_nodes, 51 | output_name_to_node): 52 | return 53 | 54 | self.nodes_to_remove.extend(subgraph_nodes) 55 | 56 | fused_node = helper.make_node(fuse_op_type, 57 | inputs=[matmul.output[0], add.input[bias_index]], 58 | outputs=node.output, 59 | name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_")) 60 | fused_node.domain = "com.microsoft" 61 | self.nodes_to_add.append(fused_node) 62 | self.node_name_to_graph_name[fused_node.name] = self.this_graph_name 63 | -------------------------------------------------------------------------------- /fusion_fastgelu.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | from typing import Dict, Optional 6 | from logging import getLogger 7 | from onnx import helper 8 | from onnx_model import OnnxModel 9 | from fusion_base import Fusion 10 | 11 | logger = getLogger(__name__) 12 | 13 | 14 | class FusionFastGelu(Fusion): 15 | def __init__(self, model: OnnxModel): 16 | super().__init__(model, "FastGelu", "Tanh") 17 | 18 | def fuse(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict): 19 | if self.fuse_1(tanh_node, input_name_to_nodes, output_name_to_node): 20 | return 21 | 22 | if self.fuse_2(tanh_node, input_name_to_nodes, output_name_to_node): 23 | return 24 | 25 | if self.fuse_3(tanh_node, input_name_to_nodes, output_name_to_node): 26 | return 27 | 28 | def fuse_1(self, tanh_node, input_name_to_nodes, output_name_to_node) -> Optional[bool]: 29 | """ 30 | Fuse Gelu with tanh into one node: 31 | +---------------------------+ 32 | | | 33 | | v 34 | [root] --> Pow --> Mul -----> Add --> Mul --> Tanh --> Add --> Mul 35 | | (Y=3) (B=0.0447...) (B=0.7978...) (B=1) ^ 36 | | | 37 | +------> Mul(B=0.5)--------------------------------------------+ 38 | Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine. 39 | """ 40 | if tanh_node.output[0] not in input_name_to_nodes: 41 | return 42 | children = input_name_to_nodes[tanh_node.output[0]] 43 | if len(children) != 1 or children[0].op_type != 'Add': 44 | return 45 | add_after_tanh = children[0] 46 | 47 | if not self.model.has_constant_input(add_after_tanh, 1.0): 48 | return 49 | 50 | if add_after_tanh.output[0] not in input_name_to_nodes: 51 | return 52 | children = input_name_to_nodes[add_after_tanh.output[0]] 53 | if len(children) != 1 or children[0].op_type != 'Mul': 54 | return 55 | mul_after_tanh = children[0] 56 | 57 | mul_half = self.model.match_parent(mul_after_tanh, 'Mul', None, output_name_to_node) 58 | if mul_half is None: 59 | return 60 | 61 | i = self.model.find_constant_input(mul_half, 0.5) 62 | if i < 0: 63 | return 64 | 65 | root_input = mul_half.input[0 if i == 1 else 1] 66 | 67 | #root_node could be None when root_input is graph input 68 | root_node = self.model.get_parent(mul_half, 0 if i == 1 else 1, output_name_to_node) 69 | 70 | mul_before_tanh = self.model.match_parent(tanh_node, 'Mul', 0, output_name_to_node) 71 | if mul_before_tanh is None: 72 | return 73 | 74 | i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001) 75 | if i < 0: 76 | return 77 | 78 | add_before_tanh = self.model.match_parent(mul_before_tanh, 'Add', 0 if i == 1 else 1, output_name_to_node) 79 | if add_before_tanh is None: 80 | return 81 | 82 | mul_after_pow = self.model.match_parent(add_before_tanh, 83 | 'Mul', 84 | None, 85 | output_name_to_node, 86 | exclude=[root_node] if root_node else []) 87 | if mul_after_pow is None: 88 | return 89 | 90 | i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001) 91 | if i < 0: 92 | return 93 | 94 | pow = self.model.match_parent(mul_after_pow, 'Pow', 0 if i == 1 else 1, output_name_to_node) 95 | if pow is None: 96 | return 97 | 98 | if not self.model.has_constant_input(pow, 3.0): 99 | return 100 | 101 | if pow.input[0] != root_input: 102 | return 103 | 104 | subgraph_nodes = [ 105 | mul_after_tanh, mul_half, add_after_tanh, tanh_node, mul_before_tanh, add_before_tanh, mul_after_pow, pow 106 | ] 107 | if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul_after_tanh.output[0]], input_name_to_nodes, 108 | output_name_to_node): 109 | return 110 | 111 | self.nodes_to_remove.extend(subgraph_nodes) 112 | fused_node = helper.make_node('FastGelu', 113 | inputs=[root_input], 114 | outputs=mul_after_tanh.output, 115 | name=self.model.create_node_name('FastGelu')) 116 | fused_node.domain = "com.microsoft" 117 | self.nodes_to_add.append(fused_node) 118 | self.node_name_to_graph_name[fused_node.name] = self.this_graph_name 119 | return True 120 | 121 | def fuse_2(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]: 122 | """ 123 | This pattern is from Tensorflow model. 124 | Fuse Gelu with tanh into one node: 125 | +---------------------------+ 126 | | | 127 | | v 128 | [root] --> Pow --> Mul -----> Add --> Mul --> Tanh --> Add --> Mul(B=0.5)-->Mul--> 129 | | (Y=3) (B=0.0447...) (B=0.7978...) (B=1) ^ 130 | | | 131 | +---------------------------------------------------------------------------+ 132 | Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine. 133 | """ 134 | if tanh_node.output[0] not in input_name_to_nodes: 135 | return 136 | children = input_name_to_nodes[tanh_node.output[0]] 137 | if len(children) != 1 or children[0].op_type != 'Add': 138 | return 139 | add_after_tanh = children[0] 140 | 141 | if not self.model.has_constant_input(add_after_tanh, 1.0): 142 | return 143 | 144 | if add_after_tanh.output[0] not in input_name_to_nodes: 145 | return 146 | children = input_name_to_nodes[add_after_tanh.output[0]] 147 | if len(children) != 1 or children[0].op_type != 'Mul': 148 | return 149 | mul_half = children[0] 150 | 151 | i = self.model.find_constant_input(mul_half, 0.5) 152 | if i < 0: 153 | return 154 | 155 | if mul_half.output[0] not in input_name_to_nodes: 156 | return 157 | children = input_name_to_nodes[mul_half.output[0]] 158 | if len(children) != 1 or children[0].op_type != 'Mul': 159 | return 160 | mul_after_mul_half = children[0] 161 | 162 | root_node = self.model.get_parent(mul_after_mul_half, 163 | 0 if mul_after_mul_half.input[1] == mul_half.output[0] else 1, 164 | output_name_to_node) 165 | if root_node is None: 166 | return 167 | 168 | mul_before_tanh = self.model.match_parent(tanh_node, 'Mul', 0, output_name_to_node) 169 | if mul_before_tanh is None: 170 | return 171 | 172 | i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001) 173 | if i < 0: 174 | return 175 | 176 | add_before_tanh = self.model.match_parent(mul_before_tanh, 'Add', 0 if i == 1 else 1, output_name_to_node) 177 | if add_before_tanh is None: 178 | return 179 | 180 | mul_after_pow = self.model.match_parent(add_before_tanh, 'Mul', None, output_name_to_node, exclude=[root_node]) 181 | if mul_after_pow is None: 182 | return 183 | 184 | i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001) 185 | if i < 0: 186 | return 187 | 188 | pow = self.model.match_parent(mul_after_pow, 'Pow', 0 if i == 1 else 1, output_name_to_node) 189 | if pow is None: 190 | return 191 | 192 | if not self.model.has_constant_input(pow, 3.0): 193 | return 194 | 195 | if pow.input[0] != root_node.output[0]: 196 | return 197 | 198 | subgraph_nodes = [ 199 | mul_after_mul_half, mul_half, add_after_tanh, tanh_node, mul_before_tanh, add_before_tanh, mul_after_pow, 200 | pow 201 | ] 202 | if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul_after_mul_half.output[0]], input_name_to_nodes, 203 | output_name_to_node): 204 | return 205 | 206 | self.nodes_to_remove.extend(subgraph_nodes) 207 | fused_node = helper.make_node('FastGelu', 208 | inputs=[root_node.output[0]], 209 | outputs=mul_after_mul_half.output, 210 | name=self.model.create_node_name('FastGelu')) 211 | fused_node.domain = "com.microsoft" 212 | self.nodes_to_add.append(fused_node) 213 | self.node_name_to_graph_name[fused_node.name] = self.this_graph_name 214 | return True 215 | 216 | def fuse_3(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]: 217 | """ 218 | OpenAI's gelu implementation, also used in Megatron: 219 | Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x))) 220 | 221 | Fuse subgraph into a FastGelu node: 222 | +------------ Mul (B=0.79788456) -------------------+ 223 | | | 224 | +-------------------------------+ | 225 | | | | 226 | | v v 227 | [root] --> Mul (B=0.044715) --> Mul --> Add(B=1) --> Mul --> Tanh --> Add(B=1) --> Mul--> 228 | | ^ 229 | | | 230 | +-----------> Mul (B=0.5) --------------------------------------------------------+ 231 | """ 232 | if tanh_node.output[0] not in input_name_to_nodes: 233 | return 234 | 235 | children = input_name_to_nodes[tanh_node.output[0]] 236 | if len(children) != 1 or children[0].op_type != 'Add': 237 | return 238 | add_after_tanh = children[0] 239 | 240 | if not self.model.has_constant_input(add_after_tanh, 1.0): 241 | return 242 | 243 | if add_after_tanh.output[0] not in input_name_to_nodes: 244 | return 245 | children = input_name_to_nodes[add_after_tanh.output[0]] 246 | if len(children) != 1 or children[0].op_type != 'Mul': 247 | return 248 | mul_last = children[0] 249 | 250 | mul_half = self.model.match_parent(mul_last, 'Mul', None, output_name_to_node) 251 | if mul_half is None: 252 | return 253 | 254 | i = self.model.find_constant_input(mul_half, 0.5) 255 | if i < 0: 256 | return 257 | 258 | root_input = mul_half.input[0 if i == 1 else 1] 259 | 260 | mul_before_tanh = self.model.match_parent(tanh_node, 'Mul', 0, output_name_to_node) 261 | if mul_before_tanh is None: 262 | return 263 | 264 | add_1 = self.model.match_parent(mul_before_tanh, 'Add', None, output_name_to_node) 265 | if add_1 is None: 266 | return 267 | j = self.model.find_constant_input(add_1, 1.0) 268 | if j < 0: 269 | return 270 | 271 | mul_7978 = self.model.match_parent(mul_before_tanh, 'Mul', None, output_name_to_node) 272 | if mul_7978 is None: 273 | return 274 | k = self.model.find_constant_input(mul_7978, 0.7978, delta=0.0001) 275 | if k < 0: 276 | return 277 | if mul_7978.input[0 if k == 1 else 1] != root_input: 278 | return 279 | 280 | mul_before_add_1 = self.model.match_parent(add_1, 'Mul', 0 if j == 1 else 1, output_name_to_node) 281 | if mul_before_add_1 is None: 282 | return 283 | 284 | if mul_before_add_1.input[0] == root_input: 285 | another = 1 286 | elif mul_before_add_1.input[1] == root_input: 287 | another = 0 288 | else: 289 | return 290 | 291 | mul_0447 = self.model.match_parent(mul_before_add_1, 'Mul', another, output_name_to_node) 292 | if mul_0447 is None: 293 | return 294 | m = self.model.find_constant_input(mul_0447, 0.0447, delta=0.0001) 295 | if m < 0: 296 | return 297 | 298 | if mul_0447.input[0 if m == 1 else 1] != root_input: 299 | return 300 | 301 | subgraph_nodes = [ 302 | mul_0447, mul_before_add_1, add_1, mul_before_tanh, tanh_node, add_after_tanh, mul_7978, mul_half, mul_last 303 | ] 304 | if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul_last.output[0]], input_name_to_nodes, 305 | output_name_to_node): 306 | return 307 | 308 | self.nodes_to_remove.extend(subgraph_nodes) 309 | fused_node = helper.make_node('FastGelu', 310 | inputs=[root_input], 311 | outputs=mul_last.output, 312 | name=self.model.create_node_name('FastGelu')) 313 | fused_node.domain = "com.microsoft" 314 | self.nodes_to_add.append(fused_node) 315 | self.node_name_to_graph_name[fused_node.name] = self.this_graph_name 316 | return True 317 | -------------------------------------------------------------------------------- /fusion_gelu.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | from typing import Dict, Optional 6 | from logging import getLogger 7 | from onnx import helper 8 | from onnx_model import OnnxModel 9 | from fusion_base import Fusion 10 | 11 | logger = getLogger(__name__) 12 | 13 | 14 | class FusionGelu(Fusion): 15 | def __init__(self, model: OnnxModel): 16 | super().__init__(model, "Gelu", "Erf") 17 | 18 | def fuse(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict): 19 | if self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node): 20 | return 21 | if self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node): 22 | return 23 | self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node) 24 | 25 | def fuse_1(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]: 26 | """ 27 | This pattern is from PyTorch model 28 | Fuse Gelu with Erf into one node: 29 | Pattern 1: 30 | +-------Mul(0.5)---------------------+ 31 | | | 32 | | v 33 | [root] --> Div -----> Erf --> Add --> Mul --> 34 | (B=1.4142...) (1) 35 | 36 | Pattern 2: 37 | +------------------------------------+ 38 | | | 39 | | v 40 | [root] --> Div -----> Erf --> Add --> Mul -->Mul --> 41 | (B=1.4142...) (1) (0.5) 42 | 43 | Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine. 44 | """ 45 | if erf_node.output[0] not in input_name_to_nodes: 46 | return 47 | children = input_name_to_nodes[erf_node.output[0]] 48 | if len(children) != 1 or children[0].op_type != 'Add': 49 | return 50 | add_after_erf = children[0] 51 | 52 | if not self.model.has_constant_input(add_after_erf, 1): 53 | return 54 | 55 | if add_after_erf.output[0] not in input_name_to_nodes: 56 | return 57 | children = input_name_to_nodes[add_after_erf.output[0]] 58 | if len(children) != 1 or children[0].op_type != 'Mul': 59 | return 60 | mul_after_erf = children[0] 61 | 62 | div = self.model.match_parent(erf_node, 'Div', 0, output_name_to_node) 63 | if div is None: 64 | return 65 | 66 | if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1: 67 | return 68 | 69 | subgraph_input = div.input[0] 70 | 71 | another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0 72 | if subgraph_input == mul_after_erf.input[another]: # pattern 2 73 | children = input_name_to_nodes[mul_after_erf.output[0]] 74 | if len(children) != 1 or children[0].op_type != 'Mul': 75 | return 76 | mul_half = children[0] 77 | if not self.model.has_constant_input(mul_half, 0.5): 78 | return 79 | subgraph_output = mul_half.output[0] 80 | else: # pattern 1 81 | mul_half = self.model.match_parent(mul_after_erf, 'Mul', another, output_name_to_node) 82 | if mul_half is None: 83 | return 84 | 85 | if not self.model.has_constant_input(mul_half, 0.5): 86 | return 87 | 88 | if subgraph_input not in mul_half.input: 89 | return 90 | 91 | subgraph_output = mul_after_erf.output[0] 92 | 93 | subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half] 94 | if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes, 95 | output_name_to_node): 96 | return 97 | 98 | self.nodes_to_remove.extend(subgraph_nodes) 99 | fused_node = helper.make_node('Gelu', inputs=[subgraph_input], outputs=[subgraph_output]) 100 | fused_node.domain = "com.microsoft" 101 | self.nodes_to_add.append(fused_node) 102 | self.node_name_to_graph_name[fused_node.name] = self.this_graph_name 103 | return True 104 | 105 | def fuse_2(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]: 106 | """ 107 | This pattern is from Keras model 108 | Fuse Gelu with Erf into one node: 109 | +------------------------------------------+ 110 | | | 111 | | v 112 | [root] --> Div -----> Erf --> Add --> Mul -->Mul 113 | (B=1.4142...) (A=1) (A=0.5) 114 | 115 | Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine. 116 | """ 117 | if erf_node.output[0] not in input_name_to_nodes: 118 | return 119 | children = input_name_to_nodes[erf_node.output[0]] 120 | if len(children) != 1 or children[0].op_type != 'Add': 121 | return 122 | add_after_erf = children[0] 123 | 124 | if not self.model.has_constant_input(add_after_erf, 1): 125 | return 126 | 127 | if add_after_erf.output[0] not in input_name_to_nodes: 128 | return 129 | children = input_name_to_nodes[add_after_erf.output[0]] 130 | if len(children) != 1 or children[0].op_type != 'Mul': 131 | return 132 | mul_after_erf = children[0] 133 | 134 | if not self.model.has_constant_input(mul_after_erf, 0.5): 135 | return 136 | 137 | if mul_after_erf.output[0] not in input_name_to_nodes: 138 | return 139 | children = input_name_to_nodes[mul_after_erf.output[0]] 140 | if len(children) != 1 or children[0].op_type != 'Mul': 141 | return 142 | mul = children[0] 143 | 144 | div = self.model.match_parent(erf_node, 'Div', 0, output_name_to_node) 145 | if div is None: 146 | return 147 | 148 | sqrt_node = None 149 | if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1: 150 | sqrt_node = self.model.match_parent(div, 'Sqrt', 1, output_name_to_node) 151 | if sqrt_node is None: 152 | return 153 | if not self.model.has_constant_input(sqrt_node, 2.0): 154 | return 155 | 156 | root_node = self.model.get_parent(div, 0, output_name_to_node) 157 | if root_node is None: 158 | return 159 | 160 | if root_node.output[0] not in mul.input: 161 | return 162 | 163 | subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul] 164 | if sqrt_node: 165 | subgraph_nodes.append(sqrt_node) 166 | 167 | if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [mul.output[0]], input_name_to_nodes, 168 | output_name_to_node): 169 | return 170 | 171 | self.nodes_to_remove.extend(subgraph_nodes) 172 | fused_node = helper.make_node('Gelu', inputs=[root_node.output[0]], outputs=[mul.output[0]]) 173 | fused_node.domain = "com.microsoft" 174 | self.nodes_to_add.append(fused_node) 175 | self.node_name_to_graph_name[fused_node.name] = self.this_graph_name 176 | return True 177 | 178 | def fuse_3(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]: 179 | """ 180 | This pattern is from TensorFlow model 181 | Fuse Gelu with Erf into one node: 182 | +----------------------------------------------+ 183 | | | 184 | | v 185 | [root] --> Mul -----> Erf --> Add --> Mul -->Mul 186 | (A=0.7071067690849304) (B=1) (B=0.5) 187 | 188 | Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine. 189 | """ 190 | 191 | if erf_node.output[0] not in input_name_to_nodes: 192 | return 193 | children = input_name_to_nodes[erf_node.output[0]] 194 | if len(children) != 1 or children[0].op_type != 'Add': 195 | return 196 | add_after_erf = children[0] 197 | 198 | if not self.model.has_constant_input(add_after_erf, 1): 199 | return 200 | 201 | if add_after_erf.output[0] not in input_name_to_nodes: 202 | return 203 | children = input_name_to_nodes[add_after_erf.output[0]] 204 | if len(children) != 1 or children[0].op_type != 'Mul': 205 | return 206 | mul_half = children[0] 207 | 208 | if not self.model.has_constant_input(mul_half, 0.5): 209 | return 210 | 211 | first_mul = self.model.match_parent(erf_node, 'Mul', 0, output_name_to_node) 212 | if first_mul is None: 213 | return 214 | 215 | i = self.model.find_constant_input(first_mul, 0.7071067690849304, delta=0.001) 216 | if i < 0: 217 | return 218 | 219 | root_node = self.model.get_parent(first_mul, 0 if i == 1 else 1, output_name_to_node) 220 | if root_node is None: 221 | return 222 | 223 | if mul_half.output[0] not in input_name_to_nodes: 224 | return 225 | children = input_name_to_nodes[mul_half.output[0]] 226 | if len(children) != 1 or children[0].op_type != 'Mul': 227 | return 228 | last_mul = children[0] 229 | 230 | if not (last_mul.input[0] == root_node.output[0] or last_mul.input[1] == root_node.output[0]): 231 | return 232 | 233 | subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul] 234 | if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [last_mul.output[0]], input_name_to_nodes, 235 | output_name_to_node): 236 | return 237 | 238 | self.nodes_to_remove.extend(subgraph_nodes) 239 | fused_node = helper.make_node('Gelu', inputs=[root_node.output[0]], outputs=[last_mul.output[0]]) 240 | fused_node.domain = "com.microsoft" 241 | self.nodes_to_add.append(fused_node) 242 | self.node_name_to_graph_name[fused_node.name] = self.this_graph_name 243 | return True 244 | -------------------------------------------------------------------------------- /fusion_gelu_approximation.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | 6 | from logging import getLogger 7 | from onnx import helper 8 | from onnx_model import OnnxModel 9 | from fusion_base import Fusion 10 | 11 | 12 | class FusionGeluApproximation(Fusion): 13 | def __init__(self, model: OnnxModel): 14 | super().__init__(model, 'FastGelu', ['Gelu', 'BiasGelu'], 'GeluApproximation') 15 | 16 | def fuse(self, node, input_name_to_nodes, output_name_to_node): 17 | new_node = helper.make_node("FastGelu", 18 | inputs=node.input, 19 | outputs=node.output, 20 | name=self.model.create_node_name("FastGelu", node.op_type + "_Approximation")) 21 | new_node.domain = "com.microsoft" 22 | self.nodes_to_remove.append(node) 23 | self.nodes_to_add.append(new_node) 24 | self.node_name_to_graph_name[new_node.name] = self.this_graph_name 25 | -------------------------------------------------------------------------------- /fusion_gpt_attention_megatron.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | import numpy as np 6 | from logging import getLogger 7 | from onnx import helper, numpy_helper, TensorProto 8 | from onnx_model import OnnxModel 9 | from fusion_base import Fusion 10 | from fusion_utils import FusionUtils 11 | from fusion_gpt_attention import FusionGptAttentionPastBase 12 | 13 | logger = getLogger(__name__) 14 | 15 | 16 | def is_close(value, expected_value): 17 | return abs(value - expected_value) <= 1e-6 18 | 19 | 20 | class FusionGptAttentionMegatron(FusionGptAttentionPastBase): 21 | """ 22 | Fuse GPT-2 Attention with past state subgraph from Megatron into one Attention node. 23 | """ 24 | def __init__(self, model: OnnxModel, num_heads: int): 25 | super().__init__(model, num_heads) 26 | 27 | def fuse_attention_node(self, matmul_before_split, add_before_split, past, present, input, reshape_qkv, mask): 28 | attention_node_name = self.model.create_node_name('GptAttention') 29 | int32_mask = self.cast_attention_mask(mask) 30 | output = reshape_qkv.output[0] 31 | i = 1 if (add_before_split.input[0] == matmul_before_split.output[0]) else 0 32 | attention_node = helper.make_node( 33 | 'Attention', 34 | inputs=[input, matmul_before_split.input[1], add_before_split.input[i], int32_mask, past], 35 | outputs=[output, present], 36 | name=attention_node_name) 37 | attention_node.domain = "com.microsoft" 38 | attention_node.attribute.extend([ 39 | helper.make_attribute("num_heads", self.num_heads), 40 | helper.make_attribute("unidirectional", 0) # unidirectional shall not be ON for 4D attention mask 41 | ]) 42 | 43 | nodes_to_add = [attention_node] 44 | self.nodes_to_add.extend(nodes_to_add) 45 | 46 | for node in nodes_to_add: 47 | self.node_name_to_graph_name[node.name] = self.this_graph_name 48 | 49 | self.nodes_to_remove.append(reshape_qkv) 50 | 51 | # we rely on prune_graph() to clean old subgraph nodes 52 | self.prune_graph = True 53 | 54 | def match_mask(self, sub_qk, mul_qk, matmul_qk, layernorm_before_attention): 55 | mask_nodes = self.model.match_parent_path( 56 | sub_qk, 57 | ['Mul', 'Sub', 'Slice', 'Slice'], 58 | [1, 0, 1, 0]) # yapf: disable 59 | if mask_nodes is None: 60 | logger.debug("fuse_attention: failed to match unidirectional mask path") 61 | return None 62 | (mul_mask, sub_mask, last_slice_mask, slice_mask) = mask_nodes 63 | 64 | if mul_qk.input[1] != last_slice_mask.output[0]: 65 | logger.debug("fuse_attention failed: mul_qk.input[1] != last_slice_mask.output[0]") 66 | return None 67 | 68 | if not self.utils.check_node_input_value(mul_mask, 1, 10000.0): 69 | logger.debug("fuse_attention failed: mul_mask input 1 is not constant 10000.0") 70 | return None 71 | 72 | if not self.utils.check_node_input_value(sub_mask, 0, 1.0): 73 | logger.debug("fuse_attention failed: sub_mask input 0 is not constant 1.0") 74 | return None 75 | 76 | if not self.model.find_graph_input(slice_mask.input[0]): 77 | logger.info("expect slick_mask input 0 to be graph input") 78 | return None 79 | 80 | if not self.utils.check_node_input_value(last_slice_mask, 1, [0]): 81 | logger.debug("fuse_attention failed: last_slice_mask input 1 (starts) is not constant [0]") 82 | return None 83 | 84 | if not self.utils.check_node_input_value(last_slice_mask, 3, [3]): 85 | logger.debug("fuse_attention failed: last_slice_mask input 3 (axes) is not constant [3]") 86 | return False 87 | 88 | if not self.utils.check_node_input_value(last_slice_mask, 4, [1]): 89 | logger.debug("fuse_attention failed: last_slice_mask input 4 (steps) is not constant [1]") 90 | return False 91 | 92 | if not self.utils.check_node_input_value(slice_mask, 3, [2]): 93 | logger.debug("fuse_attention failed: slice_mask input 3 (axes) is not constant [2]") 94 | return None 95 | 96 | if not self.utils.check_node_input_value(slice_mask, 4, [1]): 97 | logger.debug("fuse_attention failed: slice_mask input 4 (steps) is not constant [1]") 98 | return None 99 | 100 | last_slice_path = self.model.match_parent_path(last_slice_mask, ['Unsqueeze', 'Gather', 'Shape', 'MatMul'], 101 | [2, 0, 0, 0]) 102 | if last_slice_path is None or last_slice_path[-1] != matmul_qk: 103 | logger.debug("fuse_attention: failed to match last slice path") 104 | return None 105 | 106 | first_slice_path = self.model.match_parent_path(slice_mask, ['Unsqueeze', 'Gather', 'Shape', 'MatMul'], 107 | [2, 0, 0, 0]) 108 | if first_slice_path is None or first_slice_path[-1] != matmul_qk: 109 | logger.debug("fuse_attention: failed to match first slice path") 110 | return None 111 | 112 | first_slice_sub = self.model.match_parent_path(slice_mask, ['Unsqueeze', 'Sub', 'Gather', 'Shape', 'MatMul'], 113 | [1, 0, 0, 0, 0]) 114 | if first_slice_sub is None or first_slice_sub[-1] != matmul_qk: 115 | logger.debug("fuse_attention: failed to match last slice sub path") 116 | return None 117 | 118 | first_slice_sub_1 = self.model.match_parent_path(slice_mask, 119 | ['Unsqueeze', 'Sub', 'Gather', 'Shape', 'LayerNormalization'], 120 | [1, 0, 1, 0, 0]) 121 | if first_slice_sub_1 is None or first_slice_sub_1[-1] != layernorm_before_attention: 122 | logger.debug("fuse_attention: failed to match last slice sub path 1") 123 | return None 124 | 125 | return slice_mask.input[0] 126 | 127 | def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): 128 | past = None 129 | present = None 130 | 131 | qkv_nodes = self.model.match_parent_path( 132 | normalize_node, 133 | ['Add', 'Add', 'MatMul', 'Reshape', 'Transpose', 'MatMul'], 134 | [ 0, 1, None, 0, 0, 0], 135 | output_name_to_node=output_name_to_node, 136 | ) # yapf: disable 137 | if qkv_nodes is None: 138 | return 139 | (add_skip, add_after_attention, matmul_after_attention, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes 140 | 141 | skip_input = add_skip.input[0] 142 | 143 | v_nodes = self.model.match_parent_path( 144 | matmul_qkv, 145 | ['Concat', 'Transpose', 'Reshape', 'Split', 'Add', 'MatMul', 'LayerNormalization'], 146 | [1, 1, 0, 0, 0, None, 0]) # yapf: disable 147 | if v_nodes is None: 148 | logger.debug("fuse_attention: failed to match v path") 149 | return 150 | (concat_v, transpose_v, reshape_v, split_v, add_before_split, matmul_before_split, 151 | layernorm_before_attention) = v_nodes 152 | if skip_input != layernorm_before_attention.input[0]: 153 | logger.debug("fuse_attention: skip_input != layernorm_before_attention.input[0]") 154 | return 155 | 156 | qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Sub', 'Mul', 'MatMul'], [0, 0, 0, 0]) 157 | if qk_nodes is None: 158 | logger.debug("fuse_attention: failed to match qk path") 159 | return None 160 | (softmax_qk, sub_qk, mul_qk, matmul_qk) = qk_nodes 161 | if self.model.get_node_attribute(softmax_qk, "axis") != 3: 162 | logger.debug("fuse_attention failed: softmax_qk axis != 3") 163 | return None 164 | 165 | attention_mask = self.match_mask(sub_qk, mul_qk, matmul_qk, layernorm_before_attention) 166 | 167 | q_nodes = self.model.match_parent_path(matmul_qk, ['Div', 'Transpose', 'Reshape', 'Split'], [0, 0, 0, 0]) 168 | if q_nodes is None: 169 | logger.debug("fuse_attention: failed to match q path") 170 | return 171 | (div_q, transpose_q, reshape_q, split_q) = q_nodes 172 | if split_v != split_q: 173 | logger.debug("fuse_attention: skip since split_v != split_q") 174 | return 175 | 176 | k_nodes = self.model.match_parent_path(matmul_qk, 177 | ['Div', 'Transpose', 'Concat', 'Transpose', 'Reshape', 'Split'], 178 | [1, 0, 0, 1, 0, 0]) 179 | if k_nodes is None: 180 | logger.debug("fuse_attention: failed to match k path") 181 | return 182 | (div_k, _, concat_k, transpose_k, reshape_k, split_k) = k_nodes 183 | if split_v != split_k: 184 | logger.debug("fuse_attention: skip since split_v != split_k") 185 | return 186 | 187 | i, value = self.model.get_constant_input(reshape_k) 188 | if not (isinstance(value, np.ndarray) and list(value.shape) == [4] and value[0] == 0 and value[1] == 0 189 | and value[2] > 0 and value[3] > 0): 190 | logger.debug("fuse_attention: reshape constant input is not [0, 0, N, H]") 191 | return 192 | 193 | num_heads = value[2] 194 | if num_heads != self.num_heads: 195 | logger.info(f"Detected num_heads={num_heads}. Ignore user specified value {self.num_heads}") 196 | self.num_heads = num_heads 197 | 198 | hidden_size_per_head = value[3] 199 | i, value = self.model.get_constant_input(div_k) 200 | expected_value = float(np.sqrt(np.sqrt(hidden_size_per_head))) 201 | if not is_close(value, expected_value): 202 | logger.debug(f"fuse_attention: div_k value={value} expected={expected_value}") 203 | return 204 | 205 | i, value = self.model.get_constant_input(div_q) 206 | if not is_close(value, expected_value): 207 | logger.debug(f"fuse_attention: div_q value={value} expected={expected_value}") 208 | return 209 | 210 | # Match past and present paths 211 | past = self.match_past_pattern_2(concat_k, concat_v, output_name_to_node) 212 | if past is None: 213 | logger.debug("fuse_attention: match past failed") 214 | return 215 | if not self.model.find_graph_input(past): 216 | logger.debug("fuse_attention: past is not graph input.") 217 | # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input. 218 | 219 | present = self.match_present(concat_v, input_name_to_nodes) 220 | if present is None: 221 | logger.debug("fuse_attention: match present failed") 222 | return 223 | if not self.model.find_graph_output(present): 224 | logger.info("fuse_attention: expect present to be graph output") 225 | return 226 | 227 | self.fuse_attention_node(matmul_before_split, add_before_split, past, present, 228 | layernorm_before_attention.output[0], reshape_qkv, attention_mask) 229 | -------------------------------------------------------------------------------- /fusion_gpt_attention_no_past.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | import numpy as np 6 | from logging import getLogger 7 | from onnx import helper, numpy_helper, TensorProto 8 | from onnx_model import OnnxModel 9 | from fusion_base import Fusion 10 | from fusion_utils import FusionUtils 11 | 12 | logger = getLogger(__name__) 13 | 14 | 15 | class FusionGptAttentionNoPast(Fusion): 16 | """ 17 | Fuse GPT-2 Attention without past state into one Attention node. 18 | This does not support attention_mask graph input right now. 19 | """ 20 | def __init__(self, model: OnnxModel, num_heads: int): 21 | super().__init__(model, "Attention", "LayerNormalization", "without past") 22 | # TODO: detect num_heads from graph like FusionAttention 23 | self.num_heads = num_heads 24 | 25 | def create_attention_node(self, gemm, gemm_qkv, input, output): 26 | attention_node_name = self.model.create_node_name('Attention') 27 | attention_node = helper.make_node('Attention', 28 | inputs=[input, gemm.input[1], gemm.input[2]], 29 | outputs=[attention_node_name + "_output"], 30 | name=attention_node_name) 31 | attention_node.domain = "com.microsoft" 32 | attention_node.attribute.extend( 33 | [helper.make_attribute("num_heads", self.num_heads), 34 | helper.make_attribute("unidirectional", 1)]) 35 | 36 | matmul_node = helper.make_node('MatMul', 37 | inputs=[attention_node_name + "_output", gemm_qkv.input[1]], 38 | outputs=[attention_node_name + "_matmul_output"], 39 | name=attention_node_name + "_matmul") 40 | 41 | add_node = helper.make_node('Add', 42 | inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]], 43 | outputs=[output], 44 | name=attention_node_name + "_add") 45 | 46 | self.nodes_to_add.extend([attention_node, matmul_node, add_node]) 47 | self.node_name_to_graph_name[attention_node.name] = self.this_graph_name 48 | self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name 49 | self.node_name_to_graph_name[add_node.name] = self.this_graph_name 50 | 51 | def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): 52 | return_indice = [] 53 | qkv_nodes = self.model.match_parent_path( 54 | normalize_node, 55 | ['Add', 'Reshape', 'Gemm', 'Reshape', 'Reshape', 'Transpose', 'MatMul'], 56 | [0, None, 0, 0, 0, 0, 0], 57 | output_name_to_node=output_name_to_node, 58 | return_indice=return_indice 59 | ) # yapf: disable 60 | if qkv_nodes is None: 61 | return 62 | (add_qkv, reshape_qkv, gemm_qkv, reshape_1, reshape_2, transpose_qkv, matmul_qkv) = qkv_nodes 63 | 64 | another_input = add_qkv.input[1 - return_indice[0]] 65 | 66 | v_nodes = self.model.match_parent_path( 67 | matmul_qkv, 68 | ['Transpose', 'Reshape', 'Split', 'Reshape', 'Gemm', 'Reshape'], 69 | [1, 0, 0, 0, 0, 0]) # yapf: disable 70 | if v_nodes is None: 71 | logger.debug("fuse_attention: failed to match v path") 72 | return 73 | (transpose_v, reshape_v, split_v, reshape_after_gemm, gemm, reshape_before_gemm) = v_nodes 74 | 75 | layernorm_before_attention = self.model.get_parent(reshape_before_gemm, 0, output_name_to_node) 76 | if layernorm_before_attention is None or layernorm_before_attention.op_type != 'LayerNormalization': 77 | if layernorm_before_attention.op_type != 'Add': 78 | logger.debug(f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}") 79 | return 80 | 81 | if not another_input in layernorm_before_attention.input: 82 | # match openai-gpt 83 | if not another_input in layernorm_before_attention.output: 84 | logger.debug("Add and LayerNormalization shall have one same input") 85 | return 86 | 87 | qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Sub', 'Mul', 'Div', 'MatMul'], [0, 0, 0, 0, 0]) 88 | if qk_nodes is not None: 89 | (softmax_qk, sub_qk, mul_qk, div_qk, matmul_qk) = qk_nodes 90 | mask_nodes = self.model.match_parent_path( 91 | sub_qk, 92 | ['Mul', 'Sub', 'Slice', 'Slice', 'Unsqueeze', 'Sub', 'Squeeze', 'Slice', 'Shape', 'Div'], 93 | [1, 0, 1, 0, 1, 0, 0, 0, 0, 0]) # yapf: disable 94 | if mask_nodes is None: 95 | logger.debug("fuse_attention: failed to match mask path") 96 | return 97 | div_mask = mask_nodes[-1] 98 | 99 | if div_qk != div_mask: 100 | logger.debug("fuse_attention: skip since div_qk != div_mask") 101 | return 102 | else: 103 | # New pattern for gpt2 from PyTorch 1.5.0 and Transformers 2.9.0. 104 | qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Where', 'Div', 'MatMul'], [0, 0, 1, 0]) 105 | if qk_nodes is not None: 106 | (softmax_qk, where_qk, div_qk, matmul_qk) = qk_nodes 107 | mask_nodes = self.model.match_parent_path( 108 | where_qk, 109 | ['Cast', 'Slice', 'Slice', 'Unsqueeze', 'Sub', 'Squeeze', 'Slice', 'Shape', 'Div'], 110 | [ 0, 0, 0, 1, 0, 0, 0, 0, 0]) # yapf: disable 111 | if mask_nodes is None: 112 | logger.debug("fuse_attention: failed to match mask path") 113 | return 114 | div_mask = mask_nodes[-1] 115 | 116 | if div_qk != div_mask: 117 | logger.debug("fuse_attention: skip since div_qk != div_mask") 118 | return 119 | else: 120 | # match openai-gpt 121 | qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'Add', 'Mul', 'Div', 'MatMul'], 122 | [0, 0, 0, 0, 0]) 123 | if qk_nodes is None: 124 | logger.debug("fuse_attention: failed to match qk path") 125 | return 126 | (softmax_qk, add_qk, mul_qk, div_qk, matmul_qk) = qk_nodes 127 | mask_nodes = self.model.match_parent_path( 128 | mul_qk, 129 | ['Slice', 'Slice', 'Unsqueeze', 'Squeeze', 'Slice', 'Shape', 'Div'], 130 | [ 1, 0, 2, 0, 0, 0, 0]) # yapf: disable 131 | if mask_nodes is None: 132 | logger.debug("fuse_attention: failed to match mask path") 133 | return 134 | div_mask = mask_nodes[-1] 135 | 136 | if div_qk != div_mask: 137 | logger.debug("fuse_attention: skip since div_qk != div_mask") 138 | return 139 | 140 | q_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Split'], [0, 0, 0]) 141 | if q_nodes is None: 142 | logger.debug("fuse_attention: failed to match q path") 143 | return 144 | (transpose_q, reshape_q, split_q) = q_nodes 145 | if split_v != split_q: 146 | logger.debug("fuse_attention: skip since split_v != split_q") 147 | return 148 | 149 | k_nodes = self.model.match_parent_path(matmul_qk, ['Transpose', 'Reshape', 'Split'], [1, 0, 0]) 150 | if k_nodes is None: 151 | logger.debug("fuse_attention: failed to match k path") 152 | return 153 | (transpose_k, reshape_k, split_k) = k_nodes 154 | if split_v != split_k: 155 | logger.debug("fuse_attention: skip since split_v != split_k") 156 | return 157 | 158 | self.create_attention_node(gemm, gemm_qkv, layernorm_before_attention.output[0], reshape_qkv.output[0]) 159 | 160 | # we rely on prune_graph() to clean old subgraph nodes: 161 | # qk_nodes + q_nodes + k_nodes + v_nodes + mask_nodes + [reshape_qkv, transpose_qkv, matmul_qkv] 162 | self.prune_graph = True 163 | -------------------------------------------------------------------------------- /fusion_layernorm.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | from typing import Dict 6 | from logging import getLogger 7 | from onnx import helper 8 | from onnx_model import OnnxModel 9 | from fusion_base import Fusion 10 | 11 | logger = getLogger(__name__) 12 | 13 | 14 | class FusionLayerNormalization(Fusion): 15 | def __init__(self, model: OnnxModel): 16 | super().__init__(model, "LayerNormalization", "ReduceMean") 17 | 18 | def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): 19 | """ 20 | Fuse Layer Normalization subgraph into one node LayerNormalization: 21 | +----------------------+ 22 | | | 23 | | v 24 | [Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add 25 | (axis=2 or -1) | (Y=2) (axis=2 or -1) (E-6 or E-12 or 0) ^ 26 | | | 27 | +-----------------------------------------------+ 28 | 29 | It also handles cases of duplicated sub nodes exported from older version of PyTorch: 30 | +----------------------+ 31 | | v 32 | | +-------> Sub-----------------------------------------------+ 33 | | | | 34 | | | v 35 | [Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add 36 | | ^ 37 | | | 38 | +----------------------+ 39 | """ 40 | children = self.model.get_children(node, input_name_to_nodes) 41 | if len(children) == 0 or len(children) > 2: 42 | return 43 | 44 | root_input = node.input[0] 45 | 46 | if children[0].op_type != 'Sub' or children[0].input[0] != root_input: 47 | return 48 | 49 | if len(children) == 2: 50 | if children[1].op_type != 'Sub' or children[1].input[0] != root_input: 51 | return 52 | 53 | div_node = None 54 | for child in children: 55 | div_node = self.model.find_first_child_by_type(child, 'Div', input_name_to_nodes, recursive=False) 56 | if div_node is not None: 57 | break 58 | if div_node is None: 59 | return 60 | 61 | path_id, parent_nodes, _ = self.model.match_parent_paths( 62 | div_node, [(['Sqrt', 'Add', 'ReduceMean', 'Pow', 'Sub'], [1, 0, 0, 0, 0]), 63 | (['Sqrt', 'Add', 'ReduceMean', 'Pow', 'Cast', 'Sub'], [1, 0, 0, 0, 0, 0])], output_name_to_node) 64 | if path_id < 0: 65 | return 66 | 67 | sub_node = parent_nodes[-1] 68 | if sub_node not in children: 69 | return 70 | 71 | second_add_node = parent_nodes[1] 72 | i, add_weight = self.model.get_constant_input(second_add_node) 73 | if add_weight is None or add_weight <= 0 or add_weight > 1.0E-4: 74 | logger.warning(f"epsilon value is not expeced: {add_weight}") 75 | return 76 | 77 | pow_node = parent_nodes[3] 78 | if not self.model.find_constant_input(pow_node, 2.0) == 1: 79 | return 80 | 81 | mul_node = input_name_to_nodes[div_node.output[0]][0] 82 | if mul_node.op_type != 'Mul': 83 | return 84 | 85 | last_add_node = input_name_to_nodes[mul_node.output[0]][0] 86 | if last_add_node.op_type != 'Add': 87 | return 88 | 89 | subgraph_nodes = [node] 90 | subgraph_nodes.extend(children) 91 | subgraph_nodes.extend(parent_nodes[:-1]) 92 | 93 | subgraph_nodes.extend([last_add_node, mul_node, div_node]) 94 | if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, last_add_node.output, input_name_to_nodes, 95 | output_name_to_node): 96 | logger.debug(f"It is not safe to fuse LayerNormalization node. Skip") 97 | return 98 | 99 | weight_input = mul_node.input[1 - self.model.input_index(div_node.output[0], mul_node)] 100 | if not self.model.is_constant_with_specified_dimension(weight_input, 1, "layernorm weight"): 101 | return 102 | 103 | bias_input = last_add_node.input[1 - self.model.input_index(mul_node.output[0], last_add_node)] 104 | if not self.model.is_constant_with_specified_dimension(bias_input, 1, "layernorm bias"): 105 | return 106 | 107 | self.nodes_to_remove.extend(subgraph_nodes) 108 | 109 | normalize_node = helper.make_node('LayerNormalization', 110 | inputs=[node.input[0], weight_input, bias_input], 111 | outputs=[last_add_node.output[0]], 112 | name=self.model.create_node_name("LayerNormalization", 113 | name_prefix="LayerNorm")) 114 | normalize_node.attribute.extend([helper.make_attribute("epsilon", float(add_weight))]) 115 | self.nodes_to_add.append(normalize_node) 116 | self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name 117 | 118 | 119 | class FusionLayerNormalizationTF(Fusion): 120 | def __init__(self, model: OnnxModel): 121 | super().__init__(model, "LayerNormalization", "Add", "TF") 122 | 123 | def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): 124 | """ 125 | Layer Norm from Tensorflow model(using keras2onnx or tf2onnx): 126 | +------------------------------------+ 127 | | | 128 | | | 129 | (Cast_1) | 130 | | | 131 | | v (B) (B) (A) 132 | Add --> (Cast_1) --> ReduceMean --> Sub --> Mul --> ReduceMean --> (Cast_3) --> Add --> Sqrt --> Reciprocol --> Mul --> Mul --> Sub --> Add 133 | | | | ^ ^ 134 | | | | | | 135 | | +--------------------------------------------------(Cast_2)-------------------------------|-------+ | 136 | | v | 137 | +---------------------------------------------------------------------------------------------------------------> Mul--------------------+ 138 | """ 139 | return_indice = [] 140 | _, parent_nodes, return_indice = self.model.match_parent_paths( 141 | node, 142 | [(['Sub', 'Mul', 'Mul', 'Reciprocal', 'Sqrt', 'Add', 'ReduceMean', 'Mul', 'Sub', 'ReduceMean'], 143 | [ 1, 1, None, 0, 0, 0, None, 0, 0, None]), 144 | (['Sub', 'Mul', 'Mul', 'Reciprocal', 'Sqrt', 'Add', 'Cast', 'ReduceMean', 'Mul', 'Sub', 'ReduceMean'], 145 | [ 1, 1, None, 0, 0, 0, 0, None, 0, 0, None])], 146 | output_name_to_node) # yapf: disable 147 | 148 | if parent_nodes is None: 149 | return 150 | 151 | assert len(return_indice) == 3 152 | if not (return_indice[0] in [0, 1] and return_indice[1] in [0, 1] and return_indice[2] in [0, 1]): 153 | logger.debug("return indice is exepected in [0, 1], but got {return_indice}") 154 | return 155 | 156 | sub_node_0, mul_node_0, mul_node_1, reciprocol_node, sqrt_node, add_node_0 = parent_nodes[:6] 157 | reduce_mean_node_0, mul_node_2, sub_node_1, reduce_mean_node_1 = parent_nodes[-4:] 158 | 159 | cast_node_3 = None 160 | if len(parent_nodes) == 11: 161 | cast_node_3 = parent_nodes[6] 162 | assert (cast_node_3.op_type == 'Cast') 163 | 164 | mul_node_3 = self.model.match_parent(node, 'Mul', 0, output_name_to_node) 165 | if mul_node_3 is None: 166 | logger.debug("mul_node_3 not found") 167 | return 168 | 169 | node_before_reduce = self.model.get_parent(reduce_mean_node_1, 0, output_name_to_node) 170 | root_node = node_before_reduce if cast_node_3 is None else self.model.get_parent( 171 | node_before_reduce, 0, output_name_to_node) 172 | if root_node is None: 173 | logger.debug("root node is none") 174 | return 175 | 176 | i, epsilon = self.model.get_constant_input(add_node_0) 177 | if epsilon is None or epsilon <= 0 or (epsilon > 1.0E-5 and cast_node_3 is None): 178 | logger.debug("epsilon is not matched") 179 | return 180 | 181 | if cast_node_3 is None and (reduce_mean_node_1.input[0] not in mul_node_3.input 182 | or reduce_mean_node_1.input[0] not in sub_node_1.input): 183 | logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node") 184 | return 185 | 186 | if cast_node_3 is not None and (node_before_reduce.input[0] not in mul_node_3.input 187 | or reduce_mean_node_1.input[0] not in sub_node_1.input): 188 | logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node") 189 | return 190 | 191 | if mul_node_2.input[0] != mul_node_2.input[1]: 192 | logger.debug("mul_node_2 shall have two same inputs") 193 | return 194 | 195 | subgraph_nodes = [ 196 | node, sub_node_0, mul_node_0, mul_node_1, reciprocol_node, sqrt_node, add_node_0, reduce_mean_node_0, 197 | mul_node_2, sub_node_1, reduce_mean_node_1, mul_node_3 198 | ] 199 | 200 | if cast_node_3 is not None: 201 | cast_node_2 = self.model.match_parent(mul_node_0, 'Cast', 0, output_name_to_node) 202 | if cast_node_2 is None: 203 | logger.debug("cast_node_2 not found") 204 | return 205 | subgraph_nodes.extend([node_before_reduce, cast_node_2, cast_node_3]) 206 | 207 | if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, node.output, self.model.input_name_to_nodes(), 208 | self.model.output_name_to_node()): 209 | logger.debug("not safe to fuse layer normalization") 210 | return 211 | 212 | self.nodes_to_remove.extend(subgraph_nodes) 213 | 214 | weight_input = mul_node_1.input[1] 215 | bias_input = sub_node_0.input[0] 216 | 217 | #TODO: add epsilon attribute 218 | fused_node = helper.make_node('LayerNormalization', 219 | inputs=[mul_node_3.input[0], weight_input, bias_input], 220 | outputs=[node.output[0]], 221 | name=self.model.create_node_name("LayerNormalization", name_prefix="LayerNorm")) 222 | fused_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))]) 223 | self.nodes_to_add.append(fused_node) 224 | self.node_name_to_graph_name[fused_node.name] = self.this_graph_name 225 | -------------------------------------------------------------------------------- /fusion_options.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | from argparse import ArgumentParser 6 | 7 | 8 | class AttentionMaskFormat: 9 | MaskIndexEnd = 0 10 | MaskIndexEndAndStart = 1 11 | AttentionMask = 2 12 | NoMask = 3 13 | 14 | 15 | class FusionOptions: 16 | """ Options of fusion in graph optimization 17 | """ 18 | def __init__(self, model_type): 19 | self.enable_gelu = True 20 | self.enable_layer_norm = True 21 | self.enable_attention = True 22 | self.enable_skip_layer_norm = True 23 | self.enable_embed_layer_norm = True 24 | self.enable_bias_skip_layer_norm = True 25 | self.enable_bias_gelu = True 26 | self.enable_gelu_approximation = False 27 | self.attention_mask_format = AttentionMaskFormat.AttentionMask 28 | 29 | if model_type == 'gpt2': 30 | self.enable_skip_layer_norm = False 31 | 32 | def use_raw_attention_mask(self, use_raw_mask=True): 33 | if use_raw_mask: 34 | self.attention_mask_format = AttentionMaskFormat.AttentionMask 35 | else: 36 | self.attention_mask_format = AttentionMaskFormat.MaskIndexEnd 37 | 38 | def disable_attention_mask(self): 39 | self.attention_mask_format = AttentionMaskFormat.NoMask 40 | 41 | @staticmethod 42 | def parse(args): 43 | options = FusionOptions(args.model_type) 44 | if args.disable_gelu: 45 | options.enable_gelu = False 46 | if args.disable_layer_norm: 47 | options.enable_layer_norm = False 48 | if args.disable_attention: 49 | options.enable_attention = False 50 | if args.disable_skip_layer_norm: 51 | options.enable_skip_layer_norm = False 52 | if args.disable_embed_layer_norm: 53 | options.enable_embed_layer_norm = False 54 | if args.disable_bias_skip_layer_norm: 55 | options.enable_bias_skip_layer_norm = False 56 | if args.disable_bias_gelu: 57 | options.enable_bias_gelu = False 58 | if args.enable_gelu_approximation: 59 | options.enable_gelu_approximation = True 60 | if args.use_mask_index: 61 | options.use_raw_attention_mask(False) 62 | if args.no_attention_mask: 63 | options.disable_attention_mask() 64 | return options 65 | 66 | @staticmethod 67 | def add_arguments(parser: ArgumentParser): 68 | parser.add_argument('--disable_attention', required=False, action='store_true', help="disable Attention fusion") 69 | parser.set_defaults(disable_attention=False) 70 | 71 | parser.add_argument('--disable_skip_layer_norm', 72 | required=False, 73 | action='store_true', 74 | help="disable SkipLayerNormalization fusion") 75 | parser.set_defaults(disable_skip_layer_norm=False) 76 | 77 | parser.add_argument('--disable_embed_layer_norm', 78 | required=False, 79 | action='store_true', 80 | help="disable EmbedLayerNormalization fusion") 81 | parser.set_defaults(disable_embed_layer_norm=False) 82 | 83 | parser.add_argument('--disable_bias_skip_layer_norm', 84 | required=False, 85 | action='store_true', 86 | help="disable Add Bias and SkipLayerNormalization fusion") 87 | parser.set_defaults(disable_bias_skip_layer_norm=False) 88 | 89 | parser.add_argument('--disable_bias_gelu', 90 | required=False, 91 | action='store_true', 92 | help="disable Add Bias and Gelu/FastGelu fusion") 93 | parser.set_defaults(disable_bias_gelu=False) 94 | 95 | parser.add_argument('--disable_layer_norm', 96 | required=False, 97 | action='store_true', 98 | help="disable LayerNormalization fusion") 99 | parser.set_defaults(disable_layer_norm=False) 100 | 101 | parser.add_argument('--disable_gelu', required=False, action='store_true', help="disable Gelu fusion") 102 | parser.set_defaults(disable_gelu=False) 103 | 104 | parser.add_argument('--enable_gelu_approximation', 105 | required=False, 106 | action='store_true', 107 | help="enable Gelu/BiasGelu to FastGelu conversion") 108 | parser.set_defaults(enable_gelu_approximation=False) 109 | 110 | parser.add_argument('--use_mask_index', 111 | required=False, 112 | action='store_true', 113 | help="use mask index instead of raw attention mask in attention operator") 114 | parser.set_defaults(use_mask_index=False) 115 | 116 | parser.add_argument('--no_attention_mask', 117 | required=False, 118 | action='store_true', 119 | help="no attention mask. Only works for model_type=bert") 120 | parser.set_defaults(no_attention_mask=False) 121 | -------------------------------------------------------------------------------- /fusion_reshape.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | 6 | from fusion_base import Fusion 7 | from logging import getLogger 8 | import numpy as np 9 | from onnx import helper, numpy_helper, TensorProto 10 | from onnx_model import OnnxModel 11 | 12 | logger = getLogger(__name__) 13 | 14 | 15 | class FusionReshape(Fusion): 16 | def __init__(self, model: OnnxModel): 17 | super().__init__(model, "Reshape", "Reshape") 18 | 19 | def replace_reshape_node(self, shape, reshape_node, concat_node): 20 | shape_value = np.asarray(shape, dtype=np.int64) 21 | constant_shape_name = self.model.create_node_name('Constant', 'constant_shape') 22 | new_node = helper.make_node('Constant', 23 | inputs=[], 24 | outputs=[constant_shape_name], 25 | value=helper.make_tensor(name='const_tensor', 26 | data_type=TensorProto.INT64, 27 | dims=shape_value.shape, 28 | vals=bytes(shape_value), 29 | raw=True)) 30 | reshape_node.input[1] = constant_shape_name 31 | reshape_node.name = self.model.create_node_name('Reshape', 'Reshape_Fuse') 32 | self.nodes_to_remove.extend([concat_node]) 33 | self.nodes_to_add.append(new_node) 34 | self.node_name_to_graph_name[new_node.name] = self.this_graph_name 35 | 36 | def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node): 37 | if reshape_node.input[1] not in output_name_to_node: 38 | return 39 | 40 | concat_node = output_name_to_node[reshape_node.input[1]] 41 | if concat_node.op_type != 'Concat' or len(concat_node.input) < 3 or len(concat_node.input) > 4: 42 | return 43 | 44 | path0 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Gather', 'Shape'], [0, 0, 0], 45 | output_name_to_node) 46 | if path0 is None: 47 | return 48 | 49 | (unsqueeze_0, gather_0, shape_0) = path0 50 | 51 | path1 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Gather', 'Shape'], [1, 0, 0], 52 | output_name_to_node) 53 | if path1 is None: 54 | return 55 | (unsqueeze_1, gather_1, shape_1) = path1 56 | 57 | shape = [] 58 | gather_value = self.model.get_constant_value(gather_0.input[1]) 59 | if gather_value == 0: 60 | shape.append(0) 61 | 62 | gather_value = self.model.get_constant_value(gather_1.input[1]) 63 | if gather_value == 1: 64 | shape.append(0) 65 | 66 | if len(shape) != 2: 67 | return 68 | 69 | path2 = [] 70 | path3 = [] 71 | shape_nodes = [shape_0, shape_1] 72 | if len(concat_node.input) == 3 and self.model.get_initializer(concat_node.input[2]) is None: 73 | path2 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Mul', 'Gather', 'Shape'], [2, 0, 0, 0], 74 | output_name_to_node) 75 | if path2 is None: 76 | path2 = self.model.match_parent_path( 77 | concat_node, ['Unsqueeze', 'Mul', 'Squeeze', 'Slice', 'Shape'], [2, 0, 0, 0, 0], 78 | output_name_to_node) # GPT2 exported by PyTorch 1.4 with opset_version=11 79 | if path2 is None: 80 | return 81 | 82 | path3 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Mul', 'Gather', 'Shape'], [2, 0, 1, 0], 83 | output_name_to_node) 84 | if path3 is None: 85 | path3 = self.model.match_parent_path( 86 | concat_node, ['Unsqueeze', 'Mul', 'Squeeze', 'Slice', 'Shape'], [2, 0, 1, 0, 0], 87 | output_name_to_node) # GPT2 exported by PyTorch 1.4 with opset_version=11 88 | if path3 is None: 89 | return 90 | 91 | shape_nodes.extend([path2[-1], path3[-1]]) 92 | shape.append(-1) 93 | elif (len(concat_node.input) > 2): 94 | concat_2 = self.model.get_initializer(concat_node.input[2]) 95 | if concat_2 is None: 96 | return 97 | concat_value = numpy_helper.to_array(concat_2) 98 | if isinstance(concat_value, list): 99 | shape.extend(concat_value) 100 | else: 101 | shape.append(concat_value) 102 | 103 | if len(concat_node.input) == 4 and self.model.get_initializer(concat_node.input[3]) is None: 104 | if -1 in shape: 105 | return 106 | 107 | path2 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Div', 'Gather', 'Shape'], [3, 0, 0, 0], 108 | output_name_to_node) 109 | if path2 is None: 110 | path2 = self.model.match_parent_path( 111 | concat_node, ['Unsqueeze', 'Div', 'Squeeze', 'Slice', 'Shape'], [3, 0, 0, 0, 0], 112 | output_name_to_node) # GPT2 exported by PyTorch 1.4 with opset_version=11 113 | if path2 is None: 114 | return 115 | shape_nodes.extend([path2[-1]]) 116 | shape.append(-1) 117 | elif (len(concat_node.input) > 3): 118 | concat_3 = self.model.get_initializer(concat_node.input[3]) 119 | if concat_3 is None: 120 | return 121 | 122 | concat_value = numpy_helper.to_array(concat_3) 123 | if isinstance(concat_value, list): 124 | shape.extend(concat_value) 125 | else: 126 | shape.append(concat_value) 127 | 128 | root_input = reshape_node.input[0] 129 | same_shape_input = True 130 | for shape_node in shape_nodes: 131 | if shape_node.input[0] != root_input: 132 | same_shape_input = False 133 | 134 | if not same_shape_input: 135 | return 136 | 137 | self.replace_reshape_node(shape, reshape_node, concat_node) 138 | 139 | self.nodes_to_remove.extend(path0) 140 | self.nodes_to_remove.extend(path1) 141 | self.nodes_to_remove.extend(path2) 142 | self.nodes_to_remove.extend(path3) 143 | -------------------------------------------------------------------------------- /fusion_shape.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | 6 | from fusion_base import Fusion 7 | from logging import getLogger 8 | from onnx import TensorProto, NodeProto 9 | from onnx_model import OnnxModel 10 | from fusion_utils import FusionUtils 11 | from typing import Union, Dict, List 12 | 13 | logger = getLogger(__name__) 14 | 15 | 16 | class FusionShape(Fusion): 17 | def __init__(self, model: OnnxModel): 18 | super().__init__(model, "Shape", "Concat") 19 | self.utils = FusionUtils(model) 20 | self.shape_infer = None 21 | self.shape_infer_done = False 22 | 23 | def get_dimensions_from_tensor_proto(self, tensor_proto: TensorProto) -> Union[int, None]: 24 | if tensor_proto.type.tensor_type.HasField('shape'): 25 | return len(tensor_proto.type.tensor_type.shape.dim) 26 | else: 27 | return None 28 | 29 | def get_dimensions(self, input_name: str) -> Union[int, None]: 30 | graph_input = self.model.find_graph_input(input_name) 31 | if graph_input: 32 | return self.get_dimensions_from_tensor_proto(graph_input) 33 | 34 | if not self.shape_infer_done: 35 | self.shape_infer = self.model.infer_runtime_shape({}, update=True) 36 | self.shape_infer_done = True 37 | 38 | if self.shape_infer is not None: 39 | return self.get_dimensions_from_tensor_proto(self.shape_infer.known_vi_[input_name]) 40 | 41 | return None 42 | 43 | def fuse(self, concat_node: NodeProto, input_name_to_nodes: Dict[str, List[NodeProto]], 44 | output_name_to_node: Dict[str, NodeProto]): 45 | """ 46 | Smplify subgraph like 47 | 48 | (2d_input) 49 | / \ 50 | Shape shape 51 | / \ 52 | Gather(indices=0) Gather(indices=1) 53 | | | 54 | Unsqueeze(axes=0) Unsqueeze(axes=0) 55 | \ / 56 | Concat 57 | | 58 | 59 | into (2d_input) --> Shape --> 60 | """ 61 | opset_version = self.model.get_opset_version() 62 | 63 | inputs = len(concat_node.input) 64 | root = None 65 | shape_output = None 66 | for i in range(inputs): 67 | path = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Gather', 'Shape'], [i, 0, 0], 68 | output_name_to_node) 69 | if path is None: 70 | return 71 | 72 | unsqueeze, gather, shape = path 73 | if i == 0: 74 | shape_output = shape.output[0] 75 | if root is None: 76 | root = shape.input[0] 77 | if self.get_dimensions(root) != inputs: 78 | return 79 | elif shape.input[0] != root: 80 | return 81 | 82 | if not FusionUtils.check_node_attribute(unsqueeze, 'axis', 0, default_value=0): 83 | return 84 | 85 | if opset_version < 13: 86 | if not FusionUtils.check_node_attribute(unsqueeze, 'axes', [0]): 87 | return 88 | else: 89 | if not self.utils.check_node_input_value(unsqueeze, 1, [0]): 90 | return 91 | 92 | value = self.model.get_constant_value(gather.input[1]) 93 | from numpy import ndarray, array_equal 94 | if not (isinstance(value, ndarray) and value.size == 1 and value.item() == i): 95 | return 96 | 97 | if self.model.find_graph_output(concat_node.output[0]) is None: 98 | self.model.replace_input_of_all_nodes(concat_node.output[0], shape_output) 99 | self.fused_count += 1 100 | self.prune_graph = True 101 | -------------------------------------------------------------------------------- /fusion_skiplayernorm.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | 6 | from logging import getLogger 7 | from onnx import helper 8 | from onnx_model import OnnxModel 9 | from fusion_base import Fusion 10 | from fusion_utils import NumpyHelper 11 | 12 | logger = getLogger(__name__) 13 | 14 | 15 | class FusionSkipLayerNormalization(Fusion): 16 | """ 17 | Fuse Add + LayerNormalization into one node: SkipLayerNormalization 18 | Note: This fusion does not check the input shape of Add and LayerNormalization. 19 | """ 20 | def __init__(self, model: OnnxModel): 21 | super().__init__(model, "SkipLayerNormalization", "LayerNormalization") 22 | # Update shape inference is needed since other fusions might add new edge which does not have shape info yet. 23 | self.shape_infer_helper = self.model.infer_runtime_shape({"batch_size": 4, "seq_len": 7}, update=True) 24 | 25 | def fuse(self, node, input_name_to_nodes, output_name_to_node): 26 | add = self.model.get_parent(node, 0, output_name_to_node) 27 | 28 | # In some models there is input_ids->gather->add->LayerNorm and one of input of the 29 | # add node is initializer with fixed shape which should not be fused into SkipLayerNorm 30 | if add is None: 31 | return 32 | 33 | for add_input in add.input: 34 | if self.model.get_initializer(add_input) != None: 35 | return 36 | 37 | # The number of input node of add should be 2 38 | if len(self.model.get_parents(add)) != 2: 39 | return 40 | 41 | if self.shape_infer_helper is not None: 42 | if not self.shape_infer_helper.compare_shape(add.input[0], add.input[1]): 43 | logger.debug( 44 | f"skip skiplayernorm fusion since shape of inputs ({add.input[0]}, {add.input[1]}) are not same") 45 | return 46 | else: 47 | # shape_infer_helper can not handle subgraphs. Current work around is to disable skiplayernorm fusion 48 | # longterm todo: support subgraph in symbolic_shape_infer or support add broadcasting in skiplayernorm op 49 | logger.warning( 50 | "symbolic shape infer failed. it's safe to ignore this message if there is no issue with optimized model" 51 | ) 52 | 53 | gather_path = self.model.match_parent_path(add, ['Gather'], [None]) 54 | if gather_path is not None and self.model.find_graph_input(gather_path[0].input[1]) is None: 55 | if self.model.match_parent_path(gather_path[0], ['ConstantOfShape'], [1]) is None: 56 | return 57 | 58 | if add is not None and add.op_type == 'Add' and self.model.is_safe_to_fuse_nodes( 59 | [add, node], node.output, input_name_to_nodes, output_name_to_node): 60 | self.nodes_to_remove.extend([add, node]) 61 | 62 | inputs = [add.input[0], add.input[1], node.input[1], node.input[2]] 63 | normalize_node = helper.make_node("SkipLayerNormalization", 64 | inputs=inputs, 65 | outputs=[node.output[0]], 66 | name=self.model.create_node_name("SkipLayerNormalization", 67 | name_prefix="SkipLayerNorm")) 68 | normalize_node.domain = "com.microsoft" 69 | 70 | # Pass attribute "epsilon" from layernorm node to SkipLayerNormalization 71 | for att in node.attribute: 72 | if att.name == 'epsilon': 73 | normalize_node.attribute.extend([att]) 74 | 75 | # Set default epsilon if no epsilon exists from layernorm 76 | if len(normalize_node.attribute) == 0: 77 | normalize_node.attribute.extend([helper.make_attribute("epsilon", 1.0E-12)]) 78 | 79 | self.nodes_to_add.append(normalize_node) 80 | self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name 81 | 82 | 83 | class FusionBiasSkipLayerNormalization(Fusion): 84 | def __init__(self, model: OnnxModel): 85 | super().__init__(model, "SkipLayerNormalization", "SkipLayerNormalization", "add bias") 86 | 87 | def fuse(self, node, input_name_to_nodes, output_name_to_node): 88 | if len(node.input) != 4: 89 | return 90 | 91 | return_indice = [] 92 | nodes = self.model.match_parent_path(node, ['Add', 'MatMul'], [None, None], None, return_indice) 93 | if nodes is None: 94 | return 95 | assert len(return_indice) == 2 96 | add_input_index = return_indice[0] 97 | if add_input_index >= 2: 98 | return 99 | 100 | (add, matmul) = nodes 101 | 102 | # bias should be one dimension 103 | bias_index = -1 104 | for i, input in enumerate(add.input): 105 | initializer = self.model.get_initializer(input) 106 | if initializer is None: 107 | continue 108 | bias_index = i 109 | bias_weight = NumpyHelper.to_array(initializer) 110 | break 111 | if bias_weight is None: 112 | logger.debug(f"Bias weight not found") 113 | return 114 | if len(bias_weight.shape) != 1: 115 | logger.debug(f"Bias weight is not 1D") 116 | return 117 | 118 | subgraph_nodes = [node, add] 119 | if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, [node.output[0]], input_name_to_nodes, 120 | output_name_to_node): 121 | logger.debug(f"Skip fusing SkipLayerNormalization with Bias since it is not safe") 122 | return 123 | 124 | self.nodes_to_remove.extend(subgraph_nodes) 125 | inputs = [ 126 | node.input[1 - add_input_index], matmul.output[0], node.input[2], node.input[3], add.input[bias_index] 127 | ] 128 | new_node = helper.make_node("SkipLayerNormalization", 129 | inputs=inputs, 130 | outputs=node.output, 131 | name=self.model.create_node_name("SkipLayerNormalization", 132 | "SkipLayerNorm_AddBias_")) 133 | new_node.domain = "com.microsoft" 134 | 135 | # Pass attribute "epsilon" from skiplayernorm node to skiplayernorm(add bias) 136 | for att in node.attribute: 137 | if att.name == 'epsilon': 138 | new_node.attribute.extend([att]) 139 | 140 | # Set default epsilon if no epsilon exists from skiplayernorm 141 | if len(new_node.attribute) == 0: 142 | new_node.attribute.extend([helper.make_attribute("epsilon", 1.0E-12)]) 143 | 144 | self.nodes_to_add.append(new_node) 145 | self.node_name_to_graph_name[new_node.name] = self.this_graph_name 146 | -------------------------------------------------------------------------------- /fusion_utils.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | from logging import getLogger 6 | from typing import Tuple 7 | from onnx import helper, numpy_helper, TensorProto 8 | from numpy import ndarray, array_equal 9 | from onnx_model import OnnxModel 10 | 11 | logger = getLogger(__name__) 12 | 13 | 14 | class FusionUtils: 15 | def __init__(self, model: OnnxModel): 16 | self.model: OnnxModel = model 17 | 18 | def cast_graph_input_to_int32(self, input_name: str) -> Tuple[bool, str]: 19 | graph_input = self.model.find_graph_input(input_name) 20 | if graph_input is not None and graph_input.type.tensor_type.elem_type != TensorProto.INT32: 21 | cast_output, cast_node = self.cast_input_to_int32(input_name) 22 | logger.debug(f"Casted graph input {input_name} to int32") 23 | return True, cast_output 24 | 25 | logger.debug(f"Did not cast graph input {input_name} to int32: found {graph_input is not None}") 26 | return False, input_name 27 | 28 | def cast_input_to_int32(self, input_name: str): 29 | cast_output = input_name + '_int32' 30 | 31 | # Avoid consequent Cast nodes. 32 | inputs = [input_name] 33 | output_name_to_node = self.model.output_name_to_node() 34 | if input_name in output_name_to_node: 35 | parent_node = output_name_to_node[input_name] 36 | if parent_node and parent_node.op_type == 'Cast': 37 | inputs = [parent_node.input[0]] 38 | 39 | cast_node = helper.make_node('Cast', inputs=inputs, outputs=[cast_output]) 40 | cast_node.attribute.extend([helper.make_attribute("to", int(TensorProto.INT32))]) 41 | self.model.add_node(cast_node) 42 | 43 | return cast_output, cast_node 44 | 45 | def remove_cast_int32(self, input_name: str): 46 | input_name_to_nodes = self.model.input_name_to_nodes() 47 | nodes = input_name_to_nodes[input_name] 48 | for node in nodes: 49 | if node.op_type == "Cast": 50 | is_int32 = False 51 | for att in node.attribute: 52 | if att.name == 'to' and att.i == int(TensorProto.INT32): 53 | is_int32 = True 54 | break 55 | if is_int32: 56 | output_name = node.output[0] 57 | self.model.remove_node(node) 58 | self.model.replace_input_of_all_nodes(output_name, input_name) 59 | 60 | @staticmethod 61 | def check_node_attribute(node, attribute_name: str, expected_value, default_value=None): 62 | """Verify that a node has expected value for an attribute. 63 | 64 | Args: 65 | node (NodeProto): a node to check 66 | attribute_name (str): name of attribute 67 | expected_value (Any): expected value of the attribute 68 | default_value (Any, optional): default value if the attribute does not exist. Defaults to None. 69 | 70 | Returns: 71 | bool: whether the check is passed or not 72 | """ 73 | value = default_value 74 | for attr in node.attribute: 75 | if attr.name == attribute_name: 76 | value = helper.get_attribute_value(attr) 77 | 78 | if isinstance(expected_value, list): 79 | return (isinstance(value, ndarray) or isinstance(value, list)) and array_equal( 80 | expected_value, value, equal_nan=False) 81 | else: 82 | return value == expected_value 83 | 84 | def check_node_input_value(self, node, input_index: int, expected_value): 85 | """Verify that a node has expected input value 86 | 87 | Args: 88 | node (NodeProto): a node to check 89 | input_index (int): index of its input to be verified 90 | expected_value (Any): expected value of the input 91 | 92 | Returns: 93 | bool: whether the check is passed or not 94 | """ 95 | assert len(node.input) > input_index 96 | 97 | value = self.model.get_constant_value(node.input[input_index]) 98 | 99 | if isinstance(expected_value, list): 100 | return (isinstance(value, ndarray) or isinstance(value, list)) and array_equal( 101 | expected_value, value, equal_nan=False) 102 | else: 103 | return value == expected_value 104 | 105 | def get_dtype(self, shape_infer_helper, input_or_output_name: str) -> int: 106 | """Get data type of an input or output. 107 | 108 | Args: 109 | shape_infer_helper (SymbolicShapeInferenceHelper): object of symbolic shape inference 110 | input_or_output_name (str): name of input or output 111 | 112 | Returns: 113 | int: tensor data type 114 | """ 115 | dtype = self.model.get_dtype(input_or_output_name) 116 | if dtype is not None: 117 | return dtype 118 | 119 | if shape_infer_helper: 120 | tensor_proto = shape_infer_helper.known_vi_[input_or_output_name] 121 | if tensor_proto.type.tensor_type.HasField('elem_type'): 122 | return tensor_proto.type.tensor_type.elem_type 123 | 124 | return None 125 | 126 | def remove_useless_cast_nodes(self): 127 | """Remove cast nodes that are not needed: input and output has same data type. 128 | """ 129 | shape_infer = self.model.infer_runtime_shape(update=True) 130 | if shape_infer is None: 131 | return 132 | 133 | nodes_to_remove = [] 134 | for node in self.model.nodes(): 135 | if node.op_type == 'Cast': 136 | input_dtype = self.get_dtype(shape_infer, node.input[0]) 137 | output_dtype = self.get_dtype(shape_infer, node.output[0]) 138 | if input_dtype and input_dtype == output_dtype: 139 | nodes_to_remove.append(node) 140 | 141 | if nodes_to_remove: 142 | graph_input_names = set(self.model.get_graphs_input_names()) 143 | graph_output_names = set(self.model.get_graphs_output_names()) 144 | for node in nodes_to_remove: 145 | if bool(set(node.output) & graph_output_names): 146 | if not bool(set(node.input) & graph_input_names): 147 | self.model.replace_output_of_all_nodes(node.input[0], node.output[0]) 148 | else: 149 | continue 150 | else: 151 | self.model.replace_input_of_all_nodes(node.output[0], node.input[0]) 152 | self.model.remove_node(node) 153 | logger.info(f"Removed {len(nodes_to_remove)} Cast nodes with output type same as input") 154 | 155 | def remove_useless_reshape_nodes(self): 156 | """Remove reshape node that is not needed based on symbolic shape inference: input and output has same shape 157 | """ 158 | shape_infer = self.model.infer_runtime_shape(update=True) 159 | if shape_infer is None: 160 | return 161 | 162 | nodes_to_remove = [] 163 | for node in self.model.nodes(): 164 | if node.op_type == 'Reshape': 165 | input_shape = shape_infer.get_edge_shape(node.input[0]) 166 | output_shape = shape_infer.get_edge_shape(node.output[0]) 167 | if input_shape and output_shape and input_shape == output_shape: 168 | logger.info( 169 | f"Remove reshape node {node.name} since its input shape is same as output: {input_shape}") 170 | nodes_to_remove.append(node) 171 | 172 | if nodes_to_remove: 173 | graph_input_names = set(self.model.get_graphs_input_names()) 174 | graph_output_names = set(self.model.get_graphs_output_names()) 175 | for node in nodes_to_remove: 176 | if bool(set(node.output) & graph_output_names): 177 | if not bool(set(node.input) & graph_input_names): 178 | self.model.replace_output_of_all_nodes(node.input[0], node.output[0]) 179 | else: 180 | continue 181 | else: 182 | self.model.replace_input_of_all_nodes(node.output[0], node.input[0]) 183 | self.model.remove_node(node) 184 | 185 | 186 | class NumpyHelper: 187 | @staticmethod 188 | def to_array(tensor: TensorProto, fill_zeros: bool = False) -> ndarray: 189 | # When weights are in external data format but not presented, we can still test the optimizer with two changes: 190 | # (1) set fill_zeros = True (2) change load_external_data=False in optimizer.py 191 | if fill_zeros: 192 | from onnx import mapping 193 | return ndarray(shape=tensor.dims, dtype=mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.data_type]) 194 | 195 | return numpy_helper.to_array(tensor) 196 | -------------------------------------------------------------------------------- /hf.co_1ms/README.md: -------------------------------------------------------------------------------- 1 | 2 | ### Quick Summary - Use vendor supplied Pytorch and you will get the same performance as Infinity (as of 10/3/2021) 3 | tl;dr: Repackage OneDNN/DNNL on CPU and CUDNN for TensorRT/Tensorcore and you have Infinity without $20k/cpu/yr 4 | 5 | 6 | Reconstructed Demos from launch Video here: https://www.youtube.com/watch?v=jiftCAhOYQA 7 | 8 | Infinity CPU Inference Dual-core Cascade lake VM: 9 | Seq length 16: 2.6ms 10 | ![cpu 16](images/cpu_16_2_5ms.png) 11 | Seq length 128: 9.7ms 12 | ![gpu 128](images/cpu_9_7ms.png) 13 | 14 | Infinity GPU Inference Quad-core Cascade lake VM + 1 T4 GPU: 15 | Seq length 16: 1.7ms 16 | ![cpu 16](images/gpu_16_1_7ms.png) 17 | Seq length 128: 2.6ms 18 | ![gpu 128](images/gpu_128_2_6ms.png) 19 | 20 | 21 | The original model used in the video is available here: https://huggingface.co/philschmid/MiniLM-L6-H384-uncased-sst2 22 | 23 | The optimized "Infinity Model" switch is basically the QNNX quantized model is available here: 24 | https://huggingface.co/philschmid/Infinity_cpu_MiniLM_L6_H384_uncased_sst2 25 | 26 | # To Infinity and Beyond 27 | For our experiments we want to start from the original model to see if we can reach the demo'ed metrics. 28 | 29 | Setup your Python ENV 30 | ``` 31 | python3.9 -m venv ~/1msenv 32 | source ~/1msenv/bin/activate 33 | pip install --upgrade pip 34 | pip install --upgrade onnx coloredlogs packaging psutil py3nvml onnxconverter_common numpy transformers sympy wheel 35 | 36 | # This is the compare regular PyTorch / Torchscript performance 37 | # To install Intel's Pytorch enahcements -- you will recreate the "1ms" demos with this at 9.8ms 38 | # uninstall with pip uninstall torch torch-ipex 39 | pip install torch_ipex==1.9.0 -f https://software.intel.com/ipex-whl-stable 40 | 41 | # To install stock Pytorch nighty -- you will run a couple ms slower at 11ms 42 | pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html 43 | 44 | ``` 45 | 46 | ## build ONNX with OneDNN and CUDNN 47 | 48 | ``` 49 | ./build.sh --config Release --build_wheel --parallel --use_openmp --use_dnnl --skip_tests --use_cuda --cuda_home /usr/local/cuda --cudnn_home /usr/local/cuda 50 | #find . -name *.whl 51 | ./build/Linux/Release/dist/onnxruntime_gpu-1.10.0-cp39-cp39-linux_x86_64.whl 52 | ./build/Linux/Release/dist/onnxruntime_dnnl-1.10.0-cp39-cp39-linux_x86_64.whl 53 | 54 | pip install ./build/Linux/Release/dist/onnxruntime_dnnl-1.10.0-cp39-cp39-linux_x86_64.whl ./build/Linux/Release/dist/onnxruntime_gpu-1.10.0-cp39-cp39-linux_x86_64.whl 55 | ``` 56 | 57 | ## Approaching Infinity 58 | Run the Benchmark script in this folder. Change the parameters to GPU if you are doing a gpu run 59 | 60 | ``` 61 | ./hf.co_1ms/run_benchmark.sh 62 | ``` 63 | 64 | ## Are we there yet? 65 | 66 | ## CPU Benchmark Results 67 | 68 | | Seq.Len | 1.11-dev Torchscript (FP32) | 1.11-dev Torchscript (INT8) | Intel 1.9.0 Torchscript (FP32) | Intel 1.9.0 Torchscript (Int8) | ONNX (FP32) | ONNX (Int8) | 69 | |---------| ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | 70 | | 16 |6.14|2.49|5.86|1.96|2.76|1.24| 71 | | 128 |17.39|11.67|16.65|9.59|13.63|7.48| 72 | 73 | ## GPU Benchmark Results on A100 *NOT* T4 demo'ed in Infinity Video 74 | 75 | | Seq.Len | 1.11.0.dev20211003+cu111 Torchscript (FP32) | 1.11.0.dev20211003+cu111 Torchscript (FP16) | ONNX (FP32) | ONNX (FP16) | 76 | |---------| ----------- | ----------- | ----------- | ----------- 77 | | 16 |3.10|2.77|0.81|0.83| 78 | | 128 |3.55|2.96|0.74|0.97| 79 | 80 | Detailed results ![here](result.csv) 81 | 82 | 83 | ### Sample CPU run 84 | 85 | # 'average_latency_ms': '9.59' vs Infinity's '9.7ms' 86 | 87 | ``` 88 | ./hf.co_1ms/run_benchmark.sh 89 | ... 90 | Run PyTorch on philschmid/MiniLM-L6-H384-uncased-sst2 with input shape [1, 128] 91 | {'engine': 'torchscript', 'version': '1.9.0+cpu', 'device': 'cpu', 'optimizer': '', 'precision': , 'io_binding': '', 'model_name': 'philschmid/MiniLM-L6-H384-uncased-sst2', 'inputs': 1, 'threads': 2, 'batch_size': 1, 'sequence_length': 128, 'datetime': '2021-10-04 03:50:52.568732', 'test_times': 100, 'latency_variance': '0.00', 'latency_90_percentile': '9.79', 'latency_95_percentile': '9.84', 'latency_99_percentile': '9.94', 'average_latency_ms': '9.59', 'QPS': '104.32'} 92 | ``` 93 | 94 | 95 | -------------------------------------------------------------------------------- /hf.co_1ms/fusion.csv: -------------------------------------------------------------------------------- 1 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 2 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_cpu.onnx,2021-10-04 03:50:24.257021,4.11.2,1.9.0+cpu,1,6,0,0,6,0,12 3 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 4 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_int8_cpu.onnx,2021-10-04 03:50:43.020711,4.11.2,1.9.0+cpu,0,6,0,0,6,1,12 5 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 6 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_cpu.onnx,2021-10-04 04:34:02.831263,4.11.2,1.11.0.dev20211003+cpu,1,6,0,0,6,0,12 7 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 8 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_int8_cpu.onnx,2021-10-04 04:34:22.689454,4.11.2,1.11.0.dev20211003+cpu,0,6,0,0,6,1,12 9 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 10 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_gpu.onnx,2021-10-04 05:22:26.404124,4.11.2,1.11.0.dev20211003+cpu,1,6,0,0,6,0,12 11 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 12 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp16_gpu.onnx,2021-10-04 05:22:49.040808,4.11.2,1.11.0.dev20211003+cpu,1,6,0,6,0,0,12 13 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 14 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_gpu.onnx,2021-10-04 05:29:54.038759,4.11.2,1.11.0.dev20211003+cu111,1,6,0,0,6,0,12 15 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 16 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp16_gpu.onnx,2021-10-04 05:30:44.705058,4.11.2,1.11.0.dev20211003+cu111,1,6,0,6,0,0,12 17 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 18 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_gpu.onnx,2021-10-05 06:12:25.602878,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 19 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 20 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 06:13:58.767952,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 21 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 22 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_int8_cpu.onnx,2021-10-05 06:14:37.706671,4.11.2,1.11.0.dev20211003+cu111,0,12,0,0,12,1,24 23 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 24 | ./onnx_models/bert_large_uncased_1_fp32_cpu.onnx,2021-10-05 06:18:24.971957,4.11.2,1.11.0.dev20211003+cu111,1,24,0,0,24,0,48 25 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 26 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 06:21:34.252637,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 27 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 28 | ./onnx_models/bert_large_uncased_1_int8_cpu.onnx,2021-10-05 06:23:53.876515,4.11.2,1.11.0.dev20211003+cu111,0,24,0,0,24,1,48 29 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 30 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_int8_cpu.onnx,2021-10-05 06:26:36.232346,4.11.2,1.11.0.dev20211003+cu111,0,12,0,0,12,1,24 31 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 32 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 23:05:17.742694,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 33 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 34 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 23:09:44.412182,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 35 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 36 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 23:12:46.956941,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 37 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 38 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_gpu.onnx,2021-10-05 23:15:17.325840,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 39 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 40 | ./onnx_models/gpt2_1_fp32_gpu.onnx,2021-10-05 23:22:20.170049,4.11.2,1.11.0.dev20211003+cu111,0,0,0,12,0,25,0 41 | -------------------------------------------------------------------------------- /hf.co_1ms/images/cpu_16_2_5ms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/c9cc725effa7b8070b69487c54dee5ba28cffa9b/hf.co_1ms/images/cpu_16_2_5ms.png -------------------------------------------------------------------------------- /hf.co_1ms/images/cpu_9_7ms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/c9cc725effa7b8070b69487c54dee5ba28cffa9b/hf.co_1ms/images/cpu_9_7ms.png -------------------------------------------------------------------------------- /hf.co_1ms/images/gpu_128_2_6ms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/c9cc725effa7b8070b69487c54dee5ba28cffa9b/hf.co_1ms/images/gpu_128_2_6ms.png -------------------------------------------------------------------------------- /hf.co_1ms/images/gpu_16_1_7ms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/c9cc725effa7b8070b69487c54dee5ba28cffa9b/hf.co_1ms/images/gpu_16_1_7ms.png -------------------------------------------------------------------------------- /hf.co_1ms/images/infinity_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/c9cc725effa7b8070b69487c54dee5ba28cffa9b/hf.co_1ms/images/infinity_model.png -------------------------------------------------------------------------------- /hf.co_1ms/images/model_dir.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/c9cc725effa7b8070b69487c54dee5ba28cffa9b/hf.co_1ms/images/model_dir.png -------------------------------------------------------------------------------- /hf.co_1ms/onnx.diff: -------------------------------------------------------------------------------- 1 | diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py 2 | index 6e5d5b98e..a9f0e3a93 100644 3 | --- a/onnxruntime/python/tools/transformers/benchmark.py 4 | +++ b/onnxruntime/python/tools/transformers/benchmark.py 5 | @@ -483,7 +483,7 @@ def parse_arguments(): 6 | help='Disable running ONNX Runtime with binded inputs and outputs. ') 7 | parser.set_defaults(disable_ort_io_binding=False) 8 | 9 | - parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[0], help="Threads to use") 10 | + parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[2], help="Threads to use") 11 | 12 | args = parser.parse_args() 13 | return args 14 | diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py 15 | index 051480ebb..31bd05b87 100644 16 | --- a/onnxruntime/python/tools/transformers/huggingface_models.py 17 | +++ b/onnxruntime/python/tools/transformers/huggingface_models.py 18 | @@ -16,6 +16,7 @@ MODELS = { 19 | "bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 20 | "bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 21 | "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 22 | + "philschmid/MiniLM-L6-H384-uncased-sst2": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 23 | # "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 24 | # "bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 25 | # "bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 26 | -------------------------------------------------------------------------------- /hf.co_1ms/onnx_with_eigen.diff: -------------------------------------------------------------------------------- 1 | diff --git a/.gitmodules b/.gitmodules 2 | index 5c2838373..cae75f4b7 100644 3 | --- a/.gitmodules 4 | +++ b/.gitmodules 5 | @@ -24,7 +24,7 @@ 6 | url = https://github.com/google/re2.git 7 | [submodule "cmake/external/eigen"] 8 | path = cmake/external/eigen 9 | - url = https://gitlab.com/libeigen/eigen.git 10 | + url = https://gitlab.com/cantonios/eigen.git 11 | [submodule "cmake/external/cxxopts"] 12 | path = cmake/external/cxxopts 13 | url = https://github.com/jarro2783/cxxopts.git 14 | diff --git a/cgmanifests/submodules/cgmanifest.json b/cgmanifests/submodules/cgmanifest.json 15 | index 41c43a6ff..1388141ca 100644 16 | --- a/cgmanifests/submodules/cgmanifest.json 17 | +++ b/cgmanifests/submodules/cgmanifest.json 18 | @@ -115,7 +115,7 @@ 19 | "type": "git", 20 | "git": { 21 | "commitHash": "efd9867ff0e8df23016ac6c9828d0d7bf8bec1b1", 22 | - "repositoryUrl": "https://gitlab.com/libeigen/eigen.git" 23 | + "repositoryUrl": "https://gitlab.com/cantonios/eigen.git" 24 | }, 25 | "comments": "git submodule at cmake/external/FeaturizersLibrary/src/3rdParty/eigen" 26 | } 27 | @@ -195,7 +195,7 @@ 28 | "type": "git", 29 | "git": { 30 | "commitHash": "d10b27fe37736d2944630ecd7557cefa95cf87c9", 31 | - "repositoryUrl": "https://gitlab.com/libeigen/eigen.git" 32 | + "repositoryUrl": "https://gitlab.com/cantonios/eigen.git" 33 | }, 34 | "comments": "git submodule at cmake/external/eigen" 35 | } 36 | diff --git a/cmake/external/FeaturizersLibrary b/cmake/external/FeaturizersLibrary 37 | --- a/cmake/external/FeaturizersLibrary 38 | +++ b/cmake/external/FeaturizersLibrary 39 | @@ -1 +1 @@ 40 | -Subproject commit fd5fe3de507d4a19f5923c5d4c267e3d730500a9 41 | +Subproject commit fd5fe3de507d4a19f5923c5d4c267e3d730500a9-dirty 42 | diff --git a/cmake/external/eigen b/cmake/external/eigen 43 | --- a/cmake/external/eigen 44 | +++ b/cmake/external/eigen 45 | @@ -1 +1 @@ 46 | -Subproject commit d10b27fe37736d2944630ecd7557cefa95cf87c9 47 | +Subproject commit d10b27fe37736d2944630ecd7557cefa95cf87c9-dirty 48 | diff --git a/cmake/external/onnx b/cmake/external/onnx 49 | --- a/cmake/external/onnx 50 | +++ b/cmake/external/onnx 51 | @@ -1 +1 @@ 52 | -Subproject commit 1f63dcb7fcc3a8bf5c3c8e326867ecd6f5c43f35 53 | +Subproject commit 1f63dcb7fcc3a8bf5c3c8e326867ecd6f5c43f35-dirty 54 | diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py 55 | index 6e5d5b98e..a9f0e3a93 100644 56 | --- a/onnxruntime/python/tools/transformers/benchmark.py 57 | +++ b/onnxruntime/python/tools/transformers/benchmark.py 58 | @@ -483,7 +483,7 @@ def parse_arguments(): 59 | help='Disable running ONNX Runtime with binded inputs and outputs. ') 60 | parser.set_defaults(disable_ort_io_binding=False) 61 | 62 | - parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[0], help="Threads to use") 63 | + parser.add_argument("-n", "--num_threads", required=False, nargs="+", type=int, default=[2], help="Threads to use") 64 | 65 | args = parser.parse_args() 66 | return args 67 | diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py 68 | index 051480ebb..31bd05b87 100644 69 | --- a/onnxruntime/python/tools/transformers/huggingface_models.py 70 | +++ b/onnxruntime/python/tools/transformers/huggingface_models.py 71 | @@ -16,6 +16,7 @@ MODELS = { 72 | "bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 73 | "bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 74 | "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 75 | + "philschmid/MiniLM-L6-H384-uncased-sst2": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 76 | # "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 77 | # "bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 78 | # "bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 79 | -------------------------------------------------------------------------------- /hf.co_1ms/requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | sympy 3 | wheel 4 | psutill 5 | -------------------------------------------------------------------------------- /hf.co_1ms/result.csv: -------------------------------------------------------------------------------- 1 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 2 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,2.76,13.63 3 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 4 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.9.0+cpu,cpu,fp32,,,2,5.86,16.65 5 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 6 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,int8,True,True,2,1.24,7.48 7 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 8 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.9.0+cpu,cpu,int8,,,2,1.96,9.59 9 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 10 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,2.72,13.33 11 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 12 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cpu,cpu,fp32,,,2,6.14,17.39 13 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 14 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,int8,True,True,2,1.24,7.68 15 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 16 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cpu,cpu,int8,,,2,2.49,11.67 17 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 18 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,0.81,0.83 19 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 20 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,2,3.10,3.55 21 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 22 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cuda,fp16,True,True,2,0.74,0.97 23 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 24 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp16,,,2,2.77,2.96 25 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 26 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,1.18,1.62 27 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 28 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,7.99,44.87 29 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 30 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,13.17,51.22 31 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 32 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,int8,True,True,2,5.49,41.44 33 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 34 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,2,6.80,41.23 35 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 36 | bert-large-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,105.78,547.92 37 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 38 | bert-large-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,176.39,633.71 39 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 40 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,8.01,44.48 41 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 42 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,12.99,51.68 43 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 44 | bert-large-uncased,1,onnxruntime,1.10.0,cpu,int8,True,True,2,68.26,472.31 45 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 46 | bert-large-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,2,54.54,414.33 47 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 48 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,int8,True,True,2,5.92,43.16 49 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 50 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,2,7.85,41.07 51 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 52 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,8.76,49.58 53 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 54 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,14.09,58.02 55 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 56 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,8.14,48.34 57 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 58 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,14.05,58.36 59 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 60 | microsoft/MiniLM-L12-H384-uncased,1,tensorflow,2.8.0-dev20211005,cpu,fp32,,,2,22.49,91.46 61 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128 62 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,49.27 63 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128 64 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,56.76 65 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128 66 | microsoft/MiniLM-L12-H384-uncased,1,tensorflow,2.8.0-dev20211005,cpu,fp32,,,2,93.29 67 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128 68 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,1.64 69 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128 70 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,2,5.69 71 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128 72 | microsoft/MiniLM-L12-H384-uncased,1,tensorflow,2.8.0-dev20211005,cuda,fp32,,,2,7.82 73 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128 74 | gpt2,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,3.80 75 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128 76 | gpt2,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,2,5.03 77 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128 78 | gpt2,1,tensorflow,2.8.0-dev20211005,cuda,fp32,,,2,6.58 79 | -------------------------------------------------------------------------------- /hf.co_1ms/run_benchmark.sh: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | # This measures the performance of OnnxRuntime, PyTorch and TorchScript on transformer models. 7 | # Please install PyTorch (see https://pytorch.org/) before running this benchmark. Like the following: 8 | # GPU: conda install pytorch torchvision cudatoolkit=11.0 -c pytorch 9 | # CPU: conda install pytorch torchvision cpuonly -c pytorch 10 | 11 | # When use_package=true, you need not copy other files to run benchmarks except this sh file. 12 | # Otherwise, it will use python script (*.py) files in this directory. 13 | use_package=true 14 | 15 | # only need once 16 | run_install=false 17 | 18 | # Engines to test. 19 | run_ort=true 20 | run_torch=false 21 | run_torchscript=true 22 | run_tensorflow=false 23 | 24 | # Onnx model source (default is from pytorch, set export_onnx_from_tf=true to convert from tensorflow model) 25 | export_onnx_from_tf=false 26 | 27 | # Devices to test (You can run either CPU or GPU, but not both: gpu need onnxruntime-gpu, and CPU need onnxruntime). 28 | run_gpu_fp32=false 29 | run_gpu_fp16=false 30 | run_cpu_fp32=true 31 | run_cpu_int8=true 32 | 33 | average_over=1000 34 | # CPU takes longer time to run, only run 100 inferences to get average latency. 35 | if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then 36 | average_over=100 37 | fi 38 | 39 | # Enable optimizer (use script instead of OnnxRuntime for graph optimization) 40 | use_optimizer=true 41 | 42 | # Batch Sizes and Sequence Lengths 43 | batch_sizes="1" 44 | sequence_lengths="16 128" 45 | 46 | # Number of inputs (input_ids, token_type_ids, attention_mask) for ONNX model. 47 | # Not that different input count might lead to different performance 48 | # Here we only test one input (input_ids) for fair comparison with PyTorch. 49 | input_counts=1 50 | 51 | # Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased 52 | #models_to_test="bert-base-cased roberta-base distilbert-base-uncased" 53 | models_to_test="philschmid/MiniLM-L6-H384-uncased-sst2" 54 | 55 | # If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU: 56 | # export CUDA_VISIBLE_DEVICES=1 57 | 58 | # This script will generate a logs file with a list of commands used in tests. 59 | echo echo "ort=$run_ort torch=$run_torch torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" >> benchmark.log 60 | 61 | # Set it to false to skip testing. You can use it to dry run this script with the log file. 62 | run_tests=true 63 | 64 | # Directory for downloading pretrained models. 65 | cache_dir="./cache_models" 66 | 67 | # Directory for ONNX models 68 | onnx_dir="./onnx_models" 69 | 70 | # ------------------------------------------- 71 | if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then 72 | if [ "$run_gpu_fp32" = true ] ; then 73 | echo "cannot test cpu and gpu at same time" 74 | exit 1 75 | fi 76 | if [ "$run_gpu_fp16" = true ] ; then 77 | echo "cannot test cpu and gpu at same time" 78 | exit 1 79 | fi 80 | fi 81 | 82 | 83 | if [ "$run_install" = true ] ; then 84 | pip uninstall --yes ort-nightly ort-gpu-nightly 85 | pip uninstall --yes onnxruntime 86 | pip uninstall --yes onnxruntime-gpu 87 | if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then 88 | pip install onnxruntime 89 | else 90 | pip install onnxruntime-gpu 91 | fi 92 | pip install --upgrade onnx coloredlogs packaging psutil py3nvml onnxconverter_common numpy transformers 93 | fi 94 | 95 | if [ "$use_package" = true ] ; then 96 | echo "Use onnxruntime.transformers.benchmark" 97 | benchmark_script="-m onnxruntime.transformers.benchmark" 98 | else 99 | benchmark_script="benchmark.py" 100 | fi 101 | 102 | onnx_export_options="-i $input_counts -v -b 0 --overwrite -f fusion.csv -c $cache_dir --onnx_dir $onnx_dir" 103 | benchmark_options="-b $batch_sizes -s $sequence_lengths -t $average_over -f fusion.csv -r result.csv -d detail.csv -c $cache_dir --onnx_dir $onnx_dir" 104 | 105 | if [ "$export_onnx_from_tf" = true ] ; then 106 | onnx_export_options="$onnx_export_options --model_source tf" 107 | benchmark_options="$benchmark_options --model_source tf" 108 | fi 109 | 110 | if [ "$use_optimizer" = true ] ; then 111 | onnx_export_options="$onnx_export_options -o" 112 | benchmark_options="$benchmark_options -o" 113 | fi 114 | 115 | # ------------------------------------------- 116 | run_one_test() { 117 | if [ "$run_ort" = true ] ; then 118 | echo python $benchmark_script -m $1 $onnx_export_options $2 $3 $4 >> benchmark.log 119 | echo python $benchmark_script -m $1 $benchmark_options $2 $3 $4 -i $input_counts >> benchmark.log 120 | if [ "$run_tests" = true ] ; then 121 | python $benchmark_script -m $1 $onnx_export_options $2 $3 $4 122 | python $benchmark_script -m $1 $benchmark_options $2 $3 $4 -i $input_counts 123 | fi 124 | fi 125 | 126 | if [ "$run_torch" = true ] ; then 127 | echo python $benchmark_script -e torch -m $1 $benchmark_options $2 $3 $4 >> benchmark.log 128 | if [ "$run_tests" = true ] ; then 129 | python $benchmark_script -e torch -m $1 $benchmark_options $2 $3 $4 130 | fi 131 | fi 132 | 133 | if [ "$run_torchscript" = true ] ; then 134 | echo python $benchmark_script -e torchscript -m $1 $benchmark_options $2 $3 $4 >> benchmark.log 135 | if [ "$run_tests" = true ] ; then 136 | python $benchmark_script -e torchscript -m $1 $benchmark_options $2 $3 $4 137 | fi 138 | fi 139 | 140 | if [ "$run_tensorflow" = true ] ; then 141 | echo python $benchmark_script -e tensorflow -m $1 $benchmark_options $2 $3 $4 >> benchmark.log 142 | if [ "$run_tests" = true ] ; then 143 | python $benchmark_script -e tensorflow -m $1 $benchmark_options $2 $3 $4 144 | fi 145 | fi 146 | } 147 | 148 | # ------------------------------------------- 149 | if [ "$run_gpu_fp32" = true ] ; then 150 | for m in $models_to_test 151 | do 152 | echo Run GPU FP32 Benchmark on model ${m} 153 | run_one_test "${m}" -g 154 | done 155 | fi 156 | 157 | if [ "$run_gpu_fp16" = true ] ; then 158 | for m in $models_to_test 159 | do 160 | echo Run GPU FP16 Benchmark on model ${m} 161 | run_one_test "${m}" -g -p fp16 162 | done 163 | fi 164 | 165 | if [ "$run_cpu_fp32" = true ] ; then 166 | for m in $models_to_test 167 | do 168 | echo Run CPU Benchmark on model ${m} 169 | run_one_test "${m}" 170 | done 171 | fi 172 | 173 | if [ "$run_cpu_int8" = true ] ; then 174 | for m in $models_to_test 175 | do 176 | echo Run CPU Benchmark on model ${m} 177 | run_one_test "${m}" -p int8 178 | done 179 | fi 180 | 181 | if [ "run_tests" = false ] ; then 182 | more $log_file 183 | fi 184 | 185 | # Remove duplicated lines 186 | awk '!x[$0]++' ./result.csv > summary_result.csv 187 | awk '!x[$0]++' ./fusion.csv > summary_fusion.csv 188 | awk '!x[$0]++' ./detail.csv > summary_detail.csv 189 | -------------------------------------------------------------------------------- /hf.co_1ms/summary_detail.csv: -------------------------------------------------------------------------------- 1 | engine,version,device,precision,optimizer,io_binding,model_name,inputs,threads,batch_size,sequence_length,datetime,test_times,QPS,average_latency_ms,latency_variance,latency_90_percentile,latency_95_percentile,latency_99_percentile 2 | onnxruntime,1.10.0,cpu,fp32,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 03:50:26.867866,100,362.77,2.76,0.00,2.90,2.92,2.98 3 | onnxruntime,1.10.0,cpu,fp32,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 03:50:27.149803,100,73.39,13.63,0.00,13.69,13.83,14.11 4 | torchscript,1.9.0+cpu,cpu,fp32,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 03:50:32.062478,100,170.78,5.86,0.00,5.96,6.00,6.16 5 | torchscript,1.9.0+cpu,cpu,fp32,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 03:50:34.446485,100,60.07,16.65,0.00,17.08,17.15,17.43 6 | onnxruntime,1.10.0,cpu,int8,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 03:50:45.708862,100,805.29,1.24,0.00,1.36,1.41,1.77 7 | onnxruntime,1.10.0,cpu,int8,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 03:50:45.837148,100,133.75,7.48,0.00,7.62,7.70,7.75 8 | torchscript,1.9.0+cpu,cpu,int8,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 03:50:50.498982,100,509.34,1.96,0.00,2.02,2.04,2.14 9 | torchscript,1.9.0+cpu,cpu,int8,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 03:50:52.568732,100,104.32,9.59,0.00,9.79,9.84,9.94 10 | onnxruntime,1.10.0,cpu,fp32,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 04:34:05.479853,100,368.08,2.72,0.00,2.82,2.89,3.04 11 | onnxruntime,1.10.0,cpu,fp32,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 04:34:05.757331,100,75.02,13.33,0.00,13.43,13.49,13.55 12 | torchscript,1.11.0.dev20211003+cpu,cpu,fp32,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 04:34:10.839467,100,162.78,6.14,0.00,6.24,6.33,6.72 13 | torchscript,1.11.0.dev20211003+cpu,cpu,fp32,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 04:34:13.508216,100,57.51,17.39,0.00,17.46,17.59,19.88 14 | onnxruntime,1.10.0,cpu,int8,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 04:34:25.458148,100,803.30,1.24,0.00,1.27,1.32,1.44 15 | onnxruntime,1.10.0,cpu,int8,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 04:34:25.586571,100,130.24,7.68,0.00,7.72,7.74,8.28 16 | torchscript,1.11.0.dev20211003+cpu,cpu,int8,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 04:34:30.947950,100,402.31,2.49,0.00,2.52,2.53,2.75 17 | torchscript,1.11.0.dev20211003+cpu,cpu,int8,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 04:34:33.658199,100,85.68,11.67,0.00,11.81,11.92,12.11 18 | onnxruntime,1.10.0,cuda,fp32,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 05:30:04.821624,1000,1231.85,0.81,0.00,0.83,0.84,0.88 19 | onnxruntime,1.10.0,cuda,fp32,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 05:30:05.648448,1000,1197.71,0.83,0.00,0.85,0.85,0.90 20 | torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 05:30:19.976177,1000,322.43,3.10,0.00,3.12,3.14,3.33 21 | torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 05:30:25.230227,1000,281.58,3.55,0.00,3.57,3.59,3.73 22 | onnxruntime,1.10.0,cuda,fp16,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 05:30:55.521173,1000,1357.36,0.74,0.00,0.75,0.76,0.79 23 | onnxruntime,1.10.0,cuda,fp16,True,True,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 05:30:56.270584,1000,1026.73,0.97,0.00,0.99,1.01,1.07 24 | torchscript,1.11.0.dev20211003+cu111,cuda,fp16,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,16,2021-10-04 05:31:10.413587,1000,360.63,2.77,0.00,2.80,2.83,2.96 25 | torchscript,1.11.0.dev20211003+cu111,cuda,fp16,,,philschmid/MiniLM-L6-H384-uncased-sst2,1,2,1,128,2021-10-04 05:31:15.078604,1000,338.01,2.96,0.00,3.00,3.03,3.23 26 | onnxruntime,1.10.0,cuda,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:12:35.849579,1000,846.56,1.18,0.00,1.27,1.34,1.59 27 | onnxruntime,1.10.0,cuda,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:12:37.044826,1000,615.51,1.62,0.00,1.64,1.65,1.86 28 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:14:02.389280,100,125.11,7.99,0.00,8.24,8.30,8.78 29 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:14:03.200105,100,22.29,44.87,0.00,46.53,46.65,47.82 30 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:14:14.120446,100,75.91,13.17,0.00,13.86,13.93,14.58 31 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:14:21.541510,100,19.52,51.22,0.00,52.90,54.08,55.23 32 | onnxruntime,1.10.0,cpu,int8,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:14:41.473194,100,182.00,5.49,0.00,5.82,5.95,6.12 33 | onnxruntime,1.10.0,cpu,int8,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:14:42.031579,100,24.13,41.44,0.01,44.82,47.02,47.71 34 | torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:14:54.679325,100,147.13,6.80,0.00,7.12,7.37,8.59 35 | torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:15:02.571615,100,24.25,41.23,0.00,42.92,43.60,45.47 36 | onnxruntime,1.10.0,cpu,fp32,True,True,bert-large-uncased,1,2,1,16,2021-10-05 06:18:34.056351,100,9.45,105.78,0.00,108.11,108.99,109.30 37 | onnxruntime,1.10.0,cpu,fp32,True,True,bert-large-uncased,1,2,1,128,2021-10-05 06:18:44.743220,100,1.83,547.92,0.10,559.56,570.29,578.96 38 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,bert-large-uncased,1,2,1,16,2021-10-05 06:20:10.589961,100,5.67,176.39,0.01,181.04,182.60,189.76 39 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,bert-large-uncased,1,2,1,128,2021-10-05 06:21:21.849099,100,1.58,633.71,0.13,645.70,648.50,669.45 40 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:21:37.840842,100,124.88,8.01,0.00,8.36,8.38,8.63 41 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:21:38.653203,100,22.48,44.48,0.00,46.29,47.69,49.16 42 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:21:49.417318,100,76.97,12.99,0.00,13.80,14.08,15.73 43 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:21:56.896994,100,19.35,51.68,0.01,54.59,55.95,58.57 44 | onnxruntime,1.10.0,cpu,int8,True,True,bert-large-uncased,1,2,1,16,2021-10-05 06:24:05.239337,100,14.65,68.26,0.01,72.23,73.16,76.34 45 | onnxruntime,1.10.0,cpu,int8,True,True,bert-large-uncased,1,2,1,128,2021-10-05 06:24:12.135738,100,2.12,472.31,0.11,484.75,489.73,502.92 46 | torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,bert-large-uncased,1,2,1,16,2021-10-05 06:25:27.824158,100,18.34,54.54,0.00,57.09,59.17,61.63 47 | torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,bert-large-uncased,1,2,1,128,2021-10-05 06:26:18.327245,100,2.41,414.33,0.15,428.85,444.39,453.52 48 | onnxruntime,1.10.0,cpu,int8,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:26:40.112340,100,168.96,5.92,0.00,6.20,6.31,6.51 49 | onnxruntime,1.10.0,cpu,int8,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:26:40.713564,100,23.17,43.16,0.00,45.58,46.19,47.42 50 | torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 06:26:53.915002,100,127.43,7.85,0.00,8.05,8.08,8.15 51 | torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 06:27:01.869530,100,24.35,41.07,0.00,43.17,43.80,46.70 52 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 23:05:21.913857,100,114.10,8.76,0.00,9.07,9.31,10.18 53 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:05:22.805509,100,20.17,49.58,0.00,50.75,51.17,55.66 54 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 23:05:34.701467,100,70.95,14.09,0.00,14.40,15.01,16.26 55 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:05:43.042837,100,17.24,58.02,0.00,59.97,61.35,63.13 56 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 23:09:49.882880,100,122.90,8.14,0.00,8.38,8.49,8.77 57 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:09:50.709533,100,20.69,48.34,0.00,49.81,50.59,54.69 58 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 23:10:04.007583,100,71.19,14.05,0.00,14.75,15.23,17.61 59 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:10:12.404564,100,17.14,58.36,0.00,59.46,61.13,63.78 60 | tensorflow,2.8.0-dev20211005,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,16,2021-10-05 23:10:28.886317,100,44.46,22.49,0.00,25.37,26.07,26.85 61 | tensorflow,2.8.0-dev20211005,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:10:39.513954,100,10.93,91.46,0.02,97.39,100.28,105.02 62 | onnxruntime,1.10.0,cpu,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:12:52.513580,100,20.29,49.27,0.00,50.35,51.24,54.06 63 | torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:13:10.888478,100,17.62,56.76,0.00,58.22,58.75,64.49 64 | tensorflow,2.8.0-dev20211005,cpu,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:13:30.827603,100,10.72,93.29,0.01,96.71,100.44,102.02 65 | onnxruntime,1.10.0,cuda,fp32,True,True,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:15:30.156163,1000,608.39,1.64,0.00,1.80,1.81,1.86 66 | torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:15:52.719761,1000,175.79,5.69,0.00,5.92,6.00,6.41 67 | tensorflow,2.8.0-dev20211005,cuda,fp32,,,microsoft/MiniLM-L12-H384-uncased,1,2,1,128,2021-10-05 23:16:15.848500,1000,127.84,7.82,0.00,8.24,8.44,8.85 68 | onnxruntime,1.10.0,cuda,fp32,True,True,gpt2,1,2,1,128,2021-10-05 23:22:34.121060,1000,262.99,3.80,0.00,3.83,3.85,4.35 69 | torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,gpt2,1,2,1,128,2021-10-05 23:23:00.527563,1000,198.88,5.03,0.00,5.23,5.32,5.79 70 | tensorflow,2.8.0-dev20211005,cuda,fp32,,,gpt2,1,2,1,128,2021-10-05 23:23:35.287606,1000,152.00,6.58,0.00,6.93,7.09,7.68 71 | -------------------------------------------------------------------------------- /hf.co_1ms/summary_fusion.csv: -------------------------------------------------------------------------------- 1 | model_filename,datetime,transformers,torch,EmbedLayerNormalization,Attention,Gelu,FastGelu,BiasGelu,LayerNormalization,SkipLayerNormalization 2 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_cpu.onnx,2021-10-04 03:50:24.257021,4.11.2,1.9.0+cpu,1,6,0,0,6,0,12 3 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_int8_cpu.onnx,2021-10-04 03:50:43.020711,4.11.2,1.9.0+cpu,0,6,0,0,6,1,12 4 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_cpu.onnx,2021-10-04 04:34:02.831263,4.11.2,1.11.0.dev20211003+cpu,1,6,0,0,6,0,12 5 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_int8_cpu.onnx,2021-10-04 04:34:22.689454,4.11.2,1.11.0.dev20211003+cpu,0,6,0,0,6,1,12 6 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_gpu.onnx,2021-10-04 05:22:26.404124,4.11.2,1.11.0.dev20211003+cpu,1,6,0,0,6,0,12 7 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp16_gpu.onnx,2021-10-04 05:22:49.040808,4.11.2,1.11.0.dev20211003+cpu,1,6,0,6,0,0,12 8 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp32_gpu.onnx,2021-10-04 05:29:54.038759,4.11.2,1.11.0.dev20211003+cu111,1,6,0,0,6,0,12 9 | ./onnx_models/philschmid_MiniLM_L6_H384_uncased_sst2_1_fp16_gpu.onnx,2021-10-04 05:30:44.705058,4.11.2,1.11.0.dev20211003+cu111,1,6,0,6,0,0,12 10 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_gpu.onnx,2021-10-05 06:12:25.602878,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 11 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 06:13:58.767952,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 12 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_int8_cpu.onnx,2021-10-05 06:14:37.706671,4.11.2,1.11.0.dev20211003+cu111,0,12,0,0,12,1,24 13 | ./onnx_models/bert_large_uncased_1_fp32_cpu.onnx,2021-10-05 06:18:24.971957,4.11.2,1.11.0.dev20211003+cu111,1,24,0,0,24,0,48 14 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 06:21:34.252637,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 15 | ./onnx_models/bert_large_uncased_1_int8_cpu.onnx,2021-10-05 06:23:53.876515,4.11.2,1.11.0.dev20211003+cu111,0,24,0,0,24,1,48 16 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_int8_cpu.onnx,2021-10-05 06:26:36.232346,4.11.2,1.11.0.dev20211003+cu111,0,12,0,0,12,1,24 17 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 23:05:17.742694,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 18 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 23:09:44.412182,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 19 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_cpu.onnx,2021-10-05 23:12:46.956941,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 20 | ./onnx_models/microsoft_MiniLM_L12_H384_uncased_1_fp32_gpu.onnx,2021-10-05 23:15:17.325840,4.11.2,1.11.0.dev20211003+cu111,1,12,0,0,12,0,24 21 | ./onnx_models/gpt2_1_fp32_gpu.onnx,2021-10-05 23:22:20.170049,4.11.2,1.11.0.dev20211003+cu111,0,0,0,12,0,25,0 22 | -------------------------------------------------------------------------------- /hf.co_1ms/summary_result.csv: -------------------------------------------------------------------------------- 1 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s16,b1_s128 2 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,2.76,13.63 3 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.9.0+cpu,cpu,fp32,,,2,5.86,16.65 4 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,int8,True,True,2,1.24,7.48 5 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.9.0+cpu,cpu,int8,,,2,1.96,9.59 6 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,2.72,13.33 7 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cpu,cpu,fp32,,,2,6.14,17.39 8 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cpu,int8,True,True,2,1.24,7.68 9 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cpu,cpu,int8,,,2,2.49,11.67 10 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,0.81,0.83 11 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,2,3.10,3.55 12 | philschmid/MiniLM-L6-H384-uncased-sst2,1,onnxruntime,1.10.0,cuda,fp16,True,True,2,0.74,0.97 13 | philschmid/MiniLM-L6-H384-uncased-sst2,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp16,,,2,2.77,2.96 14 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,1.18,1.62 15 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,7.99,44.87 16 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,13.17,51.22 17 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,int8,True,True,2,5.49,41.44 18 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,2,6.80,41.23 19 | bert-large-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,105.78,547.92 20 | bert-large-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,176.39,633.71 21 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,8.01,44.48 22 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,12.99,51.68 23 | bert-large-uncased,1,onnxruntime,1.10.0,cpu,int8,True,True,2,68.26,472.31 24 | bert-large-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,2,54.54,414.33 25 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,int8,True,True,2,5.92,43.16 26 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,int8,,,2,7.85,41.07 27 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,8.76,49.58 28 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,14.09,58.02 29 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,8.14,48.34 30 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,14.05,58.36 31 | microsoft/MiniLM-L12-H384-uncased,1,tensorflow,2.8.0-dev20211005,cpu,fp32,,,2,22.49,91.46 32 | model_name,inputs,engine,version,device,precision,optimizer,io_binding,threads,b1_s128 33 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cpu,fp32,True,True,2,49.27 34 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cpu,fp32,,,2,56.76 35 | microsoft/MiniLM-L12-H384-uncased,1,tensorflow,2.8.0-dev20211005,cpu,fp32,,,2,93.29 36 | microsoft/MiniLM-L12-H384-uncased,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,1.64 37 | microsoft/MiniLM-L12-H384-uncased,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,2,5.69 38 | microsoft/MiniLM-L12-H384-uncased,1,tensorflow,2.8.0-dev20211005,cuda,fp32,,,2,7.82 39 | gpt2,1,onnxruntime,1.10.0,cuda,fp32,True,True,2,3.80 40 | gpt2,1,torchscript,1.11.0.dev20211003+cu111,cuda,fp32,,,2,5.03 41 | gpt2,1,tensorflow,2.8.0-dev20211005,cuda,fp32,,,2,6.58 42 | -------------------------------------------------------------------------------- /hf_co_models.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | 7 | # Maps model class name to a tuple of model class 8 | MODEL_CLASSES = [ 9 | 'AutoModel', 'AutoModelWithLMHead', 'AutoModelForSequenceClassification', 'AutoModelForQuestionAnswering' 10 | ] 11 | 12 | # List of pretrained models: https://huggingface.co/transformers/pretrained_models.html 13 | # Pretrained model name to a tuple of input names, opset_version, use_external_data_format, optimization model type 14 | MODELS = { 15 | # BERT 16 | "bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 17 | "bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 18 | "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 19 | "philschmid/MiniLM-L6-H384-uncased-sst2": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 20 | # "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 21 | # "bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 22 | # "bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 23 | # "bert-base-chinese": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 24 | # "bert-base-german-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 25 | # "bert-large-uncased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 26 | # "bert-large-cased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 27 | # "bert-large-uncased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask", 28 | # "token_type_ids"], 12, False, "bert"), 29 | # "bert-large-cased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask", 30 | # "token_type_ids"], 12, False, "bert"), 31 | # "bert-base-cased-finetuned-mrpc": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 32 | # "bert-base-german-dbmdz-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 33 | # "bert-base-german-dbmdz-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 34 | # todo: more models to add 35 | # GPT (no past state) 36 | "openai-gpt": (["input_ids"], 11, False, "gpt2"), 37 | # GPT-2 (no past state, use benchmark_gpt2.py for past_key_values) 38 | "gpt2": (["input_ids"], 11, False, "gpt2"), 39 | "gpt2-medium": (["input_ids"], 11, False, "gpt2"), 40 | "gpt2-large": (["input_ids"], 11, True, "gpt2"), 41 | "gpt2-xl": (["input_ids"], 11, True, "gpt2"), 42 | "distilgpt2": (["input_ids"], 11, False, "gpt2"), 43 | # Transformer-XL (Models uses Einsum, which need opset version 12 or later.) 44 | "transfo-xl-wt103": (["input_ids", "mems"], 12, False, "bert"), 45 | # XLNet 46 | "xlnet-base-cased": (["input_ids"], 12, False, "bert"), 47 | "xlnet-large-cased": (["input_ids"], 12, False, "bert"), 48 | # XLM 49 | "xlm-mlm-en-2048": (["input_ids"], 11, True, "bert"), 50 | "xlm-mlm-ende-1024": (["input_ids"], 11, False, "bert"), 51 | "xlm-mlm-enfr-1024": (["input_ids"], 11, False, "bert"), 52 | # RoBERTa 53 | "roberta-base": (["input_ids", "attention_mask"], 12, False, "bert"), 54 | "roberta-large": (["input_ids", "attention_mask"], 12, False, "bert"), 55 | "roberta-large-mnli": (["input_ids", "attention_mask"], 12, False, "bert"), 56 | "deepset/roberta-base-squad2": (["input_ids", "attention_mask"], 11, False, "bert"), 57 | "distilroberta-base": (["input_ids", "attention_mask"], 12, False, "bert"), 58 | 59 | # DistilBERT 60 | "distilbert-base-uncased": (["input_ids", "attention_mask"], 11, False, "bert"), 61 | "distilbert-base-uncased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"), 62 | # CTRL 63 | "ctrl": (["input_ids"], 11, True, "bert"), 64 | # CamemBERT 65 | "camembert-base": (["input_ids"], 11, False, "bert"), 66 | # ALBERT 67 | "albert-base-v1": (["input_ids"], 12, False, "bert"), 68 | "albert-large-v1": (["input_ids"], 12, False, "bert"), 69 | "albert-xlarge-v1": (["input_ids"], 12, True, "bert"), 70 | #"albert-xxlarge-v1": (["input_ids"], 12, True, "bert"), 71 | "albert-base-v2": (["input_ids"], 12, False, "bert"), 72 | "albert-large-v2": (["input_ids"], 12, False, "bert"), 73 | "albert-xlarge-v2": (["input_ids"], 12, True, "bert"), 74 | #"albert-xxlarge-v2": (["input_ids"], 12, True, "bert"), 75 | # T5 (use benchmark_t5.py instead) 76 | # "t5-small": (["input_ids", "decoder_input_ids"], 12, False, "bert"), 77 | # "t5-base": (["input_ids", "decoder_input_ids"], 12, False, "bert"), 78 | # "t5-large": (["input_ids", "decoder_input_ids"], 12, True, "bert"), 79 | # "t5-3b": (["input_ids", "decoder_input_ids"], 12, True, "bert"), 80 | # "t5-11b": (["input_ids", "decoder_input_ids"], 12, True, "bert"), 81 | #"valhalla/t5-small-qa-qg-hl": (["input_ids"], 12, True, "bert"), 82 | # XLM-RoBERTa 83 | "xlm-roberta-base": (["input_ids"], 11, False, "bert"), 84 | "xlm-roberta-large": (["input_ids"], 11, True, "bert"), 85 | # FlauBERT 86 | "flaubert/flaubert_small_cased": (["input_ids"], 11, False, "bert"), 87 | #"flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"), 88 | "flaubert/flaubert_base_cased": (["input_ids"], 11, False, "bert"), 89 | #"flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"), 90 | # Bart 91 | "facebook/bart-large": (["input_ids", "attention_mask"], 11, False, "bart"), 92 | "facebook/bart-base": (["input_ids", "attention_mask"], 11, False, "bart"), 93 | "facebook/bart-large-mnli": (["input_ids", "attention_mask"], 11, False, "bart"), 94 | "facebook/bart-large-cnn": (["input_ids", "attention_mask"], 11, False, "bart"), 95 | 96 | # DialoGPT 97 | "microsoft/DialoGPT-small": (["input_ids"], 11, False, "gpt2"), 98 | "microsoft/DialoGPT-medium": (["input_ids"], 11, False, "gpt2"), 99 | #"microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"), 100 | # Reformer 101 | #"google/reformer-enwik8": (["input_ids"], 11, False, "bert"), 102 | #"google/reformer-crime-and-punishment": (["input_ids"], 11, False, "bert"), 103 | # MarianMT 104 | #"Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"), 105 | # Longformer (use benchmark_longformer.py instead) 106 | #"allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"), 107 | #"allenai/longformer-large-4096": (["input_ids"], 12, False, "bert"), 108 | # MBart 109 | "facebook/mbart-large-cc25": (["input_ids"], 11, True, "bert"), 110 | "facebook/mbart-large-en-ro": (["input_ids"], 11, True, "bert"), 111 | # "Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"), 112 | # # Longformer 113 | # "allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"), 114 | # "allenai/longformer-large-4096": (["input_ids"], 12, True, "bert"), 115 | # "funnel-transformer/small": (["input_ids"], 12, False, "bert"), 116 | # "funnel-transformer/small-base": (["input_ids"], 12, False, "bert"), 117 | # "funnel-transformer/medium": (["input_ids"], 12, False, "bert"), 118 | # "funnel-transformer/medium-base": (["input_ids"], 12, False, "bert"), 119 | # "funnel-transformer/intermediate": (["input_ids"], 12, False, "bert"), 120 | # "funnel-transformer/intermediate-base": (["input_ids"], 12, False, "bert"), 121 | # "funnel-transformer/large": (["input_ids"], 12, True, "bert"), 122 | # "funnel-transformer/large-base": (["input_ids"], 12, True, "bert"), 123 | # "funnel-transformer/xlarge": (["input_ids"], 12, True, "bert"), 124 | # "funnel-transformer/xlarge-base": (["input_ids"], 12, True, "bert"), 125 | # Layoutlm 126 | "microsoft/layoutlm-base-uncased": (["input_ids"], 11, False, "bert"), 127 | "microsoft/layoutlm-large-uncased": (["input_ids"], 11, False, "bert"), 128 | # Squeezebert 129 | "squeezebert/squeezebert-uncased": (["input_ids"], 11, False, "bert"), 130 | "squeezebert/squeezebert-mnli": (["input_ids"], 11, False, "bert"), 131 | "squeezebert/squeezebert-mnli-headless": (["input_ids"], 11, False, "bert"), 132 | "unc-nlp/lxmert-base-uncased": (["input_ids", "visual_feats", "visual_pos"], 11, False, "bert"), 133 | # "google/pegasus-xsum": (["input_ids"], 11, False, "bert"), 134 | # "google/pegasus-large": (["input_ids"], 11, False, "bert"), 135 | } 136 | -------------------------------------------------------------------------------- /huggingface_MiniLM_loadsave.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from iree import runtime as ireert 4 | from iree.compiler import tf as tfc 5 | from iree.compiler import compile_str 6 | import sys 7 | from absl import app 8 | 9 | import numpy as np 10 | import os 11 | import tempfile 12 | import tensorflow as tf 13 | 14 | import time 15 | import cProfile 16 | from transformers import BertModel, BertTokenizer, TFBertModel 17 | 18 | MAX_SEQUENCE_LENGTH = 128 19 | BATCH_SIZE = 1 20 | 21 | # Create a set of 2-dimensional inputs 22 | bert_input = [tf.TensorSpec(shape=[BATCH_SIZE,MAX_SEQUENCE_LENGTH],dtype=tf.int32), 23 | tf.TensorSpec(shape=[BATCH_SIZE,MAX_SEQUENCE_LENGTH], dtype=tf.int32), 24 | tf.TensorSpec(shape=[BATCH_SIZE,MAX_SEQUENCE_LENGTH], dtype=tf.int32)] 25 | 26 | class BertModule(tf.Module): 27 | def __init__(self): 28 | super(BertModule, self).__init__() 29 | # Create a BERT trainer with the created network. 30 | self.m = TFBertModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased", from_pt=True) 31 | 32 | # Invoke the trainer model on the inputs. This causes the layer to be built. 33 | self.m.predict = lambda x,y,z: self.m.call(input_ids=x, attention_mask=y, token_type_ids=z, training=False) 34 | 35 | @tf.function(input_signature=bert_input) 36 | def predict(self, input_ids, attention_mask, token_type_ids): 37 | return self.m.predict(input_ids, attention_mask, token_type_ids) 38 | 39 | if __name__ == "__main__": 40 | # Prepping Data 41 | tokenizer = BertTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased") 42 | text = "Replace me by any text you'd like." 43 | encoded_input = tokenizer(text, padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH) 44 | for key in encoded_input: 45 | encoded_input[key] = tf.expand_dims(tf.convert_to_tensor(encoded_input[key]),0) 46 | 47 | # Compile the model using IREE 48 | compiler_module = tfc.compile_module(BertModule(), exported_names = ["predict"], import_only=True) 49 | ARITFACTS_DIR = os.getcwd() 50 | mlir_path = os.path.join(ARITFACTS_DIR, "model_raw.mlir") 51 | with open(mlir_path, "wb") as output_file: 52 | output_file.write(compiler_module) 53 | with open(mlir_path, "rb") as input_file: 54 | compiled_data = input_file.read() 55 | 56 | # Compile the model using IREE 57 | #backend = "dylib-llvm-aot" 58 | #args = ["--iree-llvm-target-cpu-features=host"] 59 | #backend_config = "dylib" 60 | backend = "cuda" 61 | backend_config = "cuda" 62 | #args = ["--iree-cuda-llvm-target-arch=sm_75", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"] 63 | # FIXME: Stella's GPU is only 7.5 64 | args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"] 65 | flatbuffer_blob = compile_str(compiler_module, target_backends=[backend], extra_args=args, input_type="mhlo") 66 | #flatbuffer_blob = compile_str(compiled_data, target_backends=["dylib-llvm-aot"]) 67 | 68 | # Save module as MLIR file in a directory 69 | ireert.flags.FUNCTION_INPUT_VALIDATION = False 70 | ireert.flags.parse_flags("--cuda_allow_inline_execution") 71 | vm_module = ireert.VmModule.from_flatbuffer(flatbuffer_blob) 72 | #tracer = ireert.Tracer(os.getcwd()) 73 | config = ireert.Config(backend_config) 74 | ctx = ireert.SystemContext(config=config) 75 | ctx.add_vm_module(vm_module) 76 | 77 | # Setting up training/benchmark information. 78 | total_iter = 15 79 | host_inputs =[encoded_input["input_ids"], encoded_input["attention_mask"], encoded_input["token_type_ids"]] 80 | device_inputs = [ireert.asdevicearray(config.device, a) for a in host_inputs] 81 | BertCompiled = ctx.modules.module 82 | predict_f = BertCompiled.predict 83 | device_outputs = predict_f(*device_inputs) 84 | with cProfile.Profile(timer=time.perf_counter_ns, timeunit=0.000001) as pr: 85 | start = time.time() 86 | for i in range(total_iter): 87 | device_outputs = predict_f(*device_inputs) 88 | end = time.time() 89 | 90 | print("RESULTS:", {k:v.to_host() for k, v in device_outputs.items()}) 91 | 92 | total_time = end - start 93 | print("time: "+str(total_time)) 94 | print("time/iter: "+str(total_time/total_iter)) 95 | -------------------------------------------------------------------------------- /huggingface_models.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | 7 | # Maps model class name to a tuple of model class 8 | MODEL_CLASSES = [ 9 | 'AutoModel', 'AutoModelWithLMHead', 'AutoModelForSequenceClassification', 'AutoModelForQuestionAnswering' 10 | ] 11 | 12 | # List of pretrained models: https://huggingface.co/transformers/pretrained_models.html 13 | # Pretrained model name to a tuple of input names, opset_version, use_external_data_format, optimization model type 14 | MODELS = { 15 | # BERT 16 | "bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 17 | "bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 18 | "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 19 | "philschmid/MiniLM-L6-H384-uncased-sst2": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 20 | "microsoft/MiniLM-L12-H384-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 21 | # "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 22 | # "bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 23 | # "bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 24 | # "bert-base-chinese": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 25 | # "bert-base-german-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 26 | # "bert-large-uncased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 27 | # "bert-large-cased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 28 | # "bert-large-uncased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask", 29 | # "token_type_ids"], 12, False, "bert"), 30 | # "bert-large-cased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask", 31 | # "token_type_ids"], 12, False, "bert"), 32 | # "bert-base-cased-finetuned-mrpc": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 33 | # "bert-base-german-dbmdz-cased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 34 | # "bert-base-german-dbmdz-uncased": (["input_ids", "attention_mask", "token_type_ids"], 12, False, "bert"), 35 | # todo: more models to add 36 | # GPT (no past state) 37 | "openai-gpt": (["input_ids"], 11, False, "gpt2"), 38 | # GPT-2 (no past state, use benchmark_gpt2.py for past_key_values) 39 | "gpt2": (["input_ids"], 11, False, "gpt2"), 40 | "gpt2-medium": (["input_ids"], 11, False, "gpt2"), 41 | "gpt2-large": (["input_ids"], 11, True, "gpt2"), 42 | "gpt2-xl": (["input_ids"], 11, True, "gpt2"), 43 | "distilgpt2": (["input_ids"], 11, False, "gpt2"), 44 | # Transformer-XL (Models uses Einsum, which need opset version 12 or later.) 45 | "transfo-xl-wt103": (["input_ids", "mems"], 12, False, "bert"), 46 | # XLNet 47 | "xlnet-base-cased": (["input_ids"], 12, False, "bert"), 48 | "xlnet-large-cased": (["input_ids"], 12, False, "bert"), 49 | # XLM 50 | "xlm-mlm-en-2048": (["input_ids"], 11, True, "bert"), 51 | "xlm-mlm-ende-1024": (["input_ids"], 11, False, "bert"), 52 | "xlm-mlm-enfr-1024": (["input_ids"], 11, False, "bert"), 53 | # RoBERTa 54 | "roberta-base": (["input_ids", "attention_mask"], 12, False, "bert"), 55 | "roberta-large": (["input_ids", "attention_mask"], 12, False, "bert"), 56 | "roberta-large-mnli": (["input_ids", "attention_mask"], 12, False, "bert"), 57 | "deepset/roberta-base-squad2": (["input_ids", "attention_mask"], 11, False, "bert"), 58 | "distilroberta-base": (["input_ids", "attention_mask"], 12, False, "bert"), 59 | 60 | # DistilBERT 61 | "distilbert-base-uncased": (["input_ids", "attention_mask"], 11, False, "bert"), 62 | "distilbert-base-uncased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"), 63 | # CTRL 64 | "ctrl": (["input_ids"], 11, True, "bert"), 65 | # CamemBERT 66 | "camembert-base": (["input_ids"], 11, False, "bert"), 67 | # ALBERT 68 | "albert-base-v1": (["input_ids"], 12, False, "bert"), 69 | "albert-large-v1": (["input_ids"], 12, False, "bert"), 70 | "albert-xlarge-v1": (["input_ids"], 12, True, "bert"), 71 | #"albert-xxlarge-v1": (["input_ids"], 12, True, "bert"), 72 | "albert-base-v2": (["input_ids"], 12, False, "bert"), 73 | "albert-large-v2": (["input_ids"], 12, False, "bert"), 74 | "albert-xlarge-v2": (["input_ids"], 12, True, "bert"), 75 | #"albert-xxlarge-v2": (["input_ids"], 12, True, "bert"), 76 | # T5 (use benchmark_t5.py instead) 77 | # "t5-small": (["input_ids", "decoder_input_ids"], 12, False, "bert"), 78 | # "t5-base": (["input_ids", "decoder_input_ids"], 12, False, "bert"), 79 | # "t5-large": (["input_ids", "decoder_input_ids"], 12, True, "bert"), 80 | # "t5-3b": (["input_ids", "decoder_input_ids"], 12, True, "bert"), 81 | # "t5-11b": (["input_ids", "decoder_input_ids"], 12, True, "bert"), 82 | #"valhalla/t5-small-qa-qg-hl": (["input_ids"], 12, True, "bert"), 83 | # XLM-RoBERTa 84 | "xlm-roberta-base": (["input_ids"], 11, False, "bert"), 85 | "xlm-roberta-large": (["input_ids"], 11, True, "bert"), 86 | # FlauBERT 87 | "flaubert/flaubert_small_cased": (["input_ids"], 11, False, "bert"), 88 | #"flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"), 89 | "flaubert/flaubert_base_cased": (["input_ids"], 11, False, "bert"), 90 | #"flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"), 91 | # Bart 92 | "facebook/bart-large": (["input_ids", "attention_mask"], 11, False, "bart"), 93 | "facebook/bart-base": (["input_ids", "attention_mask"], 11, False, "bart"), 94 | "facebook/bart-large-mnli": (["input_ids", "attention_mask"], 11, False, "bart"), 95 | "facebook/bart-large-cnn": (["input_ids", "attention_mask"], 11, False, "bart"), 96 | 97 | # DialoGPT 98 | "microsoft/DialoGPT-small": (["input_ids"], 11, False, "gpt2"), 99 | "microsoft/DialoGPT-medium": (["input_ids"], 11, False, "gpt2"), 100 | #"microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"), 101 | # Reformer 102 | #"google/reformer-enwik8": (["input_ids"], 11, False, "bert"), 103 | #"google/reformer-crime-and-punishment": (["input_ids"], 11, False, "bert"), 104 | # MarianMT 105 | #"Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"), 106 | # Longformer (use benchmark_longformer.py instead) 107 | #"allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"), 108 | #"allenai/longformer-large-4096": (["input_ids"], 12, False, "bert"), 109 | # MBart 110 | "facebook/mbart-large-cc25": (["input_ids"], 11, True, "bert"), 111 | "facebook/mbart-large-en-ro": (["input_ids"], 11, True, "bert"), 112 | # "Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"), 113 | # # Longformer 114 | # "allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"), 115 | # "allenai/longformer-large-4096": (["input_ids"], 12, True, "bert"), 116 | # "funnel-transformer/small": (["input_ids"], 12, False, "bert"), 117 | # "funnel-transformer/small-base": (["input_ids"], 12, False, "bert"), 118 | # "funnel-transformer/medium": (["input_ids"], 12, False, "bert"), 119 | # "funnel-transformer/medium-base": (["input_ids"], 12, False, "bert"), 120 | # "funnel-transformer/intermediate": (["input_ids"], 12, False, "bert"), 121 | # "funnel-transformer/intermediate-base": (["input_ids"], 12, False, "bert"), 122 | # "funnel-transformer/large": (["input_ids"], 12, True, "bert"), 123 | # "funnel-transformer/large-base": (["input_ids"], 12, True, "bert"), 124 | # "funnel-transformer/xlarge": (["input_ids"], 12, True, "bert"), 125 | # "funnel-transformer/xlarge-base": (["input_ids"], 12, True, "bert"), 126 | # Layoutlm 127 | "microsoft/layoutlm-base-uncased": (["input_ids"], 11, False, "bert"), 128 | "microsoft/layoutlm-large-uncased": (["input_ids"], 11, False, "bert"), 129 | # Squeezebert 130 | "squeezebert/squeezebert-uncased": (["input_ids"], 11, False, "bert"), 131 | "squeezebert/squeezebert-mnli": (["input_ids"], 11, False, "bert"), 132 | "squeezebert/squeezebert-mnli-headless": (["input_ids"], 11, False, "bert"), 133 | "unc-nlp/lxmert-base-uncased": (["input_ids", "visual_feats", "visual_pos"], 11, False, "bert"), 134 | # "google/pegasus-xsum": (["input_ids"], 11, False, "bert"), 135 | # "google/pegasus-large": (["input_ids"], 11, False, "bert"), 136 | } 137 | -------------------------------------------------------------------------------- /nightly_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TIMESTAMP=`date +%Y-%m-%d_%H-%M-%S` 4 | [ -d $HOME/ci ] || mkdir $HOME/ci 5 | log_file=$HOME/ci/nightly_log_${TIMESTAMP}.txt 6 | exec &> >(tee -a "$log_file") 7 | 8 | rm -rf $HOME/ci/nightly 9 | mkdir -p $HOME/ci/nightly 10 | cd $HOME/ci/nightly 11 | curl -O --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/nod-ai/transformer-benchmarks/main/perf-ci.sh 12 | chmod +x $HOME/ci/nightly/perf-ci.sh 13 | $HOME/ci/nightly/perf-ci.sh 14 | gsutil cp $log_file gs://iree-shared-files/nod-perf/logs/ 15 | -------------------------------------------------------------------------------- /onnx_model_bart.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | import logging 6 | from fusion_attention import FusionAttention, AttentionMask 7 | from fusion_reshape import FusionReshape 8 | from onnx import numpy_helper 9 | from onnx_model import OnnxModel 10 | from onnx_model_bert import BertOnnxModel 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class FusionBartEncoderAttention(FusionAttention): 16 | """ 17 | Fuse Bart Attention subgraph into one Attention node. 18 | """ 19 | def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int, attention_mask: AttentionMask): 20 | super().__init__(model, hidden_size, num_heads, attention_mask) 21 | 22 | def check_runtime_shape_path(self, reshape_qkv_2, reshape_qkv_1, reshape_q_2, reshape_k_2, reshape_v_2, root_input): 23 | concat_qkv_2_path = self.model.match_parent_path(reshape_qkv_2, ['Concat'], [1]) 24 | if concat_qkv_2_path is None: 25 | return False 26 | concat_qkv_2 = concat_qkv_2_path[0] 27 | 28 | reshape_qkv_2_path_1 = self.model.match_parent_path(concat_qkv_2, ['Unsqueeze', 'Gather', 'Shape'], [0, 0, 0]) 29 | reshape_qkv_2_path_2 = self.model.match_parent_path(concat_qkv_2, ['Unsqueeze', 'Gather', 'Shape'], [1, 0, 0]) 30 | reshape_qkv_2_path_3 = self.model.match_parent_path(concat_qkv_2, ['Unsqueeze', 'Gather', 'Shape'], [2, 0, 0]) 31 | if reshape_qkv_2_path_1 is None or reshape_qkv_2_path_2 is None or reshape_qkv_2_path_3 is None: 32 | return False 33 | 34 | _, gather_1, shape_1 = reshape_qkv_2_path_1 35 | _, gather_2, shape_2 = reshape_qkv_2_path_2 36 | _, _, shape_3 = reshape_qkv_2_path_3 37 | 38 | if shape_1.input[0] != root_input or shape_2.input[0] != root_input or shape_3.input[0] != root_input: 39 | return False 40 | 41 | reshape_qkv_1_path_1 = self.model.match_parent_path(reshape_qkv_1, ['Concat', 'Unsqueeze', 'Gather'], [1, 0, 0]) 42 | reshape_qkv_1_path_2 = self.model.match_parent_path(reshape_qkv_1, ['Concat', 'Unsqueeze', 'Gather'], [1, 2, 0]) 43 | if reshape_qkv_1_path_1 is None or reshape_qkv_1_path_2 is None: 44 | return False 45 | if reshape_qkv_1_path_1[-1].name != gather_1.name or reshape_qkv_1_path_2[-1].name != gather_2.name: 46 | return False 47 | 48 | reshape_q_2_path = self.model.match_parent_path(reshape_q_2, ['Concat', 'Unsqueeze', 'Mul'], [1, 0, 0]) 49 | reshape_k_2_path = self.model.match_parent_path(reshape_k_2, ['Concat', 'Unsqueeze', 'Mul'], [1, 0, 0]) 50 | reshape_v_2_path = self.model.match_parent_path(reshape_v_2, ['Concat', 'Unsqueeze', 'Mul'], [1, 0, 0]) 51 | if reshape_q_2_path is None or reshape_k_2_path is None or reshape_v_2_path is None: 52 | return False 53 | 54 | mul_q = reshape_q_2_path[-1] 55 | mul_k = reshape_k_2_path[-1] 56 | mul_v = reshape_v_2_path[-1] 57 | 58 | gather_1_out = gather_1.output[0] 59 | if mul_q.input[0] != gather_1_out or mul_k.input[0] != gather_1_out or mul_v.input[0] != gather_1_out: 60 | return False 61 | 62 | return True 63 | 64 | def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): 65 | # SkipLayerNormalization has two inputs, and one of them is the root input for attention. 66 | qkv_nodes = self.model.match_parent_path(normalize_node, 67 | ['Add', 'MatMul', 'Reshape', 'Transpose', 'Reshape', 'MatMul'], 68 | [None, 1, 0, 0, 0, 0]) 69 | if qkv_nodes is not None: 70 | (add_out, matmul_out, reshape_qkv_2, transpose_qkv, reshape_qkv_1, matmul_qkv) = qkv_nodes 71 | else: 72 | return 73 | 74 | other_inputs = [] 75 | for i, input in enumerate(normalize_node.input): 76 | if input not in output_name_to_node: 77 | continue 78 | if input == qkv_nodes[0].output[0]: 79 | continue 80 | other_inputs.append(input) 81 | if len(other_inputs) != 1: 82 | return 83 | 84 | root_input = other_inputs[0] 85 | children = input_name_to_nodes[root_input] 86 | children_types = [child.op_type for child in children] 87 | if children_types.count('MatMul') != 3: 88 | return 89 | 90 | v_nodes = self.model.match_parent_path(matmul_qkv, ['Reshape', 'Transpose', 'Reshape', 'Add', 'MatMul'], 91 | [1, 0, 0, 0, None]) 92 | if v_nodes is None: 93 | logger.debug("fuse_attention: failed to match v path") 94 | return 95 | (reshape_v_2, transpose_v, reshape_v_1, add_v, matmul_v) = v_nodes 96 | 97 | qk_nodes = self.model.match_parent_path(matmul_qkv, ['Softmax', 'MatMul'], [0, 0]) 98 | if qk_nodes is not None: 99 | _, matmul_qk = qk_nodes 100 | else: 101 | return 102 | 103 | q_nodes = self.model.match_parent_path(matmul_qk, ['Reshape', 'Transpose', 'Reshape', 'Mul', 'Add', 'MatMul'], 104 | [0, 0, 0, 0, 0, 1]) 105 | if q_nodes is not None: 106 | reshape_q_2, _, reshape_q_1, _, add_q, matmul_q = q_nodes 107 | else: 108 | return 109 | 110 | k_nodes = self.model.match_parent_path(matmul_qk, 111 | ['Transpose', 'Reshape', 'Transpose', 'Reshape', 'Add', 'MatMul'], 112 | [1, 0, 0, 0, 0, 1]) 113 | if k_nodes is not None: 114 | _, reshape_k_2, _, reshape_k_1, add_k, matmul_k = k_nodes 115 | else: 116 | return 117 | 118 | if not self.check_runtime_shape_path(reshape_qkv_2, reshape_qkv_1, reshape_q_2, reshape_k_2, reshape_v_2, 119 | root_input): 120 | return 121 | 122 | if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_v.input[0] == root_input: 123 | 124 | mask_nodes = [] 125 | mask_index = None 126 | attention_last_node = reshape_qkv_2 127 | 128 | num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q_1) 129 | 130 | if num_heads <= 0 or hidden_size <= 0 or (hidden_size % num_heads) != 0: 131 | logger.debug("fuse_attention: failed to detect num_heads or hidden_size") 132 | return 133 | 134 | new_node = self.create_attention_node(mask_index, matmul_q, matmul_k, matmul_v, add_q, add_k, add_v, 135 | num_heads, hidden_size, root_input, attention_last_node.output[0], 136 | None) 137 | if new_node is None: 138 | return 139 | 140 | self.nodes_to_add.append(new_node) 141 | self.node_name_to_graph_name[new_node.name] = self.this_graph_name 142 | 143 | self.nodes_to_remove.extend([attention_last_node, transpose_qkv, matmul_qkv]) 144 | self.nodes_to_remove.extend(qk_nodes) 145 | self.nodes_to_remove.extend(q_nodes) 146 | self.nodes_to_remove.extend(k_nodes) 147 | self.nodes_to_remove.extend(v_nodes) 148 | 149 | # Use prune graph to remove mask nodes since they are shared by all attention nodes. 150 | self.nodes_to_remove.extend(mask_nodes) 151 | self.prune_graph = True 152 | 153 | 154 | class FusionBartReshape(FusionReshape): 155 | def __init__(self, model: OnnxModel): 156 | super().__init__(model) 157 | 158 | def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node): 159 | if reshape_node.input[1] not in output_name_to_node: 160 | return 161 | 162 | concat_node = output_name_to_node[reshape_node.input[1]] 163 | if concat_node.op_type != 'Concat' or len(concat_node.input) != 4: 164 | return 165 | 166 | path0 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Gather', 'Shape'], [0, 0, 0], 167 | output_name_to_node) 168 | if path0 is None: 169 | return 170 | 171 | (_, gather_0, shape_0) = path0 172 | 173 | shape = [] 174 | gather_value = self.model.get_constant_value(gather_0.input[1]) 175 | if gather_value == 0: 176 | shape.append(0) 177 | 178 | path1 = self.model.match_parent_path(concat_node, ['Unsqueeze', 'Gather', 'Shape'], [1, 0, 0], 179 | output_name_to_node) 180 | if path1 is None: 181 | input_1_proto = self.model.get_initializer(concat_node.input[1]) 182 | input_2_proto = self.model.get_initializer(concat_node.input[2]) 183 | input_3_proto = self.model.get_initializer(concat_node.input[3]) 184 | if input_1_proto is None or input_2_proto is None or input_3_proto is None: 185 | return 186 | 187 | input_1 = numpy_helper.to_array(input_1_proto) 188 | input_2 = numpy_helper.to_array(input_2_proto) 189 | input_3 = numpy_helper.to_array(input_3_proto) 190 | if len(input_1) != 1 or len(input_2) != 1 or len(input_3) != 1: 191 | return 192 | 193 | if not (input_1[0] == -1 and input_2[0] > 0 and input_3[0] > 0): 194 | return 195 | 196 | shape.extend(input_1) 197 | shape.extend(input_2) 198 | shape.extend(input_3) 199 | gemm_path = self.model.match_parent_path(reshape_node, ['Add', 'MatMul'], [0, 1], output_name_to_node) 200 | if gemm_path is None: 201 | return 202 | 203 | top_matmul = gemm_path[-1] 204 | root_input = top_matmul.input[0] 205 | if shape_0.input[0] != root_input: 206 | return 207 | 208 | self.replace_reshape_node(shape, reshape_node, concat_node) 209 | else: 210 | (_, gather_1, shape_1) = path1 211 | 212 | gather_value = self.model.get_constant_value(gather_1.input[1]) 213 | if gather_value == 1: 214 | shape.append(0) 215 | 216 | input_2_proto = self.model.get_initializer(concat_node.input[2]) 217 | input_3_proto = self.model.get_initializer(concat_node.input[3]) 218 | if input_2_proto is None or input_3_proto is None: 219 | return 220 | 221 | input_2 = numpy_helper.to_array(input_2_proto) 222 | input_3 = numpy_helper.to_array(input_3_proto) 223 | if len(input_2) != 1 or len(input_3) != 1: 224 | return 225 | 226 | if not (input_2[0] > 0 and input_3[0] > 0): 227 | return 228 | 229 | shape.extend(input_2) 230 | shape.extend(input_3) 231 | gemm_path = self.model.match_parent_path(reshape_node, ['Mul', 'Add', 'MatMul'], [0, 0, 1], 232 | output_name_to_node) 233 | if gemm_path is None: 234 | return 235 | 236 | top_matmul = gemm_path[-1] 237 | root_input = top_matmul.input[0] 238 | if shape_0.input[0] != root_input or shape_1.input[0] != root_input: 239 | return 240 | 241 | self.replace_reshape_node(shape, reshape_node, concat_node) 242 | 243 | 244 | class BartOnnxModel(BertOnnxModel): 245 | def __init__(self, model, num_heads, hidden_size): 246 | super().__init__(model, num_heads, hidden_size) 247 | self.attention_mask = AttentionMask(self) 248 | self.attention_fusion = FusionBartEncoderAttention(self, self.hidden_size, self.num_heads, self.attention_mask) 249 | self.bart_reshape_fusion_preprocess = FusionBartReshape(self) 250 | 251 | def fuse_attention(self): 252 | self.attention_fusion.apply() 253 | 254 | def preprocess(self): 255 | self.adjust_reshape_and_expand() 256 | self.bart_reshape_fusion_preprocess.apply() 257 | -------------------------------------------------------------------------------- /onnx_model_gpt2.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | import logging 6 | import onnx 7 | from onnx_model_bert import BertOnnxModel 8 | from fusion_gpt_attention_no_past import FusionGptAttentionNoPast 9 | from fusion_gpt_attention import FusionGptAttention 10 | from fusion_gpt_attention_megatron import FusionGptAttentionMegatron 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class Gpt2OnnxModel(BertOnnxModel): 16 | def __init__(self, model, num_heads, hidden_size): 17 | super().__init__(model, num_heads, hidden_size) 18 | 19 | def fuse_attention(self): 20 | if len(self.model.graph.input) == 1 or len(self.model.graph.output) == 1: 21 | fusion = FusionGptAttentionNoPast(self, self.num_heads) 22 | fusion.apply() 23 | else: 24 | fusion = FusionGptAttention(self, self.num_heads) 25 | fusion.apply() 26 | fusion = FusionGptAttentionMegatron(self, self.num_heads) 27 | fusion.apply() 28 | 29 | def postprocess(self): 30 | """ 31 | Remove extra reshape nodes. 32 | """ 33 | logger.debug(f"start postprocessing...") 34 | 35 | input_name_to_nodes = self.input_name_to_nodes() 36 | output_name_to_node = self.output_name_to_node() 37 | 38 | reshape_count = 0 39 | for gemm_node in self.get_nodes_by_op_type("Gemm"): 40 | reshape_after_gemm = self.find_first_child_by_type(gemm_node, 41 | 'Reshape', 42 | input_name_to_nodes, 43 | recursive=False) 44 | 45 | return_indice = [] 46 | nodes = self.match_parent_path(gemm_node, ['Reshape', 'FastGelu'], [0, 0], output_name_to_node) 47 | if nodes is None: 48 | nodes = self.match_parent_path(gemm_node, ['Reshape', 'LayerNormalization'], [0, 0], 49 | output_name_to_node) 50 | if nodes is None: 51 | continue 52 | (reshape_before_gemm, root_node) = nodes 53 | 54 | matmul_node_name = self.create_node_name('MatMul', 'FullyConnect_MatMul') 55 | matmul_node = onnx.helper.make_node('MatMul', 56 | inputs=[matmul_node_name + "_input", gemm_node.input[1]], 57 | outputs=[matmul_node_name + "_output"], 58 | name=matmul_node_name) 59 | 60 | add_node_name = self.create_node_name('Add', 'FullyConnect_Add') 61 | add_node = onnx.helper.make_node('Add', 62 | inputs=[matmul_node_name + "_output", gemm_node.input[2]], 63 | outputs=[add_node_name + "_output"], 64 | name=add_node_name) 65 | 66 | self.replace_input_of_all_nodes(reshape_after_gemm.output[0], add_node_name + "_output") 67 | 68 | # Link root node output with MatMul 69 | self.replace_input_of_all_nodes(root_node.output[0], matmul_node_name + "_input") 70 | root_node.output[0] = matmul_node_name + "_input" 71 | 72 | self.replace_input_of_all_nodes(reshape_after_gemm.output[0], add_node_name + "_output") 73 | 74 | self.add_node(matmul_node) 75 | self.add_node(add_node) 76 | 77 | reshape_count += 2 78 | 79 | self.prune_graph() 80 | logger.info(f"postprocess: remove Reshape count:{reshape_count}") 81 | -------------------------------------------------------------------------------- /perf-ci.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | NO_SRC=false 4 | 5 | TVM_TUNED_CPU=$HOME/tvm_tuned_cpu 6 | TVM_TUNED_GPU=$HOME/tvm_tuned_gpu 7 | 8 | while getopts “n” OPTION 9 | do 10 | case $OPTION in 11 | n) 12 | echo "Not checking out src tree.. running from current checkout.." 13 | NO_SRC=true 14 | ;; 15 | ?) 16 | echo "Unsupported option.. -n for no checkout and run as developer instead of a CI" 17 | exit 18 | ;; 19 | esac 20 | done 21 | 22 | if [ "$NO_SRC" = true ]; then 23 | echo "Using existing checkout" 24 | else 25 | echo "Checking out transformer-benchmarks..." 26 | git clone https://github.com/nod-ai/transformer-benchmarks --recursive 27 | cd transformer-benchmarks 28 | git submodule update --init --recursive 29 | cd mmperf/external/iree 30 | git submodule update --init --recursive 31 | cd - 32 | #echo "Updating submodules to origin/main...things may break.. but that is the point.." 33 | #./update_submodules.sh 34 | fi 35 | 36 | #Gather results 37 | TIMESTAMP=`date +%Y-%m-%d_%H-%M-%S` 38 | 39 | #. $HOME/miniconda3/etc/profile.d/conda.sh 40 | #conda env remove -n perf_env 41 | #conda create -n perf_env python=3.9 -y 42 | #conda activate perf_env 43 | 44 | rm -rf perf_env 45 | python3 -m venv perf_env 46 | source perf_env/bin/activate 47 | 48 | #E2E Transformer benchmarks 49 | ./run_benchmark.sh --cpu_fp32=true --gpu_fp32=false --create_venv=true --ort=true --torchscript=true --tensorflow=true --iree=true --ort_optimizer=false 50 | #./run_benchmark.sh --gpu_fp32=true --cpu_fp32=false --create_venv=true --ort=true --torchscript=true --tensorflow=true --iree=true --ort_optimizer=false 51 | 52 | 53 | mkdir -p transformer-bench-results/${TIMESTAMP}/BERT_e2e/ 54 | cp *.csv transformer-bench-results/${TIMESTAMP}/BERT_e2e/ 55 | cp model.mlir transformer-bench-results/${TIMESTAMP}/BERT_e2e/model_${TIMESTAMP}.mlir 56 | 57 | #mmperf tests 58 | cd mmperf 59 | 60 | rm -rf mmperf_env 61 | python3 -m venv mmperf_env 62 | source mmperf_env/bin/activate 63 | pip install --upgrade pip 64 | pip install -r requirements.txt 65 | 66 | #CPU tests 67 | 68 | if [ -d ${TVM_TUNED_CPU} ]; then 69 | echo "Using TVM TUNED for CPU" 70 | cmake -GNinja -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DMKL_DIR=/opt/intel/oneapi/mkl/latest/ -DUSE_TVM=ON -DUSE_MKL=ON -DUSE_MLIR=ON -DUSE_IREE=ON -DIREE_DYLIB=ON -DUSE_TVM_TUNED=ON -DTVM_LIB_DIR=${TVM_TUNED_CPU} -B build . 71 | else 72 | echo "No TVM tuned libs so skipping.." 73 | cmake -GNinja -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DMKL_DIR=/opt/intel/oneapi/mkl/latest/ -DUSE_MKL=ON -DUSE_MLIR=ON -DUSE_IREE=ON -DIREE_DYLIB=ON -B build . 74 | fi 75 | 76 | #build mmperf 77 | cmake --build build 78 | #Sometimes bad things happen to MLIR deps and ninja deps. Lets do another try. 79 | cmake --build build 80 | 81 | #Run all tests and generate the plots 82 | cmake --build build/matmul --target run_all_tests 83 | 84 | python mmperf.py build/matmul ../transformer-bench-results/${TIMESTAMP}/mmperf-cpu/ 85 | 86 | mv build build.cpu 87 | 88 | #GPU tests 89 | if [ -d ${TVM_TUNED_GPU} ] ; then 90 | echo "Using TVM TUNED for GPU" 91 | # cmake -GNinja -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DMKL_DIR=/opt/intel/oneapi/mkl/latest/ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DUSE_TVM=ON -DUSE_MLIR=ON -DUSE_IREE=ON -DIREE_CUDA=ON -DUSE_CUBLAS=ON -DUSE_TVM_CUDA=ON -DTVM_ENABLE_CUDA=ON -DUSE_TVM_TUNED=ON -DTVM_LIB_DIR=${TVM_TUNED_GPU} -B build . 92 | else 93 | echo "No TVM tuned libs so skipping.." 94 | # cmake -GNinja -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DUSE_MLIR_CUDA=ON -DUSE_IREE=ON -DIREE_CUDA=ON -DUSE_CUBLAS=ON -B build . 95 | fi 96 | 97 | #build mmperf 98 | #cmake --build build 99 | #Sometimes bad things happen to MLIR deps and ninja deps. Lets do another try. 100 | #cmake --build build 101 | 102 | #Run all tests and generate the plots 103 | #cmake --build build/matmul --target run_all_tests 104 | 105 | #python mmperf.py build/matmul ../transformer-bench-results/${TIMESTAMP}/mmperf-gpu/ 106 | 107 | #mv build build.gpu 108 | 109 | cd .. 110 | 111 | cd transformer-bench-results 112 | ln -s ${TIMESTAMP} latest 113 | cd ../ 114 | 115 | echo "Remove old symlink.." 116 | gsutil rm -rf gs://shark-public/nod-perf/results/transformer-bench/latest 117 | 118 | echo "Copying to Google Storage.." 119 | gsutil cp -r transformer-bench-results/* gs://shark-public/nod-perf/results/transformer-bench/ 120 | 121 | if [ "$NO_SRC" = true ]; then 122 | echo "leaving sources and results for manual clean up" 123 | else 124 | cd ../.. 125 | echo "deleting transformer-benchmarks..." 126 | echo `pwd` 127 | rm -rf transformer-bench 128 | fi 129 | -------------------------------------------------------------------------------- /quantize_helper.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | 7 | import logging 8 | import torch 9 | import onnx 10 | import os 11 | from transformers.modeling_utils import Conv1D 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def _conv1d_to_linear(module): 17 | in_size, out_size = module.weight.shape 18 | linear = torch.nn.Linear(in_size, out_size) 19 | linear.weight.data = module.weight.data.T.contiguous() 20 | linear.bias.data = module.bias.data 21 | return linear 22 | 23 | 24 | def conv1d_to_linear(model): 25 | '''in-place 26 | This is for Dynamic Quantization, as Conv1D is not recognized by PyTorch, convert it to nn.Linear 27 | ''' 28 | logger.debug("replace Conv1D with Linear") 29 | for name in list(model._modules): 30 | module = model._modules[name] 31 | if isinstance(module, Conv1D): 32 | linear = _conv1d_to_linear(module) 33 | model._modules[name] = linear 34 | else: 35 | conv1d_to_linear(module) 36 | 37 | 38 | def _get_size_of_pytorch_model(model): 39 | torch.save(model.state_dict(), "temp.p") 40 | size = os.path.getsize("temp.p") / (1024 * 1024) 41 | os.remove('temp.p') 42 | return size 43 | 44 | 45 | class QuantizeHelper: 46 | @staticmethod 47 | def quantize_torch_model(model, dtype=torch.qint8): 48 | ''' 49 | Usage: model = quantize_model(model) 50 | 51 | TODO: mix of in-place and return, but results are different 52 | ''' 53 | conv1d_to_linear(model) 54 | quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=dtype) 55 | logger.info(f'Size of full precision Torch model(MB):{_get_size_of_pytorch_model(model)}') 56 | logger.info(f'Size of quantized Torch model(MB):{_get_size_of_pytorch_model(quantized_model)}') 57 | return quantized_model 58 | 59 | @staticmethod 60 | def quantize_onnx_model(onnx_model_path, quantized_model_path, use_external_data_format=False): 61 | from onnxruntime.quantization import quantize, QuantizationMode 62 | logger.info(f'Size of full precision ONNX model(MB):{os.path.getsize(onnx_model_path)/(1024*1024)}') 63 | onnx_opt_model = onnx.load_model(onnx_model_path) 64 | quantized_onnx_model = quantize(onnx_opt_model, 65 | quantization_mode=QuantizationMode.IntegerOps, 66 | symmetric_weight=True, 67 | force_fusions=True) 68 | 69 | if use_external_data_format: 70 | from pathlib import Path 71 | Path(quantized_model_path).parent.mkdir(parents=True, exist_ok=True) 72 | onnx.external_data_helper.convert_model_to_external_data(quantized_onnx_model, 73 | all_tensors_to_one_file=True, 74 | location=Path(quantized_model_path).name + ".data") 75 | onnx.save_model(quantized_onnx_model, quantized_model_path) 76 | 77 | logger.info(f"quantized model saved to:{quantized_model_path}") 78 | #TODO: inlcude external data in total model size. 79 | logger.info(f'Size of quantized ONNX model(MB):{os.path.getsize(quantized_model_path)/(1024*1024)}') 80 | -------------------------------------------------------------------------------- /resnet50.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | from transformers import AutoFeatureExtractor, ResNetForImageClassification 4 | from datasets import load_dataset 5 | 6 | dataset = load_dataset("huggingface/cats-image") 7 | image = dataset["test"]["image"][0] 8 | 9 | feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/resnet-50") 10 | 11 | 12 | #PyTorch 13 | import torch 14 | 15 | model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50") 16 | warmup = 5 17 | total_iter = 100 18 | num_iter = total_iter - warmup 19 | for i in range(num_iter): 20 | if(i == warmup-1): 21 | start = time.time() 22 | inputs = feature_extractor(image, return_tensors="pt") 23 | with torch.no_grad(): 24 | logits = model(**inputs).logits 25 | predicted_label = logits.argmax(-1).item() 26 | end = time.time() 27 | total_time = end - start 28 | print("PyTorch: time/iter in ms : "+str(total_time*1000/num_iter)) 29 | #print(model.config.id2label[predicted_label]) 30 | 31 | 32 | # OnnxRuntime 33 | from onnxruntime import InferenceSession 34 | import urllib.request 35 | 36 | if not os.path.isfile("model.onnx"): 37 | urllib.request.urlretrieve('https://huggingface.co/OWG/resnet-50/resolve/main/onnx/model.onnx',"model.onnx") 38 | 39 | session = InferenceSession("model.onnx") 40 | 41 | warmup = 5 42 | total_iter = 100 43 | num_iter = total_iter - warmup 44 | for i in range(num_iter): 45 | if(i == warmup-1): 46 | start = time.time() 47 | #print(BertCompiled.learn(predict_sample_input,np.random.randint(5, size=(BATCH_SIZE)))) 48 | # ONNX Runtime expects NumPy arrays as input 49 | inputs = feature_extractor(image, return_tensors="np") 50 | outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs)) 51 | end = time.time() 52 | total_time = end - start 53 | print("Onnx: time/iter in ms : "+str(total_time*1000/num_iter)) 54 | 55 | -------------------------------------------------------------------------------- /run_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # ------------------------------------------------------------------------- 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Copyright (c) Nod, Inc. All rights reserved. 5 | # Licensed under the MIT License. See License.txt in the project root for 6 | # license information. 7 | # -------------------------------------------------------------------------- 8 | # This measures the performance of OnnxRuntime, PyTorch and TorchScript on transformer models. 9 | # Please install PyTorch or Tensorflow or MLIR Runtime (see https://pytorch.org/) before running this benchmark. Like the following: 10 | # GPU: conda install pytorch torchvision cudatoolkit=11.0 -c pytorch 11 | # CPU: conda install pytorch torchvision cpuonly -c pytorch 12 | 13 | # When use_package=true, you need not copy other files to run benchmarks except this sh file. 14 | # Otherwise, it will use python script (*.py) files in this directory. 15 | 16 | ARGUMENT_LIST=( 17 | "gpu_fp32" 18 | "gpu_fp16" 19 | "cpu_fp32" 20 | "cpu_int8" 21 | "ort" 22 | "torch" 23 | "torchscript" 24 | "tensorflow" 25 | "iree" 26 | "shark" 27 | "pip_install_pkg" 28 | "ort_optimizer" 29 | "create_venv" 30 | "with_nsys" 31 | ) 32 | 33 | #setup defaults 34 | # Devices to test (You can run either CPU or GPU, but not both: gpu need onnxruntime-gpu, and CPU need onnxruntime). 35 | run_gpu_fp32=true 36 | run_gpu_fp16=false 37 | run_cpu_fp32=false 38 | run_cpu_int8=false 39 | # Engines to test. 40 | run_ort=true 41 | run_shark=false 42 | run_torch=false 43 | run_torchscript=true 44 | run_tensorflow=true 45 | run_iree=true 46 | 47 | # only need once 48 | run_create_venv=false 49 | install_pkg=true 50 | run_with_nsys=true 51 | 52 | # Enable optimizer (use script instead of OnnxRuntime for graph optimization) 53 | use_optimizer=false 54 | 55 | # read arguments 56 | opts=$(getopt \ 57 | --longoptions "$(printf "%s:," "${ARGUMENT_LIST[@]}")" \ 58 | --name "$(basename "$0")" \ 59 | --options "" \ 60 | -- "$@" 61 | ) 62 | 63 | eval set --$opts 64 | 65 | while [[ $# -gt 0 ]]; do 66 | case "$1" in 67 | --with_nsys) 68 | run_with_nsys=$2 69 | shift 2 70 | ;; 71 | --create_venv) 72 | run_create_venv=$2 73 | echo "Removing old bench_venv.." 74 | rm -rf bench_venv 75 | echo "Creating new bench_venv.." 76 | python3 -m venv bench_venv 77 | echo "sourcing new env.." 78 | source bench_venv/bin/activate 79 | shift 2 80 | ;; 81 | 82 | --ort_optimizer) 83 | use_optimizer=$2 84 | shift 2 85 | ;; 86 | 87 | --pip_install_pkg) 88 | install_pkg=$2 89 | shift 2 90 | ;; 91 | 92 | --iree) 93 | run_iree=$2 94 | shift 2 95 | ;; 96 | 97 | --shark) 98 | run_shark=$2 99 | shift 2 100 | ;; 101 | 102 | --tensorflow) 103 | run_tensorflow=$2 104 | shift 2 105 | ;; 106 | 107 | --torchscript) 108 | run_torchscript=$2 109 | shift 2 110 | ;; 111 | 112 | --torch) 113 | run_torch=$2 114 | shift 2 115 | ;; 116 | 117 | --ort) 118 | run_ort=$2 119 | shift 2 120 | ;; 121 | 122 | --gpu_fp32) 123 | run_gpu_fp32=$2 124 | shift 2 125 | ;; 126 | 127 | --gpu_fp16) 128 | run_gpu_fp16=$2 129 | shift 2 130 | ;; 131 | 132 | --cpu_fp32) 133 | run_cpu_fp32=$2 134 | shift 2 135 | ;; 136 | 137 | --cpu_int8) 138 | run_cpu_int8=$2 139 | shift 2 140 | ;; 141 | 142 | *) 143 | echo "Using defaults...: " 144 | echo " you can change them with --var=true or false" 145 | break 146 | ;; 147 | esac 148 | done 149 | 150 | 151 | echo "Parsed command line args as:" 152 | echo "gpu_fp32 $run_gpu_fp32" 153 | echo "gpu_fp16 $run_gpu_fp16" 154 | echo "cpu_fp32 $run_cpu_fp32" 155 | echo "cpu_int8 $run_cpu_int8" 156 | 157 | echo "ort $run_ort" 158 | echo "ort_optimizer $use_optimizer" 159 | echo "shark $run_shark" 160 | echo "torch $run_torch" 161 | echo "torchscript $run_torchscript" 162 | echo "tensorflow $run_tensorflow" 163 | echo "iree $run_iree" 164 | echo "create_venv $run_create_venv" 165 | echo "run_with_nsys $run_with_nsys" 166 | echo "pip_install_pkg $install_pkg" 167 | 168 | echo "Check python path.. " 169 | which python 170 | 171 | use_package=false 172 | # Onnx model source (default is from pytorch, set export_onnx_from_tf=true to convert from tensorflow model) 173 | export_onnx_from_tf=false 174 | 175 | 176 | average_over=1000 177 | # CPU takes longer time to run, only run 100 inferences to get average latency. 178 | if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then 179 | average_over=100 180 | fi 181 | 182 | # Batch Sizes and Sequence Lengths 183 | batch_sizes="1" 184 | sequence_lengths="128" 185 | 186 | # Number of inputs (input_ids, token_type_ids, attention_mask) for ONNX model. 187 | # Not that different input count might lead to different performance 188 | # Here we only test one input (input_ids) for fair comparison with PyTorch. 189 | input_counts=1 190 | 191 | # Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased 192 | #models_to_test="bert-base-cased roberta-base distilbert-base-uncased" 193 | #models_to_test="philschmid/MiniLM-L6-H384-uncased-sst2" 194 | models_to_test="microsoft/MiniLM-L12-H384-uncased" 195 | #models_to_test="gpt2" 196 | 197 | # If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU: 198 | # export CUDA_VISIBLE_DEVICES=1 199 | 200 | # This script will generate a logs file with a list of commands used in tests. 201 | echo echo "ort=$run_ort torch=$run_torch torchscript=$run_torchscript tensorflow=$run_tensorflow iree=$run_iree gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" >> benchmark.log 202 | 203 | # Set it to false to skip testing. You can use it to dry run this script with the log file. 204 | run_tests=true 205 | 206 | # Directory for downloading pretrained models. 207 | cache_dir="./cache_models" 208 | 209 | # Directory for ONNX models 210 | onnx_dir="./onnx_models" 211 | 212 | # ------------------------------------------- 213 | if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then 214 | if [ "$run_gpu_fp32" = true ] ; then 215 | echo "cannot test cpu and gpu at same time" 216 | exit 1 217 | fi 218 | if [ "$run_gpu_fp16" = true ] ; then 219 | echo "cannot test cpu and gpu at same time" 220 | exit 1 221 | fi 222 | fi 223 | 224 | 225 | if [ "$install_pkg" = true ] ; then 226 | pip install --upgrade pip 227 | pip uninstall --yes ort-nightly ort-gpu-nightly 228 | pip uninstall --yes onnxruntime 229 | pip uninstall --yes onnxruntime-gpu 230 | pip uninstall --yes torch 231 | pip uninstall --yes iree-compiler iree-runtime iree-tools-tf iree-tools-tflite iree-tools-xla 232 | 233 | if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then 234 | pip install onnxruntime 235 | pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html 236 | else 237 | pip install onnxruntime-gpu 238 | pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html 239 | #pip3 install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/rocm4.2/torch_nightly.html 240 | fi 241 | pip install tf-nightly 242 | pip install --upgrade onnx coloredlogs packaging psutil py3nvml onnxconverter_common numpy transformers sympy wheel 243 | pip install gin-config 244 | 245 | ### Installing IREE-Python 246 | python -m pip install iree-compiler iree-runtime iree-tools-tf iree-tools-tflite iree-tools-xla --find-links https://github.com/google/iree/releases 247 | 248 | if [ "$run_shark" = true ] ; then 249 | ### Installing shark 250 | git submodule update --init 251 | pip install -r `pwd`/thirdparty/SHARK/requirements.txt --no-cache-dir 252 | python -m pip install --find-links https://github.com/llvm/torch-mlir/releases torch-mlir 253 | python -m pip install ninja 254 | python -m pip install thirdparty/SHARK 255 | fi 256 | 257 | fi 258 | 259 | if [ "$use_package" = true ] ; then 260 | echo "Use onnxruntime.transformers.benchmark" 261 | benchmark_script="-m onnxruntime.transformers.benchmark" 262 | else 263 | benchmark_script="benchmark.py" 264 | fi 265 | 266 | onnx_export_options="-i $input_counts -v -b 0 --overwrite -f fusion.csv -c $cache_dir --onnx_dir $onnx_dir" 267 | benchmark_options="-b $batch_sizes -s $sequence_lengths -t $average_over -f fusion.csv -r result.csv -d detail.csv -c $cache_dir --onnx_dir $onnx_dir" 268 | 269 | if [ "$export_onnx_from_tf" = true ] ; then 270 | onnx_export_options="$onnx_export_options --model_source tf" 271 | benchmark_options="$benchmark_options --model_source tf" 272 | fi 273 | 274 | if [ "$use_optimizer" = true ] ; then 275 | onnx_export_options="$onnx_export_options -o" 276 | benchmark_options="$benchmark_options -o" 277 | fi 278 | 279 | # ------------------------------------------- 280 | run_one_test() { 281 | if [ "$run_ort" = true ] ; then 282 | echo python $benchmark_script -m $1 $onnx_export_options $2 $3 $4 >> benchmark.log 283 | echo python $benchmark_script -m $1 $benchmark_options $2 $3 $4 -i $input_counts >> benchmark.log 284 | if [ "$run_tests" = true ] ; then 285 | python $benchmark_script -m $1 $onnx_export_options $2 $3 $4 286 | python $benchmark_script -m $1 $benchmark_options $2 $3 $4 -i $input_counts 287 | fi 288 | fi 289 | 290 | if [ "$run_shark" = true ] ; then 291 | echo python $benchmark_script -e shark -m $1 $benchmark_options $2 $3 $4 >> benchmark.log 292 | if [ "$run_tests" = true ] ; then 293 | python $benchmark_script -e shark -m $1 $benchmark_options $2 $3 $4 294 | fi 295 | fi 296 | 297 | if [ "$run_torch" = true ] ; then 298 | echo python $benchmark_script -e torch -m $1 $benchmark_options $2 $3 $4 >> benchmark.log 299 | if [ "$run_tests" = true ] ; then 300 | python $benchmark_script -e torch -m $1 $benchmark_options $2 $3 $4 301 | fi 302 | fi 303 | 304 | if [ "$run_torchscript" = true ] ; then 305 | echo python $benchmark_script -e torchscript -m $1 $benchmark_options $2 $3 $4 >> benchmark.log 306 | if [ "$run_tests" = true ] ; then 307 | python $benchmark_script -e torchscript -m $1 $benchmark_options $2 $3 $4 308 | fi 309 | fi 310 | 311 | if [ "$run_tensorflow" = true ] ; then 312 | echo python $benchmark_script -e tensorflow -m $1 $benchmark_options $2 $3 $4 >> benchmark.log 313 | if [ "$run_tests" = true ] ; then 314 | python $benchmark_script -e tensorflow -m $1 $benchmark_options $2 $3 $4 315 | fi 316 | fi 317 | 318 | if [ "$run_iree" = true ] ; then 319 | echo python $benchmark_script -e iree -m $1 $benchmark_options $2 $3 $4 >> benchmark.log 320 | if [ "$run_tests" = true ] ; then 321 | python $benchmark_script -e iree -m $1 $benchmark_options $2 $3 $4 322 | fi 323 | fi 324 | } 325 | 326 | # ------------------------------------------- 327 | if [ "$run_gpu_fp32" = true ] ; then 328 | for m in $models_to_test 329 | do 330 | echo Run GPU FP32 Benchmark on model ${m} 331 | run_one_test "${m}" -g 332 | done 333 | fi 334 | 335 | if [ "$run_gpu_fp16" = true ] ; then 336 | for m in $models_to_test 337 | do 338 | echo Run GPU FP16 Benchmark on model ${m} 339 | run_one_test "${m}" -g -p fp16 340 | done 341 | fi 342 | 343 | if [ "$run_cpu_fp32" = true ] ; then 344 | for m in $models_to_test 345 | do 346 | echo Run CPU Benchmark on model ${m} 347 | run_one_test "${m}" 348 | done 349 | fi 350 | 351 | if [ "$run_cpu_int8" = true ] ; then 352 | for m in $models_to_test 353 | do 354 | echo Run CPU Benchmark on model ${m} 355 | run_one_test "${m}" -p int8 356 | done 357 | fi 358 | 359 | if [ "run_tests" = false ] ; then 360 | more $log_file 361 | fi 362 | 363 | # Remove duplicated lines 364 | awk '!x[$0]++' ./result.csv > summary_result.csv 365 | awk '!x[$0]++' ./fusion.csv > summary_fusion.csv 366 | awk '!x[$0]++' ./detail.csv > summary_detail.csv 367 | -------------------------------------------------------------------------------- /shape_infer_helper.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | #-------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | import onnx 9 | 10 | # In ORT Package the symbolic_shape_infer.py is in ../tools 11 | file_path = os.path.dirname(__file__) 12 | if os.path.exists(os.path.join(file_path, "../tools/symbolic_shape_infer.py")): 13 | sys.path.append(os.path.join(file_path, '../tools')) 14 | else: 15 | sys.path.append(os.path.join(file_path, '..')) 16 | 17 | from symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto, sympy 18 | 19 | 20 | class SymbolicShapeInferenceHelper(SymbolicShapeInference): 21 | def __init__(self, model, verbose=0, int_max=2**31 - 1, auto_merge=True, guess_output_rank=False): 22 | super().__init__(int_max, auto_merge, guess_output_rank, verbose) 23 | self.model_ = onnx.ModelProto() 24 | self.model_.CopyFrom(model) 25 | self.all_shapes_inferred_ = False 26 | self.inferred_ = False 27 | 28 | # The goal is to remove dynamic_axis_mapping 29 | def infer(self, dynamic_axis_mapping): 30 | if self.inferred_: 31 | return self.all_shapes_inferred_ 32 | 33 | self.dynamic_axis_mapping_ = dynamic_axis_mapping # e.g {"batch_size" : 4, "seq_len" :7} 34 | 35 | self._preprocess(self.model_) 36 | while self.run_: 37 | self.all_shapes_inferred_ = self._infer_impl() 38 | 39 | self.inferred_ = True 40 | return self.all_shapes_inferred_ 41 | 42 | # override _preprocess() to avoid unnecessary model copy since ctor copies the model 43 | def _preprocess(self, in_mp): 44 | self.out_mp_ = in_mp 45 | self.graph_inputs_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)]) 46 | self.initializers_ = dict([(i.name, i) for i in self.out_mp_.graph.initializer]) 47 | self.known_vi_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)]) 48 | self.known_vi_.update( 49 | dict([(i.name, onnx.helper.make_tensor_value_info(i.name, i.data_type, list(i.dims))) 50 | for i in self.out_mp_.graph.initializer])) 51 | 52 | # Override _get_sympy_shape() in symbolic_shape_infer.py to ensure shape inference by giving the actual value of dynamic axis 53 | def _get_sympy_shape(self, node, idx): 54 | sympy_shape = [] 55 | for d in self._get_shape(node, idx): 56 | if type(d) == str: 57 | if d in self.dynamic_axis_mapping_.keys(): 58 | sympy_shape.append(self.dynamic_axis_mapping_[d]) 59 | elif d in self.symbolic_dims_: 60 | sympy_shape.append(self.symbolic_dims_[d]) 61 | else: 62 | sympy_shape.append(sympy.Symbol(d, integer=True)) 63 | else: 64 | assert None != d 65 | sympy_shape.append(d) 66 | return sympy_shape 67 | 68 | def get_edge_shape(self, edge): 69 | assert (self.all_shapes_inferred_ == True) 70 | if edge not in self.known_vi_: 71 | print("Cannot retrive the shape of " + str(edge)) 72 | return None 73 | type_proto = self.known_vi_[edge].type 74 | shape = get_shape_from_type_proto(type_proto) 75 | for i in range(len(shape)): 76 | d = shape[i] 77 | if type(d) == str and d in self.dynamic_axis_mapping_.keys(): 78 | shape[i] = self.dynamic_axis_mapping_[d] 79 | return shape 80 | 81 | def compare_shape(self, edge, edge_other): 82 | assert (self.all_shapes_inferred_ == True) 83 | shape = self.get_edge_shape(edge) 84 | shape_other = self.get_edge_shape(edge_other) 85 | if shape is None or shape_other is None: 86 | raise Exception("At least one shape is missed for edges to compare") 87 | return shape == shape_other 88 | -------------------------------------------------------------------------------- /update_submodules.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | COMMIT_PUSH=false 4 | 5 | while getopts “p” OPTION 6 | do 7 | case $OPTION in 8 | p) 9 | echo "Pushing changes up.." 10 | COMMIT_PUSH=true 11 | ;; 12 | ?) 13 | echo "Unsupported option.. -p for pushing changes up after update" 14 | exit 15 | ;; 16 | esac 17 | done 18 | 19 | echo "Updating repos.." 20 | 21 | cd mmperf && git fetch --all && git checkout origin/main 22 | 23 | #update mmperf submodules first 24 | git submodule update --init 25 | #Update the submodules inside mmperf too 26 | ./update_submodules.sh 27 | 28 | if [ "$COMMIT_PUSH" = true ]; then 29 | echo "Checking out transformer-benchmarks..." 30 | git add . 31 | git commit -m "Roll external deps" 32 | echo git push https://github.com/mmperf/mmperf 33 | fi 34 | --------------------------------------------------------------------------------