├── matmul_model_float.onnx ├── matmul_model_quant.onnx ├── matmul_model_quant_io.onnx ├── requirements.txt ├── npu_quant_io_profile.csv ├── npu_quant_profile.csv ├── benchmark_matmul_cudnn.py ├── LICENSE ├── README.md └── benchmark_matmul.py /matmul_model_float.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moonshine-ai/qc_npu_benchmark/HEAD/matmul_model_float.onnx -------------------------------------------------------------------------------- /matmul_model_quant.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moonshine-ai/qc_npu_benchmark/HEAD/matmul_model_quant.onnx -------------------------------------------------------------------------------- /matmul_model_quant_io.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moonshine-ai/qc_npu_benchmark/HEAD/matmul_model_quant_io.onnx -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/petewarden/onnx@rel-1.16.2 2 | https://aiinfra.pkgs.visualstudio.com/2692857e-05ef-43b4-ba9c-ccf1c22c437c/_packaging/7982ae20-ed19-4a35-a362-a96ac99897b7/pypi/download/ort-nightly-qnn/1.20.dev20240928001/ort_nightly_qnn-1.20.0.dev20240928001-cp311-cp311-win_arm64.whl#sha256=3b12e3882d1afadf66c2349b2a167dfcbb9ae7a332dc98e0fd51c101d34ddf6e 3 | readable-number 4 | -------------------------------------------------------------------------------- /npu_quant_io_profile.csv: -------------------------------------------------------------------------------- 1 | Msg Timestamp,Message,Time,Unit of Measurement,Timing Source,Event Level,Event Identifier 2 | 0,BACKEND,26825,US,BACKEND,ROOT,RPC (finalize) time 3 | 0,BACKEND,26448,US,BACKEND,ROOT,QNN accelerator (finalize) time 4 | 0,BACKEND,26355,US,BACKEND,ROOT,Accelerator (finalize) time 5 | 0,FINALIZE,1557972,US,BACKEND,ROOT,QNN (finalize) time 6 | 0,BACKEND,4,COUNT,BACKEND,ROOT,Number of HVX threads used 7 | 0,BACKEND,64116,US,BACKEND,ROOT,RPC (execute) time 8 | 0,BACKEND,63551,US,BACKEND,ROOT,QNN accelerator (execute) time 9 | 0,BACKEND,0,COUNT,BACKEND,ROOT,Num times yield occured 10 | 0,BACKEND,699,US,BACKEND,ROOT,Time for initial VTCM acquire 11 | 0,BACKEND,2446,US,BACKEND,ROOT,Time for HVX + HMX power on and acquire 12 | 0,BACKEND,18173387,CYCLES,BACKEND,ROOT,Accelerator (execute) time (cycles) 13 | 0,NODE,211078,CYCLES,BACKEND,SUB-EVENT,Input OpId_2 (cycles) 14 | 0,NODE,10697422,CYCLES,BACKEND,SUB-EVENT,matmul_node:OpId_23 (cycles) 15 | 0,NODE,0,CYCLES,BACKEND,SUB-EVENT,matmul_output_dequant:OpId_27 (cycles) 16 | 0,NODE,7264887,CYCLES,BACKEND,SUB-EVENT,Output OpId_3 (cycles) 17 | 0,BACKEND,10353,US,BACKEND,ROOT,Accelerator (execute) time 18 | 0,BACKEND,10112,US,BACKEND,ROOT,Accelerator (execute excluding wait) time 19 | 0,EXECUTE,67117,US,BACKEND,ROOT,QNN (execute) time 20 | -------------------------------------------------------------------------------- /npu_quant_profile.csv: -------------------------------------------------------------------------------- 1 | Msg Timestamp,Message,Time,Unit of Measurement,Timing Source,Event Level,Event Identifier 2 | 0,BACKEND,43624,US,BACKEND,ROOT,RPC (finalize) time 3 | 0,BACKEND,42986,US,BACKEND,ROOT,QNN accelerator (finalize) time 4 | 0,BACKEND,42885,US,BACKEND,ROOT,Accelerator (finalize) time 5 | 0,FINALIZE,2049932,US,BACKEND,ROOT,QNN (finalize) time 6 | 0,BACKEND,4,COUNT,BACKEND,ROOT,Number of HVX threads used 7 | 0,BACKEND,88338,US,BACKEND,ROOT,RPC (execute) time 8 | 0,BACKEND,87857,US,BACKEND,ROOT,QNN accelerator (execute) time 9 | 0,BACKEND,0,COUNT,BACKEND,ROOT,Num times yield occured 10 | 0,BACKEND,664,US,BACKEND,ROOT,Time for initial VTCM acquire 11 | 0,BACKEND,1970,US,BACKEND,ROOT,Time for HVX + HMX power on and acquire 12 | 0,BACKEND,51117377,CYCLES,BACKEND,ROOT,Accelerator (execute) time (cycles) 13 | 0,NODE,0,CYCLES,BACKEND,SUB-EVENT,Input OpId_2 (cycles) 14 | 0,NODE,3046243,CYCLES,BACKEND,SUB-EVENT,input1_tensor_QuantizeLinear:OpId_16 (cycles) 15 | 0,NODE,1784335,CYCLES,BACKEND,SUB-EVENT,input0_tensor_QuantizeLinear:OpId_17 (cycles) 16 | 0,NODE,10958875,CYCLES,BACKEND,SUB-EVENT,matmul_node:OpId_18 (cycles) 17 | 0,NODE,32129948,CYCLES,BACKEND,SUB-EVENT,matmul_output_tensor_DequantizeLinear:OpId_22 (cycles) 18 | 0,NODE,3197976,CYCLES,BACKEND,SUB-EVENT,Output OpId_3 (cycles) 19 | 0,BACKEND,26024,US,BACKEND,ROOT,Accelerator (execute) time 20 | 0,BACKEND,25815,US,BACKEND,ROOT,Accelerator (execute excluding wait) time 21 | 0,EXECUTE,99091,US,BACKEND,ROOT,QNN (execute) time 22 | -------------------------------------------------------------------------------- /benchmark_matmul_cudnn.py: -------------------------------------------------------------------------------- 1 | # Benchmark script for the Qualcomm NPU on a Microsoft Surface Pro Tablet. 2 | # See README.md for more information, and LICENSE for copyright information. 3 | 4 | import numpy as np 5 | import onnx 6 | from onnx import helper as h, TensorProto as tp 7 | import onnxruntime as ort 8 | from onnxruntime.quantization import QuantFormat, QuantType, CalibrationDataReader, quantize_static 9 | from readable_number import ReadableNumber 10 | import time 11 | 12 | # Define the shape of the matrix multiplication operation to benchmark. 13 | MATRIX_COUNT = 6 14 | MATRIX_A = 1500 15 | MATRIX_B = 1500 16 | MATRIX_K = 256 17 | INPUT0_SHAPE = [1, MATRIX_COUNT, MATRIX_A, MATRIX_K] 18 | INPUT1_SHAPE = [1, MATRIX_COUNT, MATRIX_K, MATRIX_B] 19 | OUTPUT_SHAPE = [1, MATRIX_COUNT, MATRIX_A, MATRIX_B] 20 | 21 | # A multiply-add counts as two operations, conventionally. 22 | OPS_PER_MUL_ADD = 2 23 | 24 | # Derive the total number of operations from the input shapes. 25 | OPS_PER_INFERENCE = OPS_PER_MUL_ADD * MATRIX_COUNT * MATRIX_A * MATRIX_B * MATRIX_K 26 | 27 | # The float range to distribute random inputs over. 28 | INPUT_RANGE = 1.0 / 5.0 29 | 30 | # Where to save the intermediate model files. These will overwrite whatever is 31 | # in the existing repository by default. 32 | FLOAT_MODEL_PATH = "matmul_model_float.onnx" 33 | QUANT_MODEL_PATH = "matmul_model_quant.onnx" 34 | QUANT_IO_MODEL_PATH = "matmul_model_quant_io.onnx" 35 | 36 | # How many times to run inference on the model, to obtain the mean latency. 37 | ITERATIONS = 20 38 | 39 | # This class is used to provide calibration inputs for the quantization 40 | # process. Since we only care about accuracy for the two inputs we set up, we 41 | # just return those examples and then signal that we're done. 42 | class MockDataReader(CalibrationDataReader): 43 | def __init__(self, input0, input1): 44 | self.has_run = False 45 | self.input0 = input0 46 | self.input1 = input1 47 | 48 | def get_next(self): 49 | if self.has_run: 50 | return None 51 | else: 52 | self.has_run = True 53 | return { 54 | "input0_tensor": self.input0, 55 | "input1_tensor": self.input1, 56 | } 57 | 58 | def rewind(self): 59 | self.has_run = False 60 | 61 | # Convert a float tensor into an eight-bit equivalent. 62 | def quantize_tensor(input, scale, zero_point): 63 | assert input.dtype == np.float32 64 | return np.clip(np.round(input / scale) + zero_point, 0, 255).astype(np.uint8) 65 | 66 | # Convert an eight-bit quantized tensor into a float result. 67 | def dequantize_tensor(input, scale, zero_point): 68 | assert input.dtype == np.uint8 69 | return (input.astype(np.float32) - zero_point) * scale 70 | 71 | # Use the Onnx model framework to construct a graph containing a single matrix 72 | # multiplication operation with two dynamic inputs, all with float computation. 73 | def make_matmul_float_model(): 74 | 75 | matmul_node = h.make_node( 76 | "MatMul", 77 | inputs=["input0_tensor", "input1_tensor"], 78 | outputs=["matmul_output_tensor"], 79 | name="matmul_node") 80 | 81 | matmul_float_graph = h.make_graph( 82 | nodes=[ 83 | matmul_node, 84 | ], 85 | name="matmul_float_graph", 86 | inputs=[ 87 | h.make_tensor_value_info("input0_tensor", tp.FLOAT, INPUT0_SHAPE), 88 | h.make_tensor_value_info("input1_tensor", tp.FLOAT, INPUT1_SHAPE) 89 | ], 90 | outputs=[ 91 | h.make_tensor_value_info("matmul_output_tensor", tp.FLOAT, OUTPUT_SHAPE), 92 | ], 93 | initializer=[ 94 | ]) 95 | 96 | matmul_float_model = h.make_model(matmul_float_graph, producer_name="matmul_test") 97 | 98 | return matmul_float_model 99 | 100 | # Create the base float model and save it out. 101 | matmul_float_model = make_matmul_float_model() 102 | onnx.checker.check_model(matmul_float_model) 103 | onnx.save(matmul_float_model, FLOAT_MODEL_PATH) 104 | 105 | # Arbitrary but fixed seed value. 106 | rng = np.random.default_rng(7528840384) 107 | 108 | # We generate two input tensors with random values from zero to INPUT_RANGE. 109 | input0_numpy = rng.random((INPUT0_SHAPE)).astype(np.float32) * INPUT_RANGE 110 | input1_numpy = rng.random((INPUT1_SHAPE)).astype(np.float32) * INPUT_RANGE 111 | 112 | matmul_output_numpy = np.zeros(OUTPUT_SHAPE, dtype=np.float32) 113 | 114 | input0_tensor = ort.OrtValue.ortvalue_from_numpy(input0_numpy, 'cuda', 0) 115 | input1_tensor = ort.OrtValue.ortvalue_from_numpy(input1_numpy, 'cuda', 0) 116 | matmul_output_tensor = ort.OrtValue.ortvalue_from_numpy(matmul_output_numpy, 'cuda', 0) 117 | 118 | # Create an Onnx Runtime session to run the model on the CPU. 119 | gpu_options = ort.SessionOptions() 120 | gpu_session = ort.InferenceSession( 121 | FLOAT_MODEL_PATH, 122 | sess_options=gpu_options, 123 | providers=[("CUDAExecutionProvider", {"enable_cuda_graph": '1'})], 124 | ) 125 | 126 | io_binding = gpu_session.io_binding() 127 | 128 | # Pass gpu_graph_id to RunOptions through RunConfigs 129 | ro = ort.RunOptions() 130 | # gpu_graph_id is optional if the session uses only one cuda graph 131 | ro.add_run_config_entry("gpu_graph_id", "1") 132 | 133 | # Bind the input and output 134 | io_binding.bind_ortvalue_input("input0_tensor", input0_tensor) 135 | io_binding.bind_ortvalue_input("input1_tensor", input1_tensor) 136 | io_binding.bind_ortvalue_output("matmul_output_tensor", matmul_output_tensor) 137 | 138 | # Run the float model multiple times on the CPU, and calculate the overall latency. 139 | for i in range(ITERATIONS + 1): 140 | # Skip the first run, since there's setup and caching. Sae 141 | # https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html 142 | if i == 1: 143 | start_gpu = time.time() 144 | input0_tensor.update_inplace(input0_numpy) 145 | input1_tensor.update_inplace(input1_numpy) 146 | gpu_session.run_with_iobinding(io_binding, ro) 147 | end_gpu = time.time() 148 | # print(matmul_output_tensor.numpy()) 149 | 150 | print("************ Benchmark Results ************") 151 | 152 | gpu_s = (end_gpu - start_gpu) / ITERATIONS 153 | 154 | gpu_ms = gpu_s * 1000.0 155 | 156 | # Derive the ops per second from the latency and number of ops in the model. 157 | gpu_ops_per_second = round(OPS_PER_INFERENCE / gpu_s) 158 | 159 | rn = ReadableNumber(precision=0, digit_group_size=3) 160 | 161 | print(f"GPU took {gpu_ms:0.2f}ms, {rn.of(gpu_ops_per_second)} ops per second") 162 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking Qualcomm's NPU on the Microsoft Surface Tablet 2 | 3 | TL;DR - We see 1.3% of Qualcomm's NPU 45 Teraops/s claim when benchmarking Windows AI PCs 4 | 5 | - [Introduction](#introduction) 6 | - [Installation](#installation) 7 | - [Python](#python) 8 | - [Cmake](#cmake) 9 | - [Visual Studio](#visual-studio) 10 | - [Pip Packages](#pip-packages) 11 | - [Benchmark](#benchmark) 12 | - [Running](#running) 13 | - [Understanding the Output](#understanding-the-output) 14 | - [What the Benchmark Measures](#what-the-benchmark-measures) 15 | - [Possible Confounding Factors](#possible-confounding-factors) 16 | - [Compute Bound](#compute-bound) 17 | - [Power Settings](#power-settings) 18 | - [Model Topology](#model-topology) 19 | - [Configuration Errors](#configuration-errors) 20 | - [Onnx Framework](#onnx-framework) 21 | - [Interpreting the Results](#interpreting-the-results) 22 | 23 | ## Introduction 24 | 25 | Microsoft now offers Surface tablets that run Windows on a Qualcomm Arm-based 26 | SoC. These are marketed as AI PCs, due to their ability to run machine learning 27 | models faster and more efficiently than other systems. We are fans of 28 | Qualcomm's hardware, and its NPU in particular, so we've invested a lot of time 29 | and resources into porting our third-party app to this plaform. 30 | 31 | Unfortunately there aren't many code examples or benchmarks available to 32 | demonstrate how to achieve fast results as an external developer, so we've put 33 | together a small standalone project to show the performance we're seeing. It's 34 | significantly below what we'd hoped for, so we're publishing this benchmark to 35 | see if we can get ideas on how to achieve lower latency. I'm hopeful there will 36 | be software changes, either at the application, framework, or driver level, 37 | that will improve these results in the future, since I've seen the underlying 38 | hardware perform very effectively on other platforms like Android. 39 | 40 | ## Installation 41 | 42 | ### Python 43 | 44 | We're using Python to run our test scripts, and on Windows [there are several ways to install the language](https://docs.python.org/3/using/windows.html). 45 | As of October 2nd, 2024, the Python available on the Microsoft Store doesn't 46 | support the Arm architecture, and so it's not suitable for running the packages 47 | we need to access Qualcomm's NPU. Instead, you should use [the official Python dot org installer](https://www.python.org/downloads/). 48 | For the results reported here I used [version 3.11.9](https://www.python.org/ftp/python/3.11.9/python-3.11.9-arm64.exe). 49 | 50 | ### Cmake 51 | 52 | We'll also need the cmake build tool to compile Onnx (since prebuilt packages 53 | aren't yet available for Windows on Arm). To do this I ran the following 54 | command from a Powershell: 55 | 56 | ``` 57 | winget install cmake 58 | ``` 59 | 60 | ### Visual Studio 61 | 62 | The build process also requires Visual Studio for the compiler. Download Visual 63 | Studio Community Edition (not Code!) from [visualstudio.microsoft.com/downloads/](https://visualstudio.microsoft.com/downloads/). 64 | 65 | During the installation you will be prompted to select `Workload` from several options: select `Desktop C++ Development` checkbox then press install. 66 | 67 | ### Pip Packages 68 | 69 | You can install all the required Python packages by running the following 70 | from within this folder: 71 | 72 | ``` 73 | py -m pip install -r requirements.txt 74 | ``` 75 | 76 | This includes a couple of custom packages. The first is [my branch of Onnx](https://github.com/petewarden/onnx/tree/rel-1.16.2), 77 | which has [a fix for compiling using the official `py` launcher](https://github.com/onnx/onnx/pull/6407) 78 | backported to Onnx version 1.16, since the Qualcomm Onnx Runtime doesn't work 79 | with newer Onnx versions (giving an `Unsupported model IR version` error). 80 | 81 | I also grab [a nightly build](https://aiinfra.pkgs.visualstudio.com/2692857e-05ef-43b4-ba9c-ccf1c22c437c/_packaging/7982ae20-ed19-4a35-a362-a96ac99897b7/pypi/download/ort-nightly-qnn/1.20.dev20240928001/ort_nightly_qnn-1.20.0.dev20240928001-cp311-cp311-win_arm64.whl#sha256=3b12e3882d1afadf66c2349b2a167dfcbb9ae7a332dc98e0fd51c101d34ddf6e) 82 | of [Qualcomm's Onnx Runtime package](https://onnxruntime.ai/docs/execution-providers/QNN-ExecutionProvider.html). 83 | If you want to install a more recent version, there's [a list here](https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ort-nightly-qnn/). 84 | 85 | ## Benchmark 86 | 87 | ### Running 88 | 89 | To execute the benchmark, run: 90 | 91 | ``` 92 | py benchmark_matmul.py 93 | ``` 94 | 95 | ### Understanding the Output 96 | 97 | The Onnx runtime initially generates a lot of log spam, including: 98 | 99 | ``` 100 | Error in cpuinfo: Unknown chip model name 'Snapdragon(R) X 12-core X1E80100 @ 3.40 GHz'. 101 | Please add new Windows on Arm SoC/chip support to arm/windows/init.c! 102 | unknown Qualcomm CPU part 0x1 ignored 103 | ``` 104 | 105 | and 106 | 107 | ``` 108 | Starting stage: Finalizing Graph Sequence 109 | Completed stage: Finalizing Graph Sequence (115919 us) 110 | Starting stage: Completion 111 | Completed stage: Completion (1025 us) 112 | ``` 113 | 114 | After all those messages, you should see the actual benchmark 115 | results at the end, something like this: 116 | 117 | ```bash 118 | ************ Benchmark Results ************ 119 | NPU quantized compute, float I/O accuracy difference is 0.0100 120 | NPU quantized compute and I/O accuracy difference is 0.0060 121 | CPU took 8.42ms, 821,141,860,688 ops per second 122 | NPU (quantized compute, float I/O) took 30.63ms, 225,667,671,183 ops per second 123 | NPU (quantized compute and I/O) took 12.05ms, 573,475,650,364 ops per second 124 | ``` 125 | 126 | The first two lines confirm that the numerical results of the operations match 127 | between the CPU and the NPU. The final three show the latency of the three 128 | approaches to running a simple model. The latency is the wall time it took to 129 | execute the model from start to finish, and the ops per second is calculated 130 | from that latency to indicate the equivalent computational throughput. 131 | 132 | In this example, we see the CPU is capable of running 821 billion ops/second 133 | (821 Gigaops), the first NPU approach gives us 225 Gigaops, and the second 573 134 | Gigaops. 135 | 136 | ### What the Benchmark Measures 137 | 138 | This benchmark is designed to resemble some real world models we depend on, 139 | running 6 large matrix multiplications that are similar to the most 140 | time-consuming layers in transformer models like OpenAI's Whisper. The shapes 141 | are (6, 1500, 256) X (6, 256, 1500), producing a (6, 1500, 1500) result. The 142 | model we running consists of a single MatMul node with two inputs and one 143 | output. 144 | 145 | The models are created on the fly using the Onnx model framework, and then fed 146 | into the Onnx runtime. The control model is a pure float version that runs 147 | entirely on the CPU. 148 | 149 | The NPU mostly requires quantized models to run effectively (though it has 150 | limited support for float16). The first approach we took to quantization used 151 | [the official ORT `quantize_static()` method](https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#static-quantization). 152 | For convenience this leaves the input and output tensors in 32-bit float and 153 | performs runtime conversions at the start and end of the graph so that the rest 154 | of the computation happens in eight-bit. 155 | 156 | Unfortunately we discovered that the conversion operations as implemented on 157 | the NPU were extremely slow, much slower than the main matrix multiplication 158 | in fact. You can see the results in the `npu_quant_profile.csv` file in this 159 | repository, with conversions taking over 75% of the time. 160 | 161 | To work around this, we constructed an equivalent model graph programmatically 162 | with eight-bit inputs and outputs This is the second "quantized compute and 163 | I/O" approach mentioned in the results. This is usually around three times 164 | faster than the float I/O version, and profiling shows most of the time is 165 | going on the matrix multiplication, as we'd hope. 166 | 167 | ### Possible Confounding Factors 168 | 169 | There are a lot of variables involved in measuring performance. Here are some 170 | of the assumptions we've made: 171 | 172 | #### Compute Bound 173 | 174 | Modern transformer models are based around large matrix multiplications, unlike 175 | older convolutional models. One potential issue is that accelerators could 176 | become memory bound if the layers start to resemble matrix times vectors, since 177 | that doesn't allow reuse of many of the weights, and performance becomes bottle 178 | necked on fetching values from DRAM. We've tried to avoid that by making both 179 | the input matrices more square, so that tiling and reuse should be possible. 180 | 181 | The original matrices from the tiny Whisper model had a k dimension of only 64, 182 | so in case that was too small we bumped it up to 256 in this benchmark to give 183 | as much room for SIMD optimizations as possible. 184 | 185 | #### Power Settings 186 | 187 | Windows has a lot of different configuration options around energy usage, so we 188 | tried to ensure that all of the settings were on "Best Performance" and that we 189 | ran the benchmark with the tablet connected to mains power. There's also a 190 | session option on the Qualcomm Onnx Runtime, `htp_performance_mode`, that we 191 | set to `sustained_high_performance`, since that seemed to give the lowest 192 | overall latency in our experiments. 193 | 194 | #### Model Topology 195 | 196 | We wanted to create a graph of operations that reflected modern AI models, but 197 | was simple enough to easily interpret. We could have added multiple layers, or 198 | used convolutions, or static weights, but settled for a single matrix 199 | multiplication operation with dynamic inputs, since that reflected the 200 | transformer architectures that are widely used for LLMs and other modern 201 | models. 202 | 203 | #### Configuration Errors 204 | 205 | It's possible that the way we build and run our models causes them to fall off 206 | the fast path of the drivers or accelerator implementation. For example, we're 207 | using unsigned eight-bit quantization, with qdq elements in the graph. We've 208 | attempted to follow best practice from the documentation, but we'd welcome ways 209 | to improve performance, especially since these would improve the performance of 210 | our actual applications. 211 | 212 | #### Onnx Framework 213 | 214 | There are multiple different ways to access AI acceleration on Windows. We 215 | looked at DirectML, but it only seems to support GPU access. OpenVino doesn't 216 | run on our Arm hardware, as far as we can tell. We've seen similar performance 217 | results to those shown here using the [Qualcomm QNN SDK](https://www.qualcomm.com/developer/software/neural-processing-sdk-for-ai) 218 | directly. TensorFlow Lite isn't supported on Windows for Arm. From this 219 | research and our experiments, Onnx is supported by both Microsoft and Qualcomm, 220 | and seems to be the best framework to use to get accelerated performance from 221 | the NPU, but we're interested in learning if other APIs would be more 222 | appropriate. 223 | 224 | ## Interpreting the Results 225 | 226 | The results shown here are current as of October 2nd, 2024, when running on a 227 | Microsoft Surface Pro 11th Edition, with a Snapdragon(R) X 12-core X1E80100 228 | clocked at 3.40 GHz. The first obvious thing is that the NPU results, even 229 | without float conversion, are slower than the CPU. This is not ideal for an 230 | accelerator, even though it could still potentially offer energy or sustained 231 | performance advantages that make it worth using. 232 | 233 | The second conclusion is that the measured performance of 573 billion 234 | operations per second is only 1.3% of the 45 trillion ops/s that [the marketing material](https://www.microsoft.com/en-us/surface/devices/surface-pro-11th-edition) 235 | promises. 236 | 237 | By contrast, running the same model on an Nvidia Geforce RTX 4080 Laptop GPU 238 | runs in 3.2ms, an equivalent of 2,160 billion operations per second, almost 239 | four times the throughput. -------------------------------------------------------------------------------- /benchmark_matmul.py: -------------------------------------------------------------------------------- 1 | # Benchmark script for the Qualcomm NPU on a Microsoft Surface Pro Tablet. 2 | # See README.md for more information, and LICENSE for copyright information. 3 | 4 | import numpy as np 5 | import onnx 6 | from onnx import helper as h, TensorProto as tp 7 | import onnxruntime as ort 8 | from onnxruntime.quantization import QuantFormat, QuantType, CalibrationDataReader, quantize_static 9 | from readable_number import ReadableNumber 10 | import time 11 | 12 | # Define the shape of the matrix multiplication operation to benchmark. 13 | MATRIX_COUNT = 6 14 | MATRIX_A = 1500 15 | MATRIX_B = 1500 16 | MATRIX_K = 256 17 | INPUT0_SHAPE = [1, MATRIX_COUNT, MATRIX_A, MATRIX_K] 18 | INPUT1_SHAPE = [1, MATRIX_COUNT, MATRIX_K, MATRIX_B] 19 | OUTPUT_SHAPE = [1, MATRIX_COUNT, MATRIX_A, MATRIX_B] 20 | 21 | # A multiply-add counts as two operations, conventionally. 22 | OPS_PER_MUL_ADD = 2 23 | 24 | # Derive the total number of operations from the input shapes. 25 | OPS_PER_INFERENCE = OPS_PER_MUL_ADD * MATRIX_COUNT * MATRIX_A * MATRIX_B * MATRIX_K 26 | 27 | # The float range to distribute random inputs over. 28 | INPUT_RANGE = 1.0 / 5.0 29 | 30 | # Where to save the intermediate model files. These will overwrite whatever is 31 | # in the existing repository by default. 32 | FLOAT_MODEL_PATH = "matmul_model_float.onnx" 33 | QUANT_MODEL_PATH = "matmul_model_quant.onnx" 34 | QUANT_IO_MODEL_PATH = "matmul_model_quant_io.onnx" 35 | 36 | # How many times to run inference on the model, to obtain the mean latency. 37 | ITERATIONS = 20 38 | 39 | # This class is used to provide calibration inputs for the quantization 40 | # process. Since we only care about accuracy for the two inputs we set up, we 41 | # just return those examples and then signal that we're done. 42 | class MockDataReader(CalibrationDataReader): 43 | def __init__(self, input0, input1): 44 | self.has_run = False 45 | self.input0 = input0 46 | self.input1 = input1 47 | 48 | def get_next(self): 49 | if self.has_run: 50 | return None 51 | else: 52 | self.has_run = True 53 | return { 54 | "input0_tensor": self.input0, 55 | "input1_tensor": self.input1, 56 | } 57 | 58 | def rewind(self): 59 | self.has_run = False 60 | 61 | # Convert a float tensor into an eight-bit equivalent. 62 | def quantize_tensor(input, scale, zero_point): 63 | assert input.dtype == np.float32 64 | return np.clip(np.round(input / scale) + zero_point, 0, 255).astype(np.uint8) 65 | 66 | # Convert an eight-bit quantized tensor into a float result. 67 | def dequantize_tensor(input, scale, zero_point): 68 | assert input.dtype == np.uint8 69 | return (input.astype(np.float32) - zero_point) * scale 70 | 71 | # Use the Onnx model framework to construct a graph containing a single matrix 72 | # multiplication operation with two dynamic inputs, all with float computation. 73 | def make_matmul_float_model(): 74 | 75 | matmul_node = h.make_node( 76 | "MatMul", 77 | inputs=["input0_tensor", "input1_tensor"], 78 | outputs=["matmul_output_tensor"], 79 | name="matmul_node") 80 | 81 | matmul_float_graph = h.make_graph( 82 | nodes=[ 83 | matmul_node, 84 | ], 85 | name="matmul_float_graph", 86 | inputs=[ 87 | h.make_tensor_value_info("input0_tensor", tp.FLOAT, INPUT0_SHAPE), 88 | h.make_tensor_value_info("input1_tensor", tp.FLOAT, INPUT1_SHAPE) 89 | ], 90 | outputs=[ 91 | h.make_tensor_value_info("matmul_output_tensor", tp.FLOAT, OUTPUT_SHAPE), 92 | ], 93 | initializer=[ 94 | ]) 95 | 96 | matmul_float_model = h.make_model(matmul_float_graph, producer_name="matmul_test") 97 | 98 | return matmul_float_model 99 | 100 | # Builds a model with a single matrix multiplication operation, computing all 101 | # values in eight-bit, with eight-bit inputs and outputs. 102 | def make_matmul_quantized_io_model( 103 | input_scale, input_zero_point, 104 | matmul_scale, matmul_zero_point): 105 | input0_scale_tensor = h.make_tensor( 106 | name="input0_scale", 107 | data_type=tp.FLOAT, 108 | dims=[1], 109 | vals=[input_scale]) 110 | 111 | input0_zero_point_tensor = h.make_tensor( 112 | name="input0_zero_point", 113 | data_type=tp.UINT8, 114 | dims=[1], 115 | vals=[input_zero_point]) 116 | 117 | input0_dequant_node = h.make_node( 118 | "DequantizeLinear", 119 | inputs=["input0_quant_tensor", "input0_scale", "input0_zero_point"], 120 | outputs=["input0_dequant_tensor"], 121 | name="input0_dequant") 122 | 123 | input1_scale_tensor = h.make_tensor( 124 | name="input1_scale", 125 | data_type=tp.FLOAT, 126 | dims=[1], 127 | vals=[input_scale]) 128 | 129 | input1_zero_point_tensor = h.make_tensor( 130 | name="input1_zero_point", 131 | data_type=tp.UINT8, 132 | dims=[1], 133 | vals=[input_zero_point]) 134 | 135 | input1_dequant_node = h.make_node( 136 | "DequantizeLinear", 137 | inputs=["input1_quant_tensor", "input1_scale", "input1_zero_point"], 138 | outputs=["input1_dequant_tensor"], 139 | name="input1_dequant") 140 | 141 | matmul_node = h.make_node( 142 | "MatMul", 143 | inputs=["input0_dequant_tensor", "input1_dequant_tensor"], 144 | outputs=["matmul_output_tensor"], 145 | name="matmul_node") 146 | 147 | matmul_output_scale_tensor = h.make_tensor( 148 | name="matmul_output_scale", 149 | data_type=tp.FLOAT, 150 | dims=[1], 151 | vals=[matmul_scale]) 152 | 153 | matmul_output_zero_point_tensor = h.make_tensor( 154 | name="matmul_output_zero_point", 155 | data_type=tp.UINT8, 156 | dims=[1], 157 | vals=[matmul_zero_point]) 158 | 159 | matmul_output_quant_node = h.make_node( 160 | "QuantizeLinear", 161 | inputs=["matmul_output_tensor", "matmul_output_scale", "matmul_output_zero_point"], 162 | outputs=["matmul_output_quant_tensor"], 163 | name="matmul_output_quant") 164 | 165 | matmul_output_dequant_node = h.make_node( 166 | "DequantizeLinear", 167 | inputs=["matmul_output_quant_tensor", "matmul_output_scale", "matmul_output_zero_point"], 168 | outputs=["matmul_output_dequant_tensor"], 169 | name="matmul_output_dequant") 170 | 171 | matmul_quantized_graph = h.make_graph( 172 | nodes=[ 173 | input0_dequant_node, 174 | input1_dequant_node, 175 | matmul_node, 176 | matmul_output_quant_node, 177 | matmul_output_dequant_node, 178 | ], 179 | name="matmul_quantized_graph", 180 | inputs=[ 181 | h.make_tensor_value_info("input0_quant_tensor", tp.UINT8, INPUT0_SHAPE), 182 | h.make_tensor_value_info("input1_quant_tensor", tp.UINT8, INPUT1_SHAPE) 183 | ], 184 | outputs=[ 185 | h.make_tensor_value_info("matmul_output_quant_tensor", tp.UINT8, OUTPUT_SHAPE), 186 | ], 187 | initializer=[ 188 | input0_scale_tensor, 189 | input0_zero_point_tensor, 190 | input1_scale_tensor, 191 | input1_zero_point_tensor, 192 | matmul_output_scale_tensor, 193 | matmul_output_zero_point_tensor, 194 | ]) 195 | 196 | matmul_quantized_model = h.make_model(matmul_quantized_graph, producer_name="matmul_test") 197 | 198 | return matmul_quantized_model 199 | 200 | # Calculates the mean difference between two arrays. 201 | def array_msd(x, y): 202 | difference = x - y 203 | msd = np.mean(np.sqrt(difference * difference)) 204 | return msd 205 | 206 | # Create the base float model and save it out. 207 | matmul_float_model = make_matmul_float_model() 208 | onnx.checker.check_model(matmul_float_model) 209 | onnx.save(matmul_float_model, FLOAT_MODEL_PATH) 210 | 211 | # Arbitrary but fixed seed value. 212 | rng = np.random.default_rng(7528840384) 213 | 214 | # We generate two input tensors with random values from zero to INPUT_RANGE. 215 | input0_tensor = rng.random((INPUT0_SHAPE)).astype(np.float32) * INPUT_RANGE 216 | input1_tensor = rng.random((INPUT1_SHAPE)).astype(np.float32) * INPUT_RANGE 217 | 218 | # Create an Onnx Runtime session to run the model on the CPU. 219 | cpu_options = ort.SessionOptions() 220 | cpu_session = ort.InferenceSession( 221 | FLOAT_MODEL_PATH, 222 | sess_options=cpu_options, 223 | ) 224 | 225 | # Run the float model multiple times on the CPU, and calculate the overall latency. 226 | start_cpu = time.time() 227 | for i in range(ITERATIONS): 228 | cpu_float_outputs = cpu_session.run( 229 | None, { 230 | "input0_tensor": input0_tensor, 231 | "input1_tensor": input1_tensor, 232 | }) 233 | end_cpu = time.time() 234 | cpu_float_output = cpu_float_outputs[0] 235 | 236 | # Create a quantized model using the recommended ORT method. This has float 237 | # inputs and outputs. 238 | data_reader = MockDataReader(input0_tensor, input1_tensor) 239 | quantize_static( 240 | FLOAT_MODEL_PATH, 241 | QUANT_MODEL_PATH, 242 | data_reader, 243 | quant_format=QuantFormat.QDQ, 244 | per_channel=False, 245 | activation_type=QuantType.QUInt8, 246 | weight_type=QuantType.QUInt8, 247 | ) 248 | 249 | # Create an ORT session that should run on the NPU. 250 | npu_quant_options = ort.SessionOptions() 251 | # Raise an error if any operations aren't runnable on the NPU. 252 | npu_quant_options.add_session_config_entry("session.disable_cpu_ep_fallback", "1") 253 | npu_quant_session = ort.InferenceSession( 254 | QUANT_MODEL_PATH, 255 | sess_options=npu_quant_options, 256 | providers=["QNNExecutionProvider"], 257 | provider_options=[{ 258 | "backend_path": "QnnHtp.dll", 259 | "htp_performance_mode": "sustained_high_performance", 260 | "enable_htp_fp16_precision": "1", 261 | # "profiling_level": "detailed", 262 | # "profiling_file_path": "npu_quant_profile.csv", 263 | }] 264 | ) 265 | 266 | # Run the quantized model with float I/O on the NPU and calculate the latency. 267 | start_npu_quant = time.time() 268 | for i in range(ITERATIONS): 269 | npu_quant_outputs = npu_quant_session.run( 270 | None, { 271 | "input0_tensor": input0_tensor, 272 | "input1_tensor": input1_tensor, 273 | }) 274 | end_npu_quant = time.time() 275 | npu_quant_output = npu_quant_outputs[0] 276 | 277 | # Build a quantized model that has quantized inputs and outputs, to avoid the 278 | # performance problems we've seen with float conversion. We can't use the 279 | # standard quantize_static method, so instead construct the model from scratch. 280 | input_scale = INPUT_RANGE / 255.0 281 | input_zero_point = 0 282 | max_output = np.max(cpu_float_output) 283 | matmul_scale = max_output / 255.0 284 | matmul_zero_point = 0 285 | quant_io_model = make_matmul_quantized_io_model(input_scale, input_zero_point, matmul_scale, matmul_zero_point) 286 | onnx.checker.check_model(quant_io_model) 287 | onnx.save(quant_io_model, QUANT_IO_MODEL_PATH) 288 | 289 | # Convert our float inputs into quantized equivalents. 290 | input0_quant_tensor = quantize_tensor(input0_tensor, input_scale, input_zero_point) 291 | input1_quant_tensor = quantize_tensor(input1_tensor, input_scale, input_zero_point) 292 | 293 | # Build an NPU session to run the fully-quantized model. 294 | npu_quant_io_options = ort.SessionOptions() 295 | npu_quant_io_options.add_session_config_entry("session.disable_cpu_ep_fallback", "1") 296 | npu_quant_io_session = ort.InferenceSession( 297 | QUANT_IO_MODEL_PATH, 298 | sess_options=npu_quant_io_options, 299 | providers=["QNNExecutionProvider"], 300 | provider_options=[{ 301 | "backend_path": "QnnHtp.dll", 302 | "htp_performance_mode": "sustained_high_performance", 303 | "enable_htp_fp16_precision": "1", 304 | # "profiling_level": "detailed", 305 | # "profiling_file_path": "npu_quant_io_profile.csv", 306 | }] 307 | ) 308 | 309 | # Run the quantized I/O model on the NPU to measure the latency. 310 | start_npu_quant_io = time.time() 311 | for i in range(ITERATIONS): 312 | npu_quant_io_outputs = npu_quant_io_session.run( 313 | None, { 314 | "input0_quant_tensor": input0_quant_tensor, 315 | "input1_quant_tensor": input1_quant_tensor, 316 | }) 317 | end_npu_quant_io = time.time() 318 | npu_quant_io_output = npu_quant_io_outputs[0] 319 | 320 | # Convert the result back into a float tensor. 321 | npu_quant_io_output_float = dequantize_tensor(npu_quant_io_output, matmul_scale, matmul_zero_point) 322 | 323 | print("************ Benchmark Results ************") 324 | 325 | # Verify that the results are approximately what we'd expect. 326 | print(f"NPU quantized compute, float I/O accuracy difference is {array_msd(cpu_float_output, npu_quant_output):0.4f}") 327 | print(f"NPU quantized compute and I/O accuracy difference is {array_msd(cpu_float_output, npu_quant_io_output_float):0.4f}") 328 | 329 | cpu_s = (end_cpu - start_cpu) / ITERATIONS 330 | npu_quant_s = (end_npu_quant - start_npu_quant) / ITERATIONS 331 | npu_quant_io_s = (end_npu_quant_io - start_npu_quant_io) / ITERATIONS 332 | 333 | cpu_ms = cpu_s * 1000.0 334 | npu_quant_ms = npu_quant_s * 1000.0 335 | npu_quant_io_ms = npu_quant_io_s * 1000.0 336 | 337 | # Derive the ops per second from the latency and number of ops in the model. 338 | cpu_ops_per_second = round(OPS_PER_INFERENCE / cpu_s) 339 | npu_quant_ops_per_second = round(OPS_PER_INFERENCE / npu_quant_s) 340 | npu_quant_io_ops_per_second = round(OPS_PER_INFERENCE / npu_quant_io_s) 341 | 342 | rn = ReadableNumber(precision=0, digit_group_size=3) 343 | 344 | print(f"CPU took {cpu_ms:0.2f}ms, {rn.of(cpu_ops_per_second)} ops per second") 345 | print(f"NPU (quantized compute, float I/O) took {npu_quant_ms:0.2f}ms, {rn.of(npu_quant_ops_per_second)} ops per second") 346 | print(f"NPU (quantized compute and I/O) took {npu_quant_io_ms:0.2f}ms, {rn.of(npu_quant_io_ops_per_second)} ops per second") 347 | --------------------------------------------------------------------------------