├── matmul_model_float.onnx
├── matmul_model_quant.onnx
├── matmul_model_quant_io.onnx
├── requirements.txt
├── npu_quant_io_profile.csv
├── npu_quant_profile.csv
├── benchmark_matmul_cudnn.py
├── LICENSE
├── README.md
└── benchmark_matmul.py


/matmul_model_float.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moonshine-ai/qc_npu_benchmark/HEAD/matmul_model_float.onnx


--------------------------------------------------------------------------------
/matmul_model_quant.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moonshine-ai/qc_npu_benchmark/HEAD/matmul_model_quant.onnx


--------------------------------------------------------------------------------
/matmul_model_quant_io.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moonshine-ai/qc_npu_benchmark/HEAD/matmul_model_quant_io.onnx


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/petewarden/onnx@rel-1.16.2
2 | https://aiinfra.pkgs.visualstudio.com/2692857e-05ef-43b4-ba9c-ccf1c22c437c/_packaging/7982ae20-ed19-4a35-a362-a96ac99897b7/pypi/download/ort-nightly-qnn/1.20.dev20240928001/ort_nightly_qnn-1.20.0.dev20240928001-cp311-cp311-win_arm64.whl#sha256=3b12e3882d1afadf66c2349b2a167dfcbb9ae7a332dc98e0fd51c101d34ddf6e
3 | readable-number
4 | 


--------------------------------------------------------------------------------
/npu_quant_io_profile.csv:
--------------------------------------------------------------------------------
 1 | Msg Timestamp,Message,Time,Unit of Measurement,Timing Source,Event Level,Event Identifier
 2 | 0,BACKEND,26825,US,BACKEND,ROOT,RPC (finalize) time
 3 | 0,BACKEND,26448,US,BACKEND,ROOT,QNN accelerator (finalize) time
 4 | 0,BACKEND,26355,US,BACKEND,ROOT,Accelerator (finalize) time
 5 | 0,FINALIZE,1557972,US,BACKEND,ROOT,QNN (finalize) time
 6 | 0,BACKEND,4,COUNT,BACKEND,ROOT,Number of HVX threads used
 7 | 0,BACKEND,64116,US,BACKEND,ROOT,RPC (execute) time
 8 | 0,BACKEND,63551,US,BACKEND,ROOT,QNN accelerator (execute) time
 9 | 0,BACKEND,0,COUNT,BACKEND,ROOT,Num times yield occured
10 | 0,BACKEND,699,US,BACKEND,ROOT,Time for initial VTCM acquire
11 | 0,BACKEND,2446,US,BACKEND,ROOT,Time for HVX + HMX power on and acquire
12 | 0,BACKEND,18173387,CYCLES,BACKEND,ROOT,Accelerator (execute) time (cycles)
13 | 0,NODE,211078,CYCLES,BACKEND,SUB-EVENT,Input OpId_2 (cycles)
14 | 0,NODE,10697422,CYCLES,BACKEND,SUB-EVENT,matmul_node:OpId_23 (cycles)
15 | 0,NODE,0,CYCLES,BACKEND,SUB-EVENT,matmul_output_dequant:OpId_27 (cycles)
16 | 0,NODE,7264887,CYCLES,BACKEND,SUB-EVENT,Output OpId_3 (cycles)
17 | 0,BACKEND,10353,US,BACKEND,ROOT,Accelerator (execute) time
18 | 0,BACKEND,10112,US,BACKEND,ROOT,Accelerator (execute excluding wait) time
19 | 0,EXECUTE,67117,US,BACKEND,ROOT,QNN (execute) time
20 | 


--------------------------------------------------------------------------------
/npu_quant_profile.csv:
--------------------------------------------------------------------------------
 1 | Msg Timestamp,Message,Time,Unit of Measurement,Timing Source,Event Level,Event Identifier
 2 | 0,BACKEND,43624,US,BACKEND,ROOT,RPC (finalize) time
 3 | 0,BACKEND,42986,US,BACKEND,ROOT,QNN accelerator (finalize) time
 4 | 0,BACKEND,42885,US,BACKEND,ROOT,Accelerator (finalize) time
 5 | 0,FINALIZE,2049932,US,BACKEND,ROOT,QNN (finalize) time
 6 | 0,BACKEND,4,COUNT,BACKEND,ROOT,Number of HVX threads used
 7 | 0,BACKEND,88338,US,BACKEND,ROOT,RPC (execute) time
 8 | 0,BACKEND,87857,US,BACKEND,ROOT,QNN accelerator (execute) time
 9 | 0,BACKEND,0,COUNT,BACKEND,ROOT,Num times yield occured
10 | 0,BACKEND,664,US,BACKEND,ROOT,Time for initial VTCM acquire
11 | 0,BACKEND,1970,US,BACKEND,ROOT,Time for HVX + HMX power on and acquire
12 | 0,BACKEND,51117377,CYCLES,BACKEND,ROOT,Accelerator (execute) time (cycles)
13 | 0,NODE,0,CYCLES,BACKEND,SUB-EVENT,Input OpId_2 (cycles)
14 | 0,NODE,3046243,CYCLES,BACKEND,SUB-EVENT,input1_tensor_QuantizeLinear:OpId_16 (cycles)
15 | 0,NODE,1784335,CYCLES,BACKEND,SUB-EVENT,input0_tensor_QuantizeLinear:OpId_17 (cycles)
16 | 0,NODE,10958875,CYCLES,BACKEND,SUB-EVENT,matmul_node:OpId_18 (cycles)
17 | 0,NODE,32129948,CYCLES,BACKEND,SUB-EVENT,matmul_output_tensor_DequantizeLinear:OpId_22 (cycles)
18 | 0,NODE,3197976,CYCLES,BACKEND,SUB-EVENT,Output OpId_3 (cycles)
19 | 0,BACKEND,26024,US,BACKEND,ROOT,Accelerator (execute) time
20 | 0,BACKEND,25815,US,BACKEND,ROOT,Accelerator (execute excluding wait) time
21 | 0,EXECUTE,99091,US,BACKEND,ROOT,QNN (execute) time
22 | 


--------------------------------------------------------------------------------
/benchmark_matmul_cudnn.py:
--------------------------------------------------------------------------------
  1 | # Benchmark script for the Qualcomm NPU on a Microsoft Surface Pro Tablet.
  2 | # See README.md for more information, and LICENSE for copyright information.
  3 | 
  4 | import numpy as np
  5 | import onnx
  6 | from onnx import helper as h, TensorProto as tp
  7 | import onnxruntime as ort
  8 | from onnxruntime.quantization import QuantFormat, QuantType, CalibrationDataReader, quantize_static
  9 | from readable_number import ReadableNumber
 10 | import time
 11 | 
 12 | # Define the shape of the matrix multiplication operation to benchmark.
 13 | MATRIX_COUNT = 6
 14 | MATRIX_A = 1500
 15 | MATRIX_B = 1500
 16 | MATRIX_K = 256
 17 | INPUT0_SHAPE = [1, MATRIX_COUNT, MATRIX_A, MATRIX_K]
 18 | INPUT1_SHAPE = [1, MATRIX_COUNT, MATRIX_K, MATRIX_B]
 19 | OUTPUT_SHAPE = [1, MATRIX_COUNT, MATRIX_A, MATRIX_B]
 20 | 
 21 | # A multiply-add counts as two operations, conventionally.
 22 | OPS_PER_MUL_ADD = 2
 23 | 
 24 | # Derive the total number of operations from the input shapes.
 25 | OPS_PER_INFERENCE = OPS_PER_MUL_ADD * MATRIX_COUNT * MATRIX_A * MATRIX_B * MATRIX_K
 26 | 
 27 | # The float range to distribute random inputs over.
 28 | INPUT_RANGE = 1.0 / 5.0
 29 | 
 30 | # Where to save the intermediate model files. These will overwrite whatever is
 31 | # in the existing repository by default.
 32 | FLOAT_MODEL_PATH = "matmul_model_float.onnx"
 33 | QUANT_MODEL_PATH = "matmul_model_quant.onnx"
 34 | QUANT_IO_MODEL_PATH = "matmul_model_quant_io.onnx"
 35 | 
 36 | # How many times to run inference on the model, to obtain the mean latency.
 37 | ITERATIONS = 20
 38 | 
 39 | # This class is used to provide calibration inputs for the quantization 
 40 | # process. Since we only care about accuracy for the two inputs we set up, we
 41 | # just return those examples and then signal that we're done.
 42 | class MockDataReader(CalibrationDataReader):
 43 |     def __init__(self, input0, input1):
 44 |         self.has_run = False
 45 |         self.input0 = input0
 46 |         self.input1 = input1
 47 | 
 48 |     def get_next(self):
 49 |         if self.has_run:
 50 |             return None
 51 |         else:
 52 |             self.has_run = True
 53 |             return {
 54 |                 "input0_tensor": self.input0,
 55 |                 "input1_tensor": self.input1,
 56 |             }
 57 | 
 58 |     def rewind(self):
 59 |         self.has_run = False
 60 | 
 61 | # Convert a float tensor into an eight-bit equivalent.
 62 | def quantize_tensor(input, scale, zero_point):
 63 |     assert input.dtype == np.float32
 64 |     return np.clip(np.round(input / scale) + zero_point, 0, 255).astype(np.uint8)
 65 | 
 66 | # Convert an eight-bit quantized tensor into a float result.
 67 | def dequantize_tensor(input, scale, zero_point):
 68 |     assert input.dtype == np.uint8
 69 |     return (input.astype(np.float32) - zero_point) * scale
 70 | 
 71 | # Use the Onnx model framework to construct a graph containing a single matrix
 72 | # multiplication operation with two dynamic inputs, all with float computation.
 73 | def make_matmul_float_model():
 74 | 
 75 |     matmul_node = h.make_node(
 76 |         "MatMul", 
 77 |         inputs=["input0_tensor", "input1_tensor"],
 78 |         outputs=["matmul_output_tensor"], 
 79 |         name="matmul_node")
 80 |     
 81 |     matmul_float_graph = h.make_graph(
 82 |         nodes=[
 83 |             matmul_node,
 84 |         ], 
 85 |         name="matmul_float_graph",
 86 |         inputs=[
 87 |             h.make_tensor_value_info("input0_tensor", tp.FLOAT, INPUT0_SHAPE),
 88 |             h.make_tensor_value_info("input1_tensor", tp.FLOAT, INPUT1_SHAPE)
 89 |         ],
 90 |         outputs=[
 91 |             h.make_tensor_value_info("matmul_output_tensor", tp.FLOAT, OUTPUT_SHAPE),
 92 |         ],
 93 |         initializer=[
 94 |         ])
 95 | 
 96 |     matmul_float_model = h.make_model(matmul_float_graph, producer_name="matmul_test")
 97 | 
 98 |     return matmul_float_model
 99 | 
100 | # Create the base float model and save it out.
101 | matmul_float_model = make_matmul_float_model()
102 | onnx.checker.check_model(matmul_float_model)
103 | onnx.save(matmul_float_model, FLOAT_MODEL_PATH)
104 | 
105 | # Arbitrary but fixed seed value.
106 | rng = np.random.default_rng(7528840384)
107 | 
108 | # We generate two input tensors with random values from zero to INPUT_RANGE.
109 | input0_numpy = rng.random((INPUT0_SHAPE)).astype(np.float32) * INPUT_RANGE
110 | input1_numpy = rng.random((INPUT1_SHAPE)).astype(np.float32) * INPUT_RANGE
111 | 
112 | matmul_output_numpy = np.zeros(OUTPUT_SHAPE, dtype=np.float32)
113 | 
114 | input0_tensor = ort.OrtValue.ortvalue_from_numpy(input0_numpy, 'cuda', 0)
115 | input1_tensor = ort.OrtValue.ortvalue_from_numpy(input1_numpy, 'cuda', 0)
116 | matmul_output_tensor = ort.OrtValue.ortvalue_from_numpy(matmul_output_numpy, 'cuda', 0)
117 | 
118 | # Create an Onnx Runtime session to run the model on the CPU.
119 | gpu_options = ort.SessionOptions()
120 | gpu_session = ort.InferenceSession(
121 |     FLOAT_MODEL_PATH,
122 |     sess_options=gpu_options,
123 |     providers=[("CUDAExecutionProvider", {"enable_cuda_graph": '1'})],
124 | )
125 | 
126 | io_binding = gpu_session.io_binding()
127 | 
128 | # Pass gpu_graph_id to RunOptions through RunConfigs
129 | ro = ort.RunOptions()
130 | # gpu_graph_id is optional if the session uses only one cuda graph
131 | ro.add_run_config_entry("gpu_graph_id", "1")
132 | 
133 | # Bind the input and output
134 | io_binding.bind_ortvalue_input("input0_tensor", input0_tensor)
135 | io_binding.bind_ortvalue_input("input1_tensor", input1_tensor)
136 | io_binding.bind_ortvalue_output("matmul_output_tensor", matmul_output_tensor)
137 | 
138 | # Run the float model multiple times on the CPU, and calculate the overall latency.
139 | for i in range(ITERATIONS + 1):
140 |     # Skip the first run, since there's setup and caching. Sae
141 |     # https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html
142 |     if i == 1:
143 |         start_gpu = time.time()
144 |     input0_tensor.update_inplace(input0_numpy)
145 |     input1_tensor.update_inplace(input1_numpy)
146 |     gpu_session.run_with_iobinding(io_binding, ro)
147 | end_gpu = time.time()
148 | # print(matmul_output_tensor.numpy())
149 | 
150 | print("************ Benchmark Results ************")
151 | 
152 | gpu_s = (end_gpu - start_gpu) / ITERATIONS
153 | 
154 | gpu_ms = gpu_s * 1000.0
155 | 
156 | # Derive the ops per second from the latency and number of ops in the model.
157 | gpu_ops_per_second = round(OPS_PER_INFERENCE / gpu_s)
158 | 
159 | rn = ReadableNumber(precision=0, digit_group_size=3)
160 | 
161 | print(f"GPU took {gpu_ms:0.2f}ms, {rn.of(gpu_ops_per_second)} ops per second")
162 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Benchmarking Qualcomm's NPU on the Microsoft Surface Tablet
  2 | 
  3 | TL;DR - We see 1.3% of Qualcomm's NPU 45 Teraops/s claim when benchmarking Windows AI PCs
  4 | 
  5 |   - [Introduction](#introduction)
  6 |   - [Installation](#installation)
  7 |     - [Python](#python)
  8 |     - [Cmake](#cmake)
  9 |     - [Visual Studio](#visual-studio)
 10 |     - [Pip Packages](#pip-packages)
 11 |   - [Benchmark](#benchmark)
 12 |     - [Running](#running)
 13 |     - [Understanding the Output](#understanding-the-output)
 14 |     - [What the Benchmark Measures](#what-the-benchmark-measures)
 15 |     - [Possible Confounding Factors](#possible-confounding-factors)
 16 |       - [Compute Bound](#compute-bound)
 17 |       - [Power Settings](#power-settings)
 18 |       - [Model Topology](#model-topology)
 19 |       - [Configuration Errors](#configuration-errors)
 20 |       - [Onnx Framework](#onnx-framework)
 21 |   - [Interpreting the Results](#interpreting-the-results)
 22 | 
 23 | ## Introduction
 24 | 
 25 | Microsoft now offers Surface tablets that run Windows on a Qualcomm Arm-based 
 26 | SoC. These are marketed as AI PCs, due to their ability to run machine learning
 27 | models faster and more efficiently than other systems. We are fans of 
 28 | Qualcomm's hardware, and its NPU in particular, so we've invested a lot of time
 29 | and resources into porting our third-party app to this plaform.
 30 | 
 31 | Unfortunately there  aren't many code examples or benchmarks available to 
 32 | demonstrate how to achieve fast results as an external developer, so we've put
 33 | together a small standalone project to show the performance we're seeing. It's
 34 | significantly below what we'd hoped for, so we're publishing this benchmark to
 35 | see if we can get ideas on how to achieve lower latency. I'm hopeful there will
 36 | be software changes, either at the application, framework, or driver level, 
 37 | that will improve these results in the future, since I've seen the underlying 
 38 | hardware perform very effectively on other platforms like Android.
 39 | 
 40 | ## Installation
 41 | 
 42 | ### Python
 43 | 
 44 | We're using Python to run our test scripts, and on Windows [there are several ways to install the language](https://docs.python.org/3/using/windows.html).
 45 | As of October 2nd, 2024, the Python available on the Microsoft Store doesn't
 46 | support the Arm architecture, and so it's not suitable for running the packages
 47 | we need to access Qualcomm's NPU. Instead, you should use [the official Python dot org installer](https://www.python.org/downloads/). 
 48 | For the results reported here I used [version 3.11.9](https://www.python.org/ftp/python/3.11.9/python-3.11.9-arm64.exe).
 49 | 
 50 | ### Cmake
 51 | 
 52 | We'll also need the cmake build tool to compile Onnx (since prebuilt packages
 53 | aren't yet available for Windows on Arm). To do this I ran the following
 54 | command from a Powershell:
 55 | 
 56 | ```
 57 | winget install cmake
 58 | ```
 59 | 
 60 | ### Visual Studio
 61 | 
 62 | The build process also requires Visual Studio for the compiler. Download Visual
 63 | Studio Community Edition (not Code!) from [visualstudio.microsoft.com/downloads/](https://visualstudio.microsoft.com/downloads/).
 64 | 
 65 | During the installation you will be prompted to select `Workload` from several options: select `Desktop C++ Development` checkbox then press install.
 66 | 
 67 | ### Pip Packages
 68 | 
 69 | You can install all the required Python packages by running the following
 70 | from within this folder:
 71 | 
 72 | ```
 73 | py -m pip install -r requirements.txt
 74 | ```
 75 | 
 76 | This includes a couple of custom packages. The first is [my branch of Onnx](https://github.com/petewarden/onnx/tree/rel-1.16.2),
 77 | which has [a fix for compiling using the official `py` launcher](https://github.com/onnx/onnx/pull/6407)
 78 | backported to Onnx version 1.16, since the Qualcomm Onnx Runtime doesn't work
 79 | with newer Onnx versions (giving an `Unsupported model IR version` error).
 80 | 
 81 | I also grab [a nightly build](https://aiinfra.pkgs.visualstudio.com/2692857e-05ef-43b4-ba9c-ccf1c22c437c/_packaging/7982ae20-ed19-4a35-a362-a96ac99897b7/pypi/download/ort-nightly-qnn/1.20.dev20240928001/ort_nightly_qnn-1.20.0.dev20240928001-cp311-cp311-win_arm64.whl#sha256=3b12e3882d1afadf66c2349b2a167dfcbb9ae7a332dc98e0fd51c101d34ddf6e)
 82 | of [Qualcomm's Onnx Runtime package](https://onnxruntime.ai/docs/execution-providers/QNN-ExecutionProvider.html). 
 83 | If you want to install a more recent version, there's [a list here](https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ort-nightly-qnn/).
 84 | 
 85 | ## Benchmark
 86 | 
 87 | ### Running
 88 | 
 89 | To execute the benchmark, run:
 90 | 
 91 | ```
 92 | py benchmark_matmul.py
 93 | ```
 94 | 
 95 | ### Understanding the Output
 96 | 
 97 | The Onnx runtime initially generates a lot of log spam, including:
 98 | 
 99 | ```
100 | Error in cpuinfo: Unknown chip model name 'Snapdragon(R) X 12-core X1E80100 @ 3.40 GHz'.
101 | Please add new Windows on Arm SoC/chip support to arm/windows/init.c!
102 | unknown Qualcomm CPU part 0x1 ignored
103 | ```
104 | 
105 | and
106 | 
107 | ```
108 | Starting stage: Finalizing Graph Sequence
109 | Completed stage: Finalizing Graph Sequence (115919 us)
110 | Starting stage: Completion
111 | Completed stage: Completion (1025 us)
112 | ```
113 | 
114 | After all those messages, you should see the actual benchmark 
115 | results at the end, something like this:
116 | 
117 | ```bash
118 | ************ Benchmark Results ************
119 | NPU quantized compute, float I/O accuracy difference is 0.0100
120 | NPU quantized compute and I/O accuracy difference is 0.0060
121 | CPU took 8.42ms, 821,141,860,688 ops per second
122 | NPU (quantized compute, float I/O) took 30.63ms, 225,667,671,183 ops per second
123 | NPU (quantized compute and I/O) took 12.05ms, 573,475,650,364 ops per second
124 | ```
125 | 
126 | The first two lines confirm that the numerical results of the operations match
127 | between the CPU and the NPU. The final three show the latency of the three
128 | approaches to running a simple model. The latency is the wall time it took to
129 | execute the model from start to finish, and the ops per second is calculated
130 | from that latency to indicate the equivalent computational throughput.
131 | 
132 | In this example, we see the CPU is capable of running 821 billion ops/second
133 | (821 Gigaops), the first NPU approach gives us 225 Gigaops, and the second 573
134 | Gigaops.
135 | 
136 | ### What the Benchmark Measures
137 | 
138 | This benchmark is designed to resemble some real world models we depend on,
139 | running 6 large matrix multiplications that are similar to the most 
140 | time-consuming layers in transformer models like OpenAI's Whisper. The shapes
141 | are (6, 1500, 256) X (6, 256, 1500), producing a (6, 1500, 1500) result. The
142 | model we running consists of a single MatMul node with two inputs and one 
143 | output.
144 | 
145 | The models are created on the fly using the Onnx model framework, and then fed
146 | into the Onnx runtime. The control model is a pure float version that runs
147 | entirely on the CPU.
148 | 
149 | The NPU mostly requires quantized models to run effectively (though it has
150 | limited support for float16). The first approach we took to quantization used
151 | [the official ORT `quantize_static()` method](https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#static-quantization).
152 | For convenience this leaves the input and output tensors in 32-bit float and
153 | performs runtime conversions at the start and end of the graph so that the rest
154 | of the computation happens in eight-bit.
155 | 
156 | Unfortunately we discovered that the conversion operations as implemented on
157 | the NPU were extremely slow, much slower than the main matrix multiplication
158 | in fact. You can see the results in the `npu_quant_profile.csv` file in this
159 | repository, with conversions taking over 75% of the time.
160 | 
161 | To work around this, we constructed an equivalent model graph programmatically
162 | with eight-bit inputs and outputs This is the second "quantized compute and 
163 | I/O" approach mentioned in the results. This is usually around three times
164 | faster than the float I/O version, and profiling shows most of the time is
165 | going on the matrix multiplication, as we'd hope.
166 | 
167 | ### Possible Confounding Factors
168 | 
169 | There are a lot of variables involved in measuring performance. Here are some
170 | of the assumptions we've made:
171 | 
172 | #### Compute Bound 
173 | 
174 | Modern transformer models are based around large matrix multiplications, unlike
175 | older convolutional models. One potential issue is that accelerators could
176 | become memory bound if the layers start to resemble matrix times vectors, since
177 | that doesn't allow reuse of many of the weights, and performance becomes bottle
178 | necked on fetching values from DRAM. We've tried to avoid that by making both
179 | the input matrices more square, so that tiling and reuse should be possible.
180 | 
181 | The original matrices from the tiny Whisper model had a k dimension of only 64,
182 | so in case that was too small we bumped it up to 256 in this benchmark to give
183 | as much room for SIMD optimizations as possible.
184 | 
185 | #### Power Settings
186 | 
187 | Windows has a lot of different configuration options around energy usage, so we
188 | tried to ensure that all of the settings were on "Best Performance" and that we
189 | ran the benchmark with the tablet connected to mains power. There's also a 
190 | session option on the Qualcomm Onnx Runtime, `htp_performance_mode`, that we 
191 | set to `sustained_high_performance`, since that seemed to give the lowest 
192 | overall latency in our experiments.
193 | 
194 | #### Model Topology
195 | 
196 | We wanted to create a graph of operations that reflected modern AI models, but
197 | was simple enough to easily interpret. We could have added multiple layers, or
198 | used convolutions, or static weights, but settled for a single matrix 
199 | multiplication operation with dynamic inputs, since that reflected the 
200 | transformer architectures that are widely used for LLMs and other modern 
201 | models.
202 | 
203 | #### Configuration Errors
204 | 
205 | It's possible that the way we build and run our models causes them to fall off
206 | the fast path of the drivers or accelerator implementation. For example, we're
207 | using unsigned eight-bit quantization, with qdq elements in the graph. We've
208 | attempted to follow best practice from the documentation, but we'd welcome ways
209 | to improve performance, especially since these would improve the performance of
210 | our actual applications.
211 | 
212 | #### Onnx Framework
213 | 
214 | There are multiple different ways to access AI acceleration on Windows. We 
215 | looked at DirectML, but it only seems to support GPU access. OpenVino doesn't
216 | run on our Arm hardware, as far as we can tell. We've seen similar performance
217 | results to those shown here using the [Qualcomm QNN SDK](https://www.qualcomm.com/developer/software/neural-processing-sdk-for-ai) 
218 | directly. TensorFlow Lite isn't supported on Windows for Arm. From this 
219 | research and our experiments, Onnx is supported by both Microsoft and Qualcomm,
220 | and seems to be the best framework to use to get accelerated performance from
221 | the NPU, but we're interested in learning if other APIs would be more 
222 | appropriate.
223 | 
224 | ## Interpreting the Results
225 | 
226 | The results shown here are current as of October 2nd, 2024, when running on a
227 | Microsoft Surface Pro 11th Edition, with a Snapdragon(R) X 12-core X1E80100
228 | clocked at 3.40 GHz. The first obvious thing is that the NPU results, even
229 | without float conversion, are slower than the CPU. This is not ideal for an
230 | accelerator, even though it could still potentially offer energy or sustained
231 | performance advantages that make it worth using.
232 | 
233 | The second conclusion is that the measured performance of 573 billion 
234 | operations per second is only 1.3% of the 45 trillion ops/s that [the marketing material](https://www.microsoft.com/en-us/surface/devices/surface-pro-11th-edition)
235 | promises.
236 | 
237 | By contrast, running the same model on an Nvidia Geforce RTX 4080 Laptop GPU
238 | runs in 3.2ms, an equivalent of 2,160 billion operations per second, almost
239 | four times the throughput.


--------------------------------------------------------------------------------
/benchmark_matmul.py:
--------------------------------------------------------------------------------
  1 | # Benchmark script for the Qualcomm NPU on a Microsoft Surface Pro Tablet.
  2 | # See README.md for more information, and LICENSE for copyright information.
  3 | 
  4 | import numpy as np
  5 | import onnx
  6 | from onnx import helper as h, TensorProto as tp
  7 | import onnxruntime as ort
  8 | from onnxruntime.quantization import QuantFormat, QuantType, CalibrationDataReader, quantize_static
  9 | from readable_number import ReadableNumber
 10 | import time
 11 | 
 12 | # Define the shape of the matrix multiplication operation to benchmark.
 13 | MATRIX_COUNT = 6
 14 | MATRIX_A = 1500
 15 | MATRIX_B = 1500
 16 | MATRIX_K = 256
 17 | INPUT0_SHAPE = [1, MATRIX_COUNT, MATRIX_A, MATRIX_K]
 18 | INPUT1_SHAPE = [1, MATRIX_COUNT, MATRIX_K, MATRIX_B]
 19 | OUTPUT_SHAPE = [1, MATRIX_COUNT, MATRIX_A, MATRIX_B]
 20 | 
 21 | # A multiply-add counts as two operations, conventionally.
 22 | OPS_PER_MUL_ADD = 2
 23 | 
 24 | # Derive the total number of operations from the input shapes.
 25 | OPS_PER_INFERENCE = OPS_PER_MUL_ADD * MATRIX_COUNT * MATRIX_A * MATRIX_B * MATRIX_K
 26 | 
 27 | # The float range to distribute random inputs over.
 28 | INPUT_RANGE = 1.0 / 5.0
 29 | 
 30 | # Where to save the intermediate model files. These will overwrite whatever is
 31 | # in the existing repository by default.
 32 | FLOAT_MODEL_PATH = "matmul_model_float.onnx"
 33 | QUANT_MODEL_PATH = "matmul_model_quant.onnx"
 34 | QUANT_IO_MODEL_PATH = "matmul_model_quant_io.onnx"
 35 | 
 36 | # How many times to run inference on the model, to obtain the mean latency.
 37 | ITERATIONS = 20
 38 | 
 39 | # This class is used to provide calibration inputs for the quantization 
 40 | # process. Since we only care about accuracy for the two inputs we set up, we
 41 | # just return those examples and then signal that we're done.
 42 | class MockDataReader(CalibrationDataReader):
 43 |     def __init__(self, input0, input1):
 44 |         self.has_run = False
 45 |         self.input0 = input0
 46 |         self.input1 = input1
 47 | 
 48 |     def get_next(self):
 49 |         if self.has_run:
 50 |             return None
 51 |         else:
 52 |             self.has_run = True
 53 |             return {
 54 |                 "input0_tensor": self.input0,
 55 |                 "input1_tensor": self.input1,
 56 |             }
 57 | 
 58 |     def rewind(self):
 59 |         self.has_run = False
 60 | 
 61 | # Convert a float tensor into an eight-bit equivalent.
 62 | def quantize_tensor(input, scale, zero_point):
 63 |     assert input.dtype == np.float32
 64 |     return np.clip(np.round(input / scale) + zero_point, 0, 255).astype(np.uint8)
 65 | 
 66 | # Convert an eight-bit quantized tensor into a float result.
 67 | def dequantize_tensor(input, scale, zero_point):
 68 |     assert input.dtype == np.uint8
 69 |     return (input.astype(np.float32) - zero_point) * scale
 70 | 
 71 | # Use the Onnx model framework to construct a graph containing a single matrix
 72 | # multiplication operation with two dynamic inputs, all with float computation.
 73 | def make_matmul_float_model():
 74 | 
 75 |     matmul_node = h.make_node(
 76 |         "MatMul", 
 77 |         inputs=["input0_tensor", "input1_tensor"],
 78 |         outputs=["matmul_output_tensor"], 
 79 |         name="matmul_node")
 80 |     
 81 |     matmul_float_graph = h.make_graph(
 82 |         nodes=[
 83 |             matmul_node,
 84 |         ], 
 85 |         name="matmul_float_graph",
 86 |         inputs=[
 87 |             h.make_tensor_value_info("input0_tensor", tp.FLOAT, INPUT0_SHAPE),
 88 |             h.make_tensor_value_info("input1_tensor", tp.FLOAT, INPUT1_SHAPE)
 89 |         ],
 90 |         outputs=[
 91 |             h.make_tensor_value_info("matmul_output_tensor", tp.FLOAT, OUTPUT_SHAPE),
 92 |         ],
 93 |         initializer=[
 94 |         ])
 95 | 
 96 |     matmul_float_model = h.make_model(matmul_float_graph, producer_name="matmul_test")
 97 | 
 98 |     return matmul_float_model
 99 | 
100 | # Builds a model with a single matrix multiplication operation, computing all
101 | # values in eight-bit, with eight-bit inputs and outputs.
102 | def make_matmul_quantized_io_model(
103 |     input_scale, input_zero_point, 
104 |     matmul_scale, matmul_zero_point):
105 |     input0_scale_tensor = h.make_tensor(
106 |         name="input0_scale", 
107 |         data_type=tp.FLOAT, 
108 |         dims=[1],
109 |         vals=[input_scale])
110 | 
111 |     input0_zero_point_tensor = h.make_tensor(
112 |         name="input0_zero_point", 
113 |         data_type=tp.UINT8, 
114 |         dims=[1],
115 |         vals=[input_zero_point])
116 | 
117 |     input0_dequant_node = h.make_node(
118 |         "DequantizeLinear", 
119 |         inputs=["input0_quant_tensor", "input0_scale", "input0_zero_point"], 
120 |         outputs=["input0_dequant_tensor"],
121 |         name="input0_dequant")
122 | 
123 |     input1_scale_tensor = h.make_tensor(
124 |         name="input1_scale", 
125 |         data_type=tp.FLOAT, 
126 |         dims=[1],
127 |         vals=[input_scale])
128 | 
129 |     input1_zero_point_tensor = h.make_tensor(
130 |         name="input1_zero_point", 
131 |         data_type=tp.UINT8, 
132 |         dims=[1],
133 |         vals=[input_zero_point])
134 | 
135 |     input1_dequant_node = h.make_node(
136 |         "DequantizeLinear", 
137 |         inputs=["input1_quant_tensor", "input1_scale", "input1_zero_point"], 
138 |         outputs=["input1_dequant_tensor"],
139 |         name="input1_dequant")
140 | 
141 |     matmul_node = h.make_node(
142 |         "MatMul", 
143 |         inputs=["input0_dequant_tensor", "input1_dequant_tensor"],
144 |         outputs=["matmul_output_tensor"], 
145 |         name="matmul_node")
146 | 
147 |     matmul_output_scale_tensor = h.make_tensor(
148 |         name="matmul_output_scale", 
149 |         data_type=tp.FLOAT, 
150 |         dims=[1],
151 |         vals=[matmul_scale])
152 | 
153 |     matmul_output_zero_point_tensor = h.make_tensor(
154 |         name="matmul_output_zero_point", 
155 |         data_type=tp.UINT8, 
156 |         dims=[1],
157 |         vals=[matmul_zero_point])
158 | 
159 |     matmul_output_quant_node = h.make_node(
160 |         "QuantizeLinear", 
161 |         inputs=["matmul_output_tensor", "matmul_output_scale", "matmul_output_zero_point"], 
162 |         outputs=["matmul_output_quant_tensor"],
163 |         name="matmul_output_quant")
164 | 
165 |     matmul_output_dequant_node = h.make_node(
166 |         "DequantizeLinear", 
167 |         inputs=["matmul_output_quant_tensor", "matmul_output_scale", "matmul_output_zero_point"], 
168 |         outputs=["matmul_output_dequant_tensor"],
169 |         name="matmul_output_dequant")
170 | 
171 |     matmul_quantized_graph = h.make_graph(
172 |         nodes=[
173 |             input0_dequant_node,
174 |             input1_dequant_node,
175 |             matmul_node,
176 |             matmul_output_quant_node,
177 |             matmul_output_dequant_node,
178 |         ], 
179 |         name="matmul_quantized_graph",
180 |         inputs=[
181 |             h.make_tensor_value_info("input0_quant_tensor", tp.UINT8, INPUT0_SHAPE),
182 |             h.make_tensor_value_info("input1_quant_tensor", tp.UINT8, INPUT1_SHAPE)
183 |         ],
184 |         outputs=[
185 |             h.make_tensor_value_info("matmul_output_quant_tensor", tp.UINT8, OUTPUT_SHAPE),
186 |         ],
187 |         initializer=[
188 |             input0_scale_tensor,
189 |             input0_zero_point_tensor,
190 |             input1_scale_tensor,
191 |             input1_zero_point_tensor,
192 |             matmul_output_scale_tensor,
193 |             matmul_output_zero_point_tensor,
194 |         ])
195 | 
196 |     matmul_quantized_model = h.make_model(matmul_quantized_graph, producer_name="matmul_test")
197 | 
198 |     return matmul_quantized_model
199 | 
200 | # Calculates the mean difference between two arrays.
201 | def array_msd(x, y):
202 |     difference = x - y
203 |     msd = np.mean(np.sqrt(difference * difference))
204 |     return msd
205 | 
206 | # Create the base float model and save it out.
207 | matmul_float_model = make_matmul_float_model()
208 | onnx.checker.check_model(matmul_float_model)
209 | onnx.save(matmul_float_model, FLOAT_MODEL_PATH)
210 | 
211 | # Arbitrary but fixed seed value.
212 | rng = np.random.default_rng(7528840384)
213 | 
214 | # We generate two input tensors with random values from zero to INPUT_RANGE.
215 | input0_tensor = rng.random((INPUT0_SHAPE)).astype(np.float32) * INPUT_RANGE
216 | input1_tensor = rng.random((INPUT1_SHAPE)).astype(np.float32) * INPUT_RANGE
217 | 
218 | # Create an Onnx Runtime session to run the model on the CPU.
219 | cpu_options = ort.SessionOptions()
220 | cpu_session = ort.InferenceSession(
221 |     FLOAT_MODEL_PATH,
222 |     sess_options=cpu_options,
223 | )
224 | 
225 | # Run the float model multiple times on the CPU, and calculate the overall latency.
226 | start_cpu = time.time()
227 | for i in range(ITERATIONS):
228 |     cpu_float_outputs = cpu_session.run(
229 |         None, {
230 |             "input0_tensor": input0_tensor,
231 |             "input1_tensor": input1_tensor,
232 |         })
233 | end_cpu = time.time()
234 | cpu_float_output = cpu_float_outputs[0]
235 | 
236 | # Create a quantized model using the recommended ORT method. This has float
237 | # inputs and outputs.
238 | data_reader = MockDataReader(input0_tensor, input1_tensor)
239 | quantize_static(
240 |     FLOAT_MODEL_PATH,
241 |     QUANT_MODEL_PATH,
242 |     data_reader,
243 |     quant_format=QuantFormat.QDQ,
244 |     per_channel=False,
245 |     activation_type=QuantType.QUInt8,
246 |     weight_type=QuantType.QUInt8,
247 | )
248 | 
249 | # Create an ORT session that should run on the NPU.
250 | npu_quant_options = ort.SessionOptions()
251 | # Raise an error if any operations aren't runnable on the NPU.
252 | npu_quant_options.add_session_config_entry("session.disable_cpu_ep_fallback", "1")
253 | npu_quant_session = ort.InferenceSession(
254 |     QUANT_MODEL_PATH,
255 |     sess_options=npu_quant_options,
256 |     providers=["QNNExecutionProvider"],
257 |     provider_options=[{
258 |         "backend_path": "QnnHtp.dll",
259 |         "htp_performance_mode": "sustained_high_performance",
260 |         "enable_htp_fp16_precision": "1",
261 |         # "profiling_level": "detailed",
262 |         # "profiling_file_path": "npu_quant_profile.csv",
263 |     }]
264 | )
265 | 
266 | # Run the quantized model with float I/O on the NPU and calculate the latency.
267 | start_npu_quant = time.time()
268 | for i in range(ITERATIONS):
269 |     npu_quant_outputs = npu_quant_session.run(
270 |         None, {
271 |             "input0_tensor": input0_tensor,
272 |             "input1_tensor": input1_tensor,
273 |         })
274 | end_npu_quant = time.time()
275 | npu_quant_output = npu_quant_outputs[0]
276 | 
277 | # Build a quantized model that has quantized inputs and outputs, to avoid the
278 | # performance problems we've seen with float conversion. We can't use the
279 | # standard quantize_static method, so instead construct the model from scratch.
280 | input_scale = INPUT_RANGE / 255.0
281 | input_zero_point = 0
282 | max_output = np.max(cpu_float_output)
283 | matmul_scale = max_output / 255.0
284 | matmul_zero_point = 0
285 | quant_io_model =  make_matmul_quantized_io_model(input_scale, input_zero_point, matmul_scale, matmul_zero_point)
286 | onnx.checker.check_model(quant_io_model)
287 | onnx.save(quant_io_model, QUANT_IO_MODEL_PATH)
288 | 
289 | # Convert our float inputs into quantized equivalents.
290 | input0_quant_tensor = quantize_tensor(input0_tensor, input_scale, input_zero_point)
291 | input1_quant_tensor = quantize_tensor(input1_tensor, input_scale, input_zero_point)
292 | 
293 | # Build an NPU session to run the fully-quantized model.
294 | npu_quant_io_options = ort.SessionOptions()
295 | npu_quant_io_options.add_session_config_entry("session.disable_cpu_ep_fallback", "1")
296 | npu_quant_io_session = ort.InferenceSession(
297 |     QUANT_IO_MODEL_PATH,
298 |     sess_options=npu_quant_io_options,
299 |     providers=["QNNExecutionProvider"],
300 |     provider_options=[{
301 |         "backend_path": "QnnHtp.dll",
302 |         "htp_performance_mode": "sustained_high_performance",
303 |         "enable_htp_fp16_precision": "1",
304 |         # "profiling_level": "detailed",
305 |         # "profiling_file_path": "npu_quant_io_profile.csv",
306 |     }]
307 | )
308 | 
309 | # Run the quantized I/O model on the NPU to measure the latency.
310 | start_npu_quant_io = time.time()
311 | for i in range(ITERATIONS):
312 |     npu_quant_io_outputs = npu_quant_io_session.run(
313 |         None, {
314 |             "input0_quant_tensor": input0_quant_tensor,
315 |             "input1_quant_tensor": input1_quant_tensor,
316 |         })
317 | end_npu_quant_io = time.time()
318 | npu_quant_io_output = npu_quant_io_outputs[0]
319 | 
320 | # Convert the result back into a float tensor.
321 | npu_quant_io_output_float = dequantize_tensor(npu_quant_io_output, matmul_scale, matmul_zero_point)
322 | 
323 | print("************ Benchmark Results ************")
324 | 
325 | # Verify that the results are approximately what we'd expect.
326 | print(f"NPU quantized compute, float I/O accuracy difference is {array_msd(cpu_float_output, npu_quant_output):0.4f}")
327 | print(f"NPU quantized compute and I/O accuracy difference is {array_msd(cpu_float_output, npu_quant_io_output_float):0.4f}")
328 | 
329 | cpu_s = (end_cpu - start_cpu) / ITERATIONS
330 | npu_quant_s = (end_npu_quant - start_npu_quant) / ITERATIONS
331 | npu_quant_io_s = (end_npu_quant_io - start_npu_quant_io) / ITERATIONS
332 | 
333 | cpu_ms = cpu_s * 1000.0
334 | npu_quant_ms = npu_quant_s * 1000.0
335 | npu_quant_io_ms = npu_quant_io_s * 1000.0
336 | 
337 | # Derive the ops per second from the latency and number of ops in the model.
338 | cpu_ops_per_second = round(OPS_PER_INFERENCE / cpu_s)
339 | npu_quant_ops_per_second = round(OPS_PER_INFERENCE / npu_quant_s)
340 | npu_quant_io_ops_per_second = round(OPS_PER_INFERENCE / npu_quant_io_s)
341 | 
342 | rn = ReadableNumber(precision=0, digit_group_size=3)
343 | 
344 | print(f"CPU took {cpu_ms:0.2f}ms, {rn.of(cpu_ops_per_second)} ops per second")
345 | print(f"NPU (quantized compute, float I/O) took {npu_quant_ms:0.2f}ms, {rn.of(npu_quant_ops_per_second)} ops per second")
346 | print(f"NPU (quantized compute and I/O) took {npu_quant_io_ms:0.2f}ms, {rn.of(npu_quant_io_ops_per_second)} ops per second")
347 | 


--------------------------------------------------------------------------------