├── .dockerignore
├── LICENSE
├── README.md
├── configs
    ├── backend
    │   ├── ort.yaml
    │   ├── pytorch.yaml
    │   ├── tensorflow.yaml
    │   ├── tensorflow_graph.yaml
    │   └── torchscript.yaml
    └── benchmark.yaml
├── consolidate.py
├── docker
    ├── .tf_configure.bazelrc
    ├── Dockerfile
    ├── Dockerfile.compile
    └── oneAPI.repo
├── intel-requirements.txt
├── launcher.py
├── requirements.txt
└── src
    ├── backends
        ├── __init__.py
        ├── ort.py
        ├── pytorch.py
        └── tensorflow.py
    ├── benchmark.py
    ├── config.py
    ├── main.py
    ├── reports.py
    └── utils
        ├── __init__.py
        ├── cpu.py
        └── env.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | results/
2 | reports/
3 | outputs/
4 | *.iml
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Transformers performance & evaluation framework
  2 | 
  3 | The benchmarking repository provides an easy and flexible testbed to generate, run and save multiple configurations 
  4 | in order to compare Transformers based Neural Network models.
  5 | 
  6 | The overall benchmarking project leverages the Hydra framework from Facebook AI & Research which is able to generate
  7 | all the given sweeps through configurations files. Currently, we provide benchmarks for 5 Deep Learning frameworks
  8 | among the most used: 
  9 | 
 10 | - PyTorch (Eager mode)
 11 | - TorchScript (Static Graph mode)
 12 | - TensorFlow 2 (Eager mode)
 13 | - TensorFlow 2 Graph (Static Graph mode)
 14 | - ONNX Runtime for Inference (Static Graph mode + Graph Optimizations)
 15 | 
 16 | The repository is divided into 2 principal sections:
 17 | - `config/` stores all the configuration files for the supported backends.
 18 | - `backends/` stores the actual logic to generate textual inputs and execute a forward pass for the targeted backend.
 19 | 
 20 | ## Getting Started
 21 | 
 22 | **Instructions presented here have been tested on Ubuntu 20.04**
 23 | 
 24 | ```bash
 25 | apt update && apt -y install python3 python3-pip python3-dev libnuma-dev
 26 | cd <repo/path>
 27 | pip install -r requirements.txt
 28 | ```
 29 | 
 30 | 
 31 | ## Benchmarking framework
 32 | ### How to use this repository to benchmark with a specific configuration
 33 | 
 34 | Hydra, the configuration framework used in this project, provides a simple command-line interface to specify and
 35 | override the configuration to be run.
 36 | 
 37 | For instance, in order to run a benchmark for ONNX Runtime on CPU with:
 38 | - **Backend = ORT**
 39 | - **Model = bert-base-cased**
 40 | - **Device = CPU**
 41 | - **Batch Size = 1**
 42 | - **Sequence Length = 32**
 43 | 
 44 | ```bash
 45 | python3 src/main.py model=bert-base-cased sequence_length=32 backend=ort device=cpu
 46 | ```
 47 | 
 48 | ### Automatically let Hydra generate all the permutations to cover multiple configurations
 49 | 
 50 | Hydra integrates a very powerful sweep generation utility which is exposed through the `--multirun` command-line flag
 51 | when invoking the benchmark script.
 52 | 
 53 | For instance, in order to run a benchmark for PyTorch on CPU with the following specs:
 54 | - **Model = bert-base-cased**
 55 | - **Device = CPU**
 56 | - **Batch Size = 1**
 57 | - **Sequence Length = 128**
 58 | 
 59 | ```bash
 60 | python3 src/main.py model=bert-base-cased batch_size=1 sequence_length=128 backend=pytorch device=cpu
 61 | ```
 62 | 
 63 | ### Overridable configuration properties
 64 | 
 65 | - `backend`: Specify the backend(s) to use to run the benchmark `{"pytorch", "torchscript", "tensorflow", "xla", "ort"}`
 66 | - `device`: Specify on which device to run the benchmark `{"cpu", "cuda"}`
 67 | - `precision`: Specify the model's parameters data format. For now, only supports `float32` (_i.e. full precision_)
 68 | - `num_threads`: Number of threads to use for intra-operation (`-1` Detect the number of CPU cores and use this value)
 69 | - `num_interops_threads`: Number of threads to use for inter-operation (`-1` Detect the number of CPU cores and use this value)
 70 | - `warmup_runs`: Number of warmup forward to execute before recording any benchmarking results. (Especially useful to preallocate memory buffers).
 71 | - `benchmark_duration`: Duration (in seconds) of the benchmark in an attempt to do as many forward calls as possible within the specified duration. These runs are executed after `warmup_runs`.
 72 | 
 73 | ## Backend specific configuration properties
 74 | 
 75 | Framework exposes different features which can be enabled to tune the execution of the model on the underlying hardware.
 76 | In this repository we expose some of them, essentially the most common ones.
 77 | 
 78 | ### PyTorch
 79 | 
 80 | - `use_torchscript` Boolean indicating if the runtime should trace the eager model to produce an optimized version.
 81 | 
 82 | This value is `False` when using backend `pytorch` and `True` when using backend `torchscript` 
 83 | 
 84 | ### TensorFlow
 85 | 
 86 | - `use_xla` Boolean indicating if the model should be wrapped around `tf.function(jit_compile=True)` in order to compile the underlying graph through XLA.
 87 | 
 88 | This value is `False` when using backend `tensorflow_graph` and can be enabled by config file or cmd line.
 89 | 
 90 | 
 91 | ### ONNX Runtime (ORT)
 92 | 
 93 | - `opset` Integer setting which version of the ONNX Opset specification to use when exporting the model
 94 | 
 95 | - `graph_optimisation_level` Which level of optimization to apply with ONNX Runtime when loading the model. Possible values are:
 96 |    - `ORT_DISABLE_ALL` Use the raw ONNX graph without any further optimization.
 97 |    - `ORT_ENABLE_BASIC` Use basic graph optimizations which are not platform dependant.
 98 |    - `ORT_ENABLE_EXTENDED` Use more advanced technics *(might include platform dependant optimizations)*.
 99 |    - `ORT_ENABLE_ALL` Enable all the possible optimizations *(might include platform dependant optimizations)*.
100 |     
101 | - `execution_mode` Mode to execute the ONNX Graph. Can be either:
102 |    - `ORT_SEQUENTIAL` Execute the graph sequentially, without looking for subgraph to execute in parallel.
103 |    - `ORT_PARALLEL` Execute the graph potentially in parallel, looking for non-dependant subgraphs which can be run simultaneously.
104 | 
105 | 
106 | ## Launch utility tool
107 | The benchmarking comes with a launcher tool highly inspired by [the one made available by Intel](https://github.com/intel/intel-extension-for-pytorch/blob/master/intel_pytorch_extension_py/launch.py).
108 | The launcher tool helps you handle all the lower bits to configure experiments and get the best out of the platform you have.
109 | 
110 | More precisely, it will be able to configure the following elements:
111 | 
112 | - Linux transparent huge pages mechanism
113 | - CPU cores affinity for OpenMP threads on NUMA platforms
114 | - Memory affinity for OpenMP threads on NUMA platforms
115 | - OpenMP configurations (KMP_AFFINITY, KMP_BLOCKTIME, OMP_NUM_THREADS, OMP_MAX_ACTIVE_LEVELS, etc.)
116 | - Change at runtime the OpenMP library to be used (GNU / Intel)
117 | - Change the memory allocation library to be used (std, tcmalloc, jemalloc)
118 | - Setup multi-instances inference (multi independent models executing in parallel) with per-instance CPU core/memory affinity
119 | 
120 | The launcher script `launcher.py` is located at the root of transformers-benchmarks folder. 
121 | You can run `python launcher.py --help` to get all the tuning options available.  
122 | 
123 | ## Ready to use CLI command
124 | 
125 | ### Benchmarking out of the box configuration for multiple backends
126 | ```shell
127 | --multirun model=bert-base-cased backend=pytorch,torchscript,tensorflow,xla,ort
128 | ```
129 | 
130 | ### Tuning the number of intra/inter ops for parallel sections (OMP_NUM_THREADS, MKL_NUM_THREADS, etc.)
131 | 
132 | ```shell
133 | --multirun model=bert-base-cased batch_size=1 sequence_length=32 backend.num_threads=2,4,8 backend.num_interops_threads=2,4,8
134 | ```
135 | 
136 | ### Tuning OpenMP thread affinity
137 | ```shell
138 | python launcher.py --kmp_affinity=<value_here> -- src/main.py model=bert-base-cased batch_size=1 sequence_length=32 ... 
139 | ```
140 | 
141 | ### Tuning number of model instances (multi-instance setup) along with intra/inter ops for parallel sections
142 | ```shell
143 | python launcher.py --ninstances=4 -- src/main.py model=bert-base-cased batch_size=1 sequence_length=32 ...
144 | ```
145 | 
146 | ### Tuning allocation library 
147 | ```shell
148 | export TCMALLOC_LIBRARY_PATH=</path/to/tcmalloc/libtcmalloc.so>
149 | python launcher.py --enable_tcmalloc -- src/main.py model=bert-base-cased batch_size=1 sequence_length=32 ...
150 | ```
151 |  
152 | ### Tuning OpenMP implementation
153 | ```shell
154 | export INTEL_OPENMP_LIBRARY_PATH=</path/to/intel/openmp/libomp.so>
155 | python launcher.py --enable_iomp -- src/main.py model=bert-base-cased batch_size=1 sequence_length=32 ...
156 | ```
157 | 
158 | ### Enabling Transparent Huge Page
159 | ```shell
160 | python launcher.py --enable_thp -- src/main.py model=bert-base-cased batch_size=1 sequence_length=32 ...
161 | ```
162 | 
163 | ## Hydra FAQ
164 | 
165 | ## Executing dry-run to highlight configuration
166 | ```shell
167 | python launcher.py --enable_tcmalloc --enable_iomp --ninstances=2 -- src/main.py --info config model=bert-base-cased batch_size=16 sequence_length=512
168 | ```
169 | 


--------------------------------------------------------------------------------
/configs/backend/ort.yaml:
--------------------------------------------------------------------------------
1 | _target_: backends.ort.OnnxRuntimeBackend
2 | name: onnxruntime
3 | version: ${ort_version:}
4 | opset: 12
5 | num_threads: null
6 | num_interops_threads: null
7 | graph_optimisation_level: ORT_ENABLE_ALL
8 | execution_mode: ORT_PARALLEL


--------------------------------------------------------------------------------
/configs/backend/pytorch.yaml:
--------------------------------------------------------------------------------
1 | _target_: backends.pytorch.PyTorchBackend
2 | name: pytorch
3 | version: ${pytorch_version:}
4 | use_torchscript: false
5 | use_tf32: false
6 | num_threads: null
7 | num_interops_threads: null
8 | 


--------------------------------------------------------------------------------
/configs/backend/tensorflow.yaml:
--------------------------------------------------------------------------------
 1 | _target_: backends.tensorflow.TensorflowBackend
 2 | name: tensorflow
 3 | version: ${tensorflow_version:}
 4 | use_xla: false
 5 | use_saved_model_format: false
 6 | eager_mode: true
 7 | experimental_compiler: false
 8 | num_threads: null
 9 | num_interops_threads: null
10 | 


--------------------------------------------------------------------------------
/configs/backend/tensorflow_graph.yaml:
--------------------------------------------------------------------------------
 1 | _target_: backends.tensorflow.TensorflowBackend
 2 | name: tensorflow_graph
 3 | version: ${tensorflow_version:}
 4 | use_xla: false
 5 | use_saved_model_format: false
 6 | eager_mode: false
 7 | experimental_compiler: false
 8 | num_threads: null
 9 | num_interops_threads: null
10 | 


--------------------------------------------------------------------------------
/configs/backend/torchscript.yaml:
--------------------------------------------------------------------------------
1 | _target_: backends.pytorch.PyTorchBackend
2 | name: torchscript
3 | version: ${pytorch_version:}
4 | use_torchscript: true
5 | use_tf32: false
6 | num_threads: null
7 | num_interops_threads: null
8 | 


--------------------------------------------------------------------------------
/configs/benchmark.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - backend: pytorch
 3 | 
 4 | hydra:
 5 |   run:
 6 |     dir:
 7 |       outputs/${experiment_name}/${experiment_id}/${instance_id}
 8 |   sweep:
 9 |     dir: outputs/${experiment_name}/${experiment_id}/${instance_id}
10 |   job:
11 |     env_set:
12 |       TOKENIZERS_PARALLELISM: "false"
13 | 
14 | experiment_name: "default"
15 | python_version: ${python_version:}
16 | model: bert-base-cased
17 | batch_size: 1
18 | sequence_length: 128
19 | benchmark_duration: 5
20 | warmup_runs: 5
21 | device: cpu
22 | precision: float32
23 | num_instances: 1
24 | num_core_per_instance: -1
25 | reference: null


--------------------------------------------------------------------------------
/consolidate.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2021 Hugging Face Inc.
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | #
 15 | #  Licensed under the Apache License, Version 2.0 (the "License");
 16 | #  you may not use this file except in compliance with the License.
 17 | #  You may obtain a copy of the License at
 18 | #
 19 | #      http://www.apache.org/licenses/LICENSE-2.0
 20 | #
 21 | #  Unless required by applicable law or agreed to in writing, software
 22 | #  distributed under the License is distributed on an "AS IS" BASIS,
 23 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 24 | #  See the License for the specific language governing permissions and
 25 | #  limitations under the License.
 26 | from datetime import datetime, timezone
 27 | from glob import glob
 28 | from itertools import chain
 29 | from os import path
 30 | from pathlib import Path
 31 | from typing import Type, List, Tuple
 32 | 
 33 | import pandas as pd
 34 | from argparse import ArgumentParser
 35 | 
 36 | import yaml
 37 | from pandas import ExcelWriter
 38 | from rich.console import Console
 39 | from rich.table import Table
 40 | 
 41 | # Format name -> extension
 42 | SUPPORTED_EXPORT_FORMAT = {
 43 |     "csv": "csv",
 44 |     "excel": "xlsx"
 45 | }
 46 | 
 47 | 
 48 | SCALING_CHOICES = {"batch-size-scaling", "core-count-scaling"}
 49 | SCALING_HELP = "Which scaling metodology was used:\n \
 50 |                 \t- batch-size-scaling: The total number of cores for the original batch size remains the same - \
 51 |                 we use all the cores for the given batch size but break up the problem into smaller problems \
 52 |                 with fewer cores for the smaller problem sizes\n" \
 53 |                 "\t core-count-scaling: We vary the number of cores for the given batch size"
 54 | 
 55 | 
 56 | LATENCY_COLUMNS = {
 57 |     "latency_mean",
 58 |     "latency_std",
 59 |     "latency_50",
 60 |     "latency_90",
 61 |     "latency_95",
 62 |     "latency_99",
 63 |     "latency_999",
 64 | }
 65 | 
 66 | LATENCY_THROUGHPUT_COLUMNS = {
 67 |     "throughput",
 68 | }.union(LATENCY_COLUMNS)
 69 | 
 70 | 
 71 | SUMMARY_SUMMING_COLUMNS = {
 72 |     "nb_forwards",
 73 |     "throughput",
 74 |     "batch_size",
 75 | }
 76 | 
 77 | FINAL_COLUMNS_ORDERING = ["backend.name", "batch_size", "sequence_length", "openmp.backend", "malloc", "use_huge_page", "num_instances"]
 78 | RICH_DISPLAYED_COLUMNS = {
 79 |     "backend.name": "Backend",
 80 |     "malloc": "Malloc",
 81 |     "openmp.backend": "OpenMP",
 82 |     "use_huge_page": "Huge Pages",
 83 |     "batch_size": "Batch",
 84 |     "sequence_length": "Sequence",
 85 |     "latency_mean": "Avg. Latency",
 86 |     "latency_std": "Std. Latency",
 87 |     "throughput": "Throughput",
 88 |     "num_core_per_instance": "Cores"
 89 | }
 90 | 
 91 | MULTI_INSTANCES_VALIDATION_COLUMNS = [
 92 |     "batch_size",
 93 |     "sequence_length",
 94 |     "backend.name",
 95 |     "openmp.backend",
 96 |     "malloc",
 97 |     "backend.num_threads",
 98 |     "use_huge_page"
 99 | ]
100 | 
101 | 
102 | def flatten_yaml(path: Path, loader: Type[yaml.Loader] = yaml.SafeLoader) -> pd.DataFrame:
103 |     with open(path, "r") as yaml_f:
104 |         content = yaml.load(yaml_f, Loader=loader)
105 | 
106 |     return pd.json_normalize(content)
107 | 
108 | 
109 | def gather_results(folder: Path, is_multi_instances: bool) -> Tuple[pd.DataFrame, List[str]]:
110 |     # List all csv results
111 |     results_f = [(f, f.parent.joinpath(".hydra/config.yaml")) for f in folder.glob("**/results.csv")]
112 |     results_df = pd.concat([
113 |         # This will concatenate columns from the benchmarks along with config columns
114 |         pd.concat((pd.read_csv(results, index_col=0), flatten_yaml(config)), axis="columns")
115 |         for results, config in results_f
116 |     ], axis="index")
117 | 
118 |     existing_columns = list(set(FINAL_COLUMNS_ORDERING).intersection(results_df.columns))
119 |     results_df = results_df.sort_values(existing_columns)
120 | 
121 |     # Ensure the number of instances (according to the sum of instance_sum) matchs num_instances field
122 |     if is_multi_instances:
123 |         results_df["is_valid"] = results_df.groupby(MULTI_INSTANCES_VALIDATION_COLUMNS)["instance_id"].transform("count")
124 |         results_df["is_valid"] = results_df["is_valid"] == results_df["num_instances"]
125 |     else:
126 |         results_df["is_valid"] = True
127 | 
128 |     results_df.fillna("N/A", inplace=True)
129 |     if len(results_df) == 0:
130 |         raise ValueError(f"No results.csv file were found in {folder}")
131 | 
132 |     return results_df, existing_columns
133 | 
134 | 
135 | def aggregate_multi_instances_results(results_df: pd.DataFrame, grouping_columns: List[str], mode: str):
136 |     agg_df = results_df.copy()
137 |     agg_df = agg_df.groupby(grouping_columns)
138 |     transforms = {
139 |         "latency_mean": ["min", "max", "mean"],
140 |         "throughput": ["sum"],
141 |         "instance_id": ["sum"],
142 |         "is_valid": ["all"]
143 |     }
144 | 
145 |     # How to aggregate cores and batch
146 |     if mode == "batch-size-scaling":
147 |         transforms["batch_size"] = "sum"
148 | 
149 |     return agg_df.agg(transforms)
150 | 
151 | 
152 | def show_results_in_console(df: pd.DataFrame, sorting_columns: List[str]):
153 |     console = Console(width=200)
154 |     table = Table(
155 |         show_header=True, header_style="bold",
156 |         title="Latency & Throughput for each framework (latencies given in ms)",
157 |     )
158 | 
159 |     # Create copy
160 |     local_df = df.copy()
161 |     local_df = local_df.assign(**local_df[LATENCY_COLUMNS].apply(lambda x: round((x * 1e-6), 2)))
162 | 
163 |     # Filter out columns
164 |     displayed_columns = {
165 |         column_id: column_title
166 |         for column_id, column_title in RICH_DISPLAYED_COLUMNS.items()
167 |         if column_id in local_df.columns
168 |     }
169 | 
170 |     for column_name in displayed_columns.values():
171 |         table.add_column(column_name, justify="center")
172 |     table.add_column("Instance ID", justify="center")
173 | 
174 |     # Add rows
175 |     for _, item_columns in local_df.sort_values(sorting_columns, ascending=True).iterrows():
176 |         table.add_row(*[str(item_columns[c]) for c in chain(displayed_columns.keys(), ["instance_id"])])
177 | 
178 |     # Display the table
179 |     console.print(table)
180 | 
181 | 
182 | if __name__ == '__main__':
183 |     parser = ArgumentParser("Hugging Face Model Benchmark")
184 |     parser.add_argument("--results-folder", type=Path, help="Where the benchmark results have been saved")
185 |     parser.add_argument("--multi-instances-scaling", choices=SCALING_CHOICES, help=SCALING_HELP)
186 |     parser.add_argument("--format", choices=SUPPORTED_EXPORT_FORMAT.keys(), default="csv", help="Export file format")
187 |     parser.add_argument("output_folder", type=Path, help="Where the resulting report will be saved")
188 | 
189 |     # Parse command line arguments
190 |     args = parser.parse_args()
191 |     args.now = datetime.now(timezone.utc).astimezone()
192 |     args.experiment_id = path.split(args.results_folder)[-1]
193 |     args.format_ext = SUPPORTED_EXPORT_FORMAT[args.format.lower()]
194 | 
195 |     for name in {"aggregated", "consolidated"}:
196 |         value = f"{name}_{args.experiment_id}_" \
197 |                 f"{args.now.date().isoformat()}T{args.now.time().strftime('%H-%M')}" \
198 |                 f".{args.format_ext}"
199 |         setattr(args, f"{name}_filename", value)
200 | 
201 |     # Ensure everything looks right
202 |     if not args.results_folder.exists():
203 |         print(f"Folder {args.results_folder} doesn't exist")
204 |         exit(1)
205 | 
206 |     try:
207 |         # Detect folder run type from folder structure
208 |         instances_folder = glob(f"{args.results_folder.as_posix()}/*")
209 | 
210 |         args.is_multi_instances = len(instances_folder) > 1
211 |         args.instances = {path.split(instance_folder)[-1] for instance_folder in instances_folder}
212 |         args.is_multirun = {
213 |             path.split(instance_folder)[-1]: path.exists(path.join(instance_folder, "multirun.yaml"))
214 |             for instance_folder in instances_folder
215 |         }
216 | 
217 |         print(
218 |             f"Detected following structure:"
219 |             f"\n\t- Multi Instance: {args.is_multi_instances} ({len(args.instances)} instances)"
220 |             f"\n\t- Multirun: {args.is_multirun}"
221 |           )
222 | 
223 |         # If we detect multi instance and no scaling mode is provided, ask for a value
224 |         if args.is_multi_instances and args.multi_instances_scaling is None:
225 |             print(
226 |                 "Warning:\n\tNo mode for handling multi-instances aggregation was provided. "
227 |                 "Only individual runs will be saved.\n"
228 |                 "\tTo include multi-instances aggregation results, "
229 |                 f"please use --multi-instance-scaling={SCALING_CHOICES}\n"
230 |           )
231 | 
232 |         # Ensure output folder exists
233 |         args.output_folder.mkdir(exist_ok=True, parents=True)
234 | 
235 |         # Gather the results to manipulate
236 |         consolidated_df, sorting_columns = gather_results(args.results_folder, args.is_multi_instances)
237 | 
238 |         if args.is_multi_instances and args.multi_instances_scaling is not None:
239 |             agg_df = aggregate_multi_instances_results(consolidated_df, sorting_columns, args.multi_instances_scaling)
240 | 
241 |         if args.format == "csv":
242 |             consolidated_df.to_csv(args.output_folder.joinpath(args.consolidated_filename))
243 |             if args.is_multi_instances and args.multi_instances_scaling is not None:
244 |                 agg_df.to_csv(args.output_folder.joinpath(args.aggregated_filename))
245 |         else:
246 |             with ExcelWriter(args.output_folder.joinpath(args.consolidated_filename)) as excel_writer:
247 |                 consolidated_df.to_excel(excel_writer, sheet_name="individuals")
248 |                 if args.is_multi_instances and args.multi_instances_scaling is not None:
249 |                     agg_df.to_excel(excel_writer, sheet_name="aggregated_multi_instances", merge_cells=False)
250 | 
251 |         show_results_in_console(consolidated_df, sorting_columns)
252 |     except ValueError as ve:
253 |         print(ve)
254 | 


--------------------------------------------------------------------------------
/docker/.tf_configure.bazelrc:
--------------------------------------------------------------------------------
 1 | build --action_env PYTHON_BIN_PATH="/usr/bin/python"
 2 | build --action_env PYTHON_LIB_PATH="/usr/lib/python3.8"
 3 | build --python_path="/usr/bin/python"
 4 | build --config=xla
 5 | build --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda"
 6 | build --action_env TF_CUDA_COMPUTE_CAPABILITIES="7.0,7.5,8.0,8.6"
 7 | build --action_env LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
 8 | build --action_env GCC_HOST_COMPILER_PATH="/usr/bin/gcc"
 9 | build --config=cuda
10 | build:opt --copt=-march=native
11 | build:opt --copt=-Wno-sign-compare
12 | build:opt --host_copt=-march=native
13 | build:opt --define with_default_optimizations=true
14 | test --flaky_test_attempts=3
15 | test --test_size_filters=small,medium
16 | test --test_env=LD_LIBRARY_PATH
17 | test:v1 --test_tag_filters=-benchmark-test,-no_oss,-no_gpu,-oss_serial
18 | test:v1 --build_tag_filters=-benchmark-test,-no_oss,-no_gpu
19 | test:v2 --test_tag_filters=-benchmark-test,-no_oss,-no_gpu,-oss_serial,-v1only
20 | test:v2 --build_tag_filters=-benchmark-test,-no_oss,-no_gpu,-v1only
21 | build --action_env TF_CONFIGURE_IOS="0"


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | ARG TRANSFORMERS_VERSION=4.1.1
 4 | ARG PYTORCH_VERSION=1.7.1
 5 | ARG TENSORFLOW_VERSION=2.4.0
 6 | ARG ONNXRUNTIME_VERSION=1.6.0
 7 | ARG MKL_THREADING_LIBRARY=OMP
 8 | 
 9 | RUN apt update && \
10 |     apt install -y \
11 |         git \
12 |         python3 \
13 |         python3-pip && \
14 |     rm -rf /var/lib/apt/lists/*
15 | 
16 | # PyTorch
17 | RUN python3 -m pip install torch==1.7.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
18 | 
19 | # TensorFlow
20 | RUN python3 -m pip install tensorflow
21 | 
22 | # ONNX Runtime
23 | RUN python3 -m pip install onnxruntime
24 | 
25 | COPY . /opt/intel-benchmarks
26 | 
27 | WORKDIR /opt/intel-benchmarks
28 | RUN python3 -m pip install -r requirements.txt
29 | 
30 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.compile:
--------------------------------------------------------------------------------
  1 | FROM nvidia/cuda:11.2.0-cudnn8-devel-ubuntu20.04 as builder
  2 | 
  3 | ARG TRANSFORMERS_VERSION=4.5.0
  4 | ARG PYTORCH_VERSION=1.8.1
  5 | ARG TENSORFLOW_VERSION=2.4.1
  6 | ARG MKL_THREADING_LIBRARY=OMP
  7 | ARG CUDA_ARCH_LIST=7.0;7.5;8.0;8.6+PTX
  8 | 
  9 | # Ensure tzdata is set
 10 | ENV TZ=America/New_York
 11 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 12 | 
 13 | RUN apt update && \
 14 |     apt install -y \
 15 |         curl \
 16 |         cmake \
 17 |         make \
 18 |         ninja-build \
 19 |         git \
 20 |         gpg-agent \
 21 |         wget \
 22 |         python3 \
 23 |         python3-dev \
 24 |         python3-pip
 25 | 
 26 | # Install oneAPI repo
 27 | RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \
 28 |     apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \
 29 |     rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \
 30 |     echo "deb https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
 31 | 
 32 | RUN apt update && apt install -y \
 33 |     intel-oneapi-mkl-devel \
 34 |     intel-oneapi-runtime-openmp && \
 35 |     rm -rf /var/lib/apt/lists/*
 36 | 
 37 | ENV LD_LIBRARY_PATH='/opt/intel/oneapi/tbb/latest/env/lib/intel64/gcc4.8:/opt/intel/oneapi/mkl/latest/lib/intel64'
 38 | ENV LIBRARY_PATH='/opt/intel/oneapi/tbb/latest/lib/intel64/gcc4.8:/opt/intel/oneapi/mkl/latest/lib/intel64'
 39 | ENV MKLROOT='/opt/intel/oneapi/mkl/latest'
 40 | 
 41 | # Create a folder to store all the compiled binaries
 42 | ENV FRAMEWORK_BINARIES_FOLDER /opt/bin
 43 | RUN mkdir ${FRAMEWORK_BINARIES_FOLDER}
 44 | 
 45 | # Bazel for TensorFlow
 46 | ENV BAZEL_VERSION 4.0.0
 47 | RUN cd "/usr/bin" && curl -fLO https://releases.bazel.build/${BAZEL_VERSION}/release/bazel-${BAZEL_VERSION}-linux-x86_64 && \
 48 |     chmod +x bazel-${BAZEL_VERSION}-linux-x86_64 && \
 49 |     mv bazel-${BAZEL_VERSION}-linux-x86_64 bazel && \
 50 |     ln -s /usr/bin/python3 /usr/bin/python
 51 | 
 52 | # Enable MKL to be found by the compilation process
 53 | ENV PATH=/opt/intel/oneapi/mkl/latest/include:$PATH
 54 | ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$CMAKE_PREFIX_PATH
 55 | ENV CMAKE_INCLUDE_PATH=/opt/intel/oneapi/mkl/latest/include:$PATH:$CMAKE_INCLUDE_PATH
 56 | 
 57 | # TODO: Merge with above when ready
 58 | ENV BUILD_CAFFE2_OPS=OFF \
 59 |     BUILD_CAFFE2=OFF \
 60 |     BUILD_TEST=OFF \
 61 |     USE_CUDA=ON \
 62 |     USE_OPENCV=OFF \
 63 |     USE_FFMPEG=OFF \
 64 |     USE_LEVELDB=OFF \
 65 |     USE_KINETO=OFF \
 66 |     USE_REDIS=OFF \
 67 |     USE_DISTRIBUTED=OFF \
 68 |     USE_QNNPACK=ON \
 69 |     USE_FBGEMM=ON \
 70 |     USE_NNPACK=ON \
 71 |     USE_MKLDNN=ON \
 72 |     BLAS=MKL \
 73 |     MKLDNN_CPU_RUNTIME=$MKL_THREADING_LIBRARY \
 74 |     TORCH_CUDA_ARCH_LIST=$CUDA_ARCH_LIST
 75 | 
 76 | # PyTorch
 77 | RUN git clone https://github.com/pytorch/pytorch /opt/pytorch && \
 78 |     cd /opt/pytorch && \
 79 |     git checkout v${PYTORCH_VERSION} && \
 80 |     git submodule update --init --recursive && \
 81 |     python3 -m pip install -r requirements.txt && \
 82 |     python3 setup.py bdist_wheel && \
 83 |     ls dist/ | grep -i ".whl" | xargs -I % sh -c 'cp /opt/pytorch/dist/% ${FRAMEWORK_BINARIES_FOLDER}/'
 84 | 
 85 | 
 86 | 
 87 | # TensorFlow
 88 | RUN git clone https://github.com/tensorflow/tensorflow /opt/tensorflow && \
 89 |     cd /opt/tensorflow && \
 90 |     git checkout v${TENSORFLOW_VERSION}
 91 | 
 92 | COPY docker/.tf_configure.bazelrc /opt/tensorflow/.tf_configure.bazelrc
 93 | RUN cd /opt/tensorflow && \
 94 |     python3 -m pip install -U --user pip numpy wheel && \
 95 |     python3 -m pip install -U --user keras_preprocessing --no-deps && \
 96 |     bazel build \
 97 |     --config=cuda \
 98 |     --config=v2 \
 99 |     --config=opt \
100 |     --config=mkl \
101 |     --config=numa \
102 |     --config=noaws \
103 |     --config=nogcp \
104 |     --config=nohdfs \
105 |     --config=nonccl \
106 |     //tensorflow/tools/pip_package:build_pip_package
107 | 
108 | RUN cd /opt/tensorflow && \
109 |     ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg && \
110 |     ls /tmp/tensorflow_pkg | grep -i ".whl" | xargs -I % sh -c 'cp /tmp/tensorflow_pkg/% ${FRAMEWORK_BINARIES_FOLDER}/'
111 | 
112 | 
113 | # ONNX Runtime
114 | RUN git clone https://github.com/microsoft/onnxruntime opt/onnxruntime && \
115 |     cd /opt/onnxruntime && \
116 |     ./build.sh --config=Release --parallel --cmake_generator=Ninja --enable_pybind --build_wheel --enable_lto --use_openmp --skip_tests --skip_onnx_tests && \
117 |     ls /opt/onnxruntime/build/Linux/Release/dist/ | grep -i ".whl" | xargs -I % sh -c 'cp /opt/onnxruntime/build/Linux/Release/dist/% ${FRAMEWORK_BINARIES_FOLDER}/'
118 | 
119 | FROM nvidia/cuda:11.2.0-cudnn8-runtime-ubuntu20.04
120 | 
121 | RUN apt update && \
122 |     apt install -y \
123 |           python3 \
124 |           python3-pip \
125 |           numactl \
126 |           libtcmalloc-minimal4 \
127 |           wget
128 | 
129 | # Install oneAPI repo
130 | RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \
131 |     apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \
132 |     rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \
133 |     echo "deb https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
134 | 
135 | RUN apt update && apt install -y \
136 |     intel-oneapi-mkl \
137 |     intel-oneapi-runtime-openmp && \
138 |     rm -rf /var/lib/apt/lists/*
139 | 
140 | ENV LD_LIBRARY_PATH='/usr/local/cuda/compat:/opt/intel/oneapi/tbb/latest/env/lib/intel64/gcc4.8:/opt/intel/oneapi/mkl/latest/lib/intel64'
141 | ENV LIBRARY_PATH='/opt/intel/oneapi/tbb/latest/lib/intel64/gcc4.8:/opt/intel/oneapi/mkl/latest/lib/intel64'
142 | ENV MKLROOT='/opt/intel/oneapi/mkl/latest'
143 | 
144 | # Copy
145 | COPY --from=builder /opt/bin /opt
146 | 
147 | # Install frameworks
148 | RUN  ls /opt/*whl | xargs python3 -m pip install
149 | 
150 | # Copy tune
151 | COPY . /opt/tune
152 | 
153 | WORKDIR /opt/tune
154 | RUN python3 -m pip install -r requirements.txt
155 | 
156 | WORKDIR /opt/tune
157 | RUN python3 -m pip install -r requirements.txt


--------------------------------------------------------------------------------
/docker/oneAPI.repo:
--------------------------------------------------------------------------------
1 | [oneAPI]
2 | name=Intel(R) oneAPI repository
3 | baseurl=https://yum.repos.intel.com/oneapi
4 | enabled=1
5 | gpgcheck=1
6 | repo_gpgcheck=1
7 | gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB


--------------------------------------------------------------------------------
/intel-requirements.txt:
--------------------------------------------------------------------------------
 1 | omegaconf>=2.1.0dev20
 2 | hydra-core>=1.1.0.dev5
 3 | torch
 4 | intel-tensorflow
 5 | onnxruntime
 6 | psutil
 7 | pandas
 8 | rich
 9 | transformers
10 | multiprocess
11 | sympy


--------------------------------------------------------------------------------
/launcher.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2021 Intel Corporation.
  2 | #  Copyright 2021 Hugging Face Inc.
  3 | #
  4 | #  Licensed under the Apache License, Version 2.0 (the "License");
  5 | #  you may not use this file except in compliance with the License.
  6 | #  You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #  Unless required by applicable law or agreed to in writing, software
 11 | #  distributed under the License is distributed on an "AS IS" BASIS,
 12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #  See the License for the specific language governing permissions and
 14 | #  limitations under the License.
 15 | 
 16 | from __future__ import absolute_import, division, print_function, unicode_literals
 17 | 
 18 | from getpass import getpass
 19 | from random import getrandbits
 20 | 
 21 | from binascii import hexlify
 22 | 
 23 | import sys
 24 | import platform
 25 | import subprocess
 26 | import os
 27 | from os.path import expanduser
 28 | import re
 29 | import glob
 30 | from argparse import ArgumentParser, REMAINDER
 31 | from argparse import RawTextHelpFormatter
 32 | import logging
 33 | import psutil
 34 | 
 35 | from utils import CPUinfo
 36 | 
 37 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 38 | LOGGER = logging.getLogger(__name__)
 39 | 
 40 | r"""
 41 | This is a script for launching PyTorch training and inference on Intel Xeon CPU with optimal configurations.
 42 | Now, single instance inference/training, multi-instance inference/training and distributed training
 43 | with oneCCL backend is enabled.
 44 | 
 45 | To get the peak performance on Intel Xeon CPU, the script optimizes the configuration of thread and memory
 46 | management. For thread management, the script configures thread affinity and the preload of Intel OMP library.
 47 | For memory management, it configures NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc).
 48 | 
 49 | **How to use this module:**
 50 | 
 51 | *** Single instance inference/training ***
 52 | 
 53 | 1. Run single-instance inference or training on a single node with all CPU sockets.
 54 | 
 55 | ::
 56 | 
 57 |    >>> python -m intel_pytorch_extension.launch script.py args
 58 | 
 59 | 2. Run single-instance inference or training on a single CPU socket.
 60 | 
 61 | ::
 62 | 
 63 |    >>> python -m intel_pytorch_extension.launch --socket_id 1 script.py args
 64 | 
 65 | *** Multi-instance inference ***
 66 | 
 67 | 1. Multi-instance
 68 |    By default, one instance per socket. if you want to set the instance numbers and core per instance, --nintances and  --ncore_per_instance should be set.
 69 | 
 70 | 
 71 |    >>> python -m intel_pytorch_extension.launch --multi_instance python_script args
 72 | 
 73 |    eg: on CLX8280 with 14 instance, 4 cores per instance
 74 | ::
 75 | 
 76 |    >>> python -m intel_pytorch_extension.launch --multi_instance --nintances 14 --ncore_per_instance 4 python_script args
 77 | 
 78 | 
 79 | *** Distributed Training ***
 80 | 
 81 | spawns up multiple distributed training processes on each of the training nodes. For intel_pytorch_extension, oneCCL
 82 | is used as the communication backend and MPI used to launch multi-proc. To get the better
 83 | performance, you should specify the different cores for oneCCL communication and computation
 84 | process seperately. This tool can automatically set these ENVs(such as I_MPI_PIN_DOMIN) and launch
 85 | multi-proc for you.
 86 | 
 87 | The utility can be used for single-node distributed training, in which one or
 88 | more processes per node will be spawned.  It can also be used in
 89 | multi-node distributed training, by spawning up multiple processes on each node
 90 | for well-improved multi-node distributed training performance as well.
 91 | 
 92 | 
 93 | 1. Single-Node multi-process distributed training
 94 | 
 95 | ::
 96 | 
 97 |     >>> python  -m intel_pytorch_extension.launch --distributed  python_script  --arg1 --arg2 --arg3 and all other
 98 |                 arguments of your training script
 99 | 
100 | 2. Multi-Node multi-process distributed training: (e.g. two nodes)
101 | 
102 | 
103 | rank 0: *(IP: 192.168.10.10, and has a free port: 295000)*
104 | 
105 | ::
106 | 
107 |     >>> python -m intel_pytorch_extension.launch --distributed --nproc_per_node=xxx
108 |                --nnodes=2 --hostfile hostfile python_sript --arg1 --arg2 --arg3
109 |                and all other arguments of your training script)
110 | 
111 | 
112 | 3. To look up what optional arguments this module offers:
113 | 
114 | ::
115 | 
116 |     >>> python -m intel_pytorch_extension.launch --help
117 | 
118 | *** Memory allocator  ***
119 | 
120 | "--enable_tcmalloc" and "--enable_jemalloc" can be used to enable different memory allcator.
121 | 
122 | """
123 | 
124 | SUDOER_PASSWORD = None
125 | THP_ALLOWED_VALUES = {'always', 'never', 'madvise'}
126 | THP_COMMON_LOCATION = "/sys/kernel/mm/transparent_hugepage/enabled"
127 | THP_REDHAT_LOCATION = "/sys/kernel/mm/redhat_transparent_hugepage/enabled"
128 | 
129 | THP_LOCATION = THP_REDHAT_LOCATION if os.path.exists(THP_REDHAT_LOCATION) else THP_COMMON_LOCATION
130 | 
131 | 
132 | def get_transparent_huge_pages():
133 |     if os.path.exists(THP_LOCATION):
134 |         with open(THP_LOCATION) as f:
135 |             tbh_status = f.read().rstrip()  # Remove newline
136 |             tbh_value = re.search("\\[(.*)\\]", tbh_status)
137 | 
138 |             if tbh_value is not None and tbh_value.group(1) in THP_ALLOWED_VALUES:
139 |                 return tbh_value.group(1)
140 |     return None
141 | 
142 | 
143 | def set_transparent_huge_pages(tbh_value, elevation_pwd=None):
144 |     if not tbh_value or tbh_value not in THP_ALLOWED_VALUES:
145 |         print(f"Provided TBH value to be set is not valid {tbh_value}")
146 |         return
147 | 
148 |     if os.path.exists(THP_LOCATION):
149 |         # Clear memory cache on kernel level
150 |         print("Clearing kernel memory cache: 'echo 3 > /proc/sys/vm/drop_caches'")
151 |         code = subprocess.call(f'echo {elevation_pwd} | sudo -S sh -c "sync;echo 3 > /proc/sys/vm/drop_caches"', shell=True)
152 |         if code != 0:
153 |             print(f"Unable to clear kernel memory cache, return code={code}")
154 | 
155 |         # Explicitly ask for huge pages
156 |         print(f'Setting Transparent Huge Page to status: "echo {tbh_value} > {THP_LOCATION}"')
157 |         code = subprocess.call(f'echo {elevation_pwd} | sudo -S sh -c "echo {tbh_value} > {THP_LOCATION}"', shell=True)
158 |         if code != 0:
159 |             print(f"Unable to set kernel transparent huge pages, return code={code}")
160 |     else:
161 |         print("Warning: Unable to enable Transparent HugePages.")
162 | 
163 | 
164 | def set_mpi_pin_domain(args):
165 |     """
166 |     I_MPI_PIN_DOMAIN specify the cores used for every MPI process.
167 |     The first ccl_worker_count cores of every rank for ccl communication
168 |     and the other cores will be used to do computation.
169 |     For example: on CascadeLake 8280 CPU, 2 ranks on one node. ccl_worker_count=4
170 |     CCL_WORKER_COUNT=4
171 |     CCL_WORKER_AFFINITY="0,1,2,3,28,29,30,31"
172 |     I_MPI_PIN_DOMAIN=[0xffffff0, 0xffffff0000000]
173 |     """
174 |     cpuinfo = CPUinfo()
175 |     ppn = args.nproc_per_node
176 |     total_cores = cpuinfo.physical_core_nums
177 | 
178 |     if args.use_logical_core:
179 |         total_cores = cpuinfo.logical_core_nums
180 |     cores_per_rank = total_cores // ppn
181 |     pin_domain = "["
182 | 
183 |     for proc in range(ppn):
184 |         domain_binary = 0
185 |         begin = proc * cores_per_rank + args.ccl_worker_count
186 |         end = proc * cores_per_rank + cores_per_rank - 1
187 |         for i in range(begin, end + 1):
188 |             domain_binary |= (1 << i)
189 |         pin_domain += hex(domain_binary) + ","
190 |     return pin_domain + "]"
191 | 
192 | 
193 | def set_ccl_worker_affinity(args):
194 |     """
195 |     computation and communication use different cores when using oneCCL
196 |     backend for distributed training. we use first ccl_worker_count cores of
197 |     every rank for ccl communication
198 |     """
199 |     cpuinfo = CPUinfo()
200 |     ppn = args.nproc_per_node
201 |     total_cores = cpuinfo.physical_core_nums
202 |     if args.use_logical_core:
203 |         total_cores = cpuinfo.logical_core_nums
204 |     cores_per_rank = total_cores // ppn
205 |     affinity = ''
206 | 
207 |     for proc in range(ppn):
208 |         for ccl_worker in range(args.ccl_worker_count):
209 |             affinity += str(proc * cores_per_rank + ccl_worker) + ","
210 |     os.environ["CCL_WORKER_AFFINITY"] = affinity
211 | 
212 | 
213 | def add_lib_preload(lib_type=None):
214 |     """
215 |     Enable TCMalloc/JeMalloc/iomp
216 |     """
217 |     library_paths = []
218 | 
219 |     # We export path library through $<LIB_NAME>_LIBRARY_PATH
220 |     if f"{lib_type.upper()}_LIBRARY_PATH" in os.environ:
221 |         library_paths.append(os.environ[f"{lib_type.upper()}_LIBRARY_PATH"])
222 | 
223 |     if "CONDA_PREFIX" in os.environ:
224 |         library_paths.append(os.environ["CONDA_PREFIX"] + "/lib/")
225 | 
226 |     library_paths += [
227 |         f"{expanduser('~')}/.local/lib/",
228 |         "/usr/local/lib/",
229 |         "/usr/local/lib64/",
230 |         "/usr/lib/",
231 |         "/usr/lib64/"
232 |     ]
233 | 
234 |     lib_find = False
235 |     for lib_path in library_paths:
236 |         if not lib_path.endswith("/"):
237 |             lib_path += "/"
238 |         library_file = lib_path + "lib" + lib_type + ".so"
239 |         matches = glob.glob(library_file)
240 |         if len(matches) > 0:
241 |             if "LD_PRELOAD" in os.environ:
242 |                 os.environ["LD_PRELOAD"] = matches[0] + ":" + os.environ["LD_PRELOAD"]
243 |             else:
244 |                 os.environ["LD_PRELOAD"] = matches[0]
245 |             print(f"{lib_type} found at: {matches}")
246 |             lib_find = True
247 |             break
248 |     return lib_find
249 | 
250 | 
251 | def set_memory_allocator(args):
252 |     if args.enable_tcmalloc and args.enable_jemalloc:
253 |         LOGGER.error("Unable to enable TCMalloc and JEMalloc at the same time")
254 |         exit(-1)
255 | 
256 |     if args.enable_tcmalloc:
257 |         find_tc = add_lib_preload(lib_type="tcmalloc")
258 |         if not find_tc:
259 |             LOGGER.warning(
260 |                 "Unable to find the {} library file lib{}.so in $CONDA_PREFIX/lib or  /.local/lib/"
261 |                 " or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or "
262 |                 "~/.local/lib/ so the LD_PRELOAD environment variable will not be set."
263 |                 .format("TCmalloc", "tcmalloc", expanduser("~"))
264 |             )
265 |             args.additional_benchmark_args.append("+malloc=std")
266 |         else:
267 |             LOGGER.info("Use TCMalloc memory allocator")
268 |             args.additional_benchmark_args.append("+malloc=tcmalloc")
269 | 
270 |     elif args.enable_jemalloc:
271 |         find_je = add_lib_preload(lib_type="jemalloc")
272 |         if not find_je:
273 |             LOGGER.warning(
274 |                 "Unable to find the {} library file lib{}.so in $CONDA_PREFIX/lib or  /.local/lib/"
275 |                 " or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or "
276 |                 "~/.local/lib/ so the LD_PRELOAD environment variable will not be set."
277 |                 .format("JeMalloc", "jemalloc", expanduser("~"))
278 |             )
279 |             args.additional_benchmark_args.append("+malloc=std")
280 |         else:
281 |             LOGGER.info("Use JeMalloc memory allocator")
282 |             args.additional_benchmark_args.append("+malloc=jemalloc")
283 |             if "MALLOC_CONF" not in os.environ:
284 |                 os.environ["MALLOC_CONF"] = args.malloc_conf
285 |             LOGGER.info("MALLOC_CONF={}".format(os.environ["MALLOC_CONF"]))
286 | 
287 |     elif args.use_default_allocator:
288 |         args.additional_benchmark_args.append("+malloc=std")
289 | 
290 |     else:
291 |         find_tc = add_lib_preload(lib_type="tcmalloc")
292 |         if find_tc:
293 |             LOGGER.info("Use TCMalloc memory allocator")
294 |             args.additional_benchmark_args.append("+malloc=tcmalloc")
295 |             if "MALLOC_CONF" not in os.environ:
296 |                 os.environ["MALLOC_CONF"] = args.malloc_conf
297 |             LOGGER.info("MALLOC_CONF={}".format(os.environ["MALLOC_CONF"]))
298 |             return
299 | 
300 |         find_je = add_lib_preload(lib_type="jemalloc")
301 |         if find_je:
302 |             LOGGER.info("Use JeMalloc memory allocator")
303 |             args.additional_benchmark_args.append("+malloc=jemalloc")
304 |             if "MALLOC_CONF" not in os.environ:
305 |                 os.environ["MALLOC_CONF"] = args.malloc_conf
306 |             LOGGER.info("MALLOC_CONF={}".format(os.environ["MALLOC_CONF"]))
307 |             return
308 | 
309 |         LOGGER.warning(
310 |             "Both TCMalloc and JeMalloc are not fount in $CONDA_PREFIX/lib or  /.local/lib/"
311 |             " or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or "
312 |             "~/.local/lib/ so the LD_PRELOAD environment variable will not be set. "
313 |             "This may drop the performance"
314 |             .format(expanduser("~"))
315 |         )
316 |         args.additional_benchmark_args.append(f"+malloc=std")
317 | 
318 | 
319 | def set_multi_thread_and_allocator(args):
320 |     set_memory_allocator(args)
321 | 
322 |     if args.enable_thp:
323 |         SUDOER_PASSWORD = getpass("Setting Transparent Huge Page requires elevated privileges.\nPassword:")
324 |         set_transparent_huge_pages("always", SUDOER_PASSWORD)
325 | 
326 |     if "THP_STATUS" not in os.environ:
327 |         os.environ["THP_STATUS"] = get_transparent_huge_pages()
328 | 
329 |     if "OMP_NUM_THREADS" not in os.environ:
330 |         os.environ["OMP_NUM_THREADS"] = str(args.ncore_per_instance)
331 |     elif "OMP_NUM_THREADS" in os.environ:
332 |         args.ncore_per_instance = int(os.environ["OMP_NUM_THREADS"])
333 | 
334 |     if "OMP_MAX_ACTIVE_LEVELS" not in os.environ:
335 |         os.environ["OMP_MAX_ACTIVE_LEVELS"] = str(args.omp_max_active_levels)
336 |     else:
337 |         args.omp_max_active_levels = int(os.environ["OMP_MAX_ACTIVE_LEVELS"])
338 | 
339 |     if "KMP_AFFINITY" not in os.environ:
340 |         os.environ["KMP_AFFINITY"] = args.kmp_affinity
341 | 
342 |     if "KMP_BLOCKTIME" not in os.environ:
343 |         os.environ["KMP_BLOCKTIME"] = args.kmp_blocktime
344 | 
345 |     if "DNNL_PRIMITIVE_CACHE_CAPACITY" not in os.environ:
346 |         os.environ["DNNL_PRIMITIVE_CACHE_CAPACITY"] = '1024'
347 | 
348 |     LOGGER.info(f"OMP_NUM_THREADS={os.environ['OMP_NUM_THREADS']}")
349 |     LOGGER.info(f"OMP_MAX_ACTIVE_LEVELS={os.environ['OMP_MAX_ACTIVE_LEVELS']}")
350 |     LOGGER.info(f"KMP_AFFINITY={os.environ['KMP_AFFINITY']}")
351 |     LOGGER.info(f"KMP_BLOCKTIME={os.environ['KMP_BLOCKTIME']}")
352 |     LOGGER.info(f"DNNL_PRIMITIVE_CACHE_CAPACITY={os.environ['DNNL_PRIMITIVE_CACHE_CAPACITY']}")
353 | 
354 |     omp_backend = "default"
355 |     if args.enable_iomp:
356 |         find_iomp = add_lib_preload(lib_type="iomp5")
357 |         if not find_iomp:
358 |             LOGGER.warning("Unable to find the {} library file lib{}.so in $CONDA_PREFIX/lib or  /.local/lib/"
359 |                            " or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or "
360 |                            "~/.local/lib/ so the LD_PRELOAD environment variable will not be set."
361 |                            .format("iomp", "iomp", expanduser("~")))
362 |         else:
363 |             omp_backend = "iomp"
364 | 
365 |     # Add any additional argument for benchmark script
366 |     args.additional_benchmark_args.append(f"backend.num_threads={os.environ['OMP_NUM_THREADS']}")
367 |     args.additional_benchmark_args.append(f"+openmp.backend={omp_backend}")
368 |     args.additional_benchmark_args.append(f"+openmp.num_threads={os.environ['OMP_NUM_THREADS']}")
369 |     args.additional_benchmark_args.append(f"+openmp.max_active_levels={os.environ['OMP_MAX_ACTIVE_LEVELS']}")
370 |     args.additional_benchmark_args.append(f'+openmp.affinity="{os.environ["KMP_AFFINITY"]}"')
371 |     args.additional_benchmark_args.append(f"+openmp.blocktime={os.environ['KMP_BLOCKTIME']}")
372 |     args.additional_benchmark_args.append(f"use_huge_page={os.environ['THP_STATUS']}")
373 | 
374 | 
375 | def launch(args):
376 |     """
377 |     single-instance / multi-instance launcher
378 |     """
379 |     cores, processes = [], []
380 |     cpuinfo = CPUinfo()
381 | 
382 |     if args.core_list:  # user specify what cores will be used by params
383 |         cores = args.core_list.strip().split(",")
384 |         if args.ncore_per_instance == -1:
385 |             LOGGER.error("please specify the '--ncore_per_instance' if you have pass the --core_list params")
386 |             exit(-1)
387 |         elif args.ninstances > 1 and args.ncore_per_instance * args.ninstances < len(cores):
388 |             LOGGER.warning("only first {} cores will be used, but you specify {} cores in core_list".format
389 |                            (args.ncore_per_instance * args.ninstances, len(cores)))
390 |         else:
391 |             args.ninstances = len(cores) // args.ncore_per_instance
392 |     else:
393 |         if args.use_logical_core:
394 |             if args.socket_id != -1:
395 |                 cores = cpuinfo.get_socket_logical_cores(args.socket_id)
396 |             else:
397 |                 cores = cpuinfo.get_all_logical_cores
398 |         else:
399 |             if args.socket_id != -1:
400 |                 cores = cpuinfo.get_socket_physical_cores(args.socket_id)
401 |             else:
402 |                 cores = cpuinfo.get_all_physical_cores
403 | 
404 |         if not args.multi_instance and args.ninstances == -1 and args.ncore_per_instance == -1:
405 |             args.ninstances = 1
406 |             args.ncore_per_instance = len(cores)
407 |         elif args.multi_instance and args.ninstances == -1 and args.ncore_per_instance == -1:
408 |             args.throughput_performance = True
409 |         elif args.ncore_per_instance == -1 and args.ninstances != -1:
410 |             args.ncore_per_instance = len(cores) // args.ninstances
411 |         elif args.ncore_per_instance != -1 and args.ninstances == -1:
412 |             args.ninstances = len(cores) // args.ncore_per_instance
413 |         else:
414 |             if args.ninstances * args.ncore_per_instance > len(cores):
415 |                 LOGGER.error("Please make sure ninstances * ncore_per_instance <= total_cores")
416 |                 exit(-1)
417 | 
418 |         if args.latency_performance:
419 |             if args.ncore_per_instance != 4:
420 |                 LOGGER.warning("latency_performance is a special mode, args.ncore_per_instance can only be set to be 4")
421 |             args.ncore_per_instance = 4
422 |             cores = cpuinfo.get_all_physical_cores
423 |             args.ninstances = len(cores) // args.ncore_per_instance
424 | 
425 |         if args.throughput_performance:
426 |             args.ninstances = cpuinfo.socket_nums
427 |             cores = cpuinfo.get_all_physical_cores
428 |             args.ncore_per_instance = len(cores) // args.ninstances
429 | 
430 |     os.environ["LAUNCH_CMD"] = "#"
431 |     os.environ["LAUNCH_THP"] = get_transparent_huge_pages()
432 |     os.environ["EXPERIMENT_ID"] = hexlify(getrandbits(32).to_bytes(4, 'big')).decode('ascii')
433 | 
434 |     set_multi_thread_and_allocator(args)
435 |     args.additional_benchmark_args.append(f"num_instances={args.ninstances}")
436 |     args.additional_benchmark_args.append(f"num_core_per_instance={args.ncore_per_instance}")
437 |     args.additional_benchmark_args.append(f"experiment_id={os.environ['EXPERIMENT_ID']}")
438 | 
439 |     for i in range(args.ninstances):
440 |         cmd, instance_specific_args = [], []
441 |         instance_specific_args.append(f"instance_id={i}")
442 |         if not args.disable_numactl:
443 |             instance_cores = cores[i * args.ncore_per_instance:(i + 1) * args.ncore_per_instance]
444 |             instance_sockets = cpuinfo.get_sockets_for_cores(instance_cores)
445 | 
446 |             # Convert to numactl string argument
447 |             instance_cores_str = ",".join(instance_cores)
448 |             instance_sockets_str = ",".join(instance_sockets)
449 | 
450 |             # Generate numactl call
451 |             cmd = ["numactl"]
452 |             numa_params = "-C {} ".format(instance_cores_str)
453 |             numa_params += "-m {}".format(instance_sockets_str)
454 |             cmd.extend(numa_params.split())
455 | 
456 |             instance_specific_args.append(f"+numactl.enabled=true")
457 |             instance_specific_args.append(f"+numactl.cores=\"{instance_cores_str}\"")
458 |             instance_specific_args.append(f"+numactl.membind=\"{instance_sockets_str}\"")
459 |         else:
460 |             instance_specific_args.append(f"+numactl.enabled=false")
461 | 
462 |         with_python = not args.no_python
463 |         if with_python:
464 |             cmd.append(sys.executable)
465 | 
466 |         if args.module:
467 |             cmd.append("-m")
468 | 
469 |         if "LD_PRELOAD" in os.environ:
470 |             instance_specific_args.append("+ld_preload=\"" + os.environ["LD_PRELOAD"] + "\"")
471 |         else:
472 |             instance_specific_args.append("+ld_preload=\"\"")
473 | 
474 |         cmd.append(args.program)
475 |         cmd.extend(args.program_args)
476 |         cmd.extend(args.additional_benchmark_args)
477 |         cmd.extend(instance_specific_args)
478 | 
479 |         os.environ["LAUNCH_CMD"] += " ".join(cmd) + ",#"
480 | 
481 |         process = subprocess.Popen(cmd, env=os.environ)
482 |         processes.append(process)
483 | 
484 |     os.environ["LAUNCH_CMD"] = os.environ["LAUNCH_CMD"][:-2]
485 |     for process in processes:
486 |         process.wait()
487 |         if process.returncode != 0:
488 |             raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
489 | 
490 |     if args.enable_thp:
491 |         # reset to existing val
492 |         set_transparent_huge_pages(os.environ["LAUNCH_THP"], SUDOER_PASSWORD)
493 | 
494 |     print(f"Experiment results saved at: {os.path.join('outputs', os.environ['EXPERIMENT_ID'])}")
495 | 
496 | 
497 | def mpi_dist_launch(args):
498 |     """
499 |     Set ENVs and launch MPI process for distributed training.
500 |     """
501 |     if args.nnodes > 1 and not os.path.exists(args.hostfile):
502 |         raise ValueError("hostfile is necessary when you use multi-node distributed training,"
503 |                          "Please create hostfile which include the ip list you used for distributed running")
504 |     elif args.nnodes > 1:
505 |         ipv4_addr_pattern = r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$"
506 |         ip_list = []
507 |         with open(args.hostfile) as f:
508 |             for line in f:
509 |                 line = line.strip().strip("\n")
510 |                 is_valid = re.match(ipv4_addr_pattern, line)
511 |                 if not is_valid:
512 |                     LOGGER.error(f"{line} is not valid IPV4 address")
513 |                     exit(-1)
514 |                 else:
515 |                     ip_list.append(line)
516 |         if len(ip_list) < args.nnodes:
517 |             LOGGER.error(f"The number of IP {len(ip_list)} should greater than nnodes parameters {args.nnodes}")
518 |             exit(-1)
519 |         master_check = False
520 |         dic = psutil.net_if_addrs()
521 |         for adapter in dic:
522 |             snicList = dic[adapter]
523 |             for snic in snicList:
524 |                 if snic.address == ip_list[0]:
525 |                     master_check = True
526 |         if not master_check:
527 |             LOGGER.error(
528 |                 f"MASTER_ADDR is not right. Please make sure the first ip {ip_list[0]} "
529 |                 f"in your hostfile is the current node"
530 |             )
531 |             exit(-1)
532 | 
533 |         LOGGER.info("Begin to validate the ip connect")
534 |         args.master_addr = ip_list[0]
535 |         for ip in ip_list[1:]:
536 |             completed_process = subprocess.run("ssh -o PasswordAuthentication=no {} ':'".format(ip), shell=True)
537 |             if completed_process.returncode != 0:
538 |                 LOGGER.error(
539 |                     f"Password-less SSH login to {args.master_addr} failed, "
540 |                     f"please make sure you have setup SSH public key right"
541 |                 )
542 |                 exit(-1)
543 |             else:
544 |                 LOGGER.info("connection from master node {} to slave node {} is OK".format(args.master_addr, ip))
545 | 
546 |     set_memory_allocator(args)
547 | 
548 |     # set distributed related environmental variables
549 |     os.environ["MASTER_ADDR"] = args.master_addr
550 |     os.environ["MASTER_PORT"] = str(args.master_port)
551 | 
552 |     if "I_MPI_PIN_DOMAIN" not in os.environ:
553 |         mpi_pin_domain = set_mpi_pin_domain(args)
554 |     else:
555 |         mpi_pin_domain = os.environ["I_MPI_PIN_DOMAIN"]
556 | 
557 |     cpuinfo = CPUinfo()
558 |     ppn = args.nproc_per_node
559 |     total_cores = len(cpuinfo.get_all_physical_cores)
560 |     cores_per_rank = total_cores // ppn
561 | 
562 |     if "OMP_NUM_THREADS" not in os.environ:
563 |         opm_num_threads = cores_per_rank - args.ccl_worker_count
564 |     else:
565 |         opm_num_threads = os.environ["OMP_NUM_THREADS"]
566 | 
567 |     os.environ["CCL_WORKER_COUNT"] = str(args.ccl_worker_count)
568 | 
569 |     if "CCL_WORKER_AFFINITY" not in os.environ:
570 |         set_ccl_worker_affinity(args)
571 | 
572 |     if "CCL_ATL_TRANSPORT" not in os.environ:
573 |         os.environ["CCL_ATL_TRANSPORT"] = "ofi"
574 | 
575 |     if args.enable_iomp:
576 |         find_iomp = add_lib_preload(lib_type="iomp5")
577 |         if not find_iomp:
578 |             LOGGER.warning("Unable to find the {} library file lib{}.so in $CONDA_PREFIX/lib or  /.local/lib/"
579 |                            " or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or "
580 |                            "~/.local/lib/ so the LD_PRELOAD environment variable will not be set."
581 |                            .format("iomp", "iomp", expanduser("~")))
582 |         else:
583 |             LOGGER.info("Enable iomp by set LD_PRELOAD")
584 | 
585 |     LOGGER.info("MASTER_ADDR={}".format(args.master_addr))
586 |     LOGGER.info("MASTER_PORT={}".format(args.master_port))
587 |     LOGGER.info("I_MPI_PIN_DOMAIN={}".format(mpi_pin_domain))
588 |     LOGGER.info("OMP_NUM_THREADS={} ".format(opm_num_threads))
589 |     LOGGER.info("CCL_WORKER_COUNT={}".format(args.ccl_worker_count))
590 |     LOGGER.info("CCL_WORKER_AFFINITY={}".format(os.environ["CCL_WORKER_AFFINITY"]))
591 | 
592 |     os.environ["LAUNCH_CMD"] = "#"
593 |     cmd = ['mpiexec.hydra']
594 |     mpi_config = "-l -np {} -ppn {} -genv I_MPI_PIN_DOMAIN={} -genv OMP_NUM_THREADS={} ".format(
595 |         args.nnodes*args.nproc_per_node, args.nproc_per_node,  mpi_pin_domain, opm_num_threads
596 |     )
597 |     mpi_config += args.more_mpi_parms
598 | 
599 |     if args.nnodes > 1:
600 |         mpi_config += " -hostfile {}".format(args.hostfile)
601 |     cmd.extend(mpi_config.split())
602 |     with_python = not args.no_python
603 | 
604 |     if with_python:
605 |         cmd.append(sys.executable)
606 |         cmd.append("-u")
607 | 
608 |     if args.module:
609 |         cmd.append("-m")
610 | 
611 |     cmd.append(args.program)
612 |     cmd.extend(args.program_args)
613 |     process = subprocess.Popen(cmd, env=os.environ)
614 |     process.wait()
615 |     os.environ["LAUNCH_CMD"] += " ".join(cmd) + ",#"
616 |     os.environ["LAUNCH_CMD"] = os.environ["LAUNCH_CMD"][:-2]
617 | 
618 | 
619 | def add_distributed_training_params(parser):
620 | 
621 |     cpuinfo = CPUinfo()
622 |     socket_nums = cpuinfo.socket_nums
623 | 
624 |     group = parser.add_argument_group("Distributed Training Parameters With oneCCL backend")
625 |     group.add_argument("--nnodes", metavar='\b', type=int, default=1,
626 |                        help="The number of nodes to use for distributed "
627 |                             "training")
628 |     group.add_argument("--nproc_per_node", metavar='\b', type=int, default=socket_nums,
629 |                        help="The number of processes to launch on each node")
630 | 
631 |     # ccl control
632 |     group.add_argument("--ccl_worker_count", metavar='\b', default=4, type=int,
633 |                        help="Core numbers per rank used for ccl communication")
634 | 
635 |     # mpi control
636 |     group.add_argument("--master_addr", metavar='\b', default="127.0.0.1", type=str,
637 |                        help="Master node (rank 0)'s address, should be either "
638 |                             "the IP address or the hostname of node 0, for "
639 |                             "single node multi-proc training, the "
640 |                             "--master_addr can simply be 127.0.0.1")
641 |     group.add_argument("--master_port", metavar='\b', default=29500, type=int,
642 |                        help="Master node (rank 0)'s free port that needs to "
643 |                             "be used for communication during distributed "
644 |                             "training")
645 |     group.add_argument("--hostfile", metavar='\b', default="hostfile", type=str,
646 |                        help="Hostfile is necessary for multi-node multi-proc "
647 |                             "training. hostfile includes the node address list "
648 |                             "node address which should be either the IP address"
649 |                             "or the hostname.")
650 |     group.add_argument("--more_mpi_parms", metavar='\b', default="", type=str,
651 |                        help="User can pass more parameters for mpiexec.hydra "
652 |                             "except for -np -ppn -hostfile and -genv I_MPI_PIN_DOMAIN")
653 | 
654 | 
655 | def add_memory_allocator_params(parser):
656 | 
657 |     group = parser.add_argument_group("Memory Allocator Parameters")
658 | 
659 |     # allocator control
660 |     group.add_argument("--enable_tcmalloc", action='store_true', default=False,
661 |                        help="Enable tcmalloc allocator")
662 |     group.add_argument("--enable_jemalloc", action='store_true', default=False,
663 |                        help="Enable jemalloc allocator")
664 |     group.add_argument("--use_default_allocator",  action='store_true', default=False,
665 |                        help="Use default memory allocator")
666 |     group.add_argument("--malloc_conf", metavar='\b', default="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000", type=str,
667 |                        help="MALLOC_CONF setup, for jemalloc only, environment variable has higher priority than this args."
668 |                        "default value is : oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000")
669 | 
670 |     # transparent huge pages
671 |     group.add_argument("--enable_thp", action="store_true", default=False, help="Enable Transparent Huge Pages")
672 | 
673 | 
674 | def add_multi_instance_params(parser):
675 | 
676 |     group = parser.add_argument_group("Multi-instance Parameters")
677 | 
678 |     # multi-instance control
679 |     group.add_argument("--ncore_per_instance", metavar='\b', default=-1, type=int,
680 |                        help="Cores per instance")
681 |     group.add_argument("--ninstances", metavar='\b', default=-1, type=int,
682 |                        help="For multi-instance, you should give the cores number you used for per insantance.")
683 |     group.add_argument("--latency_performance", action='store_true', default=False,
684 |                        help="By detault 4 core per instance and use all physical cores")
685 |     group.add_argument("--throughput_performance", action='store_true', default=False,
686 |                        help="By default one instance per socket and use all physical cores")
687 |     group.add_argument("--socket_id", metavar='\b', default=-1, type=int,
688 |                        help="Socket id for multi-instance, by default all sockets will be used")
689 |     group.add_argument("--use_logical_core", action='store_true', default=False,
690 |                        help="Whether only use physical cores")
691 |     group.add_argument("--disable_numactl",  action='store_true', default=False,
692 |                        help="Disable numactl")
693 |     group.add_argument("--core_list", metavar='\b', default=None, type=str,
694 |                        help="Specify the core list as 'core_id, core_id, ....', otherwise, all the cores will be used.")
695 | 
696 | 
697 | def add_kmp_iomp_params(parser):
698 | 
699 |     group = parser.add_argument_group("KMP/IOMP Affinity Parameters")
700 |     group.add_argument("--kmp_affinity", metavar='\b', default="granularity=fine,compact,1,0", type=str,
701 |                        help="KMP_AFFINITY setup, environment variable has higher priority than this args."
702 |                             "default value is : granularity=fine,compact,1,0")
703 |     group.add_argument("--kmp_blocktime", metavar='\b', default="1", type=str,
704 |                        help="KMP_BLOCKTIME setup, environment variable has higher priority than this args."
705 |                             "default value is : 1")
706 |     group.add_argument("--omp_max_active_levels", type=int, default=1, help="Set OMP_MAX_ACTIVE_LEVELS env var.")
707 |     group.add_argument("--enable_iomp", action='store_true', default=False,
708 |                        help="Enable iomp and libiomp5.so will be add to LD_PRELOAD")
709 | 
710 | 
711 | def parse_system_info(args):
712 |     from platform import libc_ver, uname
713 | 
714 |     uname_info = uname()
715 |     args.additional_benchmark_args.append(f"+system.name={uname_info.system}")
716 |     args.additional_benchmark_args.append(f"+system.arch={uname_info.machine}")
717 |     args.additional_benchmark_args.append(f"+system.kernel={uname_info.release}")
718 |     args.additional_benchmark_args.append(f"+system.libc={libc_ver()[-1]}")
719 | 
720 | 
721 | def parse_args():
722 |     """
723 |     Helper function parsing the command line options
724 |     @retval ArgumentParser
725 |     """
726 |     parser = ArgumentParser(description="This is a script for launching PyTorch training and inference on Intel Xeon CPU "
727 |                                         "with optimal configurations. Now, single instance inference/training, multi-instance "
728 |                                         "inference/training and distributed training with oneCCL backend is enabled. "
729 |                                         "To get the peak performance on Intel Xeon CPU, the script optimizes the configuration "
730 |                                         "of thread and memory management. For thread management, the script configures thread "
731 |                                         "affinity and the preload of Intel OMP library. For memory management, it configures "
732 |                                         "NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc) "
733 |                                         "\n################################# Basic usage ############################# \n"
734 |                                         "\n 1. single instance\n"
735 |                                         "\n   >>> python -m intel_pytorch_extension.launch python_script args \n"
736 |                                         "\n2. multi-instance \n"
737 |                                         "\n    >>> python -m intel_pytorch_extension.launch --multi_instance python_script args\n"
738 |                                         "\n3. Single-Node multi-process distributed training\n"
739 |                                         "\n    >>> python  -m intel_pytorch_extension.launch --distributed  python_script args\n"
740 |                                         "\n4. Multi-Node multi-process distributed training: (e.g. two nodes)\n"
741 |                                         "\n   rank 0: *(IP: 192.168.10.10, and has a free port: 295000)*\n"
742 |                                         "\n   >>> python -m intel_pytorch_extension.launch --distributed --nproc_per_node=2\n"
743 |                                         "\n       --nnodes=2 --hostfile hostfile python_script args\n",
744 |                             formatter_class=RawTextHelpFormatter)
745 | 
746 |     parser.add_argument("--multi_instance", action='store_true', default=False,
747 |                         help="Enable multi-instance, by default one instance per socket")
748 | 
749 |     parser.add_argument('--distributed', action='store_true', default=False,
750 |                         help='Enable distributed training.')
751 |     parser.add_argument("-m", "--module", default=False, action="store_true",
752 |                         help="Changes each process to interpret the launch script "
753 |                              "as a python module, executing with the same behavior as"
754 |                              "'python -m'.")
755 | 
756 |     parser.add_argument("--no_python", default=False, action="store_true",
757 |                         help="Do not prepend the --program script with \"python\" - just exec "
758 |                              "it directly. Useful when the script is not a Python script.")
759 | 
760 |     add_memory_allocator_params(parser)
761 |     add_kmp_iomp_params(parser)
762 | 
763 |     add_distributed_training_params(parser)
764 |     add_multi_instance_params(parser)
765 | 
766 |     # positional
767 |     parser.add_argument("program", type=str,
768 |                         help="The full path to the proram/script to be launched. "
769 |                              "followed by all the arguments for the script")
770 | 
771 |     # rest from the training program
772 |     parser.add_argument('program_args', nargs=REMAINDER)
773 |     return parser.parse_args()
774 | 
775 | 
776 | def main():
777 | 
778 |     env_before = set(os.environ.keys())
779 |     if platform.system() == "Windows":
780 |         raise RuntimeError("Windows platform is not supported!!!")
781 | 
782 |     args = parse_args()
783 |     args.additional_benchmark_args = []
784 | 
785 |     parse_system_info(args)
786 | 
787 |     if args.distributed and args.multi_instance:
788 |         raise RuntimeError("Either args.distributed or args.multi_instance should be set")
789 | 
790 |     if args.latency_performance and args.throughput_performance:
791 |         raise RuntimeError("Either args.latency_performance or args.throughput_performance  should be set")
792 | 
793 |     if args.nnodes > 1:
794 |         args.distributed = True
795 | 
796 |     if args.distributed:
797 |         mpi_dist_launch(args)
798 |     else:
799 |         launch(args)
800 | 
801 |     for x in sorted(set(os.environ.keys()) - env_before):
802 |         LOGGER.debug(f'{x}={os.environ[x]}')
803 | 
804 | 
805 | if __name__ == "__main__":
806 |     main()
807 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | omegaconf>=2.1.0dev20
 2 | hydra-core>=1.1.0.dev5
 3 | torch
 4 | tensorflow
 5 | onnxruntime
 6 | psutil
 7 | pandas
 8 | rich
 9 | transformers
10 | multiprocess
11 | sympy
12 | 


--------------------------------------------------------------------------------
/src/backends/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 Hugging Face Inc.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | from dataclasses import dataclass
17 | from logging import getLogger
18 | from typing import Generic, TypeVar, ClassVar, List, Optional, Set, Tuple
19 | 
20 | import numpy as np
21 | from hydra.types import TargetConf
22 | from omegaconf import MISSING
23 | from psutil import cpu_count
24 | from transformers import AutoTokenizer
25 | 
26 | from benchmark import Benchmark
27 | 
28 | LOGGER = getLogger("backends")
29 | 
30 | 
31 | @dataclass
32 | class BackendConfig(TargetConf):
33 |     name: str = MISSING
34 |     version: str = MISSING
35 |     num_threads: Optional[int] = None
36 |     num_interops_threads: Optional[int] = None
37 | 
38 |     @staticmethod
39 |     @abstractmethod
40 |     def version():
41 |         raise NotImplementedError()
42 | 
43 |     @staticmethod
44 |     def supported_keys() -> Set[str]:
45 |         return {"name", "version", "num_threads", "num_interops_threads"}
46 | 
47 | 
48 | BackendConfigT = TypeVar("BackendConfigT", bound=BackendConfig)
49 | class Backend(Generic[BackendConfigT], ABC):
50 |     NAME: ClassVar[str]
51 | 
52 |     def __init__(self, model: str):
53 |         self.model = model
54 |         self.tokenizer = AutoTokenizer.from_pretrained(model)
55 | 
56 |     @classmethod
57 |     @abstractmethod
58 |     def allocate(cls, config: 'BenchmarkConfig'):
59 |         raise NotImplementedError()
60 | 
61 |     def configure(self, config: BackendConfigT):
62 |         if config.num_interops_threads is not None:
63 |             if config.num_interops_threads == -1:
64 |                 config.num_interops_threads = cpu_count()
65 | 
66 |         if config.num_threads is not None:
67 |             if config.num_threads == -1:
68 |                 config.num_threads = cpu_count()
69 | 
70 |     @abstractmethod
71 |     def execute(self, config: 'BenchmarkConfig', is_reference: bool = False) -> Tuple[Benchmark, np.ndarray]:
72 |         raise NotImplementedError()
73 | 
74 |     def clean(self, config: 'BenchmarkConfig'):
75 |         pass
76 | 
77 |     def _get_dummy_token(self) -> str:
78 |         if self.tokenizer.unk_token is not None:
79 |             return self.tokenizer.unk_token
80 |         else:
81 |             return self.tokenizer.convert_tokens_to_string([1])
82 | 
83 |     def _get_dummy_inputs(self, batch_size: int, seq_len: int) -> List[List[str]]:
84 |         return [[self._get_dummy_token()] * seq_len] * batch_size
85 | 
86 | 


--------------------------------------------------------------------------------
/src/backends/ort.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2021 Hugging Face Inc.
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | 
 15 | from dataclasses import dataclass
 16 | from logging import getLogger
 17 | from os import getpid
 18 | from pathlib import Path
 19 | from typing import Set, Optional, Tuple
 20 | 
 21 | import numpy as np
 22 | from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel, ExecutionMode, __version__ as ort_version
 23 | from onnxruntime.transformers.optimizer import optimize_model
 24 | from tqdm import trange
 25 | from transformers import TensorType
 26 | from transformers.convert_graph_to_onnx import convert as onnx_convert
 27 | 
 28 | from backends import BackendConfig, Backend
 29 | from benchmark import Benchmark
 30 | from utils import SEC_TO_NS_SCALE
 31 | 
 32 | 
 33 | ALL_GRAPH_OPTIMIZATION_LEVELS = {
 34 |     GraphOptimizationLevel.ORT_ENABLE_ALL,
 35 |     GraphOptimizationLevel.ORT_ENABLE_EXTENDED,
 36 |     GraphOptimizationLevel.ORT_ENABLE_BASIC,
 37 |     GraphOptimizationLevel.ORT_DISABLE_ALL
 38 | }
 39 | ALL_GRAPH_OPTIMIZATION_LEVELS_FROM_STR = {
 40 |     level.name: level
 41 |     for level in ALL_GRAPH_OPTIMIZATION_LEVELS
 42 | }
 43 | 
 44 | ALL_EXECUTION_MODE = {
 45 |     ExecutionMode.ORT_PARALLEL,
 46 |     ExecutionMode.ORT_SEQUENTIAL
 47 | }
 48 | 
 49 | ALL_EXECUTION_MODE_FROM_STR = {
 50 |     level.name: level
 51 |     for level in ALL_EXECUTION_MODE
 52 | }
 53 | 
 54 | 
 55 | @dataclass
 56 | class OnnxRuntimeConfig(BackendConfig):
 57 |     name: str = "onnxruntime"
 58 |     opset: int = 12
 59 |     graph_optimisation_level: str = "ORT_ENABLE_ALL"
 60 |     execution_mode: str = "ORT_PARALLEL"
 61 | 
 62 |     @staticmethod
 63 |     def version() -> str:
 64 |         return ort_version
 65 | 
 66 |     @staticmethod
 67 |     def supported_keys() -> Set[str]:
 68 |         return BackendConfig.supported_keys().union({"opset", "graph_optimisation_level", "execution_mode"})
 69 | 
 70 | 
 71 | BACKEND_NAME = "onnxruntime"
 72 | LOGGER = getLogger(BACKEND_NAME)
 73 | ONNX_GRAPHS_FOLDER = "onnx_graphs"
 74 | 
 75 | 
 76 | class OnnxRuntimeBackend(Backend[OnnxRuntimeConfig]):
 77 | 
 78 |     def __init__(self, model: str, onnx_path: str):
 79 |         super().__init__(model)
 80 | 
 81 |         self.onnx_path = onnx_path
 82 |         self.optimized_onnx_graph = None
 83 |         self.session_opts = SessionOptions()
 84 | 
 85 |     @staticmethod
 86 |     def convert(model: str, output: Path, opset: int = 12) -> Path:
 87 |         if output.exists():
 88 |             return output
 89 | 
 90 |         onnx_convert("pt", model, output, opset=opset)
 91 | 
 92 |     @classmethod
 93 |     def allocate(cls, config: 'BenchmarkConfig'):
 94 |         onnx_model_path = Path(f"{ONNX_GRAPHS_FOLDER}/{config.model}.onnx.{getpid()}")
 95 |         OnnxRuntimeBackend.convert(config.model, onnx_model_path, config.backend.opset)
 96 | 
 97 |         backend = OnnxRuntimeBackend(config.model, onnx_model_path.absolute().as_posix())
 98 |         backend.configure(config.backend)
 99 |         return backend
100 | 
101 |     def configure(self, config: OnnxRuntimeConfig):
102 |         assert config.graph_optimisation_level in ALL_GRAPH_OPTIMIZATION_LEVELS_FROM_STR, f"Unknown {config.graph_optimisation_level}"
103 |         assert config.execution_mode in ALL_EXECUTION_MODE_FROM_STR, f"Unknown {config.execution_mode}"
104 | 
105 |         super().configure(config)
106 | 
107 |         LOGGER.info("Configuring ONNX Runtime Benchmark:")
108 | 
109 |         self.session_opts.execution_mode = ALL_EXECUTION_MODE_FROM_STR[config.execution_mode]
110 |         LOGGER.info(f"\t- Setting Execution Mode: {self.session_opts.execution_mode}")
111 | 
112 |         self.session_opts.graph_optimization_level = ALL_GRAPH_OPTIMIZATION_LEVELS_FROM_STR[config.graph_optimisation_level]
113 |         LOGGER.info(f"\t- Setting Graph Optimization Level: {self.session_opts.graph_optimization_level}")
114 | 
115 |         if config.num_threads is not None:
116 |             if self.session_opts.intra_op_num_threads != config.num_threads:
117 |                 self.session_opts.intra_op_num_threads = config.num_threads
118 | 
119 |             LOGGER.info(f"\t- Setting intra_op_num_threads({self.session_opts.intra_op_num_threads})")
120 | 
121 |         if config.num_interops_threads is not None:
122 |             if self.session_opts.inter_op_num_threads != config.num_interops_threads:
123 |                 self.session_opts.inter_op_num_threads = config.num_interops_threads
124 | 
125 |             LOGGER.info(f"\t- Setting inter_op_num_threads({self.session_opts.inter_op_num_threads})")
126 | 
127 |     def execute(self, config: 'BenchmarkConfig', is_reference: bool = False) -> Tuple[Benchmark, np.ndarray]:
128 |         benchmark = Benchmark()
129 | 
130 |         try:
131 |             model_opt_path = Path(self.onnx_path)
132 |             opt_onnx_path = model_opt_path.with_suffix(".opt" + model_opt_path.suffix)
133 | 
134 |             model_opt = optimize_model(
135 |                 self.onnx_path,
136 |                 model_type="bert",
137 |                 opt_level=int(self.session_opts.graph_optimization_level)
138 |             )
139 |             model_opt.save_model_to_file(opt_onnx_path.absolute().as_posix())
140 |             self.optimized_onnx_graph = opt_onnx_path.absolute().as_posix()
141 |         except Exception as e:
142 |             LOGGER.error(f"Unable to optimize ONNX BERT model: {e}")
143 | 
144 |         session = InferenceSession(self.optimized_onnx_graph or self.onnx_path, self.session_opts)
145 | 
146 |         dummy_inputs = self._get_dummy_inputs(
147 |             batch_size=config.batch_size,
148 |             seq_len=(config.sequence_length - self.tokenizer.num_special_tokens_to_add(pair=False))
149 |         )
150 | 
151 |         inputs = self.tokenizer(
152 |             dummy_inputs,
153 |             is_split_into_words=True,
154 |             return_tensors=TensorType.NUMPY,
155 |         )
156 |         inputs = {k: v.astype("i8") for k, v in inputs.items()}
157 | 
158 |         # Warmup
159 |         outputs = []
160 |         for _ in trange(config.warmup_runs, desc="Warming up"):
161 |             output = session.run(None, inputs)
162 |             outputs.append(output[0])
163 | 
164 |         # Let's not run the benchmark for the reference backend,
165 |         # as we are more interested in the output tensors.
166 |         if not is_reference:
167 | 
168 |             # Run benchmark
169 |             benchmark_duration_ns = config.benchmark_duration * SEC_TO_NS_SCALE
170 |             while sum(benchmark.latencies) < benchmark_duration_ns:
171 |                 with benchmark.track():
172 |                     session.run(None, inputs)
173 | 
174 |             benchmark.finalize(benchmark_duration_ns)
175 |         return benchmark, np.stack(outputs)
176 | 
177 |     def clean(self, config: 'BenchmarkConfig'):
178 |         onnx_path = Path(ONNX_GRAPHS_FOLDER)
179 | 
180 |         if onnx_path.exists():
181 |             for file in onnx_path.iterdir():
182 |                 LOGGER.debug(f"Cleaning ONNX model: {file}")
183 |                 file.unlink()
184 | 
185 |         # if Path(onnx_path).exists():
186 |         #     # Care for external data format (multiple file) if exporting bigger model
187 |         #     LOGGER.debug(f"Cleaning ONNX model: {self.onnx_path}")
188 |         #     onnx_path.unlink()
189 |         #
190 |         # if self.optimized_onnx_graph is not None and Path(self.optimized_onnx_graph).exists():
191 |         #     LOGGER.debug(f"Cleaning optimized ONNX model: {self.optimized_onnx_graph}")
192 |         #     Path(self.optimized_onnx_graph).unlink()
193 | 


--------------------------------------------------------------------------------
/src/backends/pytorch.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2021 Hugging Face Inc.
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | 
 15 | from collections import OrderedDict
 16 | from contextlib import contextmanager
 17 | from dataclasses import dataclass
 18 | from logging import getLogger
 19 | from typing import Set, Optional, Tuple
 20 | 
 21 | import numpy as np
 22 | import torch
 23 | from tqdm import trange
 24 | from transformers import AutoModel, TensorType
 25 | 
 26 | from backends import Backend, BackendConfig
 27 | from benchmark import Benchmark
 28 | from config import BenchmarkConfig
 29 | from utils import SEC_TO_NS_SCALE
 30 | 
 31 | 
 32 | BACKEND_NAME = "pytorch"
 33 | LOGGER = getLogger(BACKEND_NAME)
 34 | 
 35 | 
 36 | class CUDABenchmark(Benchmark):
 37 |     def __init__(self):
 38 |         super().__init__()
 39 | 
 40 |         if not torch.cuda.is_available():
 41 |             raise RuntimeError("CUDA is not available")
 42 | 
 43 |     @contextmanager
 44 |     def track(self):
 45 |         start_event = torch.cuda.Event(enable_timing=True)
 46 |         end_event = torch.cuda.Event(enable_timing=True)
 47 | 
 48 |         start_event.record()
 49 |         yield
 50 | 
 51 |         end_event.record()
 52 |         torch.cuda.synchronize()  # Wait for the events to be recorded!
 53 | 
 54 |         # Get timing events
 55 |         latency_ms = start_event.elapsed_time(end_event)
 56 | 
 57 |         # Convert to nanoseconds to match Benchmark.track()
 58 |         latency_ns = latency_ms * 1_000_000
 59 | 
 60 |         # Append the time to the buffer
 61 |         self.latencies.append(latency_ns)
 62 | 
 63 |         LOGGER.debug(f"Tracked function took: {latency_ns}ns ({latency_ms:.3f}ms)")
 64 | 
 65 | 
 66 | @dataclass
 67 | class PyTorchConfig(BackendConfig):
 68 |     name: str = "pytorch"
 69 |     use_torchscript: bool = False
 70 |     use_tf32: bool = False
 71 | 
 72 |     @staticmethod
 73 |     def version() -> str:
 74 |         return torch.__version__
 75 | 
 76 |     @staticmethod
 77 |     def supported_keys() -> Set[str]:
 78 |         return BackendConfig.supported_keys().union({"use_torchscript", "use_tf32"})
 79 | 
 80 | 
 81 | class PyTorchBackend(Backend[PyTorchConfig]):
 82 |     NAME = BACKEND_NAME
 83 | 
 84 |     def __init__(self, model: str):
 85 |         super().__init__(model)
 86 |         self.model = AutoModel.from_pretrained(model)
 87 | 
 88 |         LOGGER.info(f"Allocated PyTorch Backend for model: {model}")
 89 | 
 90 |     @classmethod
 91 |     def allocate(cls, config: BenchmarkConfig):
 92 |         backend = cls(config.model)
 93 |         backend.configure(config.backend)
 94 | 
 95 |         return backend
 96 | 
 97 |     def configure(self, config: PyTorchConfig):
 98 |         super().configure(config)
 99 | 
100 |         LOGGER.info("Configuring PyTorch Benchmark:")
101 | 
102 |         # Disable gradients
103 |         torch.set_grad_enabled(False)
104 |         LOGGER.info("\t+ Disabled gradients")
105 | 
106 |         # Tune Nvidia's TF32 support
107 |         if torch.has_cuda and torch.cuda.is_available():
108 |             if hasattr(torch.backends.cuda, "matmul") and hasattr(torch.backends.cuda.matmul, "allow_tf32"):
109 |                 torch.backends.cuda.matmul.allow_tf32 = config.use_tf32
110 |                 LOGGER.info(f"\t+ CUDA allows Nvidia's TF32: { torch.backends.cuda.matmul.allow_tf32 }")
111 | 
112 |         if torch.has_cudnn and torch.backends.cudnn.is_available():
113 | 
114 |             if hasattr(torch.backends.cudnn, "allow_tf32"):
115 |                 # The flag below controls whether to allow TF32 on cuDNN.
116 |                 torch.backends.cudnn.allow_tf32 = config.use_tf32
117 |                 LOGGER.info(f"\t+ CuDNN allows Nvidia's TF32: { torch.backends.cudnn.allow_tf32 }")
118 | 
119 |         self.model.eval()
120 |         LOGGER.info("\t+ Turning eval mode on Module (model.eval())")
121 | 
122 |         if config.num_threads is not None:
123 |             # if torch.get_num_threads() != config.num_threads:
124 |             torch.set_num_threads(config.num_threads)
125 | 
126 |             LOGGER.info(f"\t+ Number of threads (torch.set_num_threads({config.num_threads}))")
127 | 
128 |         if config.num_interops_threads is not None:
129 |             # TODO: Setting this value multiple times between PyTorch & TorchScript runs raise a C error
130 | 
131 |             # if torch.get_num_interop_threads() != config.num_interops_threads:
132 |             torch.set_num_interop_threads(config.num_interops_threads)
133 | 
134 |             LOGGER.info(
135 |                 f"\t+ Number of interop threads (torch.set_num_interop_threads({config.num_interops_threads}))"
136 |             )
137 | 
138 |         if config.use_torchscript:
139 |             self.model.config.return_dict = False
140 |             LOGGER.info("\t+ Disabling dictionary output for TorchScript")
141 | 
142 |     def execute(self, config: BenchmarkConfig, is_reference: bool = False) -> Tuple[Benchmark, np.ndarray]:
143 |         if config.backend.use_torchscript:
144 |             return self._run_torchscript(config, is_reference)
145 |         else:
146 |             return self._run_pytorch(config, is_reference)
147 | 
148 |     def _run_pytorch(self, config: BenchmarkConfig, is_reference: bool) -> Tuple[Benchmark, np.ndarray]:
149 |         """
150 |         :return:
151 |         """
152 |         LOGGER.info("Running PyTorch Eager benchmark")
153 |         benchmark = CUDABenchmark() if config.device == "cuda" else Benchmark()
154 | 
155 |         dummy_inputs = self._get_dummy_inputs(
156 |             batch_size=config.batch_size,
157 |             seq_len=(config.sequence_length - self.tokenizer.num_special_tokens_to_add(pair=False))
158 |         )
159 | 
160 |         inputs = self.tokenizer(
161 |             dummy_inputs,
162 |             is_split_into_words=True,
163 |             return_tensors=TensorType.PYTORCH,
164 |         )
165 | 
166 |         inputs = inputs.to(config.device)
167 |         self.model = self.model.to(config.device)
168 | 
169 |         # Warmup
170 |         outputs = []
171 |         with torch.cuda.amp.autocast(config.precision == "float16"):
172 |             for _ in trange(config.warmup_runs, desc="Warming up"):
173 |                 output = self.model(**inputs)
174 |                 outputs.append(output.last_hidden_state.cpu().numpy())
175 | 
176 |         # Let's not run the benchmark for the reference backend,
177 |         # as we are more interested in the output tensors.
178 |         if not is_reference:
179 | 
180 |             # Run benchmark
181 |             benchmark_duration_ns = config.benchmark_duration * SEC_TO_NS_SCALE
182 |             with torch.cuda.amp.autocast(config.precision == "float16"):
183 |                 while sum(benchmark.latencies) < benchmark_duration_ns:
184 |                     with benchmark.track():
185 |                         self.model(**inputs)
186 | 
187 |                 benchmark.finalize(benchmark_duration_ns)
188 | 
189 |         return benchmark, np.stack(outputs)
190 | 
191 |     def _run_torchscript(self, config: BenchmarkConfig, is_reference: bool) -> Tuple[Benchmark, np.ndarray]:
192 |         """
193 |         :return:
194 |         """
195 |         LOGGER.info("Running TorchScript benchmark")
196 |         benchmark = CUDABenchmark() if config.device == "cuda" else Benchmark()
197 | 
198 |         dummy_inputs = self._get_dummy_inputs(
199 |             batch_size=config.batch_size,
200 |             seq_len=(config.sequence_length - self.tokenizer.num_special_tokens_to_add(pair=False))
201 |         )
202 | 
203 |         inputs = self.tokenizer(
204 |             dummy_inputs,
205 |             is_split_into_words=True,
206 |             return_tensors=TensorType.PYTORCH,
207 |         )
208 | 
209 |         inputs.to(config.device)
210 |         self.model = self.model.to(config.device)
211 | 
212 |         # To be sure inputs will be presented with the right prototype
213 |         ordered_inputs = OrderedDict({
214 |             "input_ids": inputs.input_ids,
215 |             "attention_mask": inputs.attention_mask,
216 |             "token_type_ids": inputs.token_type_ids,
217 |         })
218 | 
219 |         LOGGER.debug("Calling torch JIT on model (optimize=True)")
220 |         model_scripted = torch.jit.trace(self.model, tuple(ordered_inputs.values()))
221 | 
222 |         outputs = []
223 |         with torch.jit.optimized_execution(True):
224 |             with torch.cuda.amp.autocast(config.precision == "float16"):
225 |                 for _ in trange(config.warmup_runs, desc="Warming up"):
226 |                     output = model_scripted(*ordered_inputs.values())
227 |                     outputs.append(output[0].cpu().numpy())
228 | 
229 |             # Let's not run the benchmark for the reference backend,
230 |             # as we are more interested in the output tensors.
231 |             if not is_reference:
232 | 
233 |                     # Run benchmark
234 |                     benchmark_duration_ns = config.benchmark_duration * SEC_TO_NS_SCALE
235 |                     with torch.cuda.amp.autocast(config.precision == "float16"):
236 |                         while sum(benchmark.latencies) < benchmark_duration_ns:
237 |                             with benchmark.track():
238 |                                 model_scripted(*ordered_inputs.values())
239 | 
240 |                     benchmark.finalize(benchmark_duration_ns)
241 |         return benchmark, np.stack(outputs)
242 | 
243 | 


--------------------------------------------------------------------------------
/src/backends/tensorflow.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2021 Hugging Face Inc.
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | 
 15 | import contextlib
 16 | from dataclasses import dataclass
 17 | from logging import getLogger
 18 | from pathlib import Path
 19 | from shutil import rmtree
 20 | from typing import Optional, Tuple, Callable, List, Set
 21 | 
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | from tensorflow.python.keras import Input
 25 | from tqdm import trange
 26 | from transformers import PreTrainedTokenizer, TFAutoModel, TFPreTrainedModel, TensorType
 27 | 
 28 | from backends import Backend, BackendConfig
 29 | from benchmark import Benchmark
 30 | from config import BenchmarkConfig
 31 | from utils import SEC_TO_NS_SCALE
 32 | 
 33 | BACKEND_NAME = "tensorflow"
 34 | 
 35 | SAVED_MODEL_PATH = "saved_model"
 36 | SAVED_MODEL_TUNE_FLAG = "tune"
 37 | 
 38 | LOGGER = getLogger("tensorflow")
 39 | 
 40 | 
 41 | def get_tf_device(device: str) -> str:
 42 |     if device == "cuda":
 43 |         if len(tf.config.experimental.list_physical_devices('GPU')) == 0:
 44 |             raise ValueError(f"No GPU detected, cannot move data to {device}")
 45 |         return tf.DeviceSpec(device_type="GPU")
 46 |     else:
 47 |         return tf.DeviceSpec(device_type="CPU")
 48 | 
 49 | 
 50 | def as_saved_model(tokenizer: PreTrainedTokenizer, model: TFPreTrainedModel, inputs: List, saved_model_path: Path, flag: str = SAVED_MODEL_TUNE_FLAG) -> Path:
 51 |     encodings = tokenizer(inputs, is_split_into_words=True, return_tensors="tf")
 52 | 
 53 |     # Generate symbolic trace
 54 |     tf_inputs = {name: Input((None, ), batch_size=None, dtype=tf.int32, name=name) for name, value in encodings.items()}
 55 |     tf_outputs = model(tf_inputs)
 56 |     tf_model = tf.keras.models.Model(inputs=tf_inputs, outputs={"output": tf_outputs[0]})
 57 | 
 58 |     # Saved SavedModel
 59 |     tf.saved_model.save(tf_model, saved_model_path.as_posix())
 60 | 
 61 |     # Generate a flag file indicating this folder was generated from the tune framework
 62 |     saved_model_path.joinpath(flag).touch()
 63 |     return saved_model_path
 64 | 
 65 | 
 66 | @contextlib.contextmanager
 67 | def options(options):
 68 |     old_opts = tf.config.optimizer.get_experimental_options()
 69 |     tf.config.optimizer.set_experimental_options(options)
 70 |     try:
 71 |         yield
 72 |     finally:
 73 |         tf.config.optimizer.set_experimental_options(old_opts)
 74 | 
 75 | 
 76 | @dataclass
 77 | class TensorflowConfig(BackendConfig):
 78 |     name: str = "tensorflow"
 79 |     use_xla: bool = False
 80 |     use_saved_model_format: bool = False
 81 |     eager_mode: bool = False
 82 |     experimental_compiler: Optional[bool] = None
 83 | 
 84 |     @staticmethod
 85 |     def version() -> str:
 86 |         return tf.__version__
 87 | 
 88 |     @staticmethod
 89 |     def supported_keys() -> Set[str]:
 90 |         return BackendConfig.supported_keys().union({
 91 |             "use_xla",
 92 |             "eager_mode",
 93 |             "experimental_compiler",
 94 |             "use_saved_model_format",
 95 |         })
 96 | 
 97 | 
 98 | class TensorflowBackend(Backend[TensorflowConfig]):
 99 |     NAME = BACKEND_NAME
100 | 
101 |     def __init__(self, model: str, local_model_path: str = None):
102 |         super().__init__(model)
103 |         self.model = model
104 |         self.model_info = None  # Only used when working with SavedModel
105 |         self.local_model_path = local_model_path  # Local model path if using pre-exported SavedModel file
106 | 
107 |         LOGGER.info(f"Allocated TensorFlow Backend for model: {model}")
108 | 
109 |     @classmethod
110 |     def allocate(cls, config: BenchmarkConfig):
111 |         # Check if we are using a local SavedModel file
112 |         # => (format <model_topology@model_local_page => bert-base-case@/path/to/savedmodel)
113 |         if config.backend.use_saved_model_format and "@" in config.model:
114 |             model_name, model_path = config.model.split("@")
115 |             LOGGER.info(f"Local SavedModel format detected: model={model_name}, path={model_path}")
116 | 
117 |             backend = TensorflowBackend(model_name, model_path)
118 |         else:
119 |             backend = TensorflowBackend(config.model)
120 | 
121 |         backend.configure(config.backend)
122 | 
123 |         return backend
124 | 
125 |     def clean(self, config: 'BenchmarkConfig'):
126 |         saved_model_path = Path(SAVED_MODEL_PATH)
127 |         if saved_model_path.exists() and saved_model_path.joinpath(SAVED_MODEL_TUNE_FLAG):
128 |             LOGGER.debug(f"Cleaning SavedModel folder at {saved_model_path}")
129 |             rmtree(saved_model_path)
130 |             # saved_model_path.rmdir()
131 | 
132 |     def configure(self, config: TensorflowConfig):
133 |         super().configure(config)
134 | 
135 |         LOGGER.info("Configuring TensorFlow Benchmark:")
136 | 
137 |         if config.num_threads is not None:
138 |             if tf.config.threading.get_intra_op_parallelism_threads() != config.num_threads:
139 |                 tf.config.threading.set_intra_op_parallelism_threads(config.num_threads)
140 | 
141 |             LOGGER.info(
142 |                 f"\t+ Number of intra op threads ("
143 |                 f"tf.config.threading.set_intra_op_parallelism_threads("
144 |                 f"{tf.config.threading.get_intra_op_parallelism_threads()}"
145 |                 f"))"
146 |             )
147 | 
148 |         if config.num_interops_threads is not None:
149 |             if tf.config.threading.get_inter_op_parallelism_threads() != config.num_interops_threads:
150 |                 tf.config.threading.set_inter_op_parallelism_threads(config.num_interops_threads)
151 | 
152 |             LOGGER.info(
153 |                 f"\t+ Number of inter op threads ("
154 |                 f"tf.config.threading.set_inter_op_parallelism_threads("
155 |                 f"{tf.config.threading.get_inter_op_parallelism_threads()}"
156 |                 f"))"
157 |             )
158 | 
159 |         # If we need to use the model as SavedModel format
160 |         if config.use_saved_model_format:
161 | 
162 |             # Local model support
163 |             if self.local_model_path is None:
164 |                 LOGGER.info(f"Converting model: {self.model} to SavedModel format")
165 |                 with options({
166 |                     "constant_folding": True,
167 |                     "shape_optimization": True,
168 |                     "disable_model_pruning": False,
169 |                     "arithmetic_optimization": True,
170 |                     "function_optimization": True
171 |                 }):
172 |                     with tf.device("CPU"):
173 |                         model = TFAutoModel.from_pretrained(self.model)
174 |                         self.local_model_path = as_saved_model(
175 |                             tokenizer=self.tokenizer,
176 |                             model=model,
177 |                             inputs=self._get_dummy_inputs(
178 |                                 1, model.config.max_position_embeddings - self.tokenizer.num_special_tokens_to_add()
179 |                             ),
180 |                             saved_model_path=Path(SAVED_MODEL_PATH),
181 |                             flag=SAVED_MODEL_TUNE_FLAG
182 |                         )
183 | 
184 |                     LOGGER.debug(f"Converted SavedModel stored at {self.local_model_path}")
185 | 
186 |             # Load the model
187 |             saved_model_path = Path(self.local_model_path)
188 |             LOGGER.info(f"Loading SavedModel from {saved_model_path}")
189 |             self.model_info = tf.saved_model.load(saved_model_path.as_posix())
190 |             self.model = self.model_info.signatures["serving_default"]
191 |         else:
192 |             # Postponing model allocation to tune intra/inter ops before executing any other TF related code.
193 |             self.model = TFAutoModel.from_pretrained(self.model)
194 | 
195 |     def execute(self, config: BenchmarkConfig, is_reference: bool = False) -> Tuple[Benchmark, np.ndarray]:
196 |         if config.backend.eager_mode:
197 |             return self._run_tf(config, is_reference)
198 |         else:
199 |             return self._run_tf_graph(config, is_reference)
200 | 
201 |     def _run_tf(self, config: BenchmarkConfig, is_reference: bool) -> Tuple[Benchmark, np.ndarray]:
202 |         LOGGER.info("Running TensorFlow Eager benchmark")
203 |         benchmark = Benchmark()
204 | 
205 |         dummy_inputs = self._get_dummy_inputs(
206 |             batch_size=config.batch_size,
207 |             seq_len=(config.sequence_length - self.tokenizer.num_special_tokens_to_add(pair=False))
208 |         )
209 | 
210 |         with tf.device(get_tf_device(config.device)):
211 |             inputs = self.tokenizer(
212 |                 dummy_inputs,
213 |                 is_split_into_words=True,
214 |                 return_tensors=TensorType.TENSORFLOW,
215 |             )
216 | 
217 |             # Move tf.constants to GPU ... https://github.com/tensorflow/tensorflow/issues/42242#issuecomment-675590057
218 |             inputs = {name: tf.identity(t) for name, t in inputs.items()}
219 | 
220 |             # SavedModel concrete function needs unwrapped arguments ...
221 |             # model_f = lambda x: self.model(**x).popitem()[1] \
222 |             #     if config.backend.use_saved_model_format else \
223 |             #     lambda x: self.model(x).last_hidden_state
224 |             def model_f(inputs):
225 |                 # SavedModel concrete function needs unwrapped arguments ...
226 |                 if config.backend.use_saved_model_format:
227 |                     LOGGER.info("Please note that saved model format will enable graph mode test!!")
228 |                     return self.model(**inputs).popitem()[1]
229 |                 else:
230 |                     return self.model(inputs).last_hidden_state
231 | 
232 |             # Warmup
233 |             outputs = []
234 |             for _ in trange(config.warmup_runs, desc="Warming up"):
235 |                 output = model_f(inputs)
236 |                 outputs.append(output.numpy())
237 | 
238 |             # Let's not run the benchmark for the reference backend,
239 |             # as we are more interested in the output tensors.
240 |             if not is_reference:
241 | 
242 |                 # Run benchmark
243 |                 benchmark_duration_ns = config.benchmark_duration * SEC_TO_NS_SCALE
244 |                 while sum(benchmark.latencies) < benchmark_duration_ns:
245 |                     with benchmark.track():
246 |                         model_f(inputs)
247 | 
248 |                 benchmark.finalize(benchmark_duration_ns)
249 | 
250 |             return benchmark, np.stack(outputs)
251 | 
252 |     def _run_tf_graph(self, config: BenchmarkConfig, is_reference: bool) -> Tuple[Benchmark, np.ndarray]:
253 |         if not config.backend.use_xla:
254 |             LOGGER.info("Running TensorFlow Graph benchmark")
255 |             @tf.function
256 |             def model_f(inputs):
257 |                 # SavedModel concrete function needs unwrapped arguments ...
258 |                 if config.backend.use_saved_model_format:
259 |                     return self.model(**inputs).popitem()[1]
260 |                 else:
261 |                     return self.model(inputs).last_hidden_state
262 |         else:
263 |             LOGGER.info("Running TensorFlow Graph with XLA benchmark")
264 |             @tf.function(jit_compile=True)
265 |             def model_f(inputs):
266 |                 # SavedModel concrete function needs unwrapped arguments ...
267 |                 if config.backend.use_saved_model_format:
268 |                     return self.model(**inputs).popitem()[1]
269 |                 else:
270 |                     return self.model(inputs).last_hidden_state
271 | 
272 |         benchmark = Benchmark()
273 | 
274 |         dummy_inputs = self._get_dummy_inputs(
275 |             batch_size=config.batch_size,
276 |             seq_len=(config.sequence_length - self.tokenizer.num_special_tokens_to_add(pair=False))
277 |         )
278 | 
279 |         with tf.device(get_tf_device(config.device)):
280 |             with options({
281 |                 "constant_folding": True,
282 |                 "shape_optimization": True,
283 |                 "disable_model_pruning": False,
284 |                 "arithmetic_optimization": True,
285 |                 "function_optimization": True
286 |             }):
287 |                 inputs = self.tokenizer(
288 |                     dummy_inputs,
289 |                     is_split_into_words=True,
290 |                     return_tensors=TensorType.TENSORFLOW,
291 |                 )
292 | 
293 |                 # Move tf.constants to GPU ...
294 |                 # https://github.com/tensorflow/tensorflow/issues/42242#issuecomment-675590057
295 |                 inputs = {name: tf.identity(t) for name, t in inputs.items()}
296 | 
297 |                 # Warmup
298 |                 outputs = []
299 |                 for _ in trange(config.warmup_runs, desc="Warming up"):
300 |                     output = model_f(inputs)
301 |                     outputs.append(output.numpy())
302 | 
303 |                 # Let's not run the benchmark for the reference backend,
304 |                 # as we are more interested in the output tensors.
305 |                 if not is_reference:
306 | 
307 |                     # Run benchmark
308 |                     benchmark_duration_ns = config.benchmark_duration * SEC_TO_NS_SCALE
309 |                     while sum(benchmark.latencies) < benchmark_duration_ns:
310 |                         with benchmark.track():
311 |                             model_f(inputs)
312 | 
313 |                     benchmark.finalize(benchmark_duration_ns)
314 |         return benchmark, np.stack(outputs)
315 | 


--------------------------------------------------------------------------------
/src/benchmark.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 Hugging Face Inc.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | import numpy as np
15 | 
16 | from contextlib import contextmanager
17 | from dataclasses import dataclass, field
18 | from logging import getLogger
19 | from time import perf_counter_ns
20 | from typing import List
21 | 
22 | from pandas import DataFrame
23 | 
24 | from utils import SEC_TO_NS_SCALE
25 | 
26 | LOGGER = getLogger("benchmark")
27 | 
28 | 
29 | @dataclass
30 | class Benchmark:
31 |     outputs_diff: List[np.ndarray] = None
32 |     latencies: List[float] = field(default_factory=list)
33 |     throughput: float = float("-inf")
34 | 
35 |     @property
36 |     def num_runs(self) -> int:
37 |         return len(self.latencies)
38 | 
39 |     @staticmethod
40 |     def merge(benchmarks: List['Benchmark']) -> 'Benchmark':
41 |         latencies, throughputs = [], []
42 |         for b in benchmarks:
43 | 
44 |             assert len(b.latencies) > 0, "Empty benchmark (0 latency measurements recorded)"
45 |             assert b.throughput > 0., f"Benchmark has not been finalized, throughput < 0 ({b.throughput})"
46 | 
47 |             latencies += b.latencies
48 |             throughputs.append(b.throughput)
49 | 
50 |         # Return all the latencies measured and the mean throughput over all instances
51 |         return Benchmark(
52 |             latencies,
53 |             sum(throughputs) / len(throughputs)
54 |         )
55 | 
56 |     @contextmanager
57 |     def track(self):
58 |         start = perf_counter_ns()
59 |         yield
60 |         end = perf_counter_ns()
61 | 
62 |         # Append the time to the buffer
63 |         self.latencies.append(end - start)
64 | 
65 |         LOGGER.debug(f"Tracked function took: {(end - start)}ns ({(end - start) / 1e6:.3f}ms)")
66 | 
67 |     def record_outputs(self, output: np.ndarray, reference: np.ndarray):
68 |         self.outputs_diff = np.abs(reference - output)
69 | 
70 |     def finalize(self, duration_ns: int):
71 |         self.throughput = round((len(self.latencies) / duration_ns) * SEC_TO_NS_SCALE, 2)
72 | 
73 |     def to_pandas(self) -> DataFrame:
74 |         # Compute stats
75 |         benchmarks_stats = {
76 |             "nb_forwards": len(self.latencies),
77 |             "throughput": self.throughput,
78 |             "latency_mean": np.mean(self.latencies),
79 |             "latency_std": np.std(self.latencies),
80 |             "latency_50": np.quantile(self.latencies, 0.5),
81 |             "latency_90": np.quantile(self.latencies, 0.9),
82 |             "latency_95": np.quantile(self.latencies, 0.95),
83 |             "latency_99": np.quantile(self.latencies, 0.99),
84 |             "latency_999": np.quantile(self.latencies, 0.999),
85 |         }
86 | 
87 |         if self.outputs_diff is not None:
88 |             benchmarks_stats["drift_mean"] = np.mean(self.outputs_diff)
89 |             benchmarks_stats["drift_std"] = np.std(self.outputs_diff)
90 | 
91 |         return DataFrame.from_dict(benchmarks_stats, orient="index").transpose()
92 | 


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 Hugging Face Inc.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | from binascii import hexlify
15 | from dataclasses import dataclass
16 | from logging import getLogger
17 | from random import getrandbits
18 | 
19 | from typing import Dict, Optional
20 | 
21 | from omegaconf import MISSING
22 | from transformers import __version__ as transformers_version
23 | 
24 | from backends import BackendConfig
25 | 
26 | 
27 | LOGGER = getLogger("benchmark")
28 | 
29 | 
30 | @dataclass()
31 | class BenchmarkConfig:
32 |     # Python interpreter version
33 |     python_version: str = MISSING
34 | 
35 |     # Store the transformers version used during the benchmark
36 |     transformers_version: str = transformers_version
37 | 
38 |     # Number of forward pass to run before recording any performance counters.
39 |     warmup_runs: int = MISSING
40 | 
41 |     # Duration in seconds the benchmark will collect performance counters
42 |     benchmark_duration: int = MISSING
43 | 
44 |     # The backend to use for recording timing (pytorch, torchscript, tensorflow, xla, onnxruntime)
45 |     backend: BackendConfig = MISSING
46 | 
47 |     # Name of the model used for the benchmark
48 |     model: str = MISSING
49 | 
50 |     # CPU or CUDA device to run inference on
51 |     device: str = MISSING
52 | 
53 |     # The dtype of the model to run inference with (float32, float16, int8, bfloat16)
54 |     precision: str = MISSING
55 | 
56 |     # Use Transparent Huge Page mechanism to increase CPU cache hit probability
57 |     use_huge_page: str = MISSING
58 | 
59 |     # Number of sample given to the model at each forward
60 |     batch_size: int = MISSING
61 | 
62 |     # The length of the sequence (in tokens) given to the model
63 |     sequence_length: int = MISSING
64 | 
65 |     # Multi instances settings #
66 |     num_instances: int = MISSING
67 | 
68 |     # Number of core per instances
69 |     num_core_per_instance: int = MISSING
70 | 
71 |     # Experiment identifier
72 |     experiment_id: str = hexlify(getrandbits(32).to_bytes(4, 'big')).decode('ascii')
73 | 
74 |     # Experiment name
75 |     experiment_name: str = "default"
76 | 
77 |     # Identifier for the current instance. Allow to create specific instance config folder
78 |     instance_id: int = 0
79 | 
80 |     # Reference backend implementation that will be used to generate reference (output tensors)
81 |     reference: Optional[str] = None
82 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2021 Hugging Face Inc.
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | from logging import getLogger
 15 | from typing import Type, get_args, Union
 16 | 
 17 | import hydra
 18 | import numpy as np
 19 | from hydra.core.config_store import ConfigStore
 20 | from hydra.experimental import compose
 21 | from hydra.utils import get_class
 22 | from omegaconf import OmegaConf, DictConfig
 23 | 
 24 | from backends import Backend, BackendConfig
 25 | from backends.ort import OnnxRuntimeConfig
 26 | from backends.pytorch import PyTorchConfig
 27 | from backends.tensorflow import TensorflowConfig
 28 | from config import BenchmarkConfig
 29 | 
 30 | 
 31 | # Register resolvers
 32 | OmegaConf.register_new_resolver("pytorch_version", PyTorchConfig.version)
 33 | OmegaConf.register_new_resolver("tensorflow_version", TensorflowConfig.version)
 34 | OmegaConf.register_new_resolver("ort_version", OnnxRuntimeConfig.version)
 35 | 
 36 | # Register configurations
 37 | cs = ConfigStore.instance()
 38 | cs.store(name="benchmark", node=BenchmarkConfig)
 39 | cs.store(group="backend", name="pytorch_backend", node=PyTorchConfig)
 40 | cs.store(group="backend", name="torchscript_backend", node=PyTorchConfig)
 41 | cs.store(group="backend", name="tensorflow_backend", node=TensorflowConfig)
 42 | cs.store(group="backend", name="tensorflow_graph_backend", node=TensorflowConfig)
 43 | cs.store(group="backend", name="ort_backend", node=OnnxRuntimeConfig)
 44 | 
 45 | 
 46 | LOGGER = getLogger("benchmark")
 47 | 
 48 | 
 49 | def get_overrided_backend_config(original_config: Union[DictConfig, BackendConfig], override: str) -> DictConfig:
 50 |     # Copy the initial config and pop the backend
 51 |     update_config = original_config.copy()
 52 |     OmegaConf.set_struct(update_config, False)
 53 |     update_config.pop("backend")
 54 | 
 55 |     # Retrieve the original backend factory
 56 |     backend_factory: Type[Backend] = get_class(original_config.backend._target_)
 57 | 
 58 |     # Compose the two configs (reference <- original @backend==config.reference)
 59 |     reference_config = compose(config_name="benchmark", overrides=[f"backend={override}"])
 60 |     reference_config.merge_with(update_config)
 61 |     reference_backend_factory: Type[Backend] = get_class(reference_config.backend._target_)
 62 | 
 63 |     # Retrieve each original & reference BackendConfig instance type
 64 |     reference_backend_config_type: Type[BackendConfig] = get_args(reference_backend_factory.__orig_bases__[0])[0]
 65 |     original_backend_config_type: Type[BackendConfig] = get_args(backend_factory.__orig_bases__[0])[0]
 66 | 
 67 |     # Filter out to rely only on the common subset of supported config elements
 68 |     reference_backend_keys = reference_backend_config_type.supported_keys()
 69 |     original_backend_keys = original_backend_config_type.supported_keys()
 70 | 
 71 |     # (A - B) union (A inter B)
 72 |     overlapping_backend_config_keys = \
 73 |         (reference_backend_keys.intersection(original_backend_keys)) - {"name", "_target_", "version"}
 74 | 
 75 |     LOGGER.debug(f"Keys to override from original config in the new one: {overlapping_backend_config_keys}")
 76 | 
 77 |     # Get a masked configuration copy
 78 |     original_overlapping_backend_config = OmegaConf.masked_copy(
 79 |         original_config,
 80 |         list(overlapping_backend_config_keys)
 81 |     )
 82 | 
 83 |     # Override the properties
 84 |     reference_config["backend"].merge_with(original_overlapping_backend_config)
 85 | 
 86 |     return reference_config
 87 | 
 88 | 
 89 | @hydra.main(config_path="../configs", config_name="benchmark")
 90 | def run(config: BenchmarkConfig) -> None:
 91 |     # We need to allocate the reference backend (used to compare backend output against)
 92 |     if config.reference is not None and config.reference != config.backend:
 93 |         LOGGER.info(f"Using {config.reference} as reference backend")
 94 |         reference_config = get_overrided_backend_config(config, override=config.reference)
 95 |     else:
 96 |         reference_config = None
 97 | 
 98 |     # Allocate requested target backend
 99 |     backend_factory: Type[Backend] = get_class(config.backend._target_)
100 |     backend = backend_factory.allocate(config)
101 | 
102 |     # Run benchmark and reference
103 |     benchmark, outputs = backend.execute(config, is_reference=False)
104 |     backend.clean(config)
105 | 
106 |     if reference_config is not None:
107 |         reference_backend_factory = get_class(reference_config.backend._target_)
108 |         reference_backend = reference_backend_factory.allocate(reference_config)
109 |         _, ref_outputs = reference_backend.execute(reference_config, is_reference=True)
110 | 
111 |         # Record the outputs to compare with the target backend
112 |         benchmark.record_outputs(outputs, ref_outputs)
113 |         reference_backend.clean(reference_config)
114 | 
115 |         LOGGER.info(
116 |             f"Reference backend ({config.reference}) against target backend ({config.backend.name})"
117 |             f" absolute difference:"
118 |             f" {np.mean(benchmark.outputs_diff)} (+/- {np.std(benchmark.outputs_diff)})"
119 |             f" over {len(benchmark.outputs_diff)} sample(s)"
120 |         )
121 | 
122 |     # Save the resolved config
123 |     OmegaConf.save(config, ".hydra/config.yaml", resolve=True)
124 | 
125 |     df = benchmark.to_pandas()
126 |     df.to_csv("results.csv", index_label="id")
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     run()
131 | 


--------------------------------------------------------------------------------
/src/reports.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2021 Hugging Face Inc.
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | 
 15 | from collections import defaultdict
 16 | from pathlib import Path
 17 | 
 18 | import pandas as pd
 19 | from argparse import ArgumentParser
 20 | 
 21 | from rich.console import Console
 22 | from rich.table import Table
 23 | 
 24 | 
 25 | def gather_results(folder: Path):
 26 |     # List all csv results
 27 |     results_f = [f for f in folder.glob("**/*.csv")]
 28 |     results_csv = {
 29 |         f.relative_to(folder).parent.as_posix(): pd.read_csv(f, index_col=0)
 30 |         for f in results_f
 31 |     }
 32 | 
 33 |     if len(results_csv) == 0:
 34 |         raise ValueError(f"No results.csv file were found in {folder}")
 35 | 
 36 |     # Merge dataframe wrt to framework
 37 |     dfs = defaultdict(list)
 38 |     for path, df in results_csv.items():
 39 |         framework, device, arguments = path.split("/")
 40 |         arguments = dict(arg.split("_") for arg in arguments.split("-"))
 41 | 
 42 |         # Add columns to the dataframe
 43 |         for col_name, col_value in arguments.items():
 44 |             df[col_name] = int(col_value)
 45 | 
 46 |         dfs[framework].append(df)
 47 | 
 48 |     # Concat the dataframes
 49 |     dfs = {f: pd.concat(a) for f, a in dfs.items()}
 50 | 
 51 |     for framework, df in dfs.items():
 52 |         df["framework"] = framework
 53 | 
 54 |     return pd.concat(dfs.values())
 55 | 
 56 | 
 57 | def show_results_in_console(df):
 58 |     grouped_df = df.groupby(["framework", "batch", "seqlen"])
 59 |     (grouped_df["inference_time_secs"].mean() * 1000).reset_index()
 60 | 
 61 |     console = Console()
 62 |     table = Table(
 63 |         show_header=True, header_style="bold",
 64 |         title="Inference Time per Framework, Batch Size & Sequence Length"
 65 |     )
 66 | 
 67 |     columns = (
 68 |         ("Framework", "framework"),
 69 |         ("Batch Size", "batch"),
 70 |         ("Seq Length", "seqlen"),
 71 |         ("Inference Time (ms)", "inference_time_secs")
 72 |     )
 73 | 
 74 |     # Define the columns
 75 |     for (column, _) in columns:
 76 |         table.add_column(column, justify="center")
 77 | 
 78 |     # Add rows
 79 |     for name, group in grouped_df:
 80 |         items = name + (round(group.mean()["inference_time_secs"] * 1000, 2), )
 81 |         table.add_row(*[str(item) for item in items])
 82 | 
 83 |     # Display the table
 84 |     console.print(table)
 85 | 
 86 | 
 87 | if __name__ == '__main__':
 88 |     parser = ArgumentParser("Hugging Face Model Benchmark")
 89 |     parser.add_argument("--results-folder", type=Path, help="Where the benchmark results have been saved")
 90 |     parser.add_argument("output_folder", type=Path, help="Where the resulting report will be saved")
 91 | 
 92 |     # Parse command line arguments
 93 |     args = parser.parse_args()
 94 | 
 95 |     if not args.results_folder.exists():
 96 |         print(f"Folder {args.results_folder} doesn't exist")
 97 | 
 98 |     try:
 99 |         # Ensure output folder exists
100 |         args.output_folder.mkdir(exist_ok=True, parents=True)
101 | 
102 |         # Gather the results to manipulate
103 |         df_by_framework = gather_results(args.results_folder)
104 | 
105 |         # Generate reports
106 |         df_by_framework.to_csv(args.output_folder.joinpath("final_results.csv"))
107 | 
108 |         show_results_in_console(df_by_framework)
109 |     except ValueError as ve:
110 |         print(ve)


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 Hugging Face Inc.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from .env import MANAGED_ENV_VARIABLES, ENV_VAR_TCMALLOC_LIBRARY_PATH, ENV_VAR_INTEL_OPENMP_LIBRARY_PATH,\
16 |     check_tcmalloc, check_intel_openmp, set_ld_preload_hook
17 | from .cpu import CPUinfo, cpu_count_physical, configure_numa, get_procfs_path, get_instances_with_cpu_binding
18 | 
19 | SEC_TO_NS_SCALE = 1000000000
20 | 


--------------------------------------------------------------------------------
/src/utils/cpu.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2021 Hugging Face Inc.
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | 
 15 | # Copied from FastFormer code: https://github.com/microsoft/fastformers/blob/main/examples/fastformers/run_superglue.py
 16 | import numpy as np
 17 | import platform
 18 | import re
 19 | import subprocess
 20 | import sys
 21 | from logging import getLogger
 22 | from os import getpid
 23 | from typing import List, Tuple
 24 | 
 25 | LOGGER = getLogger("cpu")
 26 | 
 27 | 
 28 | class CPUinfo:
 29 |     def __init__(self):
 30 |         self.cpuinfo = []
 31 | 
 32 |         if platform.system() == "Windows":
 33 |             raise RuntimeError("Windows platform is not supported!!!")
 34 |         elif platform.system() == "Linux":
 35 |             args = ["lscpu", "--parse=CPU,Core,Socket,Node"]
 36 |             lscpu_info = subprocess.check_output(args, universal_newlines=True).split("\n")
 37 | 
 38 |             # Get information about  cpu, core, socket and node
 39 |             for line in lscpu_info:
 40 |                 pattern = r"^([\d]+,[\d]+,[\d]+,[\d]?)"
 41 |                 regex_out = re.search(pattern, line)
 42 |                 if regex_out:
 43 |                     self.cpuinfo.append(regex_out.group(1).strip().split(","))
 44 | 
 45 |             self._get_socket_info()
 46 | 
 47 |     def _get_socket_info(self):
 48 | 
 49 |         self.socket_physical_cores = []  # socket_id is index
 50 |         self.socket_logical_cores = []   # socket_id is index
 51 |         self.sockets = int(max([line[2] for line in self.cpuinfo])) + 1
 52 |         self.core_to_sockets = {}
 53 | 
 54 |         for socket_id in range(self.sockets):
 55 |             cur_socket_physical_core = []
 56 |             cur_socket_logical_core = []
 57 | 
 58 |             for line in self.cpuinfo:
 59 |                 if socket_id == int(line[2]):
 60 |                     if line[1] not in cur_socket_physical_core:
 61 |                         cur_socket_physical_core.append(line[1])
 62 | 
 63 |                     cur_socket_logical_core.append(line[0])
 64 | 
 65 |                 self.core_to_sockets[line[0]] = line[2]
 66 | 
 67 |             self.socket_physical_cores.append(cur_socket_physical_core)
 68 |             self.socket_logical_cores.append(cur_socket_logical_core)
 69 | 
 70 |     @property
 71 |     def socket_nums(self):
 72 |         return self.sockets
 73 | 
 74 |     @property
 75 |     def physical_core_nums(self):
 76 |         return len(self.socket_physical_cores) * len(self.socket_physical_cores[0])
 77 | 
 78 |     @property
 79 |     def logical_core_nums(self):
 80 |         return len(self.socket_logical_cores) * len(self.socket_logical_cores[0])
 81 | 
 82 |     @property
 83 |     def get_all_physical_cores(self):
 84 |         return np.array(self.socket_physical_cores).flatten().tolist()
 85 | 
 86 |     @property
 87 |     def get_all_logical_cores(self):
 88 |         return np.array(self.socket_logical_cores).flatten().tolist()
 89 | 
 90 |     def get_socket_physical_cores(self, socket_id):
 91 |         if socket_id < 0 or socket_id > self.sockets - 1:
 92 |             LOGGER.error(f"Invalid socket id {socket_id}")
 93 |         return self.socket_physical_cores[socket_id]
 94 | 
 95 |     def get_socket_logical_cores(self, socket_id):
 96 |         if socket_id < 0 or socket_id > self.sockets - 1:
 97 |             LOGGER.error(f"Invalid socket id {socket_id}")
 98 |         return self.socket_logical_cores[socket_id]
 99 | 
100 |     def get_sockets_for_cores(self, core_ids):
101 |         return {self.core_to_sockets[core] for core in core_ids}
102 | 
103 | 
104 | def get_procfs_path():
105 |     """Return updated psutil.PROCFS_PATH constant."""
106 |     """Copied from psutil code, and modified to fix an error."""
107 |     return sys.modules['psutil'].PROCFS_PATH
108 | 
109 | 
110 | def cpu_count_physical():
111 |     """Return the number of physical cores in the system."""
112 |     """Copied from psutil code, and modified to fix an error."""
113 | 
114 |     physical_logical_mapping = {}
115 |     cores_per_socket = {}
116 |     current_info = {}
117 |     with open(f'{get_procfs_path()}/cpuinfo', "rb") as f:
118 |         for line in f:
119 |             line = line.strip().lower()
120 |             if not line:
121 |                 # print(current_info)
122 |                 # new section
123 |                 if b'physical id' in current_info and b'cpu cores' in current_info:
124 |                     cores_per_socket[current_info[b'physical id']] = current_info[b'cpu cores']
125 | 
126 |                 if b'physical id' in current_info and b'core id' in current_info and b'processor' in current_info:
127 |                     # print(current_info[b'physical id'] * 1000 + current_info[b'core id'])
128 |                     if current_info[b'physical id'] * 1000 + current_info[b'core id'] not in physical_logical_mapping:
129 |                         physical_logical_mapping[
130 |                             current_info[b'physical id'] * 1000 + current_info[b'core id']
131 |                         ] = current_info[b'processor']
132 |                 current_info = {}
133 |             else:
134 |                 # ongoing section
135 |                 if (line.startswith(b'physical id') or
136 |                         line.startswith(b'cpu cores') or
137 |                         line.startswith(b'core id') or
138 |                         line.startswith(b'processor')):
139 |                     key, value = line.split(b'\t:', 1)
140 |                     current_info[key.rstrip()] = int(value.rstrip())
141 | 
142 |     total_num_cores = sum(cores_per_socket.values())
143 |     core_to_socket_mapping = {}
144 |     for physical, logical in physical_logical_mapping.items():
145 |         physical_id = physical // 1000
146 | 
147 |         if physical_id not in core_to_socket_mapping:
148 |             core_to_socket_mapping[physical_id] = set()
149 | 
150 |         core_to_socket_mapping[physical_id].add(logical)
151 | 
152 |     return total_num_cores, cores_per_socket, core_to_socket_mapping
153 | 
154 | 
155 | def get_instances_with_cpu_binding(num_core_per_instance: int = -1, num_instances: int = 1) -> List[Tuple[List[int], List[int]]]:
156 |     """
157 |     :param num_core_per_instance: Number of cores to use per instances, -1 means "use all the CPU cores"
158 |     :param num_instances: Number of model instances to distribute CPU cores for
159 |     :return: List[List[int]] Per instance list of CPU core affinity
160 |     """
161 |     total_num_cores, cores_per_socket, core_to_socket_mapping = cpu_count_physical()
162 |     instance_binding = []
163 | 
164 |     # Matching ICX
165 |     total_num_cores = 64
166 |     cores_per_socket = {0: 32, 1: 32}
167 |     core_to_socket_mapping = {0: set(range(32)), 1: set(range(32, 64))}
168 | 
169 |     # 64
170 |     # {0: 32, 1: 32}
171 |     # {
172 |     #   0: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
173 |     #   1: {32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}
174 |     # }
175 | 
176 |     # items in a set are unique, if their more than 1 value, then we have different number core per socket.
177 |     assert len(set(cores_per_socket.values())) == 1, "CPU cores are not equal across sockets"
178 | 
179 |     # No special information given to restrict number of core -> Use all the cores
180 |     if num_core_per_instance < 0:
181 |         # We set the number of core per instance to the number of core of one single socket.
182 |         num_core_per_instance = cores_per_socket[0]
183 |         need_multiple_socket_per_instance = False
184 |         need_socket_overcommit = num_instances > 1  # Asking for more than one instance with all the cores
185 | 
186 |     # Number of core span more than a single socket
187 |     elif num_core_per_instance > cores_per_socket[0]:
188 |         num_core_per_instance = max(num_core_per_instance, total_num_cores)
189 |         need_multiple_socket_per_instance = len(cores_per_socket) > 1  # Ensure we have multiple socket
190 |         need_socket_overcommit = num_instances > 1
191 | 
192 |     # Span over only on socket
193 |     else:
194 |         need_multiple_socket_per_instance = False
195 |         need_socket_overcommit = num_core_per_instance > cores_per_socket[0]
196 | 
197 |     for instance in range(num_instances):
198 |         # On which socket to allocate the instance
199 |         if need_multiple_socket_per_instance:
200 |             socket = list(core_to_socket_mapping.keys())
201 |             cores = {c for s in socket for c in core_to_socket_mapping[s]}
202 | 
203 |         else:
204 |             # {socket_id -> [cores]}
205 |             socket = [instance % len(cores_per_socket.keys())]
206 | 
207 |             # Get the list of available cores (unallocated) on the target socket
208 |             cores = core_to_socket_mapping[socket[0]]
209 | 
210 |         # Pop out allocated core
211 |         # Overcommiting does pop out cores because it will have overlap between instances
212 |         # Overcommiting doesnt attempt to do things smart limiting the overhead.
213 |         if need_socket_overcommit:
214 |             cores_it = iter(cores)
215 |             bindings = [next(cores_it) for i in range(num_core_per_instance)]
216 |         else:
217 |             bindings = [cores.pop() for _ in range(num_core_per_instance)]
218 | 
219 |         instance_binding.append((socket, bindings))
220 | 
221 |     return instance_binding
222 | 
223 | 
224 | def configure_numa(socket_binding: List[int], core_binding: List[int]):
225 |     from numa import available as is_numa_available, set_membind, get_membind, set_affinity, get_affinity
226 |     if is_numa_available():
227 |         LOGGER.info("Configuring NUMA:")
228 | 
229 |         pid = getpid()
230 | 
231 |         # Set core binding affinity
232 |         set_affinity(pid, set(core_binding))
233 |         LOGGER.info(f"\tScheduler affinity set to: {get_affinity(pid)}")
234 | 
235 |         # Set memory allocation affinity
236 |         set_membind(set(socket_binding))
237 |         LOGGER.info(f"\tBinding memory allocation on {get_membind()}")
238 |     else:
239 |         LOGGER.info("NUMA not available on the system, skipping configuration")
240 | 
241 |     # Configure taskset
242 |     # TODO: Check with @Sangeeta if this is still needed as we set CPU scheduler affinity above
243 |     # system(f"taskset -p -c {','.join(map(str, core_binding))} {getpid()}")
244 |     # LOGGER.info(f"[TASKSET] Set CPU affinity to: {core_binding} (pid={getpid()})")


--------------------------------------------------------------------------------
/src/utils/env.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2021 Hugging Face Inc.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from os import environ
16 | from pathlib import Path
17 | 
18 | # Environment variables constants
19 | ENV_VAR_TCMALLOC_LIBRARY_PATH = "TCMALLOC_LIBRARY_PATH"
20 | ENV_VAR_INTEL_OPENMP_LIBRARY_PATH = "INTEL_OPENMP_LIBRARY_PATH"
21 | 
22 | MANAGED_ENV_VARIABLES = {
23 |     "LD_PRELOAD",
24 |     "KMP_AFFINITY",
25 |     "KMP_BLOCKTIME",
26 |     "KMP_BLOCKTIME",
27 |     "OMP_MAX_ACTIVE_LEVELS",
28 |     "OMP_NUM_THREADS",
29 | }
30 | 
31 | 
32 | def check_tcmalloc() -> Path:
33 |     """
34 |     Ensure tcmalloc library is correctly detected and found
35 |     """
36 |     if ENV_VAR_TCMALLOC_LIBRARY_PATH not in environ:
37 |         raise ValueError(f"Env var {ENV_VAR_TCMALLOC_LIBRARY_PATH} has to be set to location of libtcmalloc.so")
38 | 
39 |     if len(environ[ENV_VAR_TCMALLOC_LIBRARY_PATH]) == 0:
40 |         raise ValueError(f"Env var {ENV_VAR_TCMALLOC_LIBRARY_PATH} cannot be empty")
41 | 
42 |     tcmalloc_path = Path(environ[ENV_VAR_TCMALLOC_LIBRARY_PATH])
43 |     if not tcmalloc_path.exists():
44 |         raise ValueError(
45 |             f"Path {tcmalloc_path.as_posix()} pointed by "
46 |             f"env var {ENV_VAR_TCMALLOC_LIBRARY_PATH} doesn't exist"
47 |         )
48 | 
49 |     return tcmalloc_path
50 | 
51 | 
52 | def check_intel_openmp() -> Path:
53 |     """
54 |     Ensure Intel OpenMP library is correctly detected and found
55 |     """
56 |     if ENV_VAR_INTEL_OPENMP_LIBRARY_PATH not in environ:
57 |         raise ValueError(f"Env var {ENV_VAR_INTEL_OPENMP_LIBRARY_PATH} has to be set to location of libomp.so")
58 | 
59 |     if len(environ[ENV_VAR_INTEL_OPENMP_LIBRARY_PATH]) == 0:
60 |         raise ValueError(f"Env var {ENV_VAR_INTEL_OPENMP_LIBRARY_PATH} cannot be empty")
61 | 
62 |     intel_openmp_path = Path(environ[ENV_VAR_INTEL_OPENMP_LIBRARY_PATH])
63 |     if not intel_openmp_path.exists():
64 |         raise ValueError(
65 |             f"Path {intel_openmp_path.as_posix()} pointed by "
66 |             f"env var {ENV_VAR_INTEL_OPENMP_LIBRARY_PATH} doesn't exist"
67 |         )
68 | 
69 |     return intel_openmp_path
70 | 
71 | 
72 | def set_ld_preload_hook(config):
73 |     ld_preload = []
74 |     if hasattr(config, "malloc") and "tcmalloc" == config.malloc.name:
75 |         from utils import check_tcmalloc
76 |         tcmalloc_path = check_tcmalloc()
77 |         ld_preload.append(tcmalloc_path.as_posix())
78 | 
79 |     if hasattr(config, "openmp_backend") and "intel" == config.openmp_backend.name:
80 |         from utils import check_intel_openmp
81 |         intel_omp_path = check_intel_openmp()
82 |         ld_preload.append(intel_omp_path.as_posix())
83 | 
84 |     ld_preload_str = " ".join(ld_preload)
85 |     if "LD_PRELOAD" in environ:
86 |         ld_preload_str += " " + environ.get("LD_PRELOAD", default="")
87 | 
88 |     environ["LD_PRELOAD"] = ld_preload_str


--------------------------------------------------------------------------------