├── .dockerignore ├── LICENSE ├── README.md ├── configs ├── backend │ ├── ort.yaml │ ├── pytorch.yaml │ ├── tensorflow.yaml │ ├── tensorflow_graph.yaml │ └── torchscript.yaml └── benchmark.yaml ├── consolidate.py ├── docker ├── .tf_configure.bazelrc ├── Dockerfile ├── Dockerfile.compile └── oneAPI.repo ├── intel-requirements.txt ├── launcher.py ├── requirements.txt └── src ├── backends ├── __init__.py ├── ort.py ├── pytorch.py └── tensorflow.py ├── benchmark.py ├── config.py ├── main.py ├── reports.py └── utils ├── __init__.py ├── cpu.py └── env.py /.dockerignore: -------------------------------------------------------------------------------- 1 | results/ 2 | reports/ 3 | outputs/ 4 | *.iml 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Transformers performance & evaluation framework 2 | 3 | The benchmarking repository provides an easy and flexible testbed to generate, run and save multiple configurations 4 | in order to compare Transformers based Neural Network models. 5 | 6 | The overall benchmarking project leverages the Hydra framework from Facebook AI & Research which is able to generate 7 | all the given sweeps through configurations files. Currently, we provide benchmarks for 5 Deep Learning frameworks 8 | among the most used: 9 | 10 | - PyTorch (Eager mode) 11 | - TorchScript (Static Graph mode) 12 | - TensorFlow 2 (Eager mode) 13 | - TensorFlow 2 Graph (Static Graph mode) 14 | - ONNX Runtime for Inference (Static Graph mode + Graph Optimizations) 15 | 16 | The repository is divided into 2 principal sections: 17 | - `config/` stores all the configuration files for the supported backends. 18 | - `backends/` stores the actual logic to generate textual inputs and execute a forward pass for the targeted backend. 19 | 20 | ## Getting Started 21 | 22 | **Instructions presented here have been tested on Ubuntu 20.04** 23 | 24 | ```bash 25 | apt update && apt -y install python3 python3-pip python3-dev libnuma-dev 26 | cd 27 | pip install -r requirements.txt 28 | ``` 29 | 30 | 31 | ## Benchmarking framework 32 | ### How to use this repository to benchmark with a specific configuration 33 | 34 | Hydra, the configuration framework used in this project, provides a simple command-line interface to specify and 35 | override the configuration to be run. 36 | 37 | For instance, in order to run a benchmark for ONNX Runtime on CPU with: 38 | - **Backend = ORT** 39 | - **Model = bert-base-cased** 40 | - **Device = CPU** 41 | - **Batch Size = 1** 42 | - **Sequence Length = 32** 43 | 44 | ```bash 45 | python3 src/main.py model=bert-base-cased sequence_length=32 backend=ort device=cpu 46 | ``` 47 | 48 | ### Automatically let Hydra generate all the permutations to cover multiple configurations 49 | 50 | Hydra integrates a very powerful sweep generation utility which is exposed through the `--multirun` command-line flag 51 | when invoking the benchmark script. 52 | 53 | For instance, in order to run a benchmark for PyTorch on CPU with the following specs: 54 | - **Model = bert-base-cased** 55 | - **Device = CPU** 56 | - **Batch Size = 1** 57 | - **Sequence Length = 128** 58 | 59 | ```bash 60 | python3 src/main.py model=bert-base-cased batch_size=1 sequence_length=128 backend=pytorch device=cpu 61 | ``` 62 | 63 | ### Overridable configuration properties 64 | 65 | - `backend`: Specify the backend(s) to use to run the benchmark `{"pytorch", "torchscript", "tensorflow", "xla", "ort"}` 66 | - `device`: Specify on which device to run the benchmark `{"cpu", "cuda"}` 67 | - `precision`: Specify the model's parameters data format. For now, only supports `float32` (_i.e. full precision_) 68 | - `num_threads`: Number of threads to use for intra-operation (`-1` Detect the number of CPU cores and use this value) 69 | - `num_interops_threads`: Number of threads to use for inter-operation (`-1` Detect the number of CPU cores and use this value) 70 | - `warmup_runs`: Number of warmup forward to execute before recording any benchmarking results. (Especially useful to preallocate memory buffers). 71 | - `benchmark_duration`: Duration (in seconds) of the benchmark in an attempt to do as many forward calls as possible within the specified duration. These runs are executed after `warmup_runs`. 72 | 73 | ## Backend specific configuration properties 74 | 75 | Framework exposes different features which can be enabled to tune the execution of the model on the underlying hardware. 76 | In this repository we expose some of them, essentially the most common ones. 77 | 78 | ### PyTorch 79 | 80 | - `use_torchscript` Boolean indicating if the runtime should trace the eager model to produce an optimized version. 81 | 82 | This value is `False` when using backend `pytorch` and `True` when using backend `torchscript` 83 | 84 | ### TensorFlow 85 | 86 | - `use_xla` Boolean indicating if the model should be wrapped around `tf.function(jit_compile=True)` in order to compile the underlying graph through XLA. 87 | 88 | This value is `False` when using backend `tensorflow_graph` and can be enabled by config file or cmd line. 89 | 90 | 91 | ### ONNX Runtime (ORT) 92 | 93 | - `opset` Integer setting which version of the ONNX Opset specification to use when exporting the model 94 | 95 | - `graph_optimisation_level` Which level of optimization to apply with ONNX Runtime when loading the model. Possible values are: 96 | - `ORT_DISABLE_ALL` Use the raw ONNX graph without any further optimization. 97 | - `ORT_ENABLE_BASIC` Use basic graph optimizations which are not platform dependant. 98 | - `ORT_ENABLE_EXTENDED` Use more advanced technics *(might include platform dependant optimizations)*. 99 | - `ORT_ENABLE_ALL` Enable all the possible optimizations *(might include platform dependant optimizations)*. 100 | 101 | - `execution_mode` Mode to execute the ONNX Graph. Can be either: 102 | - `ORT_SEQUENTIAL` Execute the graph sequentially, without looking for subgraph to execute in parallel. 103 | - `ORT_PARALLEL` Execute the graph potentially in parallel, looking for non-dependant subgraphs which can be run simultaneously. 104 | 105 | 106 | ## Launch utility tool 107 | The benchmarking comes with a launcher tool highly inspired by [the one made available by Intel](https://github.com/intel/intel-extension-for-pytorch/blob/master/intel_pytorch_extension_py/launch.py). 108 | The launcher tool helps you handle all the lower bits to configure experiments and get the best out of the platform you have. 109 | 110 | More precisely, it will be able to configure the following elements: 111 | 112 | - Linux transparent huge pages mechanism 113 | - CPU cores affinity for OpenMP threads on NUMA platforms 114 | - Memory affinity for OpenMP threads on NUMA platforms 115 | - OpenMP configurations (KMP_AFFINITY, KMP_BLOCKTIME, OMP_NUM_THREADS, OMP_MAX_ACTIVE_LEVELS, etc.) 116 | - Change at runtime the OpenMP library to be used (GNU / Intel) 117 | - Change the memory allocation library to be used (std, tcmalloc, jemalloc) 118 | - Setup multi-instances inference (multi independent models executing in parallel) with per-instance CPU core/memory affinity 119 | 120 | The launcher script `launcher.py` is located at the root of transformers-benchmarks folder. 121 | You can run `python launcher.py --help` to get all the tuning options available. 122 | 123 | ## Ready to use CLI command 124 | 125 | ### Benchmarking out of the box configuration for multiple backends 126 | ```shell 127 | --multirun model=bert-base-cased backend=pytorch,torchscript,tensorflow,xla,ort 128 | ``` 129 | 130 | ### Tuning the number of intra/inter ops for parallel sections (OMP_NUM_THREADS, MKL_NUM_THREADS, etc.) 131 | 132 | ```shell 133 | --multirun model=bert-base-cased batch_size=1 sequence_length=32 backend.num_threads=2,4,8 backend.num_interops_threads=2,4,8 134 | ``` 135 | 136 | ### Tuning OpenMP thread affinity 137 | ```shell 138 | python launcher.py --kmp_affinity= -- src/main.py model=bert-base-cased batch_size=1 sequence_length=32 ... 139 | ``` 140 | 141 | ### Tuning number of model instances (multi-instance setup) along with intra/inter ops for parallel sections 142 | ```shell 143 | python launcher.py --ninstances=4 -- src/main.py model=bert-base-cased batch_size=1 sequence_length=32 ... 144 | ``` 145 | 146 | ### Tuning allocation library 147 | ```shell 148 | export TCMALLOC_LIBRARY_PATH= 149 | python launcher.py --enable_tcmalloc -- src/main.py model=bert-base-cased batch_size=1 sequence_length=32 ... 150 | ``` 151 | 152 | ### Tuning OpenMP implementation 153 | ```shell 154 | export INTEL_OPENMP_LIBRARY_PATH= 155 | python launcher.py --enable_iomp -- src/main.py model=bert-base-cased batch_size=1 sequence_length=32 ... 156 | ``` 157 | 158 | ### Enabling Transparent Huge Page 159 | ```shell 160 | python launcher.py --enable_thp -- src/main.py model=bert-base-cased batch_size=1 sequence_length=32 ... 161 | ``` 162 | 163 | ## Hydra FAQ 164 | 165 | ## Executing dry-run to highlight configuration 166 | ```shell 167 | python launcher.py --enable_tcmalloc --enable_iomp --ninstances=2 -- src/main.py --info config model=bert-base-cased batch_size=16 sequence_length=512 168 | ``` 169 | -------------------------------------------------------------------------------- /configs/backend/ort.yaml: -------------------------------------------------------------------------------- 1 | _target_: backends.ort.OnnxRuntimeBackend 2 | name: onnxruntime 3 | version: ${ort_version:} 4 | opset: 12 5 | num_threads: null 6 | num_interops_threads: null 7 | graph_optimisation_level: ORT_ENABLE_ALL 8 | execution_mode: ORT_PARALLEL -------------------------------------------------------------------------------- /configs/backend/pytorch.yaml: -------------------------------------------------------------------------------- 1 | _target_: backends.pytorch.PyTorchBackend 2 | name: pytorch 3 | version: ${pytorch_version:} 4 | use_torchscript: false 5 | use_tf32: false 6 | num_threads: null 7 | num_interops_threads: null 8 | -------------------------------------------------------------------------------- /configs/backend/tensorflow.yaml: -------------------------------------------------------------------------------- 1 | _target_: backends.tensorflow.TensorflowBackend 2 | name: tensorflow 3 | version: ${tensorflow_version:} 4 | use_xla: false 5 | use_saved_model_format: false 6 | eager_mode: true 7 | experimental_compiler: false 8 | num_threads: null 9 | num_interops_threads: null 10 | -------------------------------------------------------------------------------- /configs/backend/tensorflow_graph.yaml: -------------------------------------------------------------------------------- 1 | _target_: backends.tensorflow.TensorflowBackend 2 | name: tensorflow_graph 3 | version: ${tensorflow_version:} 4 | use_xla: false 5 | use_saved_model_format: false 6 | eager_mode: false 7 | experimental_compiler: false 8 | num_threads: null 9 | num_interops_threads: null 10 | -------------------------------------------------------------------------------- /configs/backend/torchscript.yaml: -------------------------------------------------------------------------------- 1 | _target_: backends.pytorch.PyTorchBackend 2 | name: torchscript 3 | version: ${pytorch_version:} 4 | use_torchscript: true 5 | use_tf32: false 6 | num_threads: null 7 | num_interops_threads: null 8 | -------------------------------------------------------------------------------- /configs/benchmark.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - backend: pytorch 3 | 4 | hydra: 5 | run: 6 | dir: 7 | outputs/${experiment_name}/${experiment_id}/${instance_id} 8 | sweep: 9 | dir: outputs/${experiment_name}/${experiment_id}/${instance_id} 10 | job: 11 | env_set: 12 | TOKENIZERS_PARALLELISM: "false" 13 | 14 | experiment_name: "default" 15 | python_version: ${python_version:} 16 | model: bert-base-cased 17 | batch_size: 1 18 | sequence_length: 128 19 | benchmark_duration: 5 20 | warmup_runs: 5 21 | device: cpu 22 | precision: float32 23 | num_instances: 1 24 | num_core_per_instance: -1 25 | reference: null -------------------------------------------------------------------------------- /consolidate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Hugging Face Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Licensed under the Apache License, Version 2.0 (the "License"); 16 | # you may not use this file except in compliance with the License. 17 | # You may obtain a copy of the License at 18 | # 19 | # http://www.apache.org/licenses/LICENSE-2.0 20 | # 21 | # Unless required by applicable law or agreed to in writing, software 22 | # distributed under the License is distributed on an "AS IS" BASIS, 23 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 24 | # See the License for the specific language governing permissions and 25 | # limitations under the License. 26 | from datetime import datetime, timezone 27 | from glob import glob 28 | from itertools import chain 29 | from os import path 30 | from pathlib import Path 31 | from typing import Type, List, Tuple 32 | 33 | import pandas as pd 34 | from argparse import ArgumentParser 35 | 36 | import yaml 37 | from pandas import ExcelWriter 38 | from rich.console import Console 39 | from rich.table import Table 40 | 41 | # Format name -> extension 42 | SUPPORTED_EXPORT_FORMAT = { 43 | "csv": "csv", 44 | "excel": "xlsx" 45 | } 46 | 47 | 48 | SCALING_CHOICES = {"batch-size-scaling", "core-count-scaling"} 49 | SCALING_HELP = "Which scaling metodology was used:\n \ 50 | \t- batch-size-scaling: The total number of cores for the original batch size remains the same - \ 51 | we use all the cores for the given batch size but break up the problem into smaller problems \ 52 | with fewer cores for the smaller problem sizes\n" \ 53 | "\t core-count-scaling: We vary the number of cores for the given batch size" 54 | 55 | 56 | LATENCY_COLUMNS = { 57 | "latency_mean", 58 | "latency_std", 59 | "latency_50", 60 | "latency_90", 61 | "latency_95", 62 | "latency_99", 63 | "latency_999", 64 | } 65 | 66 | LATENCY_THROUGHPUT_COLUMNS = { 67 | "throughput", 68 | }.union(LATENCY_COLUMNS) 69 | 70 | 71 | SUMMARY_SUMMING_COLUMNS = { 72 | "nb_forwards", 73 | "throughput", 74 | "batch_size", 75 | } 76 | 77 | FINAL_COLUMNS_ORDERING = ["backend.name", "batch_size", "sequence_length", "openmp.backend", "malloc", "use_huge_page", "num_instances"] 78 | RICH_DISPLAYED_COLUMNS = { 79 | "backend.name": "Backend", 80 | "malloc": "Malloc", 81 | "openmp.backend": "OpenMP", 82 | "use_huge_page": "Huge Pages", 83 | "batch_size": "Batch", 84 | "sequence_length": "Sequence", 85 | "latency_mean": "Avg. Latency", 86 | "latency_std": "Std. Latency", 87 | "throughput": "Throughput", 88 | "num_core_per_instance": "Cores" 89 | } 90 | 91 | MULTI_INSTANCES_VALIDATION_COLUMNS = [ 92 | "batch_size", 93 | "sequence_length", 94 | "backend.name", 95 | "openmp.backend", 96 | "malloc", 97 | "backend.num_threads", 98 | "use_huge_page" 99 | ] 100 | 101 | 102 | def flatten_yaml(path: Path, loader: Type[yaml.Loader] = yaml.SafeLoader) -> pd.DataFrame: 103 | with open(path, "r") as yaml_f: 104 | content = yaml.load(yaml_f, Loader=loader) 105 | 106 | return pd.json_normalize(content) 107 | 108 | 109 | def gather_results(folder: Path, is_multi_instances: bool) -> Tuple[pd.DataFrame, List[str]]: 110 | # List all csv results 111 | results_f = [(f, f.parent.joinpath(".hydra/config.yaml")) for f in folder.glob("**/results.csv")] 112 | results_df = pd.concat([ 113 | # This will concatenate columns from the benchmarks along with config columns 114 | pd.concat((pd.read_csv(results, index_col=0), flatten_yaml(config)), axis="columns") 115 | for results, config in results_f 116 | ], axis="index") 117 | 118 | existing_columns = list(set(FINAL_COLUMNS_ORDERING).intersection(results_df.columns)) 119 | results_df = results_df.sort_values(existing_columns) 120 | 121 | # Ensure the number of instances (according to the sum of instance_sum) matchs num_instances field 122 | if is_multi_instances: 123 | results_df["is_valid"] = results_df.groupby(MULTI_INSTANCES_VALIDATION_COLUMNS)["instance_id"].transform("count") 124 | results_df["is_valid"] = results_df["is_valid"] == results_df["num_instances"] 125 | else: 126 | results_df["is_valid"] = True 127 | 128 | results_df.fillna("N/A", inplace=True) 129 | if len(results_df) == 0: 130 | raise ValueError(f"No results.csv file were found in {folder}") 131 | 132 | return results_df, existing_columns 133 | 134 | 135 | def aggregate_multi_instances_results(results_df: pd.DataFrame, grouping_columns: List[str], mode: str): 136 | agg_df = results_df.copy() 137 | agg_df = agg_df.groupby(grouping_columns) 138 | transforms = { 139 | "latency_mean": ["min", "max", "mean"], 140 | "throughput": ["sum"], 141 | "instance_id": ["sum"], 142 | "is_valid": ["all"] 143 | } 144 | 145 | # How to aggregate cores and batch 146 | if mode == "batch-size-scaling": 147 | transforms["batch_size"] = "sum" 148 | 149 | return agg_df.agg(transforms) 150 | 151 | 152 | def show_results_in_console(df: pd.DataFrame, sorting_columns: List[str]): 153 | console = Console(width=200) 154 | table = Table( 155 | show_header=True, header_style="bold", 156 | title="Latency & Throughput for each framework (latencies given in ms)", 157 | ) 158 | 159 | # Create copy 160 | local_df = df.copy() 161 | local_df = local_df.assign(**local_df[LATENCY_COLUMNS].apply(lambda x: round((x * 1e-6), 2))) 162 | 163 | # Filter out columns 164 | displayed_columns = { 165 | column_id: column_title 166 | for column_id, column_title in RICH_DISPLAYED_COLUMNS.items() 167 | if column_id in local_df.columns 168 | } 169 | 170 | for column_name in displayed_columns.values(): 171 | table.add_column(column_name, justify="center") 172 | table.add_column("Instance ID", justify="center") 173 | 174 | # Add rows 175 | for _, item_columns in local_df.sort_values(sorting_columns, ascending=True).iterrows(): 176 | table.add_row(*[str(item_columns[c]) for c in chain(displayed_columns.keys(), ["instance_id"])]) 177 | 178 | # Display the table 179 | console.print(table) 180 | 181 | 182 | if __name__ == '__main__': 183 | parser = ArgumentParser("Hugging Face Model Benchmark") 184 | parser.add_argument("--results-folder", type=Path, help="Where the benchmark results have been saved") 185 | parser.add_argument("--multi-instances-scaling", choices=SCALING_CHOICES, help=SCALING_HELP) 186 | parser.add_argument("--format", choices=SUPPORTED_EXPORT_FORMAT.keys(), default="csv", help="Export file format") 187 | parser.add_argument("output_folder", type=Path, help="Where the resulting report will be saved") 188 | 189 | # Parse command line arguments 190 | args = parser.parse_args() 191 | args.now = datetime.now(timezone.utc).astimezone() 192 | args.experiment_id = path.split(args.results_folder)[-1] 193 | args.format_ext = SUPPORTED_EXPORT_FORMAT[args.format.lower()] 194 | 195 | for name in {"aggregated", "consolidated"}: 196 | value = f"{name}_{args.experiment_id}_" \ 197 | f"{args.now.date().isoformat()}T{args.now.time().strftime('%H-%M')}" \ 198 | f".{args.format_ext}" 199 | setattr(args, f"{name}_filename", value) 200 | 201 | # Ensure everything looks right 202 | if not args.results_folder.exists(): 203 | print(f"Folder {args.results_folder} doesn't exist") 204 | exit(1) 205 | 206 | try: 207 | # Detect folder run type from folder structure 208 | instances_folder = glob(f"{args.results_folder.as_posix()}/*") 209 | 210 | args.is_multi_instances = len(instances_folder) > 1 211 | args.instances = {path.split(instance_folder)[-1] for instance_folder in instances_folder} 212 | args.is_multirun = { 213 | path.split(instance_folder)[-1]: path.exists(path.join(instance_folder, "multirun.yaml")) 214 | for instance_folder in instances_folder 215 | } 216 | 217 | print( 218 | f"Detected following structure:" 219 | f"\n\t- Multi Instance: {args.is_multi_instances} ({len(args.instances)} instances)" 220 | f"\n\t- Multirun: {args.is_multirun}" 221 | ) 222 | 223 | # If we detect multi instance and no scaling mode is provided, ask for a value 224 | if args.is_multi_instances and args.multi_instances_scaling is None: 225 | print( 226 | "Warning:\n\tNo mode for handling multi-instances aggregation was provided. " 227 | "Only individual runs will be saved.\n" 228 | "\tTo include multi-instances aggregation results, " 229 | f"please use --multi-instance-scaling={SCALING_CHOICES}\n" 230 | ) 231 | 232 | # Ensure output folder exists 233 | args.output_folder.mkdir(exist_ok=True, parents=True) 234 | 235 | # Gather the results to manipulate 236 | consolidated_df, sorting_columns = gather_results(args.results_folder, args.is_multi_instances) 237 | 238 | if args.is_multi_instances and args.multi_instances_scaling is not None: 239 | agg_df = aggregate_multi_instances_results(consolidated_df, sorting_columns, args.multi_instances_scaling) 240 | 241 | if args.format == "csv": 242 | consolidated_df.to_csv(args.output_folder.joinpath(args.consolidated_filename)) 243 | if args.is_multi_instances and args.multi_instances_scaling is not None: 244 | agg_df.to_csv(args.output_folder.joinpath(args.aggregated_filename)) 245 | else: 246 | with ExcelWriter(args.output_folder.joinpath(args.consolidated_filename)) as excel_writer: 247 | consolidated_df.to_excel(excel_writer, sheet_name="individuals") 248 | if args.is_multi_instances and args.multi_instances_scaling is not None: 249 | agg_df.to_excel(excel_writer, sheet_name="aggregated_multi_instances", merge_cells=False) 250 | 251 | show_results_in_console(consolidated_df, sorting_columns) 252 | except ValueError as ve: 253 | print(ve) 254 | -------------------------------------------------------------------------------- /docker/.tf_configure.bazelrc: -------------------------------------------------------------------------------- 1 | build --action_env PYTHON_BIN_PATH="/usr/bin/python" 2 | build --action_env PYTHON_LIB_PATH="/usr/lib/python3.8" 3 | build --python_path="/usr/bin/python" 4 | build --config=xla 5 | build --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda" 6 | build --action_env TF_CUDA_COMPUTE_CAPABILITIES="7.0,7.5,8.0,8.6" 7 | build --action_env LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64" 8 | build --action_env GCC_HOST_COMPILER_PATH="/usr/bin/gcc" 9 | build --config=cuda 10 | build:opt --copt=-march=native 11 | build:opt --copt=-Wno-sign-compare 12 | build:opt --host_copt=-march=native 13 | build:opt --define with_default_optimizations=true 14 | test --flaky_test_attempts=3 15 | test --test_size_filters=small,medium 16 | test --test_env=LD_LIBRARY_PATH 17 | test:v1 --test_tag_filters=-benchmark-test,-no_oss,-no_gpu,-oss_serial 18 | test:v1 --build_tag_filters=-benchmark-test,-no_oss,-no_gpu 19 | test:v2 --test_tag_filters=-benchmark-test,-no_oss,-no_gpu,-oss_serial,-v1only 20 | test:v2 --build_tag_filters=-benchmark-test,-no_oss,-no_gpu,-v1only 21 | build --action_env TF_CONFIGURE_IOS="0" -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | ARG TRANSFORMERS_VERSION=4.1.1 4 | ARG PYTORCH_VERSION=1.7.1 5 | ARG TENSORFLOW_VERSION=2.4.0 6 | ARG ONNXRUNTIME_VERSION=1.6.0 7 | ARG MKL_THREADING_LIBRARY=OMP 8 | 9 | RUN apt update && \ 10 | apt install -y \ 11 | git \ 12 | python3 \ 13 | python3-pip && \ 14 | rm -rf /var/lib/apt/lists/* 15 | 16 | # PyTorch 17 | RUN python3 -m pip install torch==1.7.1+cpu -f https://download.pytorch.org/whl/torch_stable.html 18 | 19 | # TensorFlow 20 | RUN python3 -m pip install tensorflow 21 | 22 | # ONNX Runtime 23 | RUN python3 -m pip install onnxruntime 24 | 25 | COPY . /opt/intel-benchmarks 26 | 27 | WORKDIR /opt/intel-benchmarks 28 | RUN python3 -m pip install -r requirements.txt 29 | 30 | -------------------------------------------------------------------------------- /docker/Dockerfile.compile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.2.0-cudnn8-devel-ubuntu20.04 as builder 2 | 3 | ARG TRANSFORMERS_VERSION=4.5.0 4 | ARG PYTORCH_VERSION=1.8.1 5 | ARG TENSORFLOW_VERSION=2.4.1 6 | ARG MKL_THREADING_LIBRARY=OMP 7 | ARG CUDA_ARCH_LIST=7.0;7.5;8.0;8.6+PTX 8 | 9 | # Ensure tzdata is set 10 | ENV TZ=America/New_York 11 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 12 | 13 | RUN apt update && \ 14 | apt install -y \ 15 | curl \ 16 | cmake \ 17 | make \ 18 | ninja-build \ 19 | git \ 20 | gpg-agent \ 21 | wget \ 22 | python3 \ 23 | python3-dev \ 24 | python3-pip 25 | 26 | # Install oneAPI repo 27 | RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \ 28 | apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \ 29 | rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \ 30 | echo "deb https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list 31 | 32 | RUN apt update && apt install -y \ 33 | intel-oneapi-mkl-devel \ 34 | intel-oneapi-runtime-openmp && \ 35 | rm -rf /var/lib/apt/lists/* 36 | 37 | ENV LD_LIBRARY_PATH='/opt/intel/oneapi/tbb/latest/env/lib/intel64/gcc4.8:/opt/intel/oneapi/mkl/latest/lib/intel64' 38 | ENV LIBRARY_PATH='/opt/intel/oneapi/tbb/latest/lib/intel64/gcc4.8:/opt/intel/oneapi/mkl/latest/lib/intel64' 39 | ENV MKLROOT='/opt/intel/oneapi/mkl/latest' 40 | 41 | # Create a folder to store all the compiled binaries 42 | ENV FRAMEWORK_BINARIES_FOLDER /opt/bin 43 | RUN mkdir ${FRAMEWORK_BINARIES_FOLDER} 44 | 45 | # Bazel for TensorFlow 46 | ENV BAZEL_VERSION 4.0.0 47 | RUN cd "/usr/bin" && curl -fLO https://releases.bazel.build/${BAZEL_VERSION}/release/bazel-${BAZEL_VERSION}-linux-x86_64 && \ 48 | chmod +x bazel-${BAZEL_VERSION}-linux-x86_64 && \ 49 | mv bazel-${BAZEL_VERSION}-linux-x86_64 bazel && \ 50 | ln -s /usr/bin/python3 /usr/bin/python 51 | 52 | # Enable MKL to be found by the compilation process 53 | ENV PATH=/opt/intel/oneapi/mkl/latest/include:$PATH 54 | ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$CMAKE_PREFIX_PATH 55 | ENV CMAKE_INCLUDE_PATH=/opt/intel/oneapi/mkl/latest/include:$PATH:$CMAKE_INCLUDE_PATH 56 | 57 | # TODO: Merge with above when ready 58 | ENV BUILD_CAFFE2_OPS=OFF \ 59 | BUILD_CAFFE2=OFF \ 60 | BUILD_TEST=OFF \ 61 | USE_CUDA=ON \ 62 | USE_OPENCV=OFF \ 63 | USE_FFMPEG=OFF \ 64 | USE_LEVELDB=OFF \ 65 | USE_KINETO=OFF \ 66 | USE_REDIS=OFF \ 67 | USE_DISTRIBUTED=OFF \ 68 | USE_QNNPACK=ON \ 69 | USE_FBGEMM=ON \ 70 | USE_NNPACK=ON \ 71 | USE_MKLDNN=ON \ 72 | BLAS=MKL \ 73 | MKLDNN_CPU_RUNTIME=$MKL_THREADING_LIBRARY \ 74 | TORCH_CUDA_ARCH_LIST=$CUDA_ARCH_LIST 75 | 76 | # PyTorch 77 | RUN git clone https://github.com/pytorch/pytorch /opt/pytorch && \ 78 | cd /opt/pytorch && \ 79 | git checkout v${PYTORCH_VERSION} && \ 80 | git submodule update --init --recursive && \ 81 | python3 -m pip install -r requirements.txt && \ 82 | python3 setup.py bdist_wheel && \ 83 | ls dist/ | grep -i ".whl" | xargs -I % sh -c 'cp /opt/pytorch/dist/% ${FRAMEWORK_BINARIES_FOLDER}/' 84 | 85 | 86 | 87 | # TensorFlow 88 | RUN git clone https://github.com/tensorflow/tensorflow /opt/tensorflow && \ 89 | cd /opt/tensorflow && \ 90 | git checkout v${TENSORFLOW_VERSION} 91 | 92 | COPY docker/.tf_configure.bazelrc /opt/tensorflow/.tf_configure.bazelrc 93 | RUN cd /opt/tensorflow && \ 94 | python3 -m pip install -U --user pip numpy wheel && \ 95 | python3 -m pip install -U --user keras_preprocessing --no-deps && \ 96 | bazel build \ 97 | --config=cuda \ 98 | --config=v2 \ 99 | --config=opt \ 100 | --config=mkl \ 101 | --config=numa \ 102 | --config=noaws \ 103 | --config=nogcp \ 104 | --config=nohdfs \ 105 | --config=nonccl \ 106 | //tensorflow/tools/pip_package:build_pip_package 107 | 108 | RUN cd /opt/tensorflow && \ 109 | ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg && \ 110 | ls /tmp/tensorflow_pkg | grep -i ".whl" | xargs -I % sh -c 'cp /tmp/tensorflow_pkg/% ${FRAMEWORK_BINARIES_FOLDER}/' 111 | 112 | 113 | # ONNX Runtime 114 | RUN git clone https://github.com/microsoft/onnxruntime opt/onnxruntime && \ 115 | cd /opt/onnxruntime && \ 116 | ./build.sh --config=Release --parallel --cmake_generator=Ninja --enable_pybind --build_wheel --enable_lto --use_openmp --skip_tests --skip_onnx_tests && \ 117 | ls /opt/onnxruntime/build/Linux/Release/dist/ | grep -i ".whl" | xargs -I % sh -c 'cp /opt/onnxruntime/build/Linux/Release/dist/% ${FRAMEWORK_BINARIES_FOLDER}/' 118 | 119 | FROM nvidia/cuda:11.2.0-cudnn8-runtime-ubuntu20.04 120 | 121 | RUN apt update && \ 122 | apt install -y \ 123 | python3 \ 124 | python3-pip \ 125 | numactl \ 126 | libtcmalloc-minimal4 \ 127 | wget 128 | 129 | # Install oneAPI repo 130 | RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \ 131 | apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \ 132 | rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \ 133 | echo "deb https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list 134 | 135 | RUN apt update && apt install -y \ 136 | intel-oneapi-mkl \ 137 | intel-oneapi-runtime-openmp && \ 138 | rm -rf /var/lib/apt/lists/* 139 | 140 | ENV LD_LIBRARY_PATH='/usr/local/cuda/compat:/opt/intel/oneapi/tbb/latest/env/lib/intel64/gcc4.8:/opt/intel/oneapi/mkl/latest/lib/intel64' 141 | ENV LIBRARY_PATH='/opt/intel/oneapi/tbb/latest/lib/intel64/gcc4.8:/opt/intel/oneapi/mkl/latest/lib/intel64' 142 | ENV MKLROOT='/opt/intel/oneapi/mkl/latest' 143 | 144 | # Copy 145 | COPY --from=builder /opt/bin /opt 146 | 147 | # Install frameworks 148 | RUN ls /opt/*whl | xargs python3 -m pip install 149 | 150 | # Copy tune 151 | COPY . /opt/tune 152 | 153 | WORKDIR /opt/tune 154 | RUN python3 -m pip install -r requirements.txt 155 | 156 | WORKDIR /opt/tune 157 | RUN python3 -m pip install -r requirements.txt -------------------------------------------------------------------------------- /docker/oneAPI.repo: -------------------------------------------------------------------------------- 1 | [oneAPI] 2 | name=Intel(R) oneAPI repository 3 | baseurl=https://yum.repos.intel.com/oneapi 4 | enabled=1 5 | gpgcheck=1 6 | repo_gpgcheck=1 7 | gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB -------------------------------------------------------------------------------- /intel-requirements.txt: -------------------------------------------------------------------------------- 1 | omegaconf>=2.1.0dev20 2 | hydra-core>=1.1.0.dev5 3 | torch 4 | intel-tensorflow 5 | onnxruntime 6 | psutil 7 | pandas 8 | rich 9 | transformers 10 | multiprocess 11 | sympy -------------------------------------------------------------------------------- /launcher.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Intel Corporation. 2 | # Copyright 2021 Hugging Face Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from __future__ import absolute_import, division, print_function, unicode_literals 17 | 18 | from getpass import getpass 19 | from random import getrandbits 20 | 21 | from binascii import hexlify 22 | 23 | import sys 24 | import platform 25 | import subprocess 26 | import os 27 | from os.path import expanduser 28 | import re 29 | import glob 30 | from argparse import ArgumentParser, REMAINDER 31 | from argparse import RawTextHelpFormatter 32 | import logging 33 | import psutil 34 | 35 | from utils import CPUinfo 36 | 37 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 38 | LOGGER = logging.getLogger(__name__) 39 | 40 | r""" 41 | This is a script for launching PyTorch training and inference on Intel Xeon CPU with optimal configurations. 42 | Now, single instance inference/training, multi-instance inference/training and distributed training 43 | with oneCCL backend is enabled. 44 | 45 | To get the peak performance on Intel Xeon CPU, the script optimizes the configuration of thread and memory 46 | management. For thread management, the script configures thread affinity and the preload of Intel OMP library. 47 | For memory management, it configures NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc). 48 | 49 | **How to use this module:** 50 | 51 | *** Single instance inference/training *** 52 | 53 | 1. Run single-instance inference or training on a single node with all CPU sockets. 54 | 55 | :: 56 | 57 | >>> python -m intel_pytorch_extension.launch script.py args 58 | 59 | 2. Run single-instance inference or training on a single CPU socket. 60 | 61 | :: 62 | 63 | >>> python -m intel_pytorch_extension.launch --socket_id 1 script.py args 64 | 65 | *** Multi-instance inference *** 66 | 67 | 1. Multi-instance 68 | By default, one instance per socket. if you want to set the instance numbers and core per instance, --nintances and --ncore_per_instance should be set. 69 | 70 | 71 | >>> python -m intel_pytorch_extension.launch --multi_instance python_script args 72 | 73 | eg: on CLX8280 with 14 instance, 4 cores per instance 74 | :: 75 | 76 | >>> python -m intel_pytorch_extension.launch --multi_instance --nintances 14 --ncore_per_instance 4 python_script args 77 | 78 | 79 | *** Distributed Training *** 80 | 81 | spawns up multiple distributed training processes on each of the training nodes. For intel_pytorch_extension, oneCCL 82 | is used as the communication backend and MPI used to launch multi-proc. To get the better 83 | performance, you should specify the different cores for oneCCL communication and computation 84 | process seperately. This tool can automatically set these ENVs(such as I_MPI_PIN_DOMIN) and launch 85 | multi-proc for you. 86 | 87 | The utility can be used for single-node distributed training, in which one or 88 | more processes per node will be spawned. It can also be used in 89 | multi-node distributed training, by spawning up multiple processes on each node 90 | for well-improved multi-node distributed training performance as well. 91 | 92 | 93 | 1. Single-Node multi-process distributed training 94 | 95 | :: 96 | 97 | >>> python -m intel_pytorch_extension.launch --distributed python_script --arg1 --arg2 --arg3 and all other 98 | arguments of your training script 99 | 100 | 2. Multi-Node multi-process distributed training: (e.g. two nodes) 101 | 102 | 103 | rank 0: *(IP: 192.168.10.10, and has a free port: 295000)* 104 | 105 | :: 106 | 107 | >>> python -m intel_pytorch_extension.launch --distributed --nproc_per_node=xxx 108 | --nnodes=2 --hostfile hostfile python_sript --arg1 --arg2 --arg3 109 | and all other arguments of your training script) 110 | 111 | 112 | 3. To look up what optional arguments this module offers: 113 | 114 | :: 115 | 116 | >>> python -m intel_pytorch_extension.launch --help 117 | 118 | *** Memory allocator *** 119 | 120 | "--enable_tcmalloc" and "--enable_jemalloc" can be used to enable different memory allcator. 121 | 122 | """ 123 | 124 | SUDOER_PASSWORD = None 125 | THP_ALLOWED_VALUES = {'always', 'never', 'madvise'} 126 | THP_COMMON_LOCATION = "/sys/kernel/mm/transparent_hugepage/enabled" 127 | THP_REDHAT_LOCATION = "/sys/kernel/mm/redhat_transparent_hugepage/enabled" 128 | 129 | THP_LOCATION = THP_REDHAT_LOCATION if os.path.exists(THP_REDHAT_LOCATION) else THP_COMMON_LOCATION 130 | 131 | 132 | def get_transparent_huge_pages(): 133 | if os.path.exists(THP_LOCATION): 134 | with open(THP_LOCATION) as f: 135 | tbh_status = f.read().rstrip() # Remove newline 136 | tbh_value = re.search("\\[(.*)\\]", tbh_status) 137 | 138 | if tbh_value is not None and tbh_value.group(1) in THP_ALLOWED_VALUES: 139 | return tbh_value.group(1) 140 | return None 141 | 142 | 143 | def set_transparent_huge_pages(tbh_value, elevation_pwd=None): 144 | if not tbh_value or tbh_value not in THP_ALLOWED_VALUES: 145 | print(f"Provided TBH value to be set is not valid {tbh_value}") 146 | return 147 | 148 | if os.path.exists(THP_LOCATION): 149 | # Clear memory cache on kernel level 150 | print("Clearing kernel memory cache: 'echo 3 > /proc/sys/vm/drop_caches'") 151 | code = subprocess.call(f'echo {elevation_pwd} | sudo -S sh -c "sync;echo 3 > /proc/sys/vm/drop_caches"', shell=True) 152 | if code != 0: 153 | print(f"Unable to clear kernel memory cache, return code={code}") 154 | 155 | # Explicitly ask for huge pages 156 | print(f'Setting Transparent Huge Page to status: "echo {tbh_value} > {THP_LOCATION}"') 157 | code = subprocess.call(f'echo {elevation_pwd} | sudo -S sh -c "echo {tbh_value} > {THP_LOCATION}"', shell=True) 158 | if code != 0: 159 | print(f"Unable to set kernel transparent huge pages, return code={code}") 160 | else: 161 | print("Warning: Unable to enable Transparent HugePages.") 162 | 163 | 164 | def set_mpi_pin_domain(args): 165 | """ 166 | I_MPI_PIN_DOMAIN specify the cores used for every MPI process. 167 | The first ccl_worker_count cores of every rank for ccl communication 168 | and the other cores will be used to do computation. 169 | For example: on CascadeLake 8280 CPU, 2 ranks on one node. ccl_worker_count=4 170 | CCL_WORKER_COUNT=4 171 | CCL_WORKER_AFFINITY="0,1,2,3,28,29,30,31" 172 | I_MPI_PIN_DOMAIN=[0xffffff0, 0xffffff0000000] 173 | """ 174 | cpuinfo = CPUinfo() 175 | ppn = args.nproc_per_node 176 | total_cores = cpuinfo.physical_core_nums 177 | 178 | if args.use_logical_core: 179 | total_cores = cpuinfo.logical_core_nums 180 | cores_per_rank = total_cores // ppn 181 | pin_domain = "[" 182 | 183 | for proc in range(ppn): 184 | domain_binary = 0 185 | begin = proc * cores_per_rank + args.ccl_worker_count 186 | end = proc * cores_per_rank + cores_per_rank - 1 187 | for i in range(begin, end + 1): 188 | domain_binary |= (1 << i) 189 | pin_domain += hex(domain_binary) + "," 190 | return pin_domain + "]" 191 | 192 | 193 | def set_ccl_worker_affinity(args): 194 | """ 195 | computation and communication use different cores when using oneCCL 196 | backend for distributed training. we use first ccl_worker_count cores of 197 | every rank for ccl communication 198 | """ 199 | cpuinfo = CPUinfo() 200 | ppn = args.nproc_per_node 201 | total_cores = cpuinfo.physical_core_nums 202 | if args.use_logical_core: 203 | total_cores = cpuinfo.logical_core_nums 204 | cores_per_rank = total_cores // ppn 205 | affinity = '' 206 | 207 | for proc in range(ppn): 208 | for ccl_worker in range(args.ccl_worker_count): 209 | affinity += str(proc * cores_per_rank + ccl_worker) + "," 210 | os.environ["CCL_WORKER_AFFINITY"] = affinity 211 | 212 | 213 | def add_lib_preload(lib_type=None): 214 | """ 215 | Enable TCMalloc/JeMalloc/iomp 216 | """ 217 | library_paths = [] 218 | 219 | # We export path library through $_LIBRARY_PATH 220 | if f"{lib_type.upper()}_LIBRARY_PATH" in os.environ: 221 | library_paths.append(os.environ[f"{lib_type.upper()}_LIBRARY_PATH"]) 222 | 223 | if "CONDA_PREFIX" in os.environ: 224 | library_paths.append(os.environ["CONDA_PREFIX"] + "/lib/") 225 | 226 | library_paths += [ 227 | f"{expanduser('~')}/.local/lib/", 228 | "/usr/local/lib/", 229 | "/usr/local/lib64/", 230 | "/usr/lib/", 231 | "/usr/lib64/" 232 | ] 233 | 234 | lib_find = False 235 | for lib_path in library_paths: 236 | if not lib_path.endswith("/"): 237 | lib_path += "/" 238 | library_file = lib_path + "lib" + lib_type + ".so" 239 | matches = glob.glob(library_file) 240 | if len(matches) > 0: 241 | if "LD_PRELOAD" in os.environ: 242 | os.environ["LD_PRELOAD"] = matches[0] + ":" + os.environ["LD_PRELOAD"] 243 | else: 244 | os.environ["LD_PRELOAD"] = matches[0] 245 | print(f"{lib_type} found at: {matches}") 246 | lib_find = True 247 | break 248 | return lib_find 249 | 250 | 251 | def set_memory_allocator(args): 252 | if args.enable_tcmalloc and args.enable_jemalloc: 253 | LOGGER.error("Unable to enable TCMalloc and JEMalloc at the same time") 254 | exit(-1) 255 | 256 | if args.enable_tcmalloc: 257 | find_tc = add_lib_preload(lib_type="tcmalloc") 258 | if not find_tc: 259 | LOGGER.warning( 260 | "Unable to find the {} library file lib{}.so in $CONDA_PREFIX/lib or /.local/lib/" 261 | " or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or " 262 | "~/.local/lib/ so the LD_PRELOAD environment variable will not be set." 263 | .format("TCmalloc", "tcmalloc", expanduser("~")) 264 | ) 265 | args.additional_benchmark_args.append("+malloc=std") 266 | else: 267 | LOGGER.info("Use TCMalloc memory allocator") 268 | args.additional_benchmark_args.append("+malloc=tcmalloc") 269 | 270 | elif args.enable_jemalloc: 271 | find_je = add_lib_preload(lib_type="jemalloc") 272 | if not find_je: 273 | LOGGER.warning( 274 | "Unable to find the {} library file lib{}.so in $CONDA_PREFIX/lib or /.local/lib/" 275 | " or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or " 276 | "~/.local/lib/ so the LD_PRELOAD environment variable will not be set." 277 | .format("JeMalloc", "jemalloc", expanduser("~")) 278 | ) 279 | args.additional_benchmark_args.append("+malloc=std") 280 | else: 281 | LOGGER.info("Use JeMalloc memory allocator") 282 | args.additional_benchmark_args.append("+malloc=jemalloc") 283 | if "MALLOC_CONF" not in os.environ: 284 | os.environ["MALLOC_CONF"] = args.malloc_conf 285 | LOGGER.info("MALLOC_CONF={}".format(os.environ["MALLOC_CONF"])) 286 | 287 | elif args.use_default_allocator: 288 | args.additional_benchmark_args.append("+malloc=std") 289 | 290 | else: 291 | find_tc = add_lib_preload(lib_type="tcmalloc") 292 | if find_tc: 293 | LOGGER.info("Use TCMalloc memory allocator") 294 | args.additional_benchmark_args.append("+malloc=tcmalloc") 295 | if "MALLOC_CONF" not in os.environ: 296 | os.environ["MALLOC_CONF"] = args.malloc_conf 297 | LOGGER.info("MALLOC_CONF={}".format(os.environ["MALLOC_CONF"])) 298 | return 299 | 300 | find_je = add_lib_preload(lib_type="jemalloc") 301 | if find_je: 302 | LOGGER.info("Use JeMalloc memory allocator") 303 | args.additional_benchmark_args.append("+malloc=jemalloc") 304 | if "MALLOC_CONF" not in os.environ: 305 | os.environ["MALLOC_CONF"] = args.malloc_conf 306 | LOGGER.info("MALLOC_CONF={}".format(os.environ["MALLOC_CONF"])) 307 | return 308 | 309 | LOGGER.warning( 310 | "Both TCMalloc and JeMalloc are not fount in $CONDA_PREFIX/lib or /.local/lib/" 311 | " or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or " 312 | "~/.local/lib/ so the LD_PRELOAD environment variable will not be set. " 313 | "This may drop the performance" 314 | .format(expanduser("~")) 315 | ) 316 | args.additional_benchmark_args.append(f"+malloc=std") 317 | 318 | 319 | def set_multi_thread_and_allocator(args): 320 | set_memory_allocator(args) 321 | 322 | if args.enable_thp: 323 | SUDOER_PASSWORD = getpass("Setting Transparent Huge Page requires elevated privileges.\nPassword:") 324 | set_transparent_huge_pages("always", SUDOER_PASSWORD) 325 | 326 | if "THP_STATUS" not in os.environ: 327 | os.environ["THP_STATUS"] = get_transparent_huge_pages() 328 | 329 | if "OMP_NUM_THREADS" not in os.environ: 330 | os.environ["OMP_NUM_THREADS"] = str(args.ncore_per_instance) 331 | elif "OMP_NUM_THREADS" in os.environ: 332 | args.ncore_per_instance = int(os.environ["OMP_NUM_THREADS"]) 333 | 334 | if "OMP_MAX_ACTIVE_LEVELS" not in os.environ: 335 | os.environ["OMP_MAX_ACTIVE_LEVELS"] = str(args.omp_max_active_levels) 336 | else: 337 | args.omp_max_active_levels = int(os.environ["OMP_MAX_ACTIVE_LEVELS"]) 338 | 339 | if "KMP_AFFINITY" not in os.environ: 340 | os.environ["KMP_AFFINITY"] = args.kmp_affinity 341 | 342 | if "KMP_BLOCKTIME" not in os.environ: 343 | os.environ["KMP_BLOCKTIME"] = args.kmp_blocktime 344 | 345 | if "DNNL_PRIMITIVE_CACHE_CAPACITY" not in os.environ: 346 | os.environ["DNNL_PRIMITIVE_CACHE_CAPACITY"] = '1024' 347 | 348 | LOGGER.info(f"OMP_NUM_THREADS={os.environ['OMP_NUM_THREADS']}") 349 | LOGGER.info(f"OMP_MAX_ACTIVE_LEVELS={os.environ['OMP_MAX_ACTIVE_LEVELS']}") 350 | LOGGER.info(f"KMP_AFFINITY={os.environ['KMP_AFFINITY']}") 351 | LOGGER.info(f"KMP_BLOCKTIME={os.environ['KMP_BLOCKTIME']}") 352 | LOGGER.info(f"DNNL_PRIMITIVE_CACHE_CAPACITY={os.environ['DNNL_PRIMITIVE_CACHE_CAPACITY']}") 353 | 354 | omp_backend = "default" 355 | if args.enable_iomp: 356 | find_iomp = add_lib_preload(lib_type="iomp5") 357 | if not find_iomp: 358 | LOGGER.warning("Unable to find the {} library file lib{}.so in $CONDA_PREFIX/lib or /.local/lib/" 359 | " or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or " 360 | "~/.local/lib/ so the LD_PRELOAD environment variable will not be set." 361 | .format("iomp", "iomp", expanduser("~"))) 362 | else: 363 | omp_backend = "iomp" 364 | 365 | # Add any additional argument for benchmark script 366 | args.additional_benchmark_args.append(f"backend.num_threads={os.environ['OMP_NUM_THREADS']}") 367 | args.additional_benchmark_args.append(f"+openmp.backend={omp_backend}") 368 | args.additional_benchmark_args.append(f"+openmp.num_threads={os.environ['OMP_NUM_THREADS']}") 369 | args.additional_benchmark_args.append(f"+openmp.max_active_levels={os.environ['OMP_MAX_ACTIVE_LEVELS']}") 370 | args.additional_benchmark_args.append(f'+openmp.affinity="{os.environ["KMP_AFFINITY"]}"') 371 | args.additional_benchmark_args.append(f"+openmp.blocktime={os.environ['KMP_BLOCKTIME']}") 372 | args.additional_benchmark_args.append(f"use_huge_page={os.environ['THP_STATUS']}") 373 | 374 | 375 | def launch(args): 376 | """ 377 | single-instance / multi-instance launcher 378 | """ 379 | cores, processes = [], [] 380 | cpuinfo = CPUinfo() 381 | 382 | if args.core_list: # user specify what cores will be used by params 383 | cores = args.core_list.strip().split(",") 384 | if args.ncore_per_instance == -1: 385 | LOGGER.error("please specify the '--ncore_per_instance' if you have pass the --core_list params") 386 | exit(-1) 387 | elif args.ninstances > 1 and args.ncore_per_instance * args.ninstances < len(cores): 388 | LOGGER.warning("only first {} cores will be used, but you specify {} cores in core_list".format 389 | (args.ncore_per_instance * args.ninstances, len(cores))) 390 | else: 391 | args.ninstances = len(cores) // args.ncore_per_instance 392 | else: 393 | if args.use_logical_core: 394 | if args.socket_id != -1: 395 | cores = cpuinfo.get_socket_logical_cores(args.socket_id) 396 | else: 397 | cores = cpuinfo.get_all_logical_cores 398 | else: 399 | if args.socket_id != -1: 400 | cores = cpuinfo.get_socket_physical_cores(args.socket_id) 401 | else: 402 | cores = cpuinfo.get_all_physical_cores 403 | 404 | if not args.multi_instance and args.ninstances == -1 and args.ncore_per_instance == -1: 405 | args.ninstances = 1 406 | args.ncore_per_instance = len(cores) 407 | elif args.multi_instance and args.ninstances == -1 and args.ncore_per_instance == -1: 408 | args.throughput_performance = True 409 | elif args.ncore_per_instance == -1 and args.ninstances != -1: 410 | args.ncore_per_instance = len(cores) // args.ninstances 411 | elif args.ncore_per_instance != -1 and args.ninstances == -1: 412 | args.ninstances = len(cores) // args.ncore_per_instance 413 | else: 414 | if args.ninstances * args.ncore_per_instance > len(cores): 415 | LOGGER.error("Please make sure ninstances * ncore_per_instance <= total_cores") 416 | exit(-1) 417 | 418 | if args.latency_performance: 419 | if args.ncore_per_instance != 4: 420 | LOGGER.warning("latency_performance is a special mode, args.ncore_per_instance can only be set to be 4") 421 | args.ncore_per_instance = 4 422 | cores = cpuinfo.get_all_physical_cores 423 | args.ninstances = len(cores) // args.ncore_per_instance 424 | 425 | if args.throughput_performance: 426 | args.ninstances = cpuinfo.socket_nums 427 | cores = cpuinfo.get_all_physical_cores 428 | args.ncore_per_instance = len(cores) // args.ninstances 429 | 430 | os.environ["LAUNCH_CMD"] = "#" 431 | os.environ["LAUNCH_THP"] = get_transparent_huge_pages() 432 | os.environ["EXPERIMENT_ID"] = hexlify(getrandbits(32).to_bytes(4, 'big')).decode('ascii') 433 | 434 | set_multi_thread_and_allocator(args) 435 | args.additional_benchmark_args.append(f"num_instances={args.ninstances}") 436 | args.additional_benchmark_args.append(f"num_core_per_instance={args.ncore_per_instance}") 437 | args.additional_benchmark_args.append(f"experiment_id={os.environ['EXPERIMENT_ID']}") 438 | 439 | for i in range(args.ninstances): 440 | cmd, instance_specific_args = [], [] 441 | instance_specific_args.append(f"instance_id={i}") 442 | if not args.disable_numactl: 443 | instance_cores = cores[i * args.ncore_per_instance:(i + 1) * args.ncore_per_instance] 444 | instance_sockets = cpuinfo.get_sockets_for_cores(instance_cores) 445 | 446 | # Convert to numactl string argument 447 | instance_cores_str = ",".join(instance_cores) 448 | instance_sockets_str = ",".join(instance_sockets) 449 | 450 | # Generate numactl call 451 | cmd = ["numactl"] 452 | numa_params = "-C {} ".format(instance_cores_str) 453 | numa_params += "-m {}".format(instance_sockets_str) 454 | cmd.extend(numa_params.split()) 455 | 456 | instance_specific_args.append(f"+numactl.enabled=true") 457 | instance_specific_args.append(f"+numactl.cores=\"{instance_cores_str}\"") 458 | instance_specific_args.append(f"+numactl.membind=\"{instance_sockets_str}\"") 459 | else: 460 | instance_specific_args.append(f"+numactl.enabled=false") 461 | 462 | with_python = not args.no_python 463 | if with_python: 464 | cmd.append(sys.executable) 465 | 466 | if args.module: 467 | cmd.append("-m") 468 | 469 | if "LD_PRELOAD" in os.environ: 470 | instance_specific_args.append("+ld_preload=\"" + os.environ["LD_PRELOAD"] + "\"") 471 | else: 472 | instance_specific_args.append("+ld_preload=\"\"") 473 | 474 | cmd.append(args.program) 475 | cmd.extend(args.program_args) 476 | cmd.extend(args.additional_benchmark_args) 477 | cmd.extend(instance_specific_args) 478 | 479 | os.environ["LAUNCH_CMD"] += " ".join(cmd) + ",#" 480 | 481 | process = subprocess.Popen(cmd, env=os.environ) 482 | processes.append(process) 483 | 484 | os.environ["LAUNCH_CMD"] = os.environ["LAUNCH_CMD"][:-2] 485 | for process in processes: 486 | process.wait() 487 | if process.returncode != 0: 488 | raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd) 489 | 490 | if args.enable_thp: 491 | # reset to existing val 492 | set_transparent_huge_pages(os.environ["LAUNCH_THP"], SUDOER_PASSWORD) 493 | 494 | print(f"Experiment results saved at: {os.path.join('outputs', os.environ['EXPERIMENT_ID'])}") 495 | 496 | 497 | def mpi_dist_launch(args): 498 | """ 499 | Set ENVs and launch MPI process for distributed training. 500 | """ 501 | if args.nnodes > 1 and not os.path.exists(args.hostfile): 502 | raise ValueError("hostfile is necessary when you use multi-node distributed training," 503 | "Please create hostfile which include the ip list you used for distributed running") 504 | elif args.nnodes > 1: 505 | ipv4_addr_pattern = r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$" 506 | ip_list = [] 507 | with open(args.hostfile) as f: 508 | for line in f: 509 | line = line.strip().strip("\n") 510 | is_valid = re.match(ipv4_addr_pattern, line) 511 | if not is_valid: 512 | LOGGER.error(f"{line} is not valid IPV4 address") 513 | exit(-1) 514 | else: 515 | ip_list.append(line) 516 | if len(ip_list) < args.nnodes: 517 | LOGGER.error(f"The number of IP {len(ip_list)} should greater than nnodes parameters {args.nnodes}") 518 | exit(-1) 519 | master_check = False 520 | dic = psutil.net_if_addrs() 521 | for adapter in dic: 522 | snicList = dic[adapter] 523 | for snic in snicList: 524 | if snic.address == ip_list[0]: 525 | master_check = True 526 | if not master_check: 527 | LOGGER.error( 528 | f"MASTER_ADDR is not right. Please make sure the first ip {ip_list[0]} " 529 | f"in your hostfile is the current node" 530 | ) 531 | exit(-1) 532 | 533 | LOGGER.info("Begin to validate the ip connect") 534 | args.master_addr = ip_list[0] 535 | for ip in ip_list[1:]: 536 | completed_process = subprocess.run("ssh -o PasswordAuthentication=no {} ':'".format(ip), shell=True) 537 | if completed_process.returncode != 0: 538 | LOGGER.error( 539 | f"Password-less SSH login to {args.master_addr} failed, " 540 | f"please make sure you have setup SSH public key right" 541 | ) 542 | exit(-1) 543 | else: 544 | LOGGER.info("connection from master node {} to slave node {} is OK".format(args.master_addr, ip)) 545 | 546 | set_memory_allocator(args) 547 | 548 | # set distributed related environmental variables 549 | os.environ["MASTER_ADDR"] = args.master_addr 550 | os.environ["MASTER_PORT"] = str(args.master_port) 551 | 552 | if "I_MPI_PIN_DOMAIN" not in os.environ: 553 | mpi_pin_domain = set_mpi_pin_domain(args) 554 | else: 555 | mpi_pin_domain = os.environ["I_MPI_PIN_DOMAIN"] 556 | 557 | cpuinfo = CPUinfo() 558 | ppn = args.nproc_per_node 559 | total_cores = len(cpuinfo.get_all_physical_cores) 560 | cores_per_rank = total_cores // ppn 561 | 562 | if "OMP_NUM_THREADS" not in os.environ: 563 | opm_num_threads = cores_per_rank - args.ccl_worker_count 564 | else: 565 | opm_num_threads = os.environ["OMP_NUM_THREADS"] 566 | 567 | os.environ["CCL_WORKER_COUNT"] = str(args.ccl_worker_count) 568 | 569 | if "CCL_WORKER_AFFINITY" not in os.environ: 570 | set_ccl_worker_affinity(args) 571 | 572 | if "CCL_ATL_TRANSPORT" not in os.environ: 573 | os.environ["CCL_ATL_TRANSPORT"] = "ofi" 574 | 575 | if args.enable_iomp: 576 | find_iomp = add_lib_preload(lib_type="iomp5") 577 | if not find_iomp: 578 | LOGGER.warning("Unable to find the {} library file lib{}.so in $CONDA_PREFIX/lib or /.local/lib/" 579 | " or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or " 580 | "~/.local/lib/ so the LD_PRELOAD environment variable will not be set." 581 | .format("iomp", "iomp", expanduser("~"))) 582 | else: 583 | LOGGER.info("Enable iomp by set LD_PRELOAD") 584 | 585 | LOGGER.info("MASTER_ADDR={}".format(args.master_addr)) 586 | LOGGER.info("MASTER_PORT={}".format(args.master_port)) 587 | LOGGER.info("I_MPI_PIN_DOMAIN={}".format(mpi_pin_domain)) 588 | LOGGER.info("OMP_NUM_THREADS={} ".format(opm_num_threads)) 589 | LOGGER.info("CCL_WORKER_COUNT={}".format(args.ccl_worker_count)) 590 | LOGGER.info("CCL_WORKER_AFFINITY={}".format(os.environ["CCL_WORKER_AFFINITY"])) 591 | 592 | os.environ["LAUNCH_CMD"] = "#" 593 | cmd = ['mpiexec.hydra'] 594 | mpi_config = "-l -np {} -ppn {} -genv I_MPI_PIN_DOMAIN={} -genv OMP_NUM_THREADS={} ".format( 595 | args.nnodes*args.nproc_per_node, args.nproc_per_node, mpi_pin_domain, opm_num_threads 596 | ) 597 | mpi_config += args.more_mpi_parms 598 | 599 | if args.nnodes > 1: 600 | mpi_config += " -hostfile {}".format(args.hostfile) 601 | cmd.extend(mpi_config.split()) 602 | with_python = not args.no_python 603 | 604 | if with_python: 605 | cmd.append(sys.executable) 606 | cmd.append("-u") 607 | 608 | if args.module: 609 | cmd.append("-m") 610 | 611 | cmd.append(args.program) 612 | cmd.extend(args.program_args) 613 | process = subprocess.Popen(cmd, env=os.environ) 614 | process.wait() 615 | os.environ["LAUNCH_CMD"] += " ".join(cmd) + ",#" 616 | os.environ["LAUNCH_CMD"] = os.environ["LAUNCH_CMD"][:-2] 617 | 618 | 619 | def add_distributed_training_params(parser): 620 | 621 | cpuinfo = CPUinfo() 622 | socket_nums = cpuinfo.socket_nums 623 | 624 | group = parser.add_argument_group("Distributed Training Parameters With oneCCL backend") 625 | group.add_argument("--nnodes", metavar='\b', type=int, default=1, 626 | help="The number of nodes to use for distributed " 627 | "training") 628 | group.add_argument("--nproc_per_node", metavar='\b', type=int, default=socket_nums, 629 | help="The number of processes to launch on each node") 630 | 631 | # ccl control 632 | group.add_argument("--ccl_worker_count", metavar='\b', default=4, type=int, 633 | help="Core numbers per rank used for ccl communication") 634 | 635 | # mpi control 636 | group.add_argument("--master_addr", metavar='\b', default="127.0.0.1", type=str, 637 | help="Master node (rank 0)'s address, should be either " 638 | "the IP address or the hostname of node 0, for " 639 | "single node multi-proc training, the " 640 | "--master_addr can simply be 127.0.0.1") 641 | group.add_argument("--master_port", metavar='\b', default=29500, type=int, 642 | help="Master node (rank 0)'s free port that needs to " 643 | "be used for communication during distributed " 644 | "training") 645 | group.add_argument("--hostfile", metavar='\b', default="hostfile", type=str, 646 | help="Hostfile is necessary for multi-node multi-proc " 647 | "training. hostfile includes the node address list " 648 | "node address which should be either the IP address" 649 | "or the hostname.") 650 | group.add_argument("--more_mpi_parms", metavar='\b', default="", type=str, 651 | help="User can pass more parameters for mpiexec.hydra " 652 | "except for -np -ppn -hostfile and -genv I_MPI_PIN_DOMAIN") 653 | 654 | 655 | def add_memory_allocator_params(parser): 656 | 657 | group = parser.add_argument_group("Memory Allocator Parameters") 658 | 659 | # allocator control 660 | group.add_argument("--enable_tcmalloc", action='store_true', default=False, 661 | help="Enable tcmalloc allocator") 662 | group.add_argument("--enable_jemalloc", action='store_true', default=False, 663 | help="Enable jemalloc allocator") 664 | group.add_argument("--use_default_allocator", action='store_true', default=False, 665 | help="Use default memory allocator") 666 | group.add_argument("--malloc_conf", metavar='\b', default="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000", type=str, 667 | help="MALLOC_CONF setup, for jemalloc only, environment variable has higher priority than this args." 668 | "default value is : oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000") 669 | 670 | # transparent huge pages 671 | group.add_argument("--enable_thp", action="store_true", default=False, help="Enable Transparent Huge Pages") 672 | 673 | 674 | def add_multi_instance_params(parser): 675 | 676 | group = parser.add_argument_group("Multi-instance Parameters") 677 | 678 | # multi-instance control 679 | group.add_argument("--ncore_per_instance", metavar='\b', default=-1, type=int, 680 | help="Cores per instance") 681 | group.add_argument("--ninstances", metavar='\b', default=-1, type=int, 682 | help="For multi-instance, you should give the cores number you used for per insantance.") 683 | group.add_argument("--latency_performance", action='store_true', default=False, 684 | help="By detault 4 core per instance and use all physical cores") 685 | group.add_argument("--throughput_performance", action='store_true', default=False, 686 | help="By default one instance per socket and use all physical cores") 687 | group.add_argument("--socket_id", metavar='\b', default=-1, type=int, 688 | help="Socket id for multi-instance, by default all sockets will be used") 689 | group.add_argument("--use_logical_core", action='store_true', default=False, 690 | help="Whether only use physical cores") 691 | group.add_argument("--disable_numactl", action='store_true', default=False, 692 | help="Disable numactl") 693 | group.add_argument("--core_list", metavar='\b', default=None, type=str, 694 | help="Specify the core list as 'core_id, core_id, ....', otherwise, all the cores will be used.") 695 | 696 | 697 | def add_kmp_iomp_params(parser): 698 | 699 | group = parser.add_argument_group("KMP/IOMP Affinity Parameters") 700 | group.add_argument("--kmp_affinity", metavar='\b', default="granularity=fine,compact,1,0", type=str, 701 | help="KMP_AFFINITY setup, environment variable has higher priority than this args." 702 | "default value is : granularity=fine,compact,1,0") 703 | group.add_argument("--kmp_blocktime", metavar='\b', default="1", type=str, 704 | help="KMP_BLOCKTIME setup, environment variable has higher priority than this args." 705 | "default value is : 1") 706 | group.add_argument("--omp_max_active_levels", type=int, default=1, help="Set OMP_MAX_ACTIVE_LEVELS env var.") 707 | group.add_argument("--enable_iomp", action='store_true', default=False, 708 | help="Enable iomp and libiomp5.so will be add to LD_PRELOAD") 709 | 710 | 711 | def parse_system_info(args): 712 | from platform import libc_ver, uname 713 | 714 | uname_info = uname() 715 | args.additional_benchmark_args.append(f"+system.name={uname_info.system}") 716 | args.additional_benchmark_args.append(f"+system.arch={uname_info.machine}") 717 | args.additional_benchmark_args.append(f"+system.kernel={uname_info.release}") 718 | args.additional_benchmark_args.append(f"+system.libc={libc_ver()[-1]}") 719 | 720 | 721 | def parse_args(): 722 | """ 723 | Helper function parsing the command line options 724 | @retval ArgumentParser 725 | """ 726 | parser = ArgumentParser(description="This is a script for launching PyTorch training and inference on Intel Xeon CPU " 727 | "with optimal configurations. Now, single instance inference/training, multi-instance " 728 | "inference/training and distributed training with oneCCL backend is enabled. " 729 | "To get the peak performance on Intel Xeon CPU, the script optimizes the configuration " 730 | "of thread and memory management. For thread management, the script configures thread " 731 | "affinity and the preload of Intel OMP library. For memory management, it configures " 732 | "NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc) " 733 | "\n################################# Basic usage ############################# \n" 734 | "\n 1. single instance\n" 735 | "\n >>> python -m intel_pytorch_extension.launch python_script args \n" 736 | "\n2. multi-instance \n" 737 | "\n >>> python -m intel_pytorch_extension.launch --multi_instance python_script args\n" 738 | "\n3. Single-Node multi-process distributed training\n" 739 | "\n >>> python -m intel_pytorch_extension.launch --distributed python_script args\n" 740 | "\n4. Multi-Node multi-process distributed training: (e.g. two nodes)\n" 741 | "\n rank 0: *(IP: 192.168.10.10, and has a free port: 295000)*\n" 742 | "\n >>> python -m intel_pytorch_extension.launch --distributed --nproc_per_node=2\n" 743 | "\n --nnodes=2 --hostfile hostfile python_script args\n", 744 | formatter_class=RawTextHelpFormatter) 745 | 746 | parser.add_argument("--multi_instance", action='store_true', default=False, 747 | help="Enable multi-instance, by default one instance per socket") 748 | 749 | parser.add_argument('--distributed', action='store_true', default=False, 750 | help='Enable distributed training.') 751 | parser.add_argument("-m", "--module", default=False, action="store_true", 752 | help="Changes each process to interpret the launch script " 753 | "as a python module, executing with the same behavior as" 754 | "'python -m'.") 755 | 756 | parser.add_argument("--no_python", default=False, action="store_true", 757 | help="Do not prepend the --program script with \"python\" - just exec " 758 | "it directly. Useful when the script is not a Python script.") 759 | 760 | add_memory_allocator_params(parser) 761 | add_kmp_iomp_params(parser) 762 | 763 | add_distributed_training_params(parser) 764 | add_multi_instance_params(parser) 765 | 766 | # positional 767 | parser.add_argument("program", type=str, 768 | help="The full path to the proram/script to be launched. " 769 | "followed by all the arguments for the script") 770 | 771 | # rest from the training program 772 | parser.add_argument('program_args', nargs=REMAINDER) 773 | return parser.parse_args() 774 | 775 | 776 | def main(): 777 | 778 | env_before = set(os.environ.keys()) 779 | if platform.system() == "Windows": 780 | raise RuntimeError("Windows platform is not supported!!!") 781 | 782 | args = parse_args() 783 | args.additional_benchmark_args = [] 784 | 785 | parse_system_info(args) 786 | 787 | if args.distributed and args.multi_instance: 788 | raise RuntimeError("Either args.distributed or args.multi_instance should be set") 789 | 790 | if args.latency_performance and args.throughput_performance: 791 | raise RuntimeError("Either args.latency_performance or args.throughput_performance should be set") 792 | 793 | if args.nnodes > 1: 794 | args.distributed = True 795 | 796 | if args.distributed: 797 | mpi_dist_launch(args) 798 | else: 799 | launch(args) 800 | 801 | for x in sorted(set(os.environ.keys()) - env_before): 802 | LOGGER.debug(f'{x}={os.environ[x]}') 803 | 804 | 805 | if __name__ == "__main__": 806 | main() 807 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | omegaconf>=2.1.0dev20 2 | hydra-core>=1.1.0.dev5 3 | torch 4 | tensorflow 5 | onnxruntime 6 | psutil 7 | pandas 8 | rich 9 | transformers 10 | multiprocess 11 | sympy 12 | -------------------------------------------------------------------------------- /src/backends/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Hugging Face Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from dataclasses import dataclass 17 | from logging import getLogger 18 | from typing import Generic, TypeVar, ClassVar, List, Optional, Set, Tuple 19 | 20 | import numpy as np 21 | from hydra.types import TargetConf 22 | from omegaconf import MISSING 23 | from psutil import cpu_count 24 | from transformers import AutoTokenizer 25 | 26 | from benchmark import Benchmark 27 | 28 | LOGGER = getLogger("backends") 29 | 30 | 31 | @dataclass 32 | class BackendConfig(TargetConf): 33 | name: str = MISSING 34 | version: str = MISSING 35 | num_threads: Optional[int] = None 36 | num_interops_threads: Optional[int] = None 37 | 38 | @staticmethod 39 | @abstractmethod 40 | def version(): 41 | raise NotImplementedError() 42 | 43 | @staticmethod 44 | def supported_keys() -> Set[str]: 45 | return {"name", "version", "num_threads", "num_interops_threads"} 46 | 47 | 48 | BackendConfigT = TypeVar("BackendConfigT", bound=BackendConfig) 49 | class Backend(Generic[BackendConfigT], ABC): 50 | NAME: ClassVar[str] 51 | 52 | def __init__(self, model: str): 53 | self.model = model 54 | self.tokenizer = AutoTokenizer.from_pretrained(model) 55 | 56 | @classmethod 57 | @abstractmethod 58 | def allocate(cls, config: 'BenchmarkConfig'): 59 | raise NotImplementedError() 60 | 61 | def configure(self, config: BackendConfigT): 62 | if config.num_interops_threads is not None: 63 | if config.num_interops_threads == -1: 64 | config.num_interops_threads = cpu_count() 65 | 66 | if config.num_threads is not None: 67 | if config.num_threads == -1: 68 | config.num_threads = cpu_count() 69 | 70 | @abstractmethod 71 | def execute(self, config: 'BenchmarkConfig', is_reference: bool = False) -> Tuple[Benchmark, np.ndarray]: 72 | raise NotImplementedError() 73 | 74 | def clean(self, config: 'BenchmarkConfig'): 75 | pass 76 | 77 | def _get_dummy_token(self) -> str: 78 | if self.tokenizer.unk_token is not None: 79 | return self.tokenizer.unk_token 80 | else: 81 | return self.tokenizer.convert_tokens_to_string([1]) 82 | 83 | def _get_dummy_inputs(self, batch_size: int, seq_len: int) -> List[List[str]]: 84 | return [[self._get_dummy_token()] * seq_len] * batch_size 85 | 86 | -------------------------------------------------------------------------------- /src/backends/ort.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Hugging Face Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | from logging import getLogger 17 | from os import getpid 18 | from pathlib import Path 19 | from typing import Set, Optional, Tuple 20 | 21 | import numpy as np 22 | from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel, ExecutionMode, __version__ as ort_version 23 | from onnxruntime.transformers.optimizer import optimize_model 24 | from tqdm import trange 25 | from transformers import TensorType 26 | from transformers.convert_graph_to_onnx import convert as onnx_convert 27 | 28 | from backends import BackendConfig, Backend 29 | from benchmark import Benchmark 30 | from utils import SEC_TO_NS_SCALE 31 | 32 | 33 | ALL_GRAPH_OPTIMIZATION_LEVELS = { 34 | GraphOptimizationLevel.ORT_ENABLE_ALL, 35 | GraphOptimizationLevel.ORT_ENABLE_EXTENDED, 36 | GraphOptimizationLevel.ORT_ENABLE_BASIC, 37 | GraphOptimizationLevel.ORT_DISABLE_ALL 38 | } 39 | ALL_GRAPH_OPTIMIZATION_LEVELS_FROM_STR = { 40 | level.name: level 41 | for level in ALL_GRAPH_OPTIMIZATION_LEVELS 42 | } 43 | 44 | ALL_EXECUTION_MODE = { 45 | ExecutionMode.ORT_PARALLEL, 46 | ExecutionMode.ORT_SEQUENTIAL 47 | } 48 | 49 | ALL_EXECUTION_MODE_FROM_STR = { 50 | level.name: level 51 | for level in ALL_EXECUTION_MODE 52 | } 53 | 54 | 55 | @dataclass 56 | class OnnxRuntimeConfig(BackendConfig): 57 | name: str = "onnxruntime" 58 | opset: int = 12 59 | graph_optimisation_level: str = "ORT_ENABLE_ALL" 60 | execution_mode: str = "ORT_PARALLEL" 61 | 62 | @staticmethod 63 | def version() -> str: 64 | return ort_version 65 | 66 | @staticmethod 67 | def supported_keys() -> Set[str]: 68 | return BackendConfig.supported_keys().union({"opset", "graph_optimisation_level", "execution_mode"}) 69 | 70 | 71 | BACKEND_NAME = "onnxruntime" 72 | LOGGER = getLogger(BACKEND_NAME) 73 | ONNX_GRAPHS_FOLDER = "onnx_graphs" 74 | 75 | 76 | class OnnxRuntimeBackend(Backend[OnnxRuntimeConfig]): 77 | 78 | def __init__(self, model: str, onnx_path: str): 79 | super().__init__(model) 80 | 81 | self.onnx_path = onnx_path 82 | self.optimized_onnx_graph = None 83 | self.session_opts = SessionOptions() 84 | 85 | @staticmethod 86 | def convert(model: str, output: Path, opset: int = 12) -> Path: 87 | if output.exists(): 88 | return output 89 | 90 | onnx_convert("pt", model, output, opset=opset) 91 | 92 | @classmethod 93 | def allocate(cls, config: 'BenchmarkConfig'): 94 | onnx_model_path = Path(f"{ONNX_GRAPHS_FOLDER}/{config.model}.onnx.{getpid()}") 95 | OnnxRuntimeBackend.convert(config.model, onnx_model_path, config.backend.opset) 96 | 97 | backend = OnnxRuntimeBackend(config.model, onnx_model_path.absolute().as_posix()) 98 | backend.configure(config.backend) 99 | return backend 100 | 101 | def configure(self, config: OnnxRuntimeConfig): 102 | assert config.graph_optimisation_level in ALL_GRAPH_OPTIMIZATION_LEVELS_FROM_STR, f"Unknown {config.graph_optimisation_level}" 103 | assert config.execution_mode in ALL_EXECUTION_MODE_FROM_STR, f"Unknown {config.execution_mode}" 104 | 105 | super().configure(config) 106 | 107 | LOGGER.info("Configuring ONNX Runtime Benchmark:") 108 | 109 | self.session_opts.execution_mode = ALL_EXECUTION_MODE_FROM_STR[config.execution_mode] 110 | LOGGER.info(f"\t- Setting Execution Mode: {self.session_opts.execution_mode}") 111 | 112 | self.session_opts.graph_optimization_level = ALL_GRAPH_OPTIMIZATION_LEVELS_FROM_STR[config.graph_optimisation_level] 113 | LOGGER.info(f"\t- Setting Graph Optimization Level: {self.session_opts.graph_optimization_level}") 114 | 115 | if config.num_threads is not None: 116 | if self.session_opts.intra_op_num_threads != config.num_threads: 117 | self.session_opts.intra_op_num_threads = config.num_threads 118 | 119 | LOGGER.info(f"\t- Setting intra_op_num_threads({self.session_opts.intra_op_num_threads})") 120 | 121 | if config.num_interops_threads is not None: 122 | if self.session_opts.inter_op_num_threads != config.num_interops_threads: 123 | self.session_opts.inter_op_num_threads = config.num_interops_threads 124 | 125 | LOGGER.info(f"\t- Setting inter_op_num_threads({self.session_opts.inter_op_num_threads})") 126 | 127 | def execute(self, config: 'BenchmarkConfig', is_reference: bool = False) -> Tuple[Benchmark, np.ndarray]: 128 | benchmark = Benchmark() 129 | 130 | try: 131 | model_opt_path = Path(self.onnx_path) 132 | opt_onnx_path = model_opt_path.with_suffix(".opt" + model_opt_path.suffix) 133 | 134 | model_opt = optimize_model( 135 | self.onnx_path, 136 | model_type="bert", 137 | opt_level=int(self.session_opts.graph_optimization_level) 138 | ) 139 | model_opt.save_model_to_file(opt_onnx_path.absolute().as_posix()) 140 | self.optimized_onnx_graph = opt_onnx_path.absolute().as_posix() 141 | except Exception as e: 142 | LOGGER.error(f"Unable to optimize ONNX BERT model: {e}") 143 | 144 | session = InferenceSession(self.optimized_onnx_graph or self.onnx_path, self.session_opts) 145 | 146 | dummy_inputs = self._get_dummy_inputs( 147 | batch_size=config.batch_size, 148 | seq_len=(config.sequence_length - self.tokenizer.num_special_tokens_to_add(pair=False)) 149 | ) 150 | 151 | inputs = self.tokenizer( 152 | dummy_inputs, 153 | is_split_into_words=True, 154 | return_tensors=TensorType.NUMPY, 155 | ) 156 | inputs = {k: v.astype("i8") for k, v in inputs.items()} 157 | 158 | # Warmup 159 | outputs = [] 160 | for _ in trange(config.warmup_runs, desc="Warming up"): 161 | output = session.run(None, inputs) 162 | outputs.append(output[0]) 163 | 164 | # Let's not run the benchmark for the reference backend, 165 | # as we are more interested in the output tensors. 166 | if not is_reference: 167 | 168 | # Run benchmark 169 | benchmark_duration_ns = config.benchmark_duration * SEC_TO_NS_SCALE 170 | while sum(benchmark.latencies) < benchmark_duration_ns: 171 | with benchmark.track(): 172 | session.run(None, inputs) 173 | 174 | benchmark.finalize(benchmark_duration_ns) 175 | return benchmark, np.stack(outputs) 176 | 177 | def clean(self, config: 'BenchmarkConfig'): 178 | onnx_path = Path(ONNX_GRAPHS_FOLDER) 179 | 180 | if onnx_path.exists(): 181 | for file in onnx_path.iterdir(): 182 | LOGGER.debug(f"Cleaning ONNX model: {file}") 183 | file.unlink() 184 | 185 | # if Path(onnx_path).exists(): 186 | # # Care for external data format (multiple file) if exporting bigger model 187 | # LOGGER.debug(f"Cleaning ONNX model: {self.onnx_path}") 188 | # onnx_path.unlink() 189 | # 190 | # if self.optimized_onnx_graph is not None and Path(self.optimized_onnx_graph).exists(): 191 | # LOGGER.debug(f"Cleaning optimized ONNX model: {self.optimized_onnx_graph}") 192 | # Path(self.optimized_onnx_graph).unlink() 193 | -------------------------------------------------------------------------------- /src/backends/pytorch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Hugging Face Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from collections import OrderedDict 16 | from contextlib import contextmanager 17 | from dataclasses import dataclass 18 | from logging import getLogger 19 | from typing import Set, Optional, Tuple 20 | 21 | import numpy as np 22 | import torch 23 | from tqdm import trange 24 | from transformers import AutoModel, TensorType 25 | 26 | from backends import Backend, BackendConfig 27 | from benchmark import Benchmark 28 | from config import BenchmarkConfig 29 | from utils import SEC_TO_NS_SCALE 30 | 31 | 32 | BACKEND_NAME = "pytorch" 33 | LOGGER = getLogger(BACKEND_NAME) 34 | 35 | 36 | class CUDABenchmark(Benchmark): 37 | def __init__(self): 38 | super().__init__() 39 | 40 | if not torch.cuda.is_available(): 41 | raise RuntimeError("CUDA is not available") 42 | 43 | @contextmanager 44 | def track(self): 45 | start_event = torch.cuda.Event(enable_timing=True) 46 | end_event = torch.cuda.Event(enable_timing=True) 47 | 48 | start_event.record() 49 | yield 50 | 51 | end_event.record() 52 | torch.cuda.synchronize() # Wait for the events to be recorded! 53 | 54 | # Get timing events 55 | latency_ms = start_event.elapsed_time(end_event) 56 | 57 | # Convert to nanoseconds to match Benchmark.track() 58 | latency_ns = latency_ms * 1_000_000 59 | 60 | # Append the time to the buffer 61 | self.latencies.append(latency_ns) 62 | 63 | LOGGER.debug(f"Tracked function took: {latency_ns}ns ({latency_ms:.3f}ms)") 64 | 65 | 66 | @dataclass 67 | class PyTorchConfig(BackendConfig): 68 | name: str = "pytorch" 69 | use_torchscript: bool = False 70 | use_tf32: bool = False 71 | 72 | @staticmethod 73 | def version() -> str: 74 | return torch.__version__ 75 | 76 | @staticmethod 77 | def supported_keys() -> Set[str]: 78 | return BackendConfig.supported_keys().union({"use_torchscript", "use_tf32"}) 79 | 80 | 81 | class PyTorchBackend(Backend[PyTorchConfig]): 82 | NAME = BACKEND_NAME 83 | 84 | def __init__(self, model: str): 85 | super().__init__(model) 86 | self.model = AutoModel.from_pretrained(model) 87 | 88 | LOGGER.info(f"Allocated PyTorch Backend for model: {model}") 89 | 90 | @classmethod 91 | def allocate(cls, config: BenchmarkConfig): 92 | backend = cls(config.model) 93 | backend.configure(config.backend) 94 | 95 | return backend 96 | 97 | def configure(self, config: PyTorchConfig): 98 | super().configure(config) 99 | 100 | LOGGER.info("Configuring PyTorch Benchmark:") 101 | 102 | # Disable gradients 103 | torch.set_grad_enabled(False) 104 | LOGGER.info("\t+ Disabled gradients") 105 | 106 | # Tune Nvidia's TF32 support 107 | if torch.has_cuda and torch.cuda.is_available(): 108 | if hasattr(torch.backends.cuda, "matmul") and hasattr(torch.backends.cuda.matmul, "allow_tf32"): 109 | torch.backends.cuda.matmul.allow_tf32 = config.use_tf32 110 | LOGGER.info(f"\t+ CUDA allows Nvidia's TF32: { torch.backends.cuda.matmul.allow_tf32 }") 111 | 112 | if torch.has_cudnn and torch.backends.cudnn.is_available(): 113 | 114 | if hasattr(torch.backends.cudnn, "allow_tf32"): 115 | # The flag below controls whether to allow TF32 on cuDNN. 116 | torch.backends.cudnn.allow_tf32 = config.use_tf32 117 | LOGGER.info(f"\t+ CuDNN allows Nvidia's TF32: { torch.backends.cudnn.allow_tf32 }") 118 | 119 | self.model.eval() 120 | LOGGER.info("\t+ Turning eval mode on Module (model.eval())") 121 | 122 | if config.num_threads is not None: 123 | # if torch.get_num_threads() != config.num_threads: 124 | torch.set_num_threads(config.num_threads) 125 | 126 | LOGGER.info(f"\t+ Number of threads (torch.set_num_threads({config.num_threads}))") 127 | 128 | if config.num_interops_threads is not None: 129 | # TODO: Setting this value multiple times between PyTorch & TorchScript runs raise a C error 130 | 131 | # if torch.get_num_interop_threads() != config.num_interops_threads: 132 | torch.set_num_interop_threads(config.num_interops_threads) 133 | 134 | LOGGER.info( 135 | f"\t+ Number of interop threads (torch.set_num_interop_threads({config.num_interops_threads}))" 136 | ) 137 | 138 | if config.use_torchscript: 139 | self.model.config.return_dict = False 140 | LOGGER.info("\t+ Disabling dictionary output for TorchScript") 141 | 142 | def execute(self, config: BenchmarkConfig, is_reference: bool = False) -> Tuple[Benchmark, np.ndarray]: 143 | if config.backend.use_torchscript: 144 | return self._run_torchscript(config, is_reference) 145 | else: 146 | return self._run_pytorch(config, is_reference) 147 | 148 | def _run_pytorch(self, config: BenchmarkConfig, is_reference: bool) -> Tuple[Benchmark, np.ndarray]: 149 | """ 150 | :return: 151 | """ 152 | LOGGER.info("Running PyTorch Eager benchmark") 153 | benchmark = CUDABenchmark() if config.device == "cuda" else Benchmark() 154 | 155 | dummy_inputs = self._get_dummy_inputs( 156 | batch_size=config.batch_size, 157 | seq_len=(config.sequence_length - self.tokenizer.num_special_tokens_to_add(pair=False)) 158 | ) 159 | 160 | inputs = self.tokenizer( 161 | dummy_inputs, 162 | is_split_into_words=True, 163 | return_tensors=TensorType.PYTORCH, 164 | ) 165 | 166 | inputs = inputs.to(config.device) 167 | self.model = self.model.to(config.device) 168 | 169 | # Warmup 170 | outputs = [] 171 | with torch.cuda.amp.autocast(config.precision == "float16"): 172 | for _ in trange(config.warmup_runs, desc="Warming up"): 173 | output = self.model(**inputs) 174 | outputs.append(output.last_hidden_state.cpu().numpy()) 175 | 176 | # Let's not run the benchmark for the reference backend, 177 | # as we are more interested in the output tensors. 178 | if not is_reference: 179 | 180 | # Run benchmark 181 | benchmark_duration_ns = config.benchmark_duration * SEC_TO_NS_SCALE 182 | with torch.cuda.amp.autocast(config.precision == "float16"): 183 | while sum(benchmark.latencies) < benchmark_duration_ns: 184 | with benchmark.track(): 185 | self.model(**inputs) 186 | 187 | benchmark.finalize(benchmark_duration_ns) 188 | 189 | return benchmark, np.stack(outputs) 190 | 191 | def _run_torchscript(self, config: BenchmarkConfig, is_reference: bool) -> Tuple[Benchmark, np.ndarray]: 192 | """ 193 | :return: 194 | """ 195 | LOGGER.info("Running TorchScript benchmark") 196 | benchmark = CUDABenchmark() if config.device == "cuda" else Benchmark() 197 | 198 | dummy_inputs = self._get_dummy_inputs( 199 | batch_size=config.batch_size, 200 | seq_len=(config.sequence_length - self.tokenizer.num_special_tokens_to_add(pair=False)) 201 | ) 202 | 203 | inputs = self.tokenizer( 204 | dummy_inputs, 205 | is_split_into_words=True, 206 | return_tensors=TensorType.PYTORCH, 207 | ) 208 | 209 | inputs.to(config.device) 210 | self.model = self.model.to(config.device) 211 | 212 | # To be sure inputs will be presented with the right prototype 213 | ordered_inputs = OrderedDict({ 214 | "input_ids": inputs.input_ids, 215 | "attention_mask": inputs.attention_mask, 216 | "token_type_ids": inputs.token_type_ids, 217 | }) 218 | 219 | LOGGER.debug("Calling torch JIT on model (optimize=True)") 220 | model_scripted = torch.jit.trace(self.model, tuple(ordered_inputs.values())) 221 | 222 | outputs = [] 223 | with torch.jit.optimized_execution(True): 224 | with torch.cuda.amp.autocast(config.precision == "float16"): 225 | for _ in trange(config.warmup_runs, desc="Warming up"): 226 | output = model_scripted(*ordered_inputs.values()) 227 | outputs.append(output[0].cpu().numpy()) 228 | 229 | # Let's not run the benchmark for the reference backend, 230 | # as we are more interested in the output tensors. 231 | if not is_reference: 232 | 233 | # Run benchmark 234 | benchmark_duration_ns = config.benchmark_duration * SEC_TO_NS_SCALE 235 | with torch.cuda.amp.autocast(config.precision == "float16"): 236 | while sum(benchmark.latencies) < benchmark_duration_ns: 237 | with benchmark.track(): 238 | model_scripted(*ordered_inputs.values()) 239 | 240 | benchmark.finalize(benchmark_duration_ns) 241 | return benchmark, np.stack(outputs) 242 | 243 | -------------------------------------------------------------------------------- /src/backends/tensorflow.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Hugging Face Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import contextlib 16 | from dataclasses import dataclass 17 | from logging import getLogger 18 | from pathlib import Path 19 | from shutil import rmtree 20 | from typing import Optional, Tuple, Callable, List, Set 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | from tensorflow.python.keras import Input 25 | from tqdm import trange 26 | from transformers import PreTrainedTokenizer, TFAutoModel, TFPreTrainedModel, TensorType 27 | 28 | from backends import Backend, BackendConfig 29 | from benchmark import Benchmark 30 | from config import BenchmarkConfig 31 | from utils import SEC_TO_NS_SCALE 32 | 33 | BACKEND_NAME = "tensorflow" 34 | 35 | SAVED_MODEL_PATH = "saved_model" 36 | SAVED_MODEL_TUNE_FLAG = "tune" 37 | 38 | LOGGER = getLogger("tensorflow") 39 | 40 | 41 | def get_tf_device(device: str) -> str: 42 | if device == "cuda": 43 | if len(tf.config.experimental.list_physical_devices('GPU')) == 0: 44 | raise ValueError(f"No GPU detected, cannot move data to {device}") 45 | return tf.DeviceSpec(device_type="GPU") 46 | else: 47 | return tf.DeviceSpec(device_type="CPU") 48 | 49 | 50 | def as_saved_model(tokenizer: PreTrainedTokenizer, model: TFPreTrainedModel, inputs: List, saved_model_path: Path, flag: str = SAVED_MODEL_TUNE_FLAG) -> Path: 51 | encodings = tokenizer(inputs, is_split_into_words=True, return_tensors="tf") 52 | 53 | # Generate symbolic trace 54 | tf_inputs = {name: Input((None, ), batch_size=None, dtype=tf.int32, name=name) for name, value in encodings.items()} 55 | tf_outputs = model(tf_inputs) 56 | tf_model = tf.keras.models.Model(inputs=tf_inputs, outputs={"output": tf_outputs[0]}) 57 | 58 | # Saved SavedModel 59 | tf.saved_model.save(tf_model, saved_model_path.as_posix()) 60 | 61 | # Generate a flag file indicating this folder was generated from the tune framework 62 | saved_model_path.joinpath(flag).touch() 63 | return saved_model_path 64 | 65 | 66 | @contextlib.contextmanager 67 | def options(options): 68 | old_opts = tf.config.optimizer.get_experimental_options() 69 | tf.config.optimizer.set_experimental_options(options) 70 | try: 71 | yield 72 | finally: 73 | tf.config.optimizer.set_experimental_options(old_opts) 74 | 75 | 76 | @dataclass 77 | class TensorflowConfig(BackendConfig): 78 | name: str = "tensorflow" 79 | use_xla: bool = False 80 | use_saved_model_format: bool = False 81 | eager_mode: bool = False 82 | experimental_compiler: Optional[bool] = None 83 | 84 | @staticmethod 85 | def version() -> str: 86 | return tf.__version__ 87 | 88 | @staticmethod 89 | def supported_keys() -> Set[str]: 90 | return BackendConfig.supported_keys().union({ 91 | "use_xla", 92 | "eager_mode", 93 | "experimental_compiler", 94 | "use_saved_model_format", 95 | }) 96 | 97 | 98 | class TensorflowBackend(Backend[TensorflowConfig]): 99 | NAME = BACKEND_NAME 100 | 101 | def __init__(self, model: str, local_model_path: str = None): 102 | super().__init__(model) 103 | self.model = model 104 | self.model_info = None # Only used when working with SavedModel 105 | self.local_model_path = local_model_path # Local model path if using pre-exported SavedModel file 106 | 107 | LOGGER.info(f"Allocated TensorFlow Backend for model: {model}") 108 | 109 | @classmethod 110 | def allocate(cls, config: BenchmarkConfig): 111 | # Check if we are using a local SavedModel file 112 | # => (format bert-base-case@/path/to/savedmodel) 113 | if config.backend.use_saved_model_format and "@" in config.model: 114 | model_name, model_path = config.model.split("@") 115 | LOGGER.info(f"Local SavedModel format detected: model={model_name}, path={model_path}") 116 | 117 | backend = TensorflowBackend(model_name, model_path) 118 | else: 119 | backend = TensorflowBackend(config.model) 120 | 121 | backend.configure(config.backend) 122 | 123 | return backend 124 | 125 | def clean(self, config: 'BenchmarkConfig'): 126 | saved_model_path = Path(SAVED_MODEL_PATH) 127 | if saved_model_path.exists() and saved_model_path.joinpath(SAVED_MODEL_TUNE_FLAG): 128 | LOGGER.debug(f"Cleaning SavedModel folder at {saved_model_path}") 129 | rmtree(saved_model_path) 130 | # saved_model_path.rmdir() 131 | 132 | def configure(self, config: TensorflowConfig): 133 | super().configure(config) 134 | 135 | LOGGER.info("Configuring TensorFlow Benchmark:") 136 | 137 | if config.num_threads is not None: 138 | if tf.config.threading.get_intra_op_parallelism_threads() != config.num_threads: 139 | tf.config.threading.set_intra_op_parallelism_threads(config.num_threads) 140 | 141 | LOGGER.info( 142 | f"\t+ Number of intra op threads (" 143 | f"tf.config.threading.set_intra_op_parallelism_threads(" 144 | f"{tf.config.threading.get_intra_op_parallelism_threads()}" 145 | f"))" 146 | ) 147 | 148 | if config.num_interops_threads is not None: 149 | if tf.config.threading.get_inter_op_parallelism_threads() != config.num_interops_threads: 150 | tf.config.threading.set_inter_op_parallelism_threads(config.num_interops_threads) 151 | 152 | LOGGER.info( 153 | f"\t+ Number of inter op threads (" 154 | f"tf.config.threading.set_inter_op_parallelism_threads(" 155 | f"{tf.config.threading.get_inter_op_parallelism_threads()}" 156 | f"))" 157 | ) 158 | 159 | # If we need to use the model as SavedModel format 160 | if config.use_saved_model_format: 161 | 162 | # Local model support 163 | if self.local_model_path is None: 164 | LOGGER.info(f"Converting model: {self.model} to SavedModel format") 165 | with options({ 166 | "constant_folding": True, 167 | "shape_optimization": True, 168 | "disable_model_pruning": False, 169 | "arithmetic_optimization": True, 170 | "function_optimization": True 171 | }): 172 | with tf.device("CPU"): 173 | model = TFAutoModel.from_pretrained(self.model) 174 | self.local_model_path = as_saved_model( 175 | tokenizer=self.tokenizer, 176 | model=model, 177 | inputs=self._get_dummy_inputs( 178 | 1, model.config.max_position_embeddings - self.tokenizer.num_special_tokens_to_add() 179 | ), 180 | saved_model_path=Path(SAVED_MODEL_PATH), 181 | flag=SAVED_MODEL_TUNE_FLAG 182 | ) 183 | 184 | LOGGER.debug(f"Converted SavedModel stored at {self.local_model_path}") 185 | 186 | # Load the model 187 | saved_model_path = Path(self.local_model_path) 188 | LOGGER.info(f"Loading SavedModel from {saved_model_path}") 189 | self.model_info = tf.saved_model.load(saved_model_path.as_posix()) 190 | self.model = self.model_info.signatures["serving_default"] 191 | else: 192 | # Postponing model allocation to tune intra/inter ops before executing any other TF related code. 193 | self.model = TFAutoModel.from_pretrained(self.model) 194 | 195 | def execute(self, config: BenchmarkConfig, is_reference: bool = False) -> Tuple[Benchmark, np.ndarray]: 196 | if config.backend.eager_mode: 197 | return self._run_tf(config, is_reference) 198 | else: 199 | return self._run_tf_graph(config, is_reference) 200 | 201 | def _run_tf(self, config: BenchmarkConfig, is_reference: bool) -> Tuple[Benchmark, np.ndarray]: 202 | LOGGER.info("Running TensorFlow Eager benchmark") 203 | benchmark = Benchmark() 204 | 205 | dummy_inputs = self._get_dummy_inputs( 206 | batch_size=config.batch_size, 207 | seq_len=(config.sequence_length - self.tokenizer.num_special_tokens_to_add(pair=False)) 208 | ) 209 | 210 | with tf.device(get_tf_device(config.device)): 211 | inputs = self.tokenizer( 212 | dummy_inputs, 213 | is_split_into_words=True, 214 | return_tensors=TensorType.TENSORFLOW, 215 | ) 216 | 217 | # Move tf.constants to GPU ... https://github.com/tensorflow/tensorflow/issues/42242#issuecomment-675590057 218 | inputs = {name: tf.identity(t) for name, t in inputs.items()} 219 | 220 | # SavedModel concrete function needs unwrapped arguments ... 221 | # model_f = lambda x: self.model(**x).popitem()[1] \ 222 | # if config.backend.use_saved_model_format else \ 223 | # lambda x: self.model(x).last_hidden_state 224 | def model_f(inputs): 225 | # SavedModel concrete function needs unwrapped arguments ... 226 | if config.backend.use_saved_model_format: 227 | LOGGER.info("Please note that saved model format will enable graph mode test!!") 228 | return self.model(**inputs).popitem()[1] 229 | else: 230 | return self.model(inputs).last_hidden_state 231 | 232 | # Warmup 233 | outputs = [] 234 | for _ in trange(config.warmup_runs, desc="Warming up"): 235 | output = model_f(inputs) 236 | outputs.append(output.numpy()) 237 | 238 | # Let's not run the benchmark for the reference backend, 239 | # as we are more interested in the output tensors. 240 | if not is_reference: 241 | 242 | # Run benchmark 243 | benchmark_duration_ns = config.benchmark_duration * SEC_TO_NS_SCALE 244 | while sum(benchmark.latencies) < benchmark_duration_ns: 245 | with benchmark.track(): 246 | model_f(inputs) 247 | 248 | benchmark.finalize(benchmark_duration_ns) 249 | 250 | return benchmark, np.stack(outputs) 251 | 252 | def _run_tf_graph(self, config: BenchmarkConfig, is_reference: bool) -> Tuple[Benchmark, np.ndarray]: 253 | if not config.backend.use_xla: 254 | LOGGER.info("Running TensorFlow Graph benchmark") 255 | @tf.function 256 | def model_f(inputs): 257 | # SavedModel concrete function needs unwrapped arguments ... 258 | if config.backend.use_saved_model_format: 259 | return self.model(**inputs).popitem()[1] 260 | else: 261 | return self.model(inputs).last_hidden_state 262 | else: 263 | LOGGER.info("Running TensorFlow Graph with XLA benchmark") 264 | @tf.function(jit_compile=True) 265 | def model_f(inputs): 266 | # SavedModel concrete function needs unwrapped arguments ... 267 | if config.backend.use_saved_model_format: 268 | return self.model(**inputs).popitem()[1] 269 | else: 270 | return self.model(inputs).last_hidden_state 271 | 272 | benchmark = Benchmark() 273 | 274 | dummy_inputs = self._get_dummy_inputs( 275 | batch_size=config.batch_size, 276 | seq_len=(config.sequence_length - self.tokenizer.num_special_tokens_to_add(pair=False)) 277 | ) 278 | 279 | with tf.device(get_tf_device(config.device)): 280 | with options({ 281 | "constant_folding": True, 282 | "shape_optimization": True, 283 | "disable_model_pruning": False, 284 | "arithmetic_optimization": True, 285 | "function_optimization": True 286 | }): 287 | inputs = self.tokenizer( 288 | dummy_inputs, 289 | is_split_into_words=True, 290 | return_tensors=TensorType.TENSORFLOW, 291 | ) 292 | 293 | # Move tf.constants to GPU ... 294 | # https://github.com/tensorflow/tensorflow/issues/42242#issuecomment-675590057 295 | inputs = {name: tf.identity(t) for name, t in inputs.items()} 296 | 297 | # Warmup 298 | outputs = [] 299 | for _ in trange(config.warmup_runs, desc="Warming up"): 300 | output = model_f(inputs) 301 | outputs.append(output.numpy()) 302 | 303 | # Let's not run the benchmark for the reference backend, 304 | # as we are more interested in the output tensors. 305 | if not is_reference: 306 | 307 | # Run benchmark 308 | benchmark_duration_ns = config.benchmark_duration * SEC_TO_NS_SCALE 309 | while sum(benchmark.latencies) < benchmark_duration_ns: 310 | with benchmark.track(): 311 | model_f(inputs) 312 | 313 | benchmark.finalize(benchmark_duration_ns) 314 | return benchmark, np.stack(outputs) 315 | -------------------------------------------------------------------------------- /src/benchmark.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Hugging Face Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import numpy as np 15 | 16 | from contextlib import contextmanager 17 | from dataclasses import dataclass, field 18 | from logging import getLogger 19 | from time import perf_counter_ns 20 | from typing import List 21 | 22 | from pandas import DataFrame 23 | 24 | from utils import SEC_TO_NS_SCALE 25 | 26 | LOGGER = getLogger("benchmark") 27 | 28 | 29 | @dataclass 30 | class Benchmark: 31 | outputs_diff: List[np.ndarray] = None 32 | latencies: List[float] = field(default_factory=list) 33 | throughput: float = float("-inf") 34 | 35 | @property 36 | def num_runs(self) -> int: 37 | return len(self.latencies) 38 | 39 | @staticmethod 40 | def merge(benchmarks: List['Benchmark']) -> 'Benchmark': 41 | latencies, throughputs = [], [] 42 | for b in benchmarks: 43 | 44 | assert len(b.latencies) > 0, "Empty benchmark (0 latency measurements recorded)" 45 | assert b.throughput > 0., f"Benchmark has not been finalized, throughput < 0 ({b.throughput})" 46 | 47 | latencies += b.latencies 48 | throughputs.append(b.throughput) 49 | 50 | # Return all the latencies measured and the mean throughput over all instances 51 | return Benchmark( 52 | latencies, 53 | sum(throughputs) / len(throughputs) 54 | ) 55 | 56 | @contextmanager 57 | def track(self): 58 | start = perf_counter_ns() 59 | yield 60 | end = perf_counter_ns() 61 | 62 | # Append the time to the buffer 63 | self.latencies.append(end - start) 64 | 65 | LOGGER.debug(f"Tracked function took: {(end - start)}ns ({(end - start) / 1e6:.3f}ms)") 66 | 67 | def record_outputs(self, output: np.ndarray, reference: np.ndarray): 68 | self.outputs_diff = np.abs(reference - output) 69 | 70 | def finalize(self, duration_ns: int): 71 | self.throughput = round((len(self.latencies) / duration_ns) * SEC_TO_NS_SCALE, 2) 72 | 73 | def to_pandas(self) -> DataFrame: 74 | # Compute stats 75 | benchmarks_stats = { 76 | "nb_forwards": len(self.latencies), 77 | "throughput": self.throughput, 78 | "latency_mean": np.mean(self.latencies), 79 | "latency_std": np.std(self.latencies), 80 | "latency_50": np.quantile(self.latencies, 0.5), 81 | "latency_90": np.quantile(self.latencies, 0.9), 82 | "latency_95": np.quantile(self.latencies, 0.95), 83 | "latency_99": np.quantile(self.latencies, 0.99), 84 | "latency_999": np.quantile(self.latencies, 0.999), 85 | } 86 | 87 | if self.outputs_diff is not None: 88 | benchmarks_stats["drift_mean"] = np.mean(self.outputs_diff) 89 | benchmarks_stats["drift_std"] = np.std(self.outputs_diff) 90 | 91 | return DataFrame.from_dict(benchmarks_stats, orient="index").transpose() 92 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Hugging Face Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from binascii import hexlify 15 | from dataclasses import dataclass 16 | from logging import getLogger 17 | from random import getrandbits 18 | 19 | from typing import Dict, Optional 20 | 21 | from omegaconf import MISSING 22 | from transformers import __version__ as transformers_version 23 | 24 | from backends import BackendConfig 25 | 26 | 27 | LOGGER = getLogger("benchmark") 28 | 29 | 30 | @dataclass() 31 | class BenchmarkConfig: 32 | # Python interpreter version 33 | python_version: str = MISSING 34 | 35 | # Store the transformers version used during the benchmark 36 | transformers_version: str = transformers_version 37 | 38 | # Number of forward pass to run before recording any performance counters. 39 | warmup_runs: int = MISSING 40 | 41 | # Duration in seconds the benchmark will collect performance counters 42 | benchmark_duration: int = MISSING 43 | 44 | # The backend to use for recording timing (pytorch, torchscript, tensorflow, xla, onnxruntime) 45 | backend: BackendConfig = MISSING 46 | 47 | # Name of the model used for the benchmark 48 | model: str = MISSING 49 | 50 | # CPU or CUDA device to run inference on 51 | device: str = MISSING 52 | 53 | # The dtype of the model to run inference with (float32, float16, int8, bfloat16) 54 | precision: str = MISSING 55 | 56 | # Use Transparent Huge Page mechanism to increase CPU cache hit probability 57 | use_huge_page: str = MISSING 58 | 59 | # Number of sample given to the model at each forward 60 | batch_size: int = MISSING 61 | 62 | # The length of the sequence (in tokens) given to the model 63 | sequence_length: int = MISSING 64 | 65 | # Multi instances settings # 66 | num_instances: int = MISSING 67 | 68 | # Number of core per instances 69 | num_core_per_instance: int = MISSING 70 | 71 | # Experiment identifier 72 | experiment_id: str = hexlify(getrandbits(32).to_bytes(4, 'big')).decode('ascii') 73 | 74 | # Experiment name 75 | experiment_name: str = "default" 76 | 77 | # Identifier for the current instance. Allow to create specific instance config folder 78 | instance_id: int = 0 79 | 80 | # Reference backend implementation that will be used to generate reference (output tensors) 81 | reference: Optional[str] = None 82 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Hugging Face Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from logging import getLogger 15 | from typing import Type, get_args, Union 16 | 17 | import hydra 18 | import numpy as np 19 | from hydra.core.config_store import ConfigStore 20 | from hydra.experimental import compose 21 | from hydra.utils import get_class 22 | from omegaconf import OmegaConf, DictConfig 23 | 24 | from backends import Backend, BackendConfig 25 | from backends.ort import OnnxRuntimeConfig 26 | from backends.pytorch import PyTorchConfig 27 | from backends.tensorflow import TensorflowConfig 28 | from config import BenchmarkConfig 29 | 30 | 31 | # Register resolvers 32 | OmegaConf.register_new_resolver("pytorch_version", PyTorchConfig.version) 33 | OmegaConf.register_new_resolver("tensorflow_version", TensorflowConfig.version) 34 | OmegaConf.register_new_resolver("ort_version", OnnxRuntimeConfig.version) 35 | 36 | # Register configurations 37 | cs = ConfigStore.instance() 38 | cs.store(name="benchmark", node=BenchmarkConfig) 39 | cs.store(group="backend", name="pytorch_backend", node=PyTorchConfig) 40 | cs.store(group="backend", name="torchscript_backend", node=PyTorchConfig) 41 | cs.store(group="backend", name="tensorflow_backend", node=TensorflowConfig) 42 | cs.store(group="backend", name="tensorflow_graph_backend", node=TensorflowConfig) 43 | cs.store(group="backend", name="ort_backend", node=OnnxRuntimeConfig) 44 | 45 | 46 | LOGGER = getLogger("benchmark") 47 | 48 | 49 | def get_overrided_backend_config(original_config: Union[DictConfig, BackendConfig], override: str) -> DictConfig: 50 | # Copy the initial config and pop the backend 51 | update_config = original_config.copy() 52 | OmegaConf.set_struct(update_config, False) 53 | update_config.pop("backend") 54 | 55 | # Retrieve the original backend factory 56 | backend_factory: Type[Backend] = get_class(original_config.backend._target_) 57 | 58 | # Compose the two configs (reference <- original @backend==config.reference) 59 | reference_config = compose(config_name="benchmark", overrides=[f"backend={override}"]) 60 | reference_config.merge_with(update_config) 61 | reference_backend_factory: Type[Backend] = get_class(reference_config.backend._target_) 62 | 63 | # Retrieve each original & reference BackendConfig instance type 64 | reference_backend_config_type: Type[BackendConfig] = get_args(reference_backend_factory.__orig_bases__[0])[0] 65 | original_backend_config_type: Type[BackendConfig] = get_args(backend_factory.__orig_bases__[0])[0] 66 | 67 | # Filter out to rely only on the common subset of supported config elements 68 | reference_backend_keys = reference_backend_config_type.supported_keys() 69 | original_backend_keys = original_backend_config_type.supported_keys() 70 | 71 | # (A - B) union (A inter B) 72 | overlapping_backend_config_keys = \ 73 | (reference_backend_keys.intersection(original_backend_keys)) - {"name", "_target_", "version"} 74 | 75 | LOGGER.debug(f"Keys to override from original config in the new one: {overlapping_backend_config_keys}") 76 | 77 | # Get a masked configuration copy 78 | original_overlapping_backend_config = OmegaConf.masked_copy( 79 | original_config, 80 | list(overlapping_backend_config_keys) 81 | ) 82 | 83 | # Override the properties 84 | reference_config["backend"].merge_with(original_overlapping_backend_config) 85 | 86 | return reference_config 87 | 88 | 89 | @hydra.main(config_path="../configs", config_name="benchmark") 90 | def run(config: BenchmarkConfig) -> None: 91 | # We need to allocate the reference backend (used to compare backend output against) 92 | if config.reference is not None and config.reference != config.backend: 93 | LOGGER.info(f"Using {config.reference} as reference backend") 94 | reference_config = get_overrided_backend_config(config, override=config.reference) 95 | else: 96 | reference_config = None 97 | 98 | # Allocate requested target backend 99 | backend_factory: Type[Backend] = get_class(config.backend._target_) 100 | backend = backend_factory.allocate(config) 101 | 102 | # Run benchmark and reference 103 | benchmark, outputs = backend.execute(config, is_reference=False) 104 | backend.clean(config) 105 | 106 | if reference_config is not None: 107 | reference_backend_factory = get_class(reference_config.backend._target_) 108 | reference_backend = reference_backend_factory.allocate(reference_config) 109 | _, ref_outputs = reference_backend.execute(reference_config, is_reference=True) 110 | 111 | # Record the outputs to compare with the target backend 112 | benchmark.record_outputs(outputs, ref_outputs) 113 | reference_backend.clean(reference_config) 114 | 115 | LOGGER.info( 116 | f"Reference backend ({config.reference}) against target backend ({config.backend.name})" 117 | f" absolute difference:" 118 | f" {np.mean(benchmark.outputs_diff)} (+/- {np.std(benchmark.outputs_diff)})" 119 | f" over {len(benchmark.outputs_diff)} sample(s)" 120 | ) 121 | 122 | # Save the resolved config 123 | OmegaConf.save(config, ".hydra/config.yaml", resolve=True) 124 | 125 | df = benchmark.to_pandas() 126 | df.to_csv("results.csv", index_label="id") 127 | 128 | 129 | if __name__ == '__main__': 130 | run() 131 | -------------------------------------------------------------------------------- /src/reports.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Hugging Face Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from collections import defaultdict 16 | from pathlib import Path 17 | 18 | import pandas as pd 19 | from argparse import ArgumentParser 20 | 21 | from rich.console import Console 22 | from rich.table import Table 23 | 24 | 25 | def gather_results(folder: Path): 26 | # List all csv results 27 | results_f = [f for f in folder.glob("**/*.csv")] 28 | results_csv = { 29 | f.relative_to(folder).parent.as_posix(): pd.read_csv(f, index_col=0) 30 | for f in results_f 31 | } 32 | 33 | if len(results_csv) == 0: 34 | raise ValueError(f"No results.csv file were found in {folder}") 35 | 36 | # Merge dataframe wrt to framework 37 | dfs = defaultdict(list) 38 | for path, df in results_csv.items(): 39 | framework, device, arguments = path.split("/") 40 | arguments = dict(arg.split("_") for arg in arguments.split("-")) 41 | 42 | # Add columns to the dataframe 43 | for col_name, col_value in arguments.items(): 44 | df[col_name] = int(col_value) 45 | 46 | dfs[framework].append(df) 47 | 48 | # Concat the dataframes 49 | dfs = {f: pd.concat(a) for f, a in dfs.items()} 50 | 51 | for framework, df in dfs.items(): 52 | df["framework"] = framework 53 | 54 | return pd.concat(dfs.values()) 55 | 56 | 57 | def show_results_in_console(df): 58 | grouped_df = df.groupby(["framework", "batch", "seqlen"]) 59 | (grouped_df["inference_time_secs"].mean() * 1000).reset_index() 60 | 61 | console = Console() 62 | table = Table( 63 | show_header=True, header_style="bold", 64 | title="Inference Time per Framework, Batch Size & Sequence Length" 65 | ) 66 | 67 | columns = ( 68 | ("Framework", "framework"), 69 | ("Batch Size", "batch"), 70 | ("Seq Length", "seqlen"), 71 | ("Inference Time (ms)", "inference_time_secs") 72 | ) 73 | 74 | # Define the columns 75 | for (column, _) in columns: 76 | table.add_column(column, justify="center") 77 | 78 | # Add rows 79 | for name, group in grouped_df: 80 | items = name + (round(group.mean()["inference_time_secs"] * 1000, 2), ) 81 | table.add_row(*[str(item) for item in items]) 82 | 83 | # Display the table 84 | console.print(table) 85 | 86 | 87 | if __name__ == '__main__': 88 | parser = ArgumentParser("Hugging Face Model Benchmark") 89 | parser.add_argument("--results-folder", type=Path, help="Where the benchmark results have been saved") 90 | parser.add_argument("output_folder", type=Path, help="Where the resulting report will be saved") 91 | 92 | # Parse command line arguments 93 | args = parser.parse_args() 94 | 95 | if not args.results_folder.exists(): 96 | print(f"Folder {args.results_folder} doesn't exist") 97 | 98 | try: 99 | # Ensure output folder exists 100 | args.output_folder.mkdir(exist_ok=True, parents=True) 101 | 102 | # Gather the results to manipulate 103 | df_by_framework = gather_results(args.results_folder) 104 | 105 | # Generate reports 106 | df_by_framework.to_csv(args.output_folder.joinpath("final_results.csv")) 107 | 108 | show_results_in_console(df_by_framework) 109 | except ValueError as ve: 110 | print(ve) -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Hugging Face Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .env import MANAGED_ENV_VARIABLES, ENV_VAR_TCMALLOC_LIBRARY_PATH, ENV_VAR_INTEL_OPENMP_LIBRARY_PATH,\ 16 | check_tcmalloc, check_intel_openmp, set_ld_preload_hook 17 | from .cpu import CPUinfo, cpu_count_physical, configure_numa, get_procfs_path, get_instances_with_cpu_binding 18 | 19 | SEC_TO_NS_SCALE = 1000000000 20 | -------------------------------------------------------------------------------- /src/utils/cpu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Hugging Face Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Copied from FastFormer code: https://github.com/microsoft/fastformers/blob/main/examples/fastformers/run_superglue.py 16 | import numpy as np 17 | import platform 18 | import re 19 | import subprocess 20 | import sys 21 | from logging import getLogger 22 | from os import getpid 23 | from typing import List, Tuple 24 | 25 | LOGGER = getLogger("cpu") 26 | 27 | 28 | class CPUinfo: 29 | def __init__(self): 30 | self.cpuinfo = [] 31 | 32 | if platform.system() == "Windows": 33 | raise RuntimeError("Windows platform is not supported!!!") 34 | elif platform.system() == "Linux": 35 | args = ["lscpu", "--parse=CPU,Core,Socket,Node"] 36 | lscpu_info = subprocess.check_output(args, universal_newlines=True).split("\n") 37 | 38 | # Get information about cpu, core, socket and node 39 | for line in lscpu_info: 40 | pattern = r"^([\d]+,[\d]+,[\d]+,[\d]?)" 41 | regex_out = re.search(pattern, line) 42 | if regex_out: 43 | self.cpuinfo.append(regex_out.group(1).strip().split(",")) 44 | 45 | self._get_socket_info() 46 | 47 | def _get_socket_info(self): 48 | 49 | self.socket_physical_cores = [] # socket_id is index 50 | self.socket_logical_cores = [] # socket_id is index 51 | self.sockets = int(max([line[2] for line in self.cpuinfo])) + 1 52 | self.core_to_sockets = {} 53 | 54 | for socket_id in range(self.sockets): 55 | cur_socket_physical_core = [] 56 | cur_socket_logical_core = [] 57 | 58 | for line in self.cpuinfo: 59 | if socket_id == int(line[2]): 60 | if line[1] not in cur_socket_physical_core: 61 | cur_socket_physical_core.append(line[1]) 62 | 63 | cur_socket_logical_core.append(line[0]) 64 | 65 | self.core_to_sockets[line[0]] = line[2] 66 | 67 | self.socket_physical_cores.append(cur_socket_physical_core) 68 | self.socket_logical_cores.append(cur_socket_logical_core) 69 | 70 | @property 71 | def socket_nums(self): 72 | return self.sockets 73 | 74 | @property 75 | def physical_core_nums(self): 76 | return len(self.socket_physical_cores) * len(self.socket_physical_cores[0]) 77 | 78 | @property 79 | def logical_core_nums(self): 80 | return len(self.socket_logical_cores) * len(self.socket_logical_cores[0]) 81 | 82 | @property 83 | def get_all_physical_cores(self): 84 | return np.array(self.socket_physical_cores).flatten().tolist() 85 | 86 | @property 87 | def get_all_logical_cores(self): 88 | return np.array(self.socket_logical_cores).flatten().tolist() 89 | 90 | def get_socket_physical_cores(self, socket_id): 91 | if socket_id < 0 or socket_id > self.sockets - 1: 92 | LOGGER.error(f"Invalid socket id {socket_id}") 93 | return self.socket_physical_cores[socket_id] 94 | 95 | def get_socket_logical_cores(self, socket_id): 96 | if socket_id < 0 or socket_id > self.sockets - 1: 97 | LOGGER.error(f"Invalid socket id {socket_id}") 98 | return self.socket_logical_cores[socket_id] 99 | 100 | def get_sockets_for_cores(self, core_ids): 101 | return {self.core_to_sockets[core] for core in core_ids} 102 | 103 | 104 | def get_procfs_path(): 105 | """Return updated psutil.PROCFS_PATH constant.""" 106 | """Copied from psutil code, and modified to fix an error.""" 107 | return sys.modules['psutil'].PROCFS_PATH 108 | 109 | 110 | def cpu_count_physical(): 111 | """Return the number of physical cores in the system.""" 112 | """Copied from psutil code, and modified to fix an error.""" 113 | 114 | physical_logical_mapping = {} 115 | cores_per_socket = {} 116 | current_info = {} 117 | with open(f'{get_procfs_path()}/cpuinfo', "rb") as f: 118 | for line in f: 119 | line = line.strip().lower() 120 | if not line: 121 | # print(current_info) 122 | # new section 123 | if b'physical id' in current_info and b'cpu cores' in current_info: 124 | cores_per_socket[current_info[b'physical id']] = current_info[b'cpu cores'] 125 | 126 | if b'physical id' in current_info and b'core id' in current_info and b'processor' in current_info: 127 | # print(current_info[b'physical id'] * 1000 + current_info[b'core id']) 128 | if current_info[b'physical id'] * 1000 + current_info[b'core id'] not in physical_logical_mapping: 129 | physical_logical_mapping[ 130 | current_info[b'physical id'] * 1000 + current_info[b'core id'] 131 | ] = current_info[b'processor'] 132 | current_info = {} 133 | else: 134 | # ongoing section 135 | if (line.startswith(b'physical id') or 136 | line.startswith(b'cpu cores') or 137 | line.startswith(b'core id') or 138 | line.startswith(b'processor')): 139 | key, value = line.split(b'\t:', 1) 140 | current_info[key.rstrip()] = int(value.rstrip()) 141 | 142 | total_num_cores = sum(cores_per_socket.values()) 143 | core_to_socket_mapping = {} 144 | for physical, logical in physical_logical_mapping.items(): 145 | physical_id = physical // 1000 146 | 147 | if physical_id not in core_to_socket_mapping: 148 | core_to_socket_mapping[physical_id] = set() 149 | 150 | core_to_socket_mapping[physical_id].add(logical) 151 | 152 | return total_num_cores, cores_per_socket, core_to_socket_mapping 153 | 154 | 155 | def get_instances_with_cpu_binding(num_core_per_instance: int = -1, num_instances: int = 1) -> List[Tuple[List[int], List[int]]]: 156 | """ 157 | :param num_core_per_instance: Number of cores to use per instances, -1 means "use all the CPU cores" 158 | :param num_instances: Number of model instances to distribute CPU cores for 159 | :return: List[List[int]] Per instance list of CPU core affinity 160 | """ 161 | total_num_cores, cores_per_socket, core_to_socket_mapping = cpu_count_physical() 162 | instance_binding = [] 163 | 164 | # Matching ICX 165 | total_num_cores = 64 166 | cores_per_socket = {0: 32, 1: 32} 167 | core_to_socket_mapping = {0: set(range(32)), 1: set(range(32, 64))} 168 | 169 | # 64 170 | # {0: 32, 1: 32} 171 | # { 172 | # 0: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, 173 | # 1: {32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63} 174 | # } 175 | 176 | # items in a set are unique, if their more than 1 value, then we have different number core per socket. 177 | assert len(set(cores_per_socket.values())) == 1, "CPU cores are not equal across sockets" 178 | 179 | # No special information given to restrict number of core -> Use all the cores 180 | if num_core_per_instance < 0: 181 | # We set the number of core per instance to the number of core of one single socket. 182 | num_core_per_instance = cores_per_socket[0] 183 | need_multiple_socket_per_instance = False 184 | need_socket_overcommit = num_instances > 1 # Asking for more than one instance with all the cores 185 | 186 | # Number of core span more than a single socket 187 | elif num_core_per_instance > cores_per_socket[0]: 188 | num_core_per_instance = max(num_core_per_instance, total_num_cores) 189 | need_multiple_socket_per_instance = len(cores_per_socket) > 1 # Ensure we have multiple socket 190 | need_socket_overcommit = num_instances > 1 191 | 192 | # Span over only on socket 193 | else: 194 | need_multiple_socket_per_instance = False 195 | need_socket_overcommit = num_core_per_instance > cores_per_socket[0] 196 | 197 | for instance in range(num_instances): 198 | # On which socket to allocate the instance 199 | if need_multiple_socket_per_instance: 200 | socket = list(core_to_socket_mapping.keys()) 201 | cores = {c for s in socket for c in core_to_socket_mapping[s]} 202 | 203 | else: 204 | # {socket_id -> [cores]} 205 | socket = [instance % len(cores_per_socket.keys())] 206 | 207 | # Get the list of available cores (unallocated) on the target socket 208 | cores = core_to_socket_mapping[socket[0]] 209 | 210 | # Pop out allocated core 211 | # Overcommiting does pop out cores because it will have overlap between instances 212 | # Overcommiting doesnt attempt to do things smart limiting the overhead. 213 | if need_socket_overcommit: 214 | cores_it = iter(cores) 215 | bindings = [next(cores_it) for i in range(num_core_per_instance)] 216 | else: 217 | bindings = [cores.pop() for _ in range(num_core_per_instance)] 218 | 219 | instance_binding.append((socket, bindings)) 220 | 221 | return instance_binding 222 | 223 | 224 | def configure_numa(socket_binding: List[int], core_binding: List[int]): 225 | from numa import available as is_numa_available, set_membind, get_membind, set_affinity, get_affinity 226 | if is_numa_available(): 227 | LOGGER.info("Configuring NUMA:") 228 | 229 | pid = getpid() 230 | 231 | # Set core binding affinity 232 | set_affinity(pid, set(core_binding)) 233 | LOGGER.info(f"\tScheduler affinity set to: {get_affinity(pid)}") 234 | 235 | # Set memory allocation affinity 236 | set_membind(set(socket_binding)) 237 | LOGGER.info(f"\tBinding memory allocation on {get_membind()}") 238 | else: 239 | LOGGER.info("NUMA not available on the system, skipping configuration") 240 | 241 | # Configure taskset 242 | # TODO: Check with @Sangeeta if this is still needed as we set CPU scheduler affinity above 243 | # system(f"taskset -p -c {','.join(map(str, core_binding))} {getpid()}") 244 | # LOGGER.info(f"[TASKSET] Set CPU affinity to: {core_binding} (pid={getpid()})") -------------------------------------------------------------------------------- /src/utils/env.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Hugging Face Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from os import environ 16 | from pathlib import Path 17 | 18 | # Environment variables constants 19 | ENV_VAR_TCMALLOC_LIBRARY_PATH = "TCMALLOC_LIBRARY_PATH" 20 | ENV_VAR_INTEL_OPENMP_LIBRARY_PATH = "INTEL_OPENMP_LIBRARY_PATH" 21 | 22 | MANAGED_ENV_VARIABLES = { 23 | "LD_PRELOAD", 24 | "KMP_AFFINITY", 25 | "KMP_BLOCKTIME", 26 | "KMP_BLOCKTIME", 27 | "OMP_MAX_ACTIVE_LEVELS", 28 | "OMP_NUM_THREADS", 29 | } 30 | 31 | 32 | def check_tcmalloc() -> Path: 33 | """ 34 | Ensure tcmalloc library is correctly detected and found 35 | """ 36 | if ENV_VAR_TCMALLOC_LIBRARY_PATH not in environ: 37 | raise ValueError(f"Env var {ENV_VAR_TCMALLOC_LIBRARY_PATH} has to be set to location of libtcmalloc.so") 38 | 39 | if len(environ[ENV_VAR_TCMALLOC_LIBRARY_PATH]) == 0: 40 | raise ValueError(f"Env var {ENV_VAR_TCMALLOC_LIBRARY_PATH} cannot be empty") 41 | 42 | tcmalloc_path = Path(environ[ENV_VAR_TCMALLOC_LIBRARY_PATH]) 43 | if not tcmalloc_path.exists(): 44 | raise ValueError( 45 | f"Path {tcmalloc_path.as_posix()} pointed by " 46 | f"env var {ENV_VAR_TCMALLOC_LIBRARY_PATH} doesn't exist" 47 | ) 48 | 49 | return tcmalloc_path 50 | 51 | 52 | def check_intel_openmp() -> Path: 53 | """ 54 | Ensure Intel OpenMP library is correctly detected and found 55 | """ 56 | if ENV_VAR_INTEL_OPENMP_LIBRARY_PATH not in environ: 57 | raise ValueError(f"Env var {ENV_VAR_INTEL_OPENMP_LIBRARY_PATH} has to be set to location of libomp.so") 58 | 59 | if len(environ[ENV_VAR_INTEL_OPENMP_LIBRARY_PATH]) == 0: 60 | raise ValueError(f"Env var {ENV_VAR_INTEL_OPENMP_LIBRARY_PATH} cannot be empty") 61 | 62 | intel_openmp_path = Path(environ[ENV_VAR_INTEL_OPENMP_LIBRARY_PATH]) 63 | if not intel_openmp_path.exists(): 64 | raise ValueError( 65 | f"Path {intel_openmp_path.as_posix()} pointed by " 66 | f"env var {ENV_VAR_INTEL_OPENMP_LIBRARY_PATH} doesn't exist" 67 | ) 68 | 69 | return intel_openmp_path 70 | 71 | 72 | def set_ld_preload_hook(config): 73 | ld_preload = [] 74 | if hasattr(config, "malloc") and "tcmalloc" == config.malloc.name: 75 | from utils import check_tcmalloc 76 | tcmalloc_path = check_tcmalloc() 77 | ld_preload.append(tcmalloc_path.as_posix()) 78 | 79 | if hasattr(config, "openmp_backend") and "intel" == config.openmp_backend.name: 80 | from utils import check_intel_openmp 81 | intel_omp_path = check_intel_openmp() 82 | ld_preload.append(intel_omp_path.as_posix()) 83 | 84 | ld_preload_str = " ".join(ld_preload) 85 | if "LD_PRELOAD" in environ: 86 | ld_preload_str += " " + environ.get("LD_PRELOAD", default="") 87 | 88 | environ["LD_PRELOAD"] = ld_preload_str --------------------------------------------------------------------------------