├── .gitignore ├── LICENSE ├── README.md ├── env.sh ├── notebooks ├── benchmark │ ├── benchmark.ipynb │ ├── benchmark.py │ ├── gists │ │ └── benchmark_gist_1.ipynb │ ├── memory │ │ ├── README.md │ │ ├── generate.py │ │ ├── intent.py │ │ ├── qa.py │ │ └── sentiment.py │ ├── outputs │ │ ├── .gitignore │ │ ├── baseline_agg.csv │ │ ├── ray_baseline.csv │ │ ├── ray_baseline_agg.csv │ │ └── zerocopy_agg.csv │ ├── ray_baseline.ipynb │ ├── ray_deploy.ipynb │ ├── torchserve.ipynb │ └── torchserve │ │ ├── .gitignore │ │ ├── README.md │ │ ├── handler_generate.py │ │ ├── handler_intent.py │ │ ├── handler_qa.py │ │ ├── handler_sentiment.py │ │ └── torchserve.properties ├── h5_poc.ipynb ├── images │ ├── before_after.png │ ├── before_after.pptx │ ├── bert_load_times.png │ ├── models_table.png │ ├── torch_serve_arch.jpg │ └── zerocopy_perf.png ├── outputs │ └── .gitignore └── zero_copy_loading.ipynb ├── package ├── README.md ├── package.md ├── requirements.txt ├── setup.py └── zerocopy │ ├── __init__.py │ ├── invoke.py │ └── rewrite.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Vim swap files 2 | **/*.swp 3 | 4 | # Anaconda environment created by ./env.sh 5 | env 6 | 7 | # JupyterLab stuff 8 | .virtual_documents 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | pip-wheel-metadata/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 104 | __pypackages__/ 105 | 106 | # Celery stuff 107 | celerybeat-schedule 108 | celerybeat.pid 109 | 110 | # SageMath parsed files 111 | *.sage.py 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # zero-copy-model-loading 2 | 3 | Code to accompany the Medium post, ["How to Load PyTorch Models 340 Times Faster 4 | with 5 | Ray"](https://medium.com/ibm-data-ai/how-to-load-pytorch-models-340-times-faster-with-ray-8be751a6944c). 6 | 7 | ## Notebooks 8 | 9 | Notebooks can be found in the `notebooks` directory: 10 | * `zero_copy_loading.ipynb`: The notebook that was used when authoring the 11 | original blog post. 12 | * `benchmark/benchmark.ipynb`: The notebook that was used when authoring the 13 | second post in the series. 14 | 15 | Instructions to run notebooks: 16 | 1. Install `bash` and either `anaconda` or `miniconda`. 17 | 1. Check out a copy of this repository and navigate to the root directory of 18 | your local copy. 19 | 1. Run the script `./env.sh`, which creates an Anaconda environment under 20 | `/env`. 21 | ``` 22 | ./env.sh 23 | ``` 24 | 1. Activate the Anaconda environment: 25 | ``` 26 | conda activate ./env 27 | ``` 28 | 1. Run JupyterLab: 29 | ``` 30 | jupyter lab 31 | ``` 32 | 1. Navigate to the `notebooks` directory and open up the Jupyter notebook of your choice. 33 | 34 | 35 | ## Python Package 36 | 37 | This repository also contains the source code for the `zerocopy` library. 38 | `zerocopy` is a Python package that provides functions for implementing 39 | zero-copy model loading of PyTorch models on Ray. 40 | 41 | You can find the source code for the package inside the `package` directory. 42 | 43 | -------------------------------------------------------------------------------- /env.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | 4 | # Create conda environment to run the notebooks in this directory. 5 | # 6 | # By default, the environment will be located in the directory "env" 7 | # immediately under this one. To override that setting, 8 | # pass the subdirectory name as the first argument to this script, i.e. 9 | # 10 | # $ ./env.sh my_dir_name 11 | 12 | PYTHON_VERSION=3.8 13 | 14 | ############################ 15 | # HACK ALERT *** HACK ALERT 16 | # The friendly folks at Anaconda thought it would be a good idea to make the 17 | # "conda" command a shell function. 18 | # See https://github.com/conda/conda/issues/7126 19 | # The following workaround will probably be fragile. 20 | if [ -z "$CONDA_HOME" ] 21 | then 22 | echo "Error: CONDA_HOME not set." 23 | exit 24 | fi 25 | if [ -e "${CONDA_HOME}/etc/profile.d/conda.sh" ] 26 | then 27 | # shellcheck disable=SC1090 28 | . "${CONDA_HOME}/etc/profile.d/conda.sh" 29 | else 30 | echo "Error: CONDA_HOME (${CONDA_HOME}) does not appear to be set up." 31 | exit 32 | fi 33 | # END HACK 34 | ############################ 35 | 36 | Usage() 37 | { 38 | echo "Usage ./env.sh [-d dir_name] [-p] [-h]" 39 | echo "Where:" 40 | echo " -d dir_name specifies the location of the environment." 41 | echo " (default is ./env)" 42 | echo " -p means to install the zerocopy module from PyPI" 43 | echo " (default is to install from local source)" 44 | echo " -h prints this message" 45 | } 46 | 47 | INSTALL_FROM_PYPI=false 48 | ENV_DIR="env" 49 | 50 | while getopts ":hpd:" option; do 51 | case $option in 52 | h) # display Help 53 | Usage 54 | exit;; 55 | d) # Specify directory 56 | ENV_DIR=$OPTARG;; 57 | p) # Install from PyPI 58 | INSTALL_FROM_PYPI=true;; 59 | \?) # Invalid option 60 | Usage 61 | exit;; 62 | esac 63 | done 64 | 65 | echo "Creating an Anaconda environment at ./${ENV_DIR}" 66 | if [ "$INSTALL_FROM_PYPI" = true ] ; then 67 | echo "Will install zerocopy package from PyPI" 68 | else 69 | echo "Will install zerocopy package from local source tree" 70 | fi 71 | 72 | # Remove the detritus of any previous runs of this script 73 | rm -rf ./${ENV_DIR} 74 | 75 | # Note how we explicitly install pip on the line that follows. THIS IS VERY 76 | # IMPORTANT! 77 | conda create -y -p ${ENV_DIR} python=${PYTHON_VERSION} pip 78 | conda activate ./${ENV_DIR} 79 | 80 | ################################################################################ 81 | # Install packages with conda 82 | 83 | # We currently install JupyterLab from conda because the pip packages are 84 | # broken for Anaconda environments with Python 3.6 and 3.8 on Mac, as of 85 | # April 2022. 86 | # Make sure that we install everything so that some pip dependency doesn't 87 | # pull in incompatible PyPI versions of a Jupyter package. 88 | conda install -y -c conda-forge jupyterlab \ 89 | ipywidgets \ 90 | jupyterlab-git \ 91 | jupyter-lsp \ 92 | jupyterlab-lsp \ 93 | jupyter-packaging \ 94 | jupyter-resource-usage 95 | conda install -y -c conda-forge/label/main nodejs 96 | 97 | # Rebuild local JupyterLab resources, because sometimes the conda-forge 98 | # packages don't come properly configured. 99 | jupyter lab build 100 | 101 | ################################################################################ 102 | # Install packages with pip 103 | 104 | # Pip dependencies are all in requirements.txt 105 | pip install -r requirements.txt 106 | 107 | if [ "$INSTALL_FROM_PYPI" = true ] ; then 108 | pip install zerocopy 109 | else 110 | # Install the local source tree for the `zerocopy` package in editable mode 111 | pip install --editable ./package 112 | fi 113 | 114 | ################################################################################ 115 | # Custom install steps 116 | 117 | # Elyra extensions to JupyterLab (enables git integration, debugger, workflow 118 | # editor, outlines, and other features) 119 | pip install --upgrade --use-deprecated=legacy-resolver elyra 120 | 121 | # Rebuild JupyterLab environment 122 | jupyter lab build 123 | 124 | jupyter --version 125 | echo " " 126 | jupyter serverextension list 127 | echo " " 128 | jupyter labextension list 129 | echo " " 130 | 131 | conda deactivate 132 | 133 | echo "Anaconda environment at ./${ENV_DIR} successfully created." 134 | echo "To use, type 'conda activate ./${ENV_DIR}'." 135 | 136 | -------------------------------------------------------------------------------- /notebooks/benchmark/benchmark.py: -------------------------------------------------------------------------------- 1 | # Benchmark script. Run with no arguments for usage info. 2 | 3 | # Imports go here 4 | import concurrent.futures 5 | import json 6 | import requests 7 | import sys 8 | import time 9 | 10 | from typing import Tuple, Callable, List, Dict 11 | 12 | import numpy as np 13 | import pandas as pd 14 | 15 | # Constants go here 16 | USAGE = ''' 17 | Microbenchmark script for zero-copy model loading. To use, first deploy the 18 | service to test, then run this script. 19 | 20 | Usage: 21 | python benchmark.py 22 | Where: 23 | * is the local network port on which to connect 24 | * is the location where the output CSV file should go. 25 | ''' 26 | 27 | INTENT_INPUT = { 28 | 'context': 29 | ("I came here to eat chips and beat you up, " 30 | "and I'm all out of chips.") 31 | } 32 | QA_INPUT = { 33 | 'question': 'What is 1 + 1?', 34 | 'context': 35 | """Addition (usually signified by the plus symbol +) is one of the four basic 36 | operations of arithmetic, the other three being subtraction, multiplication 37 | and division. The addition of two whole numbers results in the total amount 38 | or sum of those values combined. The example in the adjacent image shows a 39 | combination of three apples and two apples, making a total of five apples. 40 | This observation is equivalent to the mathematical expression "3 + 2 = 5" 41 | (that is, "3 plus 2 is equal to 5"). 42 | """ 43 | } 44 | SENTIMENT_INPUT = { 45 | 'context': "We're not happy unless you're not happy." 46 | } 47 | GENERATE_INPUT = { 48 | 'prompt_text': 'All your base are' 49 | } 50 | 51 | # For now, we have a single canned input for each model type. 52 | MODEL_INPUTS = { 53 | 'intent': INTENT_INPUT, 54 | 'sentiment': SENTIMENT_INPUT, 55 | 'qa': QA_INPUT, 56 | 'generate': GENERATE_INPUT 57 | } 58 | 59 | LANGUAGES = ['en', 'es', 'zh'] 60 | MODEL_TYPES = list(MODEL_INPUTS.keys()) 61 | 62 | # Map the integer model IDs from the trace to pairs of language code and 63 | # model type. 64 | MODEL_ID_TO_PARAMS = [ 65 | (lang_code, model_name) 66 | for lang_code in LANGUAGES 67 | for model_name in MODEL_TYPES 68 | ] 69 | 70 | ############################################################################### 71 | # SUBROUTINES 72 | 73 | 74 | def call_model_rest( 75 | model_type: str, language: str, port_: int, 76 | timeout_sec: float = 5.0) \ 77 | -> Tuple[int, float, float]: 78 | ''' 79 | Callack function that calls a model deployed as a REST web service, 80 | retrieves the result, and returns elapsed time. 81 | 82 | :param model_type: Type of model to call; must be one of 83 | 'intent', 'sentiment', 'qa', or 'generate' 84 | :param language: Two-letter language code; must be one of 85 | 'en', 'es', 'zh' 86 | :param port_: Port on which the local REST API is listening. 87 | :param timeout_sec: Request timeout, in seconds. 88 | 89 | :returns: Tuple of HTTP result code and start and end times 90 | of the web service call. If a client-side timeout 91 | happens, the result code will be 408 (request timeout) 92 | ''' 93 | if model_type not in MODEL_TYPES: 94 | raise ValueError(f'Unexpected model type "{model_type}" ' 95 | f'(expected {MODEL_TYPES}') 96 | if language not in LANGUAGES: 97 | raise ValueError(f'Unexpected language code "{language}" ' 98 | f'(expected {LANGUAGES}') 99 | 100 | # For now, use the same input every time 101 | model_input = MODEL_INPUTS[model_type] 102 | 103 | start_time = time.time() 104 | try: 105 | result = requests.put( 106 | f'http://127.0.0.1:{port_}/predictions/{model_type}_{language}', 107 | json.dumps(model_input), 108 | timeout=timeout_sec) 109 | end_time = time.time() 110 | status_code = result.status_code 111 | except requests.exceptions.Timeout: 112 | end_time = start_time + timeout_sec 113 | status_code = 408 # HTTP/408 Request Timeout 114 | 115 | return status_code, start_time, end_time 116 | 117 | 118 | def gen_start_times(num_users: float, num_sec: int, mean_think_time_sec: float, 119 | seed: int, ) -> np.ndarray: 120 | ''' 121 | Generate a trace of inference request start times. Divides the trace 122 | into 1-second intervals. Runs a discrete event simulation to determine 123 | how many users send a request during each 1-second interval. Spreads 124 | these requests evenly across the second. 125 | 126 | :param num_users: Number of users to simulate 127 | :param num_sec: Number of seconds of trace to generate 128 | :param seed: Seed for the random number generator 129 | :param mean_think_time_sec: Average of the Poisson-distributed wait times 130 | for the simulated users. 131 | 132 | :returns: Numpy array of timestamps (starting from 0) for the requests 133 | in the trace 134 | ''' 135 | trace = [] 136 | rng = np.random.default_rng(seed) 137 | 138 | # Compute the number of requests in each 1-second window. 139 | req_per_window = np.zeros(num_sec, dtype=int) 140 | for _ in range(num_users): 141 | cur_time = rng.poisson(mean_think_time_sec) 142 | while cur_time < num_sec: 143 | req_per_window[cur_time] += 1 144 | # Move forward by a random think time. 145 | cur_time += rng.poisson(mean_think_time_sec) 146 | 147 | print(f'Requests per window:\n{req_per_window.tolist()}') 148 | 149 | # Spread each of the requests in each window evenly across the window 150 | for window_num in range(num_sec): 151 | num_requests = req_per_window[window_num] 152 | if num_requests > 0: 153 | request_interval = 1.0 / num_requests 154 | for i in range(num_requests): 155 | trace.append(window_num + request_interval * i) 156 | 157 | return np.array(trace) 158 | 159 | 160 | def gen_start_times_old(requests_per_sec: float, num_sec: int, 161 | seed: int) -> np.ndarray: 162 | ''' 163 | Old code to generate start times trace. 164 | 165 | Generate a trace of inference request start times. Divides the trace 166 | into 1-second intervals. Each interval gets a number of requests drawn 167 | from a Poissson distribution. These requests are evenly spread through the 168 | interval. 169 | 170 | :param requests_per_sec: Average requests per second overall 171 | :param num_sec: Number of seconds of trace to generate 172 | :param seed: Seed for the random number generator 173 | 174 | :returns: Numpy array of timestamps (starting from 0) for the requests 175 | in the trace 176 | ''' 177 | trace = [] 178 | rng = np.random.default_rng(seed) 179 | 180 | # Compute the number of requests in each 1-second window. 181 | req_per_window = rng.poisson(requests_per_sec, size=num_sec) 182 | 183 | for window_num in range(num_sec): 184 | num_requests = req_per_window[window_num] 185 | if num_requests > 0: 186 | request_interval = 1.0 / num_requests 187 | for i in range(num_requests): 188 | trace.append(window_num + request_interval * i) 189 | 190 | return np.array(trace) 191 | 192 | 193 | def gen_model_ids(lambda_: float, num_models: int, num_points: int, 194 | seed: int) -> np.ndarray: 195 | ''' 196 | Draw integer model IDs at random from a truncated Poisson distribution. 197 | 198 | :param lambda_: Primary parameter of the distribution, which also happens to 199 | be the mean value of the (untruncated) distribution. 200 | :param num_models: Number of models; generated IDs will range from 0 to 201 | `num_models - 1`, inclusive. 202 | :param num_points: Number of random model IDs to return. 203 | :param seed: Seed for the random number generator 204 | 205 | :returns: Randomly generated model IDs for a series of requests, as a 206 | 1D Numpy array of integers. 207 | ''' 208 | rng = np.random.default_rng(seed) 209 | # Draw integers from a truncated Poisson distribution. Start with a 210 | # non-truncated distribution, then resample for 211 | # any values that went over the limit. 212 | int_ids = rng.poisson(lambda_, size=num_points) 213 | while np.any(int_ids >= num_models): 214 | new_values = rng.poisson(lambda_, size=np.sum(int_ids >= num_models)) 215 | int_ids[int_ids >= num_models] = new_values 216 | return int_ids 217 | 218 | 219 | def run_single_benchmark( 220 | model_callback: Callable, 221 | num_users: float, 222 | num_sec: int, 223 | model_id_to_params: List[Tuple[str, str]], 224 | model_lambda: float = 0.3, 225 | mean_think_time_sec: float = 10.0, 226 | seed: int = 42) -> pd.DataFrame: 227 | ''' 228 | A single run of the benchmark. 229 | 230 | Sends a stream of requests to multiple models, with the rate varying 231 | according to a Poisson distribution and division of traffic among models 232 | following a truncated Poisson distribution. 233 | 234 | :param model_callback: Thread-safe callback function that makes a 235 | single request and returns a tuple of 236 | ``(result code, start time, end time)``. 237 | Should have the signature 238 | `f(model_type: str, language: str)` 239 | :param num_users: Number of users to simulate. 240 | :param num_sec: Seconds of traffic to generate. 241 | The actual session will extend past this window until all open requests 242 | have finished. 243 | :param model_lambda: Primary parameter of the truncated Poisson 244 | distribution used to split requests among models. Approximately 245 | equal to the mean of the distribution. The default value of 0.3 sends 246 | 70% of traffic to model 0. 247 | :param mean_think_time_sec: Average of the Poisson-distributed wait times 248 | for the simulated users. 249 | :param model_id_to_params: List that maps integer model ID to a tuple of 250 | (language code, model name) for each of the models. 251 | :param seed: Seed for the random number generator 252 | 253 | :returns: DataFrame of benchmark results at per-request granularity 254 | ''' 255 | # Preallocate the trace as a set of lists. 256 | benchmark_start_time = time.time() 257 | desired_start_times = ( 258 | gen_start_times(num_users, num_sec, mean_think_time_sec, seed) 259 | + benchmark_start_time) 260 | num_requests = desired_start_times.shape[0] 261 | model_nums = gen_model_ids(model_lambda, len(model_id_to_params), 262 | num_requests, seed) 263 | language_codes = [model_id_to_params[num][0] for num in model_nums] 264 | model_types = [model_id_to_params[num][1] for num in model_nums] 265 | actual_start_times = [None] * num_requests 266 | end_times = [None] * num_requests 267 | result_codes = [None] * num_requests 268 | 269 | # Because some notebook servers (i.e. VSCode) don't play well with 270 | # asyncio, we use threads to manage concurrent requests. 271 | thread_pool = concurrent.futures.ThreadPoolExecutor(1000) 272 | 273 | # Map from request object to request number 274 | active_requests = {} # type: Dict[concurrent.futures.Future, int] 275 | 276 | # Main event loop: Spawn background requests, get their responses. 277 | request_num = 0 278 | while request_num < num_requests or len(active_requests) > 0: 279 | sec_to_next = ( 280 | 1.0 if request_num >= num_requests 281 | else desired_start_times[request_num] - time.time() 282 | ) 283 | if sec_to_next <= 0: 284 | # Time to send the next request 285 | lang_code = language_codes[request_num] 286 | model_type = model_types[request_num] 287 | future = thread_pool.submit( 288 | model_callback, model_type, lang_code) 289 | active_requests[future] = request_num 290 | request_num += 1 291 | else: 292 | # Block until it's time to send the next request or a previous 293 | # request is done. 294 | ready_set, _ = concurrent.futures.wait( 295 | list(active_requests.keys()), 296 | timeout=sec_to_next) 297 | 298 | # Record timings from any open requests that have completed. 299 | for future in ready_set: 300 | request_id = active_requests.pop(future) 301 | result_code, start_time, end_time = future.result() 302 | actual_start_times[request_id] = start_time 303 | end_times[request_id] = end_time 304 | result_codes[request_id] = result_code 305 | 306 | # Collate results as a DataFrame 307 | result = pd.DataFrame({ 308 | 'request_id': range(num_requests), 309 | 'model_num': model_nums, 310 | 'lang_code': language_codes, 311 | 'model_type': model_types, 312 | 'desired_start': desired_start_times, 313 | 'actual_start': actual_start_times, 314 | 'end': end_times, 315 | 'result_code': result_codes 316 | }) 317 | 318 | # Make all times relative to start of the trace 319 | for key in ("desired_start", "actual_start", "end"): 320 | result[key] -= benchmark_start_time 321 | result["latency"] = result["end"] - result["actual_start"] 322 | 323 | return result 324 | 325 | 326 | def run_benchmarks( 327 | model_callback: Callable, 328 | num_sec: int = 60, 329 | min_num_users: int = 10, 330 | num_users_step: int = 5, 331 | max_failure_fraction: float = 0.6) -> pd.DataFrame: 332 | ''' 333 | Perform multiple runs of the benchmark, increasing the request 334 | rate gradually until requests start returning errors. 335 | 336 | :param model_callback: Thread-safe callback function that makes a 337 | single request and returns a tuple of 338 | ``(result code, start time, end time)``. 339 | Should have the signature 340 | `f(model_type: str, language: str)` 341 | :param num_sec: Seconds of traffic to generate for each run. 342 | The actual session will extend past this window 343 | until all open requests have finished. 344 | :param min_num_users: Minimum number of users to simulate 345 | :param num_users_step: Amount by which the number of simulated users 346 | increases with each subsequent run of the 347 | benchmark. 348 | :param max_failure_fraction: What fraction of failed web service calls 349 | the benchmark will tolerate per run before 350 | stopping the overall process. 351 | 352 | :returns: A Pandas DataFrame of detailed timings for all web service 353 | requests. The column ``request_rate`` tells which run of the 354 | benchmark each request belongs to. 355 | ''' 356 | to_concat = [] 357 | num_users = min_num_users 358 | failure_fraction = 0. 359 | 360 | while failure_fraction <= max_failure_fraction: 361 | print(f'Running with {num_users} simulated users.') 362 | times = run_single_benchmark(model_callback, 363 | num_users, num_sec, 364 | MODEL_ID_TO_PARAMS) 365 | times.insert(0, 'num_users', num_users) 366 | to_concat.append(times) 367 | num_failures = sum(times['result_code'] != 200) 368 | num_requests = len(times.index) 369 | failure_fraction = num_failures / num_requests 370 | print(f' => {failure_fraction * 100.:0.1f}% failure rate') 371 | num_users += num_users_step 372 | 373 | print(f'Stopping due to fraction of failures ({failure_fraction}) ' 374 | f'exceeding allowable limit ({max_failure_fraction})') 375 | return pd.concat(to_concat) 376 | 377 | 378 | ############################################################################### 379 | # MAIN 380 | if __name__ == '__main__': 381 | if len(sys.argv) != 3: 382 | print(USAGE) 383 | exit(-1) 384 | port, output_file = sys.argv[1:] 385 | print(f'port is {port} and output CSV file is {output_file}') 386 | 387 | # For now, every model is a web service call and the only thing that 388 | # changes is the port. 389 | def callback_fn(model_type: str, language: str): 390 | return call_model_rest(model_type, language, port) 391 | 392 | results = run_benchmarks(callback_fn) 393 | 394 | results.to_csv(output_file) 395 | 396 | 397 | 398 | 399 | 400 | 401 | -------------------------------------------------------------------------------- /notebooks/benchmark/gists/benchmark_gist_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "id": "4ae0615a-bc77-4c04-89f6-a35724dc2356", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | " Time to run once locally: 516 ms ± 6.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", 14 | " Time to run once with zero-copy: 534 ms ± 12.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", 15 | " Time to run 100 times locally: 51.9 s ± 222 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)\n", 16 | "Time to run 100 times with zero-copy: 6.31 s ± 143 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "print(\" Time to run once locally: \", end=\"\")\n", 22 | "%timeit model.generate(**model_input)\n", 23 | "print(\" Time to run once with zero-copy: \", end=\"\")\n", 24 | "%timeit ray.get(zerocopy.call_model.remote(model_ref, [], model_input, 'generate'))\n", 25 | "\n", 26 | "NUM_REPEATS = 100\n", 27 | "print(f\" Time to run {NUM_REPEATS} times locally: \", end=\"\")\n", 28 | "%timeit -r 3 [model.generate(**model_input) for _ in range(NUM_REPEATS)]\n", 29 | "print(f\"Time to run {NUM_REPEATS} times with zero-copy: \", end=\"\")\n", 30 | "%timeit ray.get([zerocopy.call_model.remote(model_ref, [], model_input, 'generate') for _ in range(NUM_REPEATS)])" 31 | ] 32 | } 33 | ], 34 | "metadata": { 35 | "interpreter": { 36 | "hash": "afa7e0f34d224467fd24b0cfa9c212efa127bdf53fe1c4e3ddf54198f34a39e3" 37 | }, 38 | "kernelspec": { 39 | "display_name": "Python 3", 40 | "language": "python", 41 | "name": "python3" 42 | }, 43 | "language_info": { 44 | "codemirror_mode": { 45 | "name": "ipython", 46 | "version": 3 47 | }, 48 | "file_extension": ".py", 49 | "mimetype": "text/x-python", 50 | "name": "python", 51 | "nbconvert_exporter": "python", 52 | "pygments_lexer": "ipython3", 53 | "version": "3.8.13" 54 | } 55 | }, 56 | "nbformat": 4, 57 | "nbformat_minor": 5 58 | } 59 | -------------------------------------------------------------------------------- /notebooks/benchmark/memory/README.md: -------------------------------------------------------------------------------- 1 | # notebooks/benchmark/memory 2 | 3 | This directory contains short Python scripts that we used to test the memory footprint 4 | of loading the four models in our benchmark study into local process memory. 5 | 6 | To run these scripts and capture peak heap size, use the `/usr/bin/time` command, 7 | which is *different* from your shell's built-in `time` command. 8 | 9 | On Linux: 10 | ``` 11 | /usr/bin/time -v python3