├── .gitignore
├── LICENSE
├── README.md
├── env.sh
├── notebooks
    ├── benchmark
    │   ├── benchmark.ipynb
    │   ├── benchmark.py
    │   ├── gists
    │   │   └── benchmark_gist_1.ipynb
    │   ├── memory
    │   │   ├── README.md
    │   │   ├── generate.py
    │   │   ├── intent.py
    │   │   ├── qa.py
    │   │   └── sentiment.py
    │   ├── outputs
    │   │   ├── .gitignore
    │   │   ├── baseline_agg.csv
    │   │   ├── ray_baseline.csv
    │   │   ├── ray_baseline_agg.csv
    │   │   └── zerocopy_agg.csv
    │   ├── ray_baseline.ipynb
    │   ├── ray_deploy.ipynb
    │   ├── torchserve.ipynb
    │   └── torchserve
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   ├── handler_generate.py
    │   │   ├── handler_intent.py
    │   │   ├── handler_qa.py
    │   │   ├── handler_sentiment.py
    │   │   └── torchserve.properties
    ├── h5_poc.ipynb
    ├── images
    │   ├── before_after.png
    │   ├── before_after.pptx
    │   ├── bert_load_times.png
    │   ├── models_table.png
    │   ├── torch_serve_arch.jpg
    │   └── zerocopy_perf.png
    ├── outputs
    │   └── .gitignore
    └── zero_copy_loading.ipynb
├── package
    ├── README.md
    ├── package.md
    ├── requirements.txt
    ├── setup.py
    └── zerocopy
    │   ├── __init__.py
    │   ├── invoke.py
    │   └── rewrite.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Vim swap files
  2 | **/*.swp
  3 | 
  4 | # Anaconda environment created by ./env.sh
  5 | env
  6 | 
  7 | # JupyterLab stuff
  8 | .virtual_documents
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | pip-wheel-metadata/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .nox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | *.py,cover
 60 | .hypothesis/
 61 | .pytest_cache/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # IPython
 90 | profile_default/
 91 | ipython_config.py
 92 | 
 93 | # pyenv
 94 | .python-version
 95 | 
 96 | # pipenv
 97 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 98 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 99 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | #   install all needed dependencies.
101 | #Pipfile.lock
102 | 
103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
104 | __pypackages__/
105 | 
106 | # Celery stuff
107 | celerybeat-schedule
108 | celerybeat.pid
109 | 
110 | # SageMath parsed files
111 | *.sage.py
112 | 
113 | # Environments
114 | .env
115 | .venv
116 | env/
117 | venv/
118 | ENV/
119 | env.bak/
120 | venv.bak/
121 | 
122 | # Spyder project settings
123 | .spyderproject
124 | .spyproject
125 | 
126 | # Rope project settings
127 | .ropeproject
128 | 
129 | # mkdocs documentation
130 | /site
131 | 
132 | # mypy
133 | .mypy_cache/
134 | .dmypy.json
135 | dmypy.json
136 | 
137 | # Pyre type checker
138 | .pyre/
139 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # zero-copy-model-loading
 2 | 
 3 | Code to accompany the Medium post, ["How to Load PyTorch Models 340 Times Faster
 4 | with
 5 | Ray"](https://medium.com/ibm-data-ai/how-to-load-pytorch-models-340-times-faster-with-ray-8be751a6944c).
 6 | 
 7 | ## Notebooks
 8 | 
 9 | Notebooks can be found in the `notebooks` directory:
10 | * `zero_copy_loading.ipynb`: The notebook that was used when authoring the 
11 |   original blog post.
12 | * `benchmark/benchmark.ipynb`: The notebook that was used when authoring the
13 |   second post in the series.
14 | 
15 | Instructions to run notebooks:
16 | 1. Install `bash` and either `anaconda` or `miniconda`.
17 | 1. Check out a copy of this repository and navigate to the root directory of
18 |    your local copy.
19 | 1. Run the script `./env.sh`, which creates an Anaconda environment under 
20 |    `<root of your local copy>/env`.
21 |    ```
22 |    ./env.sh
23 |    ```
24 | 1. Activate the Anaconda environment:
25 |    ```
26 |    conda activate ./env
27 |    ```
28 | 1. Run JupyterLab:
29 |    ```
30 |    jupyter lab
31 |    ```
32 | 1. Navigate to the `notebooks` directory and open up the Jupyter notebook of your choice.
33 | 
34 | 
35 | ## Python Package
36 | 
37 | This repository also contains the source code for the `zerocopy` library.
38 | `zerocopy` is a Python package that provides functions for implementing
39 | zero-copy model loading of PyTorch models on Ray.
40 | 
41 | You can find the source code for the package inside the `package` directory.
42 | 
43 | 


--------------------------------------------------------------------------------
/env.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | 
  3 | 
  4 | # Create conda environment to run the notebooks in this directory.
  5 | #
  6 | # By default, the environment will be located in the directory "env"
  7 | # immediately under this one. To override that setting,
  8 | # pass the subdirectory name as the first argument to this script, i.e.
  9 | #
 10 | # $ ./env.sh my_dir_name
 11 | 
 12 | PYTHON_VERSION=3.8
 13 | 
 14 | ############################
 15 | # HACK ALERT *** HACK ALERT
 16 | # The friendly folks at Anaconda thought it would be a good idea to make the
 17 | # "conda" command a shell function.
 18 | # See https://github.com/conda/conda/issues/7126
 19 | # The following workaround will probably be fragile.
 20 | if [ -z "$CONDA_HOME" ]
 21 | then
 22 |     echo "Error: CONDA_HOME not set."
 23 |     exit
 24 | fi
 25 | if [ -e "${CONDA_HOME}/etc/profile.d/conda.sh" ]
 26 | then
 27 |     # shellcheck disable=SC1090
 28 |     . "${CONDA_HOME}/etc/profile.d/conda.sh"
 29 | else
 30 |     echo "Error: CONDA_HOME (${CONDA_HOME}) does not appear to be set up."
 31 |     exit
 32 | fi
 33 | # END HACK
 34 | ############################
 35 | 
 36 | Usage()
 37 | {
 38 |     echo "Usage ./env.sh [-d dir_name] [-p] [-h]"
 39 |     echo "Where:"
 40 |     echo "  -d dir_name specifies the location of the environment."
 41 |     echo "     (default is ./env)"
 42 |     echo "  -p means to install the zerocopy module from PyPI"
 43 |     echo "     (default is to install from local source)"
 44 |     echo "  -h prints this message" 
 45 | }
 46 | 
 47 | INSTALL_FROM_PYPI=false
 48 | ENV_DIR="env"
 49 | 
 50 | while getopts ":hpd:" option; do
 51 |     case $option in
 52 |         h) # display Help
 53 |             Usage
 54 |             exit;;
 55 |         d) # Specify directory
 56 |             ENV_DIR=$OPTARG;;
 57 |         p) # Install from PyPI
 58 |             INSTALL_FROM_PYPI=true;;
 59 |         \?) # Invalid option
 60 |             Usage
 61 |             exit;;
 62 |     esac
 63 | done
 64 | 
 65 | echo "Creating an Anaconda environment at ./${ENV_DIR}"
 66 | if [ "$INSTALL_FROM_PYPI" = true ] ; then
 67 |     echo "Will install zerocopy package from PyPI"
 68 | else
 69 |     echo "Will install zerocopy package from local source tree"
 70 | fi
 71 | 
 72 | # Remove the detritus of any previous runs of this script
 73 | rm -rf ./${ENV_DIR}
 74 | 
 75 | # Note how we explicitly install pip on the line that follows. THIS IS VERY
 76 | # IMPORTANT!
 77 | conda create -y -p ${ENV_DIR} python=${PYTHON_VERSION} pip
 78 | conda activate ./${ENV_DIR}
 79 | 
 80 | ################################################################################
 81 | # Install packages with conda
 82 | 
 83 | # We currently install JupyterLab from conda because the pip packages are 
 84 | # broken for Anaconda environments with Python 3.6 and 3.8 on Mac, as of
 85 | # April 2022.
 86 | # Make sure that we install everything so that some pip dependency doesn't
 87 | # pull in incompatible PyPI versions of a Jupyter package.
 88 | conda install -y -c conda-forge jupyterlab \
 89 |     ipywidgets \
 90 |     jupyterlab-git \
 91 |     jupyter-lsp \
 92 |     jupyterlab-lsp \
 93 |     jupyter-packaging \
 94 |     jupyter-resource-usage
 95 | conda install -y -c conda-forge/label/main nodejs
 96 | 
 97 | # Rebuild local JupyterLab resources, because sometimes the conda-forge
 98 | # packages don't come properly configured.
 99 | jupyter lab build
100 | 
101 | ################################################################################
102 | # Install packages with pip
103 | 
104 | # Pip dependencies are all in requirements.txt
105 | pip install -r requirements.txt
106 | 
107 | if [ "$INSTALL_FROM_PYPI" = true ] ; then
108 |     pip install zerocopy
109 | else
110 |     # Install the local source tree for the `zerocopy` package in editable mode
111 |     pip install --editable ./package
112 | fi
113 | 
114 | ################################################################################
115 | # Custom install steps
116 | 
117 | # Elyra extensions to JupyterLab (enables git integration, debugger, workflow
118 | # editor, outlines, and other features)
119 | pip install --upgrade --use-deprecated=legacy-resolver elyra
120 | 
121 | # Rebuild JupyterLab environment 
122 | jupyter lab build
123 | 
124 | jupyter --version
125 | echo " "
126 | jupyter serverextension list
127 | echo " "
128 | jupyter labextension list
129 | echo " "
130 | 
131 | conda deactivate
132 | 
133 | echo "Anaconda environment at ./${ENV_DIR} successfully created."
134 | echo "To use, type 'conda activate ./${ENV_DIR}'."
135 | 
136 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/benchmark.py:
--------------------------------------------------------------------------------
  1 | # Benchmark script. Run with no arguments for usage info.
  2 | 
  3 | # Imports go here
  4 | import concurrent.futures
  5 | import json
  6 | import requests
  7 | import sys
  8 | import time
  9 | 
 10 | from typing import Tuple, Callable, List, Dict
 11 | 
 12 | import numpy as np
 13 | import pandas as pd
 14 | 
 15 | # Constants go here
 16 | USAGE = '''
 17 | Microbenchmark script for zero-copy model loading.  To use, first deploy the
 18 | service to test, then run this script.
 19 | 
 20 | Usage:
 21 |     python benchmark.py <port> <output_file>
 22 | Where:
 23 |     * <port> is the local network port on which to connect
 24 |     * <output_file> is the location where the output CSV file should go.
 25 | '''
 26 | 
 27 | INTENT_INPUT = {
 28 |     'context':
 29 |         ("I came here to eat chips and beat you up, "
 30 |          "and I'm all out of chips.")
 31 | }
 32 | QA_INPUT = {
 33 |     'question': 'What is 1 + 1?',
 34 |     'context':
 35 |         """Addition (usually signified by the plus symbol +) is one of the four basic 
 36 |         operations of arithmetic, the other three being subtraction, multiplication 
 37 |         and division. The addition of two whole numbers results in the total amount 
 38 |         or sum of those values combined. The example in the adjacent image shows a 
 39 |         combination of three apples and two apples, making a total of five apples. 
 40 |         This observation is equivalent to the mathematical expression "3 + 2 = 5" 
 41 |         (that is, "3 plus 2 is equal to 5").
 42 |         """
 43 | }
 44 | SENTIMENT_INPUT = {
 45 |     'context': "We're not happy unless you're not happy."
 46 | }
 47 | GENERATE_INPUT = {
 48 |     'prompt_text': 'All your base are'
 49 | }
 50 | 
 51 | # For now, we have a single canned input for each model type.
 52 | MODEL_INPUTS = {
 53 |     'intent': INTENT_INPUT,
 54 |     'sentiment': SENTIMENT_INPUT,
 55 |     'qa': QA_INPUT,
 56 |     'generate': GENERATE_INPUT
 57 | }
 58 | 
 59 | LANGUAGES = ['en', 'es', 'zh']
 60 | MODEL_TYPES = list(MODEL_INPUTS.keys())
 61 | 
 62 | # Map the integer model IDs from the trace to pairs of language code and
 63 | # model type.
 64 | MODEL_ID_TO_PARAMS = [
 65 |     (lang_code, model_name)
 66 |     for lang_code in LANGUAGES
 67 |     for model_name in MODEL_TYPES
 68 | ]
 69 | 
 70 | ###############################################################################
 71 | # SUBROUTINES
 72 | 
 73 | 
 74 | def call_model_rest(
 75 |         model_type: str, language: str, port_: int,
 76 |         timeout_sec: float = 5.0) \
 77 |         -> Tuple[int, float, float]:
 78 |     '''
 79 |     Callack function that calls a model deployed as a REST web service,
 80 |     retrieves the result, and returns elapsed time.
 81 | 
 82 |     :param model_type: Type of model to call; must be one of
 83 |                        'intent', 'sentiment', 'qa', or 'generate'
 84 |     :param language: Two-letter language code; must be one of
 85 |                      'en', 'es', 'zh'
 86 |     :param port_: Port on which the local REST API is listening.
 87 |     :param timeout_sec: Request timeout, in seconds.
 88 | 
 89 |     :returns: Tuple of HTTP result code and start and end times
 90 |               of the web service call. If a client-side timeout
 91 |               happens, the result code will be 408 (request timeout)
 92 |     '''
 93 |     if model_type not in MODEL_TYPES:
 94 |         raise ValueError(f'Unexpected model type "{model_type}" '
 95 |                          f'(expected {MODEL_TYPES}')
 96 |     if language not in LANGUAGES:
 97 |         raise ValueError(f'Unexpected language code "{language}" '
 98 |                          f'(expected {LANGUAGES}')
 99 | 
100 |     # For now, use the same input every time
101 |     model_input = MODEL_INPUTS[model_type]
102 | 
103 |     start_time = time.time()
104 |     try:
105 |         result = requests.put(
106 |             f'http://127.0.0.1:{port_}/predictions/{model_type}_{language}',
107 |             json.dumps(model_input),
108 |             timeout=timeout_sec)
109 |         end_time = time.time()
110 |         status_code = result.status_code
111 |     except requests.exceptions.Timeout:
112 |         end_time = start_time + timeout_sec
113 |         status_code = 408  # HTTP/408 Request Timeout
114 | 
115 |     return status_code, start_time, end_time
116 | 
117 | 
118 | def gen_start_times(num_users: float, num_sec: int, mean_think_time_sec: float,
119 |                     seed: int, ) -> np.ndarray:
120 |     '''
121 |     Generate a trace of inference request start times. Divides the trace
122 |     into 1-second intervals.  Runs a discrete event simulation to determine
123 |     how many users send a request during each 1-second interval. Spreads
124 |     these requests evenly across the second.
125 | 
126 |     :param num_users: Number of users to simulate
127 |     :param num_sec: Number of seconds of trace to generate
128 |     :param seed: Seed for the random number generator
129 |     :param mean_think_time_sec: Average of the Poisson-distributed wait times
130 |                                 for the simulated users.
131 | 
132 |     :returns: Numpy array of timestamps (starting from 0) for the requests
133 |      in the trace
134 |     '''
135 |     trace = []
136 |     rng = np.random.default_rng(seed)
137 | 
138 |     # Compute the number of requests in each 1-second window.
139 |     req_per_window = np.zeros(num_sec, dtype=int)
140 |     for _ in range(num_users):
141 |         cur_time = rng.poisson(mean_think_time_sec)
142 |         while cur_time < num_sec:
143 |             req_per_window[cur_time] += 1
144 |             # Move forward by a random think time.
145 |             cur_time += rng.poisson(mean_think_time_sec)
146 |             
147 |     print(f'Requests per window:\n{req_per_window.tolist()}')
148 | 
149 |     # Spread each of the requests in each window evenly across the window
150 |     for window_num in range(num_sec):
151 |         num_requests = req_per_window[window_num]
152 |         if num_requests > 0:
153 |             request_interval = 1.0 / num_requests
154 |             for i in range(num_requests):
155 |                 trace.append(window_num + request_interval * i)
156 | 
157 |     return np.array(trace)
158 | 
159 | 
160 | def gen_start_times_old(requests_per_sec: float, num_sec: int,
161 |                         seed: int) -> np.ndarray:
162 |     '''
163 |     Old code to generate start times trace.
164 | 
165 |     Generate a trace of inference request start times. Divides the trace
166 |     into 1-second intervals. Each interval gets a number of requests drawn
167 |     from a Poissson distribution. These requests are evenly spread through the
168 |     interval.
169 | 
170 |     :param requests_per_sec: Average requests per second overall
171 |     :param num_sec: Number of seconds of trace to generate
172 |     :param seed: Seed for the random number generator
173 | 
174 |     :returns: Numpy array of timestamps (starting from 0) for the requests
175 |      in the trace
176 |     '''
177 |     trace = []
178 |     rng = np.random.default_rng(seed)
179 | 
180 |     # Compute the number of requests in each 1-second window.
181 |     req_per_window = rng.poisson(requests_per_sec, size=num_sec)
182 | 
183 |     for window_num in range(num_sec):
184 |         num_requests = req_per_window[window_num]
185 |         if num_requests > 0:
186 |             request_interval = 1.0 / num_requests
187 |             for i in range(num_requests):
188 |                 trace.append(window_num + request_interval * i)
189 | 
190 |     return np.array(trace)
191 | 
192 | 
193 | def gen_model_ids(lambda_: float, num_models: int, num_points: int,
194 |                   seed: int) -> np.ndarray:
195 |     '''
196 |     Draw integer model IDs at random from a truncated Poisson distribution.
197 | 
198 |     :param lambda_: Primary parameter of the distribution, which also happens to
199 |      be the mean value of the (untruncated) distribution.
200 |     :param num_models: Number of models; generated IDs will range from 0 to
201 |                        `num_models - 1`, inclusive.
202 |     :param num_points: Number of random model IDs to return.
203 |     :param seed: Seed for the random number generator
204 | 
205 |     :returns: Randomly generated model IDs for a series of requests, as a
206 |      1D Numpy array of integers.
207 |     '''
208 |     rng = np.random.default_rng(seed)
209 |     # Draw integers from a truncated Poisson distribution. Start with a
210 |     # non-truncated distribution, then resample for
211 |     # any values that went over the limit.
212 |     int_ids = rng.poisson(lambda_, size=num_points)
213 |     while np.any(int_ids >= num_models):
214 |         new_values = rng.poisson(lambda_, size=np.sum(int_ids >= num_models))
215 |         int_ids[int_ids >= num_models] = new_values
216 |     return int_ids
217 | 
218 | 
219 | def run_single_benchmark(
220 |         model_callback: Callable,
221 |         num_users: float,
222 |         num_sec: int,
223 |         model_id_to_params: List[Tuple[str, str]],
224 |         model_lambda: float = 0.3,
225 |         mean_think_time_sec: float = 10.0,
226 |         seed: int = 42) -> pd.DataFrame:
227 |     '''
228 |     A single run of the benchmark.
229 | 
230 |     Sends a stream of requests to multiple models, with the rate varying
231 |     according to a Poisson distribution and division of traffic among models
232 |     following a truncated Poisson distribution.
233 | 
234 |     :param model_callback: Thread-safe callback function that makes a
235 |                            single request and returns a tuple of
236 |                            ``(result code, start time, end time)``.
237 |                            Should have the signature
238 |                            `f(model_type: str, language: str)`
239 |     :param num_users: Number of users to simulate.
240 |     :param num_sec: Seconds of traffic to generate.
241 |      The actual session will extend past this window until all open requests
242 |      have finished.
243 |     :param model_lambda: Primary parameter of the truncated Poisson
244 |      distribution used to split requests among models. Approximately
245 |      equal to the mean of the distribution. The default value of 0.3 sends
246 |      70% of traffic to model 0.
247 |     :param mean_think_time_sec: Average of the Poisson-distributed wait times
248 |                                 for the simulated users.
249 |     :param model_id_to_params: List that maps integer model ID to a tuple of
250 |      (language code, model name) for each of the models.
251 |     :param seed: Seed for the random number generator
252 | 
253 |     :returns: DataFrame of benchmark results at per-request granularity
254 |     '''
255 |     # Preallocate the trace as a set of lists.
256 |     benchmark_start_time = time.time()
257 |     desired_start_times = (
258 |         gen_start_times(num_users, num_sec, mean_think_time_sec, seed)
259 |         + benchmark_start_time)
260 |     num_requests = desired_start_times.shape[0]
261 |     model_nums = gen_model_ids(model_lambda, len(model_id_to_params),
262 |                                num_requests, seed)
263 |     language_codes = [model_id_to_params[num][0] for num in model_nums]
264 |     model_types = [model_id_to_params[num][1] for num in model_nums]
265 |     actual_start_times = [None] * num_requests
266 |     end_times = [None] * num_requests
267 |     result_codes = [None] * num_requests
268 | 
269 |     # Because some notebook servers (i.e. VSCode) don't play well with
270 |     # asyncio, we use threads to manage concurrent requests.
271 |     thread_pool = concurrent.futures.ThreadPoolExecutor(1000)
272 | 
273 |     # Map from request object to request number
274 |     active_requests = {}  # type: Dict[concurrent.futures.Future, int]
275 | 
276 |     # Main event loop: Spawn background requests, get their responses.
277 |     request_num = 0
278 |     while request_num < num_requests or len(active_requests) > 0:
279 |         sec_to_next = (
280 |             1.0 if request_num >= num_requests
281 |             else desired_start_times[request_num] - time.time()
282 |         )
283 |         if sec_to_next <= 0:
284 |             # Time to send the next request
285 |             lang_code = language_codes[request_num]
286 |             model_type = model_types[request_num]
287 |             future = thread_pool.submit(
288 |                 model_callback, model_type, lang_code)
289 |             active_requests[future] = request_num
290 |             request_num += 1
291 |         else:
292 |             # Block until it's time to send the next request or a previous
293 |             # request is done.
294 |             ready_set, _ = concurrent.futures.wait(
295 |                 list(active_requests.keys()),
296 |                 timeout=sec_to_next)
297 | 
298 |             # Record timings from any open requests that have completed.
299 |             for future in ready_set:
300 |                 request_id = active_requests.pop(future)
301 |                 result_code, start_time, end_time = future.result()
302 |                 actual_start_times[request_id] = start_time
303 |                 end_times[request_id] = end_time
304 |                 result_codes[request_id] = result_code
305 | 
306 |     # Collate results as a DataFrame
307 |     result = pd.DataFrame({
308 |         'request_id': range(num_requests),
309 |         'model_num': model_nums,
310 |         'lang_code': language_codes,
311 |         'model_type': model_types,
312 |         'desired_start': desired_start_times,
313 |         'actual_start': actual_start_times,
314 |         'end': end_times,
315 |         'result_code': result_codes
316 |     })
317 | 
318 |     # Make all times relative to start of the trace
319 |     for key in ("desired_start", "actual_start", "end"):
320 |         result[key] -= benchmark_start_time
321 |     result["latency"] = result["end"] - result["actual_start"]
322 | 
323 |     return result
324 | 
325 | 
326 | def run_benchmarks(
327 |         model_callback: Callable,
328 |         num_sec: int = 60,
329 |         min_num_users: int = 10,
330 |         num_users_step: int = 5,
331 |         max_failure_fraction: float = 0.6) -> pd.DataFrame:
332 |     '''
333 |     Perform multiple runs of the benchmark, increasing the request
334 |     rate gradually until requests start returning errors.
335 | 
336 |     :param model_callback: Thread-safe callback function that makes a
337 |                            single request and returns a tuple of
338 |                            ``(result code, start time, end time)``.
339 |                            Should have the signature
340 |                            `f(model_type: str, language: str)`
341 |     :param num_sec: Seconds of traffic to generate for each run.
342 |                     The actual session will extend past this window
343 |                     until all open requests have finished.
344 |     :param min_num_users: Minimum number of users to simulate
345 |     :param num_users_step: Amount by which the number of simulated users
346 |                            increases with each subsequent run of the 
347 |                            benchmark.
348 |     :param max_failure_fraction: What fraction of failed web service calls
349 |                                  the benchmark will tolerate per run before
350 |                                  stopping the overall process.
351 | 
352 |     :returns: A Pandas DataFrame of detailed timings for all web service
353 |               requests. The column ``request_rate`` tells which run of the
354 |               benchmark each request belongs to.
355 |     '''
356 |     to_concat = []
357 |     num_users = min_num_users
358 |     failure_fraction = 0.
359 | 
360 |     while failure_fraction <= max_failure_fraction:
361 |         print(f'Running with {num_users} simulated users.')
362 |         times = run_single_benchmark(model_callback,
363 |                                      num_users, num_sec,
364 |                                      MODEL_ID_TO_PARAMS)
365 |         times.insert(0, 'num_users', num_users)
366 |         to_concat.append(times)
367 |         num_failures = sum(times['result_code'] != 200)
368 |         num_requests = len(times.index)
369 |         failure_fraction = num_failures / num_requests
370 |         print(f' => {failure_fraction * 100.:0.1f}% failure rate')
371 |         num_users += num_users_step
372 | 
373 |     print(f'Stopping due to fraction of failures ({failure_fraction}) '
374 |           f'exceeding allowable limit ({max_failure_fraction})')
375 |     return pd.concat(to_concat)
376 | 
377 | 
378 | ###############################################################################
379 | # MAIN
380 | if __name__ == '__main__':
381 |     if len(sys.argv) != 3:
382 |         print(USAGE)
383 |         exit(-1)
384 |     port, output_file = sys.argv[1:]
385 |     print(f'port is {port} and output CSV file is {output_file}')
386 | 
387 |     # For now, every model is a web service call and the only thing that
388 |     # changes is the port.
389 |     def callback_fn(model_type: str, language: str):
390 |         return call_model_rest(model_type, language, port)
391 | 
392 |     results = run_benchmarks(callback_fn)
393 | 
394 |     results.to_csv(output_file)
395 | 
396 | 
397 | 
398 | 
399 | 
400 | 
401 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/gists/benchmark_gist_1.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 8,
 6 |    "id": "4ae0615a-bc77-4c04-89f6-a35724dc2356",
 7 |    "metadata": {},
 8 |    "outputs": [
 9 |     {
10 |      "name": "stdout",
11 |      "output_type": "stream",
12 |      "text": [
13 |       "            Time to run once locally: 516 ms ± 6.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
14 |       "     Time to run once with zero-copy: 534 ms ± 12.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
15 |       "       Time to run 100 times locally: 51.9 s ± 222 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)\n",
16 |       "Time to run 100 times with zero-copy: 6.31 s ± 143 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
17 |      ]
18 |     }
19 |    ],
20 |    "source": [
21 |     "print(\"            Time to run once locally: \", end=\"\")\n",
22 |     "%timeit model.generate(**model_input)\n",
23 |     "print(\"     Time to run once with zero-copy: \", end=\"\")\n",
24 |     "%timeit ray.get(zerocopy.call_model.remote(model_ref, [], model_input, 'generate'))\n",
25 |     "\n",
26 |     "NUM_REPEATS = 100\n",
27 |     "print(f\"       Time to run {NUM_REPEATS} times locally: \", end=\"\")\n",
28 |     "%timeit -r 3 [model.generate(**model_input) for _ in range(NUM_REPEATS)]\n",
29 |     "print(f\"Time to run {NUM_REPEATS} times with zero-copy: \", end=\"\")\n",
30 |     "%timeit ray.get([zerocopy.call_model.remote(model_ref, [], model_input, 'generate') for _ in range(NUM_REPEATS)])"
31 |    ]
32 |   }
33 |  ],
34 |  "metadata": {
35 |   "interpreter": {
36 |    "hash": "afa7e0f34d224467fd24b0cfa9c212efa127bdf53fe1c4e3ddf54198f34a39e3"
37 |   },
38 |   "kernelspec": {
39 |    "display_name": "Python 3",
40 |    "language": "python",
41 |    "name": "python3"
42 |   },
43 |   "language_info": {
44 |    "codemirror_mode": {
45 |     "name": "ipython",
46 |     "version": 3
47 |    },
48 |    "file_extension": ".py",
49 |    "mimetype": "text/x-python",
50 |    "name": "python",
51 |    "nbconvert_exporter": "python",
52 |    "pygments_lexer": "ipython3",
53 |    "version": "3.8.13"
54 |   }
55 |  },
56 |  "nbformat": 4,
57 |  "nbformat_minor": 5
58 | }
59 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/memory/README.md:
--------------------------------------------------------------------------------
 1 | # notebooks/benchmark/memory
 2 | 
 3 | This directory contains short Python scripts that we used to test the memory footprint
 4 | of loading the four models in our benchmark study into local process memory.
 5 | 
 6 | To run these scripts and capture peak heap size, use the `/usr/bin/time` command, 
 7 | which is *different* from your shell's built-in `time` command.
 8 | 
 9 | On Linux:
10 | ```
11 | /usr/bin/time -v python3 <script>.py
12 | ```
13 | 
14 | On MacOS:
15 | ```
16 | /usr/bin/time -l python3 <script>.py
17 | ```
18 | Look for "maximum resident set size" or something similar in the output.
19 | 
20 | Peak heap sizes in bytes from running these scripts on my Mac:
21 | 
22 | * `intent.py`: 2349740032
23 | * `sentiment.py`: 1159647232
24 | * `qa.py`: 1189236736
25 | * `generate.py`: 1391460352


--------------------------------------------------------------------------------
/notebooks/benchmark/memory/generate.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | 
 3 | GENERATE_MODEL_NAME = 'gpt2'
 4 | GENERATE_INPUT = {
 5 |     'prompt_text': 'All your base are'
 6 | }
 7 | 
 8 | generate_pipeline = transformers.pipeline(
 9 |     'text-generation', model=GENERATE_MODEL_NAME)
10 | pad_token_id = generate_pipeline.tokenizer.eos_token_id
11 | 
12 | generate_pre = generate_pipeline.preprocess(**GENERATE_INPUT)
13 | generate_output = generate_pipeline.forward(generate_pre,
14 |                                             pad_token_id=pad_token_id)
15 | print(generate_pipeline.postprocess(generate_output))
16 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/memory/intent.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | import torch
 3 | 
 4 | # Simple benchmark to measure peak memory footprint of a PyTorch
 5 | # model.
 6 | text = ("I came here to eat chips and beat you up, "
 7 |         "and I'm all out of chips.")
 8 | model = transformers.AutoModelForSeq2SeqLM.from_pretrained(
 9 |             'mrm8488/t5-base-finetuned-e2m-intent')
10 | tokenizer = transformers.AutoTokenizer.from_pretrained('t5-base')
11 | 
12 | with torch.inference_mode():
13 |     model_input = tokenizer(text, return_tensors='pt')
14 |     result = model.generate(**model_input)
15 |     print(result)
16 | 
17 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/memory/qa.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | 
 3 | QA_MODEL_NAME = 'deepset/roberta-base-squad2'
 4 | QA_INPUT = {
 5 |     'question': 'What is 1 + 1?',
 6 |     'context': 
 7 |         """Addition (usually signified by the plus symbol +) is one of the four basic operations of 
 8 |         arithmetic, the other three being subtraction, multiplication and division. The addition of two 
 9 |         whole numbers results in the total amount or sum of those values combined. The example in the
10 |         adjacent image shows a combination of three apples and two apples, making a total of five apples. 
11 |         This observation is equivalent to the mathematical expression "3 + 2 = 5" (that is, "3 plus 2 
12 |         is equal to 5").
13 |         """
14 | }
15 | 
16 | qa_pipeline = transformers.pipeline('question-answering',
17 |                                     model=QA_MODEL_NAME)
18 | print(qa_pipeline(**QA_INPUT))
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/memory/sentiment.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | import torch
 3 | 
 4 | text =  "We're not happy unless you're not happy."
 5 | 
 6 | SENTIMENT_MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment'
 7 | sentiment_tokenizer = transformers.AutoTokenizer.from_pretrained(
 8 |     SENTIMENT_MODEL_NAME)
 9 | sentiment_model = (
10 |     transformers.AutoModelForSequenceClassification
11 |     .from_pretrained(SENTIMENT_MODEL_NAME))
12 | 
13 | model_input = sentiment_tokenizer(text, padding=True, 
14 |                                   return_tensors='pt')
15 | print(sentiment_model(**model_input))
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/outputs/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore large files for now.
2 | baseline.csv
3 | zerocopy.csv
4 | 
5 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/outputs/baseline_agg.csv:
--------------------------------------------------------------------------------
1 | num_users,mean,median,max,successes,timeouts,timeout_fraction
2 | 10,0.40518082652175635,0.4428157806396484,0.9207582473754884,57,0,0.0
3 | 15,0.504623681306839,0.4783691167831421,1.316840887069702,88,0,0.0
4 | 20,0.7057660713530424,0.4994330406188965,2.21484375,114,0,0.0
5 | 25,1.3199621599616733,1.1122570037841797,3.718395709991455,141,0,0.0
6 | 30,1.7671638413479454,0.7978718280792236,5.0051469802856445,95,76,0.4444444444444444
7 | 35,0.8331436079901617,0.10843801498413085,4.975030183792114,74,125,0.628140703517588
8 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/outputs/ray_baseline_agg.csv:
--------------------------------------------------------------------------------
 1 | num_users,mean,median,max,successes,timeouts,timeout_fraction
 2 | 10,0.4111174993347703,0.4533016681671142,0.9673244953155518,57,0,0.0
 3 | 15,0.4920769821513783,0.45729291439056396,1.2287516593933103,88,0,0.0
 4 | 20,0.6714424313160411,0.46789324283599854,2.0344247817993164,114,0,0.0
 5 | 25,1.0659986472298912,0.7467355728149414,3.407196044921875,141,0,0.0
 6 | 30,2.3090410365924967,2.742401123046875,4.967850208282471,143,28,0.16374269005847952
 7 | 35,2.23592444096715,1.34779691696167,4.998433828353882,121,78,0.39195979899497485
 8 | 40,1.9590252585115686,0.7199971675872803,4.994086503982544,113,115,0.5043859649122807
 9 | 45,1.575100212766413,0.48943936824798584,4.992355108261108,114,142,0.5546875
10 | 50,1.311815688369471,0.1156303882598877,5.000372648239136,109,177,0.6188811188811189
11 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/outputs/zerocopy_agg.csv:
--------------------------------------------------------------------------------
 1 | num_users,mean,median,max,successes,timeouts,timeout_fraction
 2 | 10,0.4470122278782359,0.4183905124664306,2.1916275024414062,57,0,0.0
 3 | 15,0.4716990292072296,0.43459177017211914,1.899624347686768,88,0,0.0
 4 | 20,0.5639792910793371,0.4402179718017578,2.402397632598877,114,0,0.0
 5 | 25,0.6157335710863695,0.4520387649536133,2.400617837905884,141,0,0.0
 6 | 30,0.6402760458271406,0.4569382667541504,2.4313526153564453,171,0,0.0
 7 | 35,0.749191800553595,0.5051910877227783,2.5685923099517822,199,0,0.0
 8 | 40,0.7793919444084167,0.5376490354537964,3.076906204223633,228,0,0.0
 9 | 45,0.8503572419285774,0.5461751222610474,3.4384102821350098,256,0,0.0
10 | 50,0.7415354635332014,0.5352258682250977,2.4480528831481934,286,0,0.0
11 | 55,0.7699261460548792,0.5324233770370483,3.353690385818481,312,0,0.0
12 | 60,0.7497255192083471,0.5368337631225586,3.7880029678344727,340,0,0.0
13 | 65,0.8116256642985988,0.5563946962356567,3.554438352584839,370,0,0.0
14 | 70,0.75517483641928,0.5453705787658691,3.610730648040772,399,0,0.0
15 | 75,0.8114182116850367,0.5922657251358032,3.5695645809173584,424,0,0.0
16 | 80,0.850735641534716,0.6271641254425049,4.291506290435791,449,0,0.0
17 | 85,0.8483622743898468,0.606238842010498,3.572248935699463,477,0,0.0
18 | 90,0.9100044549457611,0.6412152051925659,4.492730379104614,504,0,0.0
19 | 95,0.9507910832622406,0.6965427398681641,3.3095507621765137,531,1,0.0018796992481203006
20 | 100,0.8390754331666943,0.6159663200378418,3.7279016971588135,561,0,0.0
21 | 105,0.9091142153335829,0.7180764675140381,3.6007609367370605,590,0,0.0
22 | 110,0.8840106630712989,0.6815733909606934,4.591192007064819,615,0,0.0
23 | 115,0.825233313381023,0.6506600379943848,4.2810492515563965,643,0,0.0
24 | 120,0.8461356522432014,0.6797752380371094,3.683051824569702,670,1,0.0014903129657228018
25 | 125,0.9096963836609718,0.7322134971618652,4.344498634338379,697,0,0.0
26 | 130,0.9353971234683333,0.7156810760498047,4.2906928062438965,725,0,0.0
27 | 135,0.8952789312981545,0.7076901197433472,4.493290424346924,752,0,0.0
28 | 140,0.9396104054573254,0.7710919380187988,4.525835275650024,780,0,0.0
29 | 145,0.9822416181764733,0.8165061473846436,4.306701898574829,809,0,0.0
30 | 150,1.0562905057461676,0.9107222557067872,4.217719793319702,835,1,0.0011961722488038277
31 | 155,1.0762822162273318,0.9212863445281982,4.448675632476807,860,1,0.0011614401858304297
32 | 160,1.048449801647757,0.9096839427947998,4.681246757507324,889,0,0.0
33 | 165,1.1568815786330426,1.0102789402008057,4.910607099533081,915,0,0.0
34 | 170,1.1176007465079978,1.0460779666900637,4.67669939994812,945,0,0.0
35 | 175,1.1788158593718538,1.07096266746521,4.498290777206421,970,1,0.0010298661174047373
36 | 180,1.1771072232723236,0.9692580699920654,4.205891609191895,1000,1,0.000999000999000999
37 | 185,1.1164654502608897,0.960507869720459,3.7796883583068848,1028,2,0.001941747572815534
38 | 190,1.123547396348576,0.994436264038086,4.0697021484375,1057,1,0.000945179584120983
39 | 195,1.3960953459761658,1.3891057968139648,4.618074178695679,1085,1,0.0009208103130755065
40 | 200,1.2391887200845255,0.9528263807296753,4.700873613357544,1110,2,0.0017985611510791368
41 | 205,1.5045363022892486,1.4641988277435305,3.967543125152588,1135,1,0.0008802816901408451
42 | 210,1.657477736985591,1.4462695121765137,4.974497079849243,1163,2,0.0017167381974248926
43 | 215,2.1141051112200677,2.371930956840515,4.029740571975708,1188,3,0.0025188916876574307
44 | 220,2.523391583965163,3.1089104413986206,4.755756616592407,1212,3,0.0024691358024691358
45 | 225,3.0236919003541964,3.851934313774109,5.010299921035767,1172,68,0.054838709677419356
46 | 230,2.567905114765604,2.7179908752441406,5.013680934906006,743,528,0.4154209284028324
47 | 235,2.2586326249490103,1.8578886985778809,5.007427453994751,641,657,0.5061633281972265
48 | 240,1.962504302071275,1.896545171737671,5.006300210952759,534,790,0.5966767371601208
49 | 245,2.2490694691153132,2.3887983560562134,4.99691653251648,510,843,0.623059866962306
50 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/ray_baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "6b9c47fd-5631-4f71-999c-39141186ac33",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# ray_baseline.ipynb\n",
  9 |     "\n",
 10 |     "Baseline model serving implementation from the [benchmark notebook](./benchmark.ipynb)."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 10,
 16 |    "id": "c8c73a81-6f3d-40a4-a0c7-20a955d4aab4",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Initialization and import code goes in this cell.\n",
 21 |     "\n",
 22 |     "# Imports: Python core, then third-party, then local.\n",
 23 |     "# Try to keep each block in alphabetical order, or the linter may get angry.\n",
 24 |     "\n",
 25 |     "import requests\n",
 26 |     "import starlette\n",
 27 |     "import time\n",
 28 |     "import os\n",
 29 |     "import json\n",
 30 |     "\n",
 31 |     "import scipy.special\n",
 32 |     "\n",
 33 |     "import ray\n",
 34 |     "from ray import serve\n",
 35 |     "import torch\n",
 36 |     "import transformers\n",
 37 |     "\n",
 38 |     "# Fix silly warning messages about parallel tokenizers\n",
 39 |     "os.environ['TOKENIZERS_PARALLELISM'] = 'False'\n",
 40 |     "\n",
 41 |     "\n",
 42 |     "# Reduce the volume of warning messages from `transformers`\n",
 43 |     "transformers.logging.set_verbosity_error()\n",
 44 |     "\n",
 45 |     "\n",
 46 |     "def reboot_ray():\n",
 47 |     "    if ray.is_initialized():\n",
 48 |     "        ray.shutdown()\n",
 49 |     "\n",
 50 |     "    if torch.cuda.is_available():\n",
 51 |     "        return ray.init(num_gpus=1)\n",
 52 |     "    else:\n",
 53 |     "        return ray.init()"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 11,
 59 |    "id": "cfec84ca-c245-44ec-b5e3-4d68aa6ea093",
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "# Constants go here\n",
 64 |     "INTENT_MODEL_NAME = 'mrm8488/t5-base-finetuned-e2m-intent'\n",
 65 |     "SENTIMENT_MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment'\n",
 66 |     "QA_MODEL_NAME = 'deepset/roberta-base-squad2'\n",
 67 |     "GENERATE_MODEL_NAME = 'gpt2'\n",
 68 |     "\n",
 69 |     "\n",
 70 |     "INTENT_INPUT = {\n",
 71 |     "    'context':\n",
 72 |     "        (\"I came here to eat chips and beat you up, \"\n",
 73 |     "         \"and I'm all out of chips.\")\n",
 74 |     "}\n",
 75 |     "\n",
 76 |     "SENTIMENT_INPUT = {\n",
 77 |     "    'context': \"We're not happy unless you're not happy.\"\n",
 78 |     "}\n",
 79 |     "\n",
 80 |     "QA_INPUT = {\n",
 81 |     "    'question': 'What is 1 + 1?',\n",
 82 |     "    'context': \n",
 83 |     "        \"\"\"Addition (usually signified by the plus symbol +) is one of the four basic operations of \n",
 84 |     "        arithmetic, the other three being subtraction, multiplication and division. The addition of two \n",
 85 |     "        whole numbers results in the total amount or sum of those values combined. The example in the\n",
 86 |     "        adjacent image shows a combination of three apples and two apples, making a total of five apples. \n",
 87 |     "        This observation is equivalent to the mathematical expression \"3 + 2 = 5\" (that is, \"3 plus 2 \n",
 88 |     "        is equal to 5\").\n",
 89 |     "        \"\"\"\n",
 90 |     "}\n",
 91 |     "\n",
 92 |     "GENERATE_INPUT = {\n",
 93 |     "    'prompt_text': 'All your base are'\n",
 94 |     "}"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 12,
100 |    "id": "3111006c-4a4a-4301-9e15-cc761ddb33ae",
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "name": "stderr",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "2022-02-25 17:22:22,792\tINFO services.py:1374 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n",
108 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:27,325\tINFO checkpoint_path.py:16 -- Using RayInternalKVStore for controller checkpoint and recovery.\n",
109 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:27,437\tINFO http_state.py:98 -- Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:vxzeES:SERVE_PROXY_ACTOR-node:127.0.0.1-0' on node 'node:127.0.0.1-0' listening on '127.0.0.1:8000'\n",
110 |       "2022-02-25 17:22:27,924\tINFO api.py:475 -- Started Serve instance in namespace '0ba90253-709e-4241-bae1-436aa41f0d8c'.\n"
111 |      ]
112 |     },
113 |     {
114 |      "data": {
115 |       "text/plain": [
116 |        "<ray.serve.api.Client at 0x7ff46849b7c0>"
117 |       ]
118 |      },
119 |      "execution_count": 12,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "serve.shutdown()\n",
126 |     "reboot_ray()\n",
127 |     "serve.start()"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 13,
133 |    "id": "c7bd5491-7be0-496a-b612-c2cfa91978dc",
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "class Intent:\n",
138 |     "    def __init__(self):\n",
139 |     "        # Tokenizer loading code from the model zoo doesn't work, so we \n",
140 |     "        # explicitly specify the t5-base tokenizer.\n",
141 |     "        self._tokenizer = transformers.AutoTokenizer.from_pretrained('t5-base')\n",
142 |     "        self._model = transformers.AutoModelForSeq2SeqLM.from_pretrained(\n",
143 |     "            INTENT_MODEL_NAME)\n",
144 |     "        self._max_length = 128  # Max sequence length, input + output, in tokens\n",
145 |     "\n",
146 |     "    async def __call__(self, request: starlette.requests.Request):\n",
147 |     "        json_request = await request.json()\n",
148 |     "\n",
149 |     "        # Preprocessing\n",
150 |     "        input_text = f'{json_request[\"context\"]} </s>'\n",
151 |     "        features = self._tokenizer([input_text], return_tensors='pt')\n",
152 |     "\n",
153 |     "        # Inference\n",
154 |     "        output = self._model.generate(\n",
155 |     "            input_ids=features['input_ids'], \n",
156 |     "            attention_mask=features['attention_mask'],\n",
157 |     "            max_length=self._max_length)\n",
158 |     "\n",
159 |     "        # Postprocessing\n",
160 |     "        result_string = self._tokenizer.decode(output[0])\n",
161 |     "        result_string = result_string.replace('<pad>', '')\n",
162 |     "        result_string = result_string[len(' '):-len('</s>')]\n",
163 |     "\n",
164 |     "        return {\n",
165 |     "            \"intent\": result_string\n",
166 |     "        }\n",
167 |     "\n",
168 |     "\n",
169 |     "class Sentiment:\n",
170 |     "    def __init__(self):\n",
171 |     "        self._tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
172 |     "            SENTIMENT_MODEL_NAME)\n",
173 |     "        self._model = (transformers.AutoModelForSequenceClassification\n",
174 |     "                       .from_pretrained(SENTIMENT_MODEL_NAME))\n",
175 |     "\n",
176 |     "    async def __call__(self, request: starlette.requests.Request):\n",
177 |     "        json_request = await request.json()\n",
178 |     "\n",
179 |     "        # Preprocessing\n",
180 |     "        encoded_input = self._tokenizer(json_request['context'],\n",
181 |     "                                        return_tensors='pt')\n",
182 |     "\n",
183 |     "        # Inference\n",
184 |     "        output = self._model(**encoded_input)\n",
185 |     "\n",
186 |     "        # Postprocessing\n",
187 |     "        scores = output[0][0].detach().numpy()\n",
188 |     "        scores = scipy.special.softmax(scores)\n",
189 |     "        scores = [float(s) for s in scores]\n",
190 |     "        scores = {k: v for k, v in zip(['positive', 'neutral', 'negative'],\n",
191 |     "                                       scores)}\n",
192 |     "        return scores\n",
193 |     "\n",
194 |     "\n",
195 |     "class QA:\n",
196 |     "    def __init__(self):\n",
197 |     "        self._pipeline = transformers.pipeline(\n",
198 |     "            'question-answering', model=QA_MODEL_NAME)\n",
199 |     "\n",
200 |     "    async def __call__(self, request: starlette.requests.Request):\n",
201 |     "        json_request = await request.json()\n",
202 |     "\n",
203 |     "        # Preprocessing (returns a Python generator)\n",
204 |     "        qa_pre = self._pipeline.create_sample(**json_request)\n",
205 |     "        qa_pre = self._pipeline.preprocess(qa_pre)\n",
206 |     "\n",
207 |     "        # Inference\n",
208 |     "        qa_output = (self._pipeline.forward(example) for example in qa_pre)\n",
209 |     "\n",
210 |     "        # Postprocessing\n",
211 |     "        qa_result = self._pipeline.postprocess(qa_output)\n",
212 |     "\n",
213 |     "        return qa_result\n",
214 |     "\n",
215 |     "\n",
216 |     "class Generate:\n",
217 |     "    def __init__(self):\n",
218 |     "        self._pipeline = transformers.pipeline(\n",
219 |     "            'text-generation', model=GENERATE_MODEL_NAME)\n",
220 |     "        self._pad_token_id = self._pipeline.tokenizer.eos_token_id\n",
221 |     "\n",
222 |     "    async def __call__(self, request: starlette.requests.Request):\n",
223 |     "        json_request = await request.json()\n",
224 |     "\n",
225 |     "        # Preprocessing\n",
226 |     "        generate_pre = self._pipeline.preprocess(**json_request)\n",
227 |     "\n",
228 |     "        # Inference\n",
229 |     "        generate_output = self._pipeline.forward(\n",
230 |     "            generate_pre, pad_token_id=self._pad_token_id)\n",
231 |     "\n",
232 |     "        # Postprocessing\n",
233 |     "        generate_result = self._pipeline.postprocess(generate_output)\n",
234 |     "\n",
235 |     "        return generate_result[0]"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "id": "60fa3569-8f71-4158-a1ce-a47d1c0da78a",
241 |    "metadata": {},
242 |    "source": [
243 |     "Now we can deploy all of these pipelines as Serve endpoints."
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 14,
249 |    "id": "02814570-6aeb-4cc3-be48-a0efbc878b50",
250 |    "metadata": {
251 |     "scrolled": true,
252 |     "tags": []
253 |    },
254 |    "outputs": [
255 |     {
256 |      "name": "stderr",
257 |      "output_type": "stream",
258 |      "text": [
259 |       "2022-02-25 17:22:27,963\tINFO api.py:249 -- Updating deployment 'intent_en'. component=serve deployment=intent_en\n",
260 |       "2022-02-25 17:22:27,971\tINFO api.py:249 -- Updating deployment 'sentiment_en'. component=serve deployment=sentiment_en\n",
261 |       "2022-02-25 17:22:27,980\tINFO api.py:249 -- Updating deployment 'qa_en'. component=serve deployment=qa_en\n",
262 |       "2022-02-25 17:22:27,989\tINFO api.py:249 -- Updating deployment 'generate_en'. component=serve deployment=generate_en\n",
263 |       "2022-02-25 17:22:27,999\tINFO api.py:249 -- Updating deployment 'intent_es'. component=serve deployment=intent_es\n",
264 |       "2022-02-25 17:22:28,010\tINFO api.py:249 -- Updating deployment 'sentiment_es'. component=serve deployment=sentiment_es\n",
265 |       "\u001b[2m\u001b[36m(HTTPProxyActor pid=80044)\u001b[0m INFO:     Started server process [80044]\n",
266 |       "2022-02-25 17:22:28,022\tINFO api.py:249 -- Updating deployment 'qa_es'. component=serve deployment=qa_es\n",
267 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:28,025\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'intent_en'. component=serve deployment=intent_en\n",
268 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:28,044\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'sentiment_en'. component=serve deployment=sentiment_en\n",
269 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:28,061\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'qa_en'. component=serve deployment=qa_en\n",
270 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:28,081\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'generate_en'. component=serve deployment=generate_en\n",
271 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:28,100\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'intent_es'. component=serve deployment=intent_es\n",
272 |       "2022-02-25 17:22:28,185\tINFO api.py:249 -- Updating deployment 'generate_es'. component=serve deployment=generate_es\n",
273 |       "2022-02-25 17:22:28,202\tINFO api.py:249 -- Updating deployment 'intent_zh'. component=serve deployment=intent_zh\n",
274 |       "2022-02-25 17:22:28,222\tINFO api.py:249 -- Updating deployment 'sentiment_zh'. component=serve deployment=sentiment_zh\n",
275 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:28,120\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'sentiment_es'. component=serve deployment=sentiment_es\n",
276 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:28,146\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'qa_es'. component=serve deployment=qa_es\n",
277 |       "2022-02-25 17:22:28,243\tINFO api.py:249 -- Updating deployment 'qa_zh'. component=serve deployment=qa_zh\n",
278 |       "2022-02-25 17:22:28,263\tINFO api.py:249 -- Updating deployment 'generate_zh'. component=serve deployment=generate_zh\n",
279 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:28,275\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'generate_es'. component=serve deployment=generate_es\n",
280 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:28,295\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'intent_zh'. component=serve deployment=intent_zh\n",
281 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:28,324\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'sentiment_zh'. component=serve deployment=sentiment_zh\n",
282 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:28,355\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'qa_zh'. component=serve deployment=qa_zh\n",
283 |       "\u001b[2m\u001b[36m(ServeController pid=80034)\u001b[0m 2022-02-25 17:22:28,385\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'generate_zh'. component=serve deployment=generate_zh\n",
284 |       "\u001b[2m\u001b[36m(intent_en pid=80042)\u001b[0m The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.\n",
285 |       "\u001b[2m\u001b[36m(intent_es pid=80040)\u001b[0m The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.\n",
286 |       "\u001b[2m\u001b[36m(intent_zh pid=80032)\u001b[0m The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.\n"
287 |      ]
288 |     }
289 |    ],
290 |    "source": [
291 |     "# Define endpoints\n",
292 |     "LANGUAGES = ['en', 'es', 'zh']\n",
293 |     "MAX_CONCURRENT_QUERIES = 1\n",
294 |     "\n",
295 |     "deployments = {}\n",
296 |     "for lang in LANGUAGES:\n",
297 |     "    deployments[(lang, 'intent')] = (\n",
298 |     "        serve.deployment(Intent, f'intent_{lang}', \n",
299 |     "                         route_prefix=f'/predictions/intent_{lang}',\n",
300 |     "                         max_concurrent_queries=MAX_CONCURRENT_QUERIES))\n",
301 |     "    deployments[(lang, 'sentiment')] = (\n",
302 |     "        serve.deployment(Sentiment, f'sentiment_{lang}',\n",
303 |     "                         route_prefix=f'/predictions/sentiment_{lang}',\n",
304 |     "                         max_concurrent_queries=MAX_CONCURRENT_QUERIES))\n",
305 |     "    deployments[(lang, 'qa')] = (\n",
306 |     "        serve.deployment(QA, f'qa_{lang}',\n",
307 |     "                         route_prefix=f'/predictions/qa_{lang}',\n",
308 |     "                         max_concurrent_queries=MAX_CONCURRENT_QUERIES))\n",
309 |     "    deployments[(lang, 'generate')] = (\n",
310 |     "        serve.deployment(Generate, f'generate_{lang}', \n",
311 |     "                         route_prefix=f'/predictions/generate_{lang}',\n",
312 |     "                         max_concurrent_queries=MAX_CONCURRENT_QUERIES))\n",
313 |     "\n",
314 |     "\n",
315 |     "for d in deployments.values():\n",
316 |     "    d.deploy(_blocking=False)\n",
317 |     "\n",
318 |     "# Wait a moment so log output doesn't go to the next cell's output\n",
319 |     "time.sleep(10.)"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": 15,
325 |    "id": "baac194f-9c7e-4d7c-8228-4e55b31738f2",
326 |    "metadata": {},
327 |    "outputs": [
328 |     {
329 |      "name": "stdout",
330 |      "output_type": "stream",
331 |      "text": [
332 |       "Intent result: {'intent': 'to eat'}\n",
333 |       "Sentiment result: {'positive': 0.5419477820396423, 'neutral': 0.38251084089279175, 'negative': 0.07554134726524353}\n",
334 |       "Question answering result: {'score': 4.278938831703272e-06, 'start': 483, 'end': 484, 'answer': '5'}\n",
335 |       "Natural language generation result: {'generated_text': 'All your base are in the red zone\\n\\n\\nI had a bit of trouble finding him, but I ended up putting a red card on him because of his green.\\n\\n\\nHe will probably be given a lot of respect for his blue, but'}\n"
336 |      ]
337 |     }
338 |    ],
339 |    "source": [
340 |     "intent_result = requests.put(\n",
341 |     "    'http://127.0.0.1:8000/predictions/intent_en', \n",
342 |     "    json.dumps(INTENT_INPUT)).json()\n",
343 |     "print(f'Intent result: {intent_result}')\n",
344 |     "\n",
345 |     "sentiment_result = requests.put(\n",
346 |     "    'http://127.0.0.1:8000/predictions/sentiment_en', \n",
347 |     "    json.dumps(SENTIMENT_INPUT)).json()\n",
348 |     "print(f'Sentiment result: {sentiment_result}')\n",
349 |     "\n",
350 |     "qa_result = requests.put(\n",
351 |     "    'http://127.0.0.1:8000/predictions/qa_en', \n",
352 |     "    json.dumps(QA_INPUT)).json()\n",
353 |     "print(f'Question answering result: {qa_result}')\n",
354 |     "\n",
355 |     "generate_result = requests.put(\n",
356 |     "    'http://127.0.0.1:8000/predictions/generate_en', \n",
357 |     "    json.dumps(GENERATE_INPUT)).json()\n",
358 |     "print(f'Natural language generation result: {generate_result}')"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 9,
364 |    "id": "d12c5007-d2dc-4711-8ab0-fc36d067db34",
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "ray.shutdown()"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "id": "42017025-37d7-4dd1-b436-ae4fea31bc94",
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": []
378 |   }
379 |  ],
380 |  "metadata": {
381 |   "kernelspec": {
382 |    "display_name": "Python 3 (ipykernel)",
383 |    "language": "python",
384 |    "name": "python3"
385 |   },
386 |   "language_info": {
387 |    "codemirror_mode": {
388 |     "name": "ipython",
389 |     "version": 3
390 |    },
391 |    "file_extension": ".py",
392 |    "mimetype": "text/x-python",
393 |    "name": "python",
394 |    "nbconvert_exporter": "python",
395 |    "pygments_lexer": "ipython3",
396 |    "version": "3.8.13"
397 |   }
398 |  },
399 |  "nbformat": 4,
400 |  "nbformat_minor": 5
401 | }
402 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/ray_deploy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "6b9c47fd-5631-4f71-999c-39141186ac33",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# ray_deploy.ipynb\n",
  9 |     "\n",
 10 |     "Optimized model serving implementation from the [benchmark notebook](./benchmark.ipynb)."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "id": "70512e88-058d-4644-a86a-71cd8334f57c",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Boilerplate"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "id": "c8c73a81-6f3d-40a4-a0c7-20a955d4aab4",
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "# Initialization and import code goes in this cell.\n",
 29 |     "\n",
 30 |     "# Imports: Python core, then third-party, then local.\n",
 31 |     "# Try to keep each block in alphabetical order, or the linter may get angry.\n",
 32 |     "\n",
 33 |     "import asyncio\n",
 34 |     "import requests\n",
 35 |     "import starlette\n",
 36 |     "import time\n",
 37 |     "import os\n",
 38 |     "import json\n",
 39 |     "\n",
 40 |     "import scipy.special\n",
 41 |     "\n",
 42 |     "import ray\n",
 43 |     "from ray import serve\n",
 44 |     "import torch\n",
 45 |     "import transformers\n",
 46 |     "import zerocopy\n",
 47 |     "\n",
 48 |     "import concurrent\n",
 49 |     "\n",
 50 |     "# Fix silly warning messages about parallel tokenizers\n",
 51 |     "os.environ['TOKENIZERS_PARALLELISM'] = 'False'\n",
 52 |     "\n",
 53 |     "\n",
 54 |     "# Reduce the volume of warning messages from `transformers`\n",
 55 |     "transformers.logging.set_verbosity_error()\n",
 56 |     "\n",
 57 |     "\n",
 58 |     "def reboot_ray():\n",
 59 |     "    if ray.is_initialized():\n",
 60 |     "        ray.shutdown()\n",
 61 |     "\n",
 62 |     "    if torch.cuda.is_available():\n",
 63 |     "        return ray.init(num_gpus=1)\n",
 64 |     "    else:\n",
 65 |     "        return ray.init()"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 2,
 71 |    "id": "cfec84ca-c245-44ec-b5e3-4d68aa6ea093",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# Constants go here\n",
 76 |     "INTENT_MODEL_NAME = 'mrm8488/t5-base-finetuned-e2m-intent'\n",
 77 |     "SENTIMENT_MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment'\n",
 78 |     "QA_MODEL_NAME = 'deepset/roberta-base-squad2'\n",
 79 |     "GENERATE_MODEL_NAME = 'gpt2'\n",
 80 |     "\n",
 81 |     "\n",
 82 |     "INTENT_INPUT = {\n",
 83 |     "    'context':\n",
 84 |     "        (\"I came here to eat chips and beat you up, \"\n",
 85 |     "         \"and I'm all out of chips.\")\n",
 86 |     "}\n",
 87 |     "\n",
 88 |     "SENTIMENT_INPUT = {\n",
 89 |     "    'context': \"We're not happy unless you're not happy.\"\n",
 90 |     "}\n",
 91 |     "\n",
 92 |     "QA_INPUT = {\n",
 93 |     "    'question': 'What is 1 + 1?',\n",
 94 |     "    'context': \n",
 95 |     "        \"\"\"Addition (usually signified by the plus symbol +) is one of the four basic operations of \n",
 96 |     "        arithmetic, the other three being subtraction, multiplication and division. The addition of two \n",
 97 |     "        whole numbers results in the total amount or sum of those values combined. The example in the\n",
 98 |     "        adjacent image shows a combination of three apples and two apples, making a total of five apples. \n",
 99 |     "        This observation is equivalent to the mathematical expression \"3 + 2 = 5\" (that is, \"3 plus 2 \n",
100 |     "        is equal to 5\").\n",
101 |     "        \"\"\"\n",
102 |     "}\n",
103 |     "\n",
104 |     "GENERATE_INPUT = {\n",
105 |     "    'prompt_text': 'All your base are'\n",
106 |     "}"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "id": "756da010-486f-49b7-8023-74af80661c11",
112 |    "metadata": {},
113 |    "source": [
114 |     "## Example model code\n",
115 |     "\n",
116 |     "This is the single-node code on which the Serve deployments below are based.  Some of this code is duplicated in `benchmark.ipynb` and should be kept in sync."
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "id": "3f636547-d45b-4acd-ab7d-4268f2957416",
122 |    "metadata": {},
123 |    "source": [
124 |     "### Intent model\n",
125 |     "\n",
126 |     "For our intent detection models, we'll use the model [`mrm8488/t5-base-finetuned-e2m-intent`](https://huggingface.co/mrm8488/t5-base-finetuned-e2m-intent).\n",
127 |     "\n",
128 |     "The intent model comes as three parts: A *tokenizer* that converts raw text into a sequence numeric token IDs, a core *model* that transforms these token sequences, and *preprocessing and postprocessing code* to choreograph the usage of the first two parts."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 3,
134 |    "id": "d9bad291-e541-45f8-9a7a-d6c7b020f76b",
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/plain": [
140 |        "'to eat'"
141 |       ]
142 |      },
143 |      "execution_count": 3,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "# Load model and tokenizer\n",
150 |     "intent_tokenizer = transformers.AutoTokenizer.from_pretrained('t5-base')\n",
151 |     "intent_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(\n",
152 |     "    INTENT_MODEL_NAME)\n",
153 |     "\n",
154 |     "# Preprocessing\n",
155 |     "input_text = f'{INTENT_INPUT[\"context\"]} </s>'\n",
156 |     "features = intent_tokenizer([input_text], return_tensors='pt')\n",
157 |     "\n",
158 |     "# Inference\n",
159 |     "output = intent_model.generate(**features)\n",
160 |     "\n",
161 |     "# Postprocessing\n",
162 |     "result_string = intent_tokenizer.decode(output[0])\n",
163 |     "result_string = result_string.replace('<pad>', '')\n",
164 |     "result_string = result_string[len(' '):-len('</s>')]\n",
165 |     "\n",
166 |     "result_string"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "id": "8b46f6c1-720a-49cb-8078-400e286e7dbf",
172 |    "metadata": {},
173 |    "source": [
174 |     "### Sentiment model\n",
175 |     "\n",
176 |     "For our sentiment models, we'll use model [`cardiffnlp/twitter-roberta-base-sentiment`](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment).\n",
177 |     "\n",
178 |     "Like the intent model, the sentiment model is packaged as a tokenizer, a core model, and instructions for pre- and post-processing."
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 4,
184 |    "id": "8e9c38c0-1210-431e-8024-9b47f77373aa",
185 |    "metadata": {},
186 |    "outputs": [
187 |     {
188 |      "data": {
189 |       "text/plain": [
190 |        "{'positive': 0.5419477820396423,\n",
191 |        " 'neutral': 0.38251084089279175,\n",
192 |        " 'negative': 0.07554134726524353}"
193 |       ]
194 |      },
195 |      "execution_count": 4,
196 |      "metadata": {},
197 |      "output_type": "execute_result"
198 |     }
199 |    ],
200 |    "source": [
201 |     "# Model loading\n",
202 |     "sentiment_tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
203 |     "    SENTIMENT_MODEL_NAME)\n",
204 |     "sentiment_model = (transformers.AutoModelForSequenceClassification\n",
205 |     "                .from_pretrained(SENTIMENT_MODEL_NAME))\n",
206 |     "\n",
207 |     "# Preprocessing\n",
208 |     "encoded_input = sentiment_tokenizer(SENTIMENT_INPUT['context'], \n",
209 |     "                                 return_tensors='pt')   \n",
210 |     "\n",
211 |     "# Inference\n",
212 |     "output = sentiment_model(**encoded_input)\n",
213 |     "\n",
214 |     "# Postprocessing\n",
215 |     "scores = output[0][0].detach().numpy()\n",
216 |     "scores = scipy.special.softmax(scores)\n",
217 |     "scores = [float(s) for s in scores]\n",
218 |     "scores = {k: v for k, v in zip(['positive', 'neutral', 'negative'], scores)}\n",
219 |     "\n",
220 |     "scores"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "id": "2c59f03b-dd87-4d87-845d-4123ccdad396",
226 |    "metadata": {},
227 |    "source": [
228 |     "### Question Answering Model\n",
229 |     "\n",
230 |     "For our question answering models, we'll use the model [`deepset/roberta-base-squad2`](https://huggingface.co/deepset/roberta-base-squad2).\n",
231 |     "\n",
232 |     "Unlike the intent and sentiment models, the question answering model comes prepackaged as a `question-answering` pipeline via the `tokenizers` library's [Pipelines API](https://huggingface.co/docs/transformers/main_classes/pipelines). \n",
233 |     "\n",
234 |     "So we can load and run all parts of the model, including pre- and post-processing code, by creating an instance of the pipeline class. The pipeline object has methods `preprocess()`, `forward()`, and `postprocess()` to perform preprocessing, inference, and postprocessing."
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 5,
240 |    "id": "96486779-6bc5-4a3f-8dd2-6faeeb117ded",
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "data": {
245 |       "text/plain": [
246 |        "{'score': 4.278938831703272e-06, 'start': 483, 'end': 484, 'answer': '5'}"
247 |       ]
248 |      },
249 |      "execution_count": 5,
250 |      "metadata": {},
251 |      "output_type": "execute_result"
252 |     }
253 |    ],
254 |    "source": [
255 |     "# Loading the model\n",
256 |     "qa_pipeline = transformers.pipeline('question-answering',\n",
257 |     "                                    model=QA_MODEL_NAME)\n",
258 |     "# Preprocessing (returns a Python generator)\n",
259 |     "qa_pre = qa_pipeline.preprocess(qa_pipeline.create_sample(**QA_INPUT))\n",
260 |     "\n",
261 |     "# Inference\n",
262 |     "qa_output = (qa_pipeline.forward(example) for example in qa_pre)\n",
263 |     "\n",
264 |     "# Postprocessing\n",
265 |     "qa_result = qa_pipeline.postprocess(qa_output)\n",
266 |     "\n",
267 |     "qa_result"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "id": "9c376f12-e679-4f37-8fe7-8ec336c009f8",
273 |    "metadata": {},
274 |    "source": [
275 |     "There is also a convenience method `__call__()` that runs all three phases of processing in sequence.."
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 6,
281 |    "id": "42b72d2a-8d20-48a6-9094-7189d2b76be5",
282 |    "metadata": {},
283 |    "outputs": [
284 |     {
285 |      "data": {
286 |       "text/plain": [
287 |        "{'score': 4.278938831703272e-06, 'start': 483, 'end': 484, 'answer': '5'}"
288 |       ]
289 |      },
290 |      "execution_count": 6,
291 |      "metadata": {},
292 |      "output_type": "execute_result"
293 |     }
294 |    ],
295 |    "source": [
296 |     "# This code also appears in `benchmark.ipynb`\n",
297 |     "\n",
298 |     "# Loading the model and associated resources\n",
299 |     "qa_pipeline = transformers.pipeline('question-answering',\n",
300 |     "                                    model=QA_MODEL_NAME)\n",
301 |     "# Preprocessing, inference, and postprocessing all happen in\n",
302 |     "# the Python object's the __call__() method.\n",
303 |     "qa_result = qa_pipeline(**QA_INPUT)\n",
304 |     "\n",
305 |     "qa_result"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "id": "7fcdc62b-d777-4b97-b144-dac251a68865",
311 |    "metadata": {
312 |     "tags": []
313 |    },
314 |    "source": [
315 |     "### Natural Language Generation Model\n",
316 |     "\n",
317 |     "For natural language generation, we'll use the [`gpt2`](https://huggingface.co/gpt2) language model. Like the question answering model, this natural language generation model comes wrapped in a `tokenizers` pipeline class. The class's `__call__()` method performs all the steps necessary to run end-to-end inference.\n"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 7,
323 |    "id": "73389972-e841-45c5-a061-44a7696a4652",
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "data": {
328 |       "text/plain": [
329 |        "[{'generated_text': 'All your base are just to get you going. If you have any problems you can use this guide to try and start playing with our new cards. There are a lot of great options you can use.\\n\\nFor the players that will run into'}]"
330 |       ]
331 |      },
332 |      "execution_count": 7,
333 |      "metadata": {},
334 |      "output_type": "execute_result"
335 |     }
336 |    ],
337 |    "source": [
338 |     "# Load the model\n",
339 |     "generate_pipeline = transformers.pipeline(\n",
340 |     "    'text-generation', model=GENERATE_MODEL_NAME)\n",
341 |     "pad_token_id = generate_pipeline.tokenizer.eos_token_id\n",
342 |     "\n",
343 |     "# Preprocessing\n",
344 |     "generate_pre = generate_pipeline.preprocess(**GENERATE_INPUT)\n",
345 |     "\n",
346 |     "# Inference\n",
347 |     "generate_output = generate_pipeline.forward(generate_pre,\n",
348 |     "                                            pad_token_id=pad_token_id)\n",
349 |     "\n",
350 |     "# Postprocessing\n",
351 |     "generate_result = generate_pipeline.postprocess(generate_output)\n",
352 |     "generate_result"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "markdown",
357 |    "id": "24aa4967-b9a1-4843-820d-98cc3f775523",
358 |    "metadata": {},
359 |    "source": [
360 |     "## Start Ray Serve"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 8,
366 |    "id": "3111006c-4a4a-4301-9e15-cc761ddb33ae",
367 |    "metadata": {},
368 |    "outputs": [
369 |     {
370 |      "name": "stderr",
371 |      "output_type": "stream",
372 |      "text": [
373 |       "2022-04-14 16:23:27,706\tINFO services.py:1412 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266\u001b[39m\u001b[22m\n",
374 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:32,867\tINFO checkpoint_path.py:16 -- Using RayInternalKVStore for controller checkpoint and recovery.\n",
375 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:32,974\tINFO http_state.py:98 -- Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:TOnaCn:SERVE_PROXY_ACTOR-node:127.0.0.1-0' on node 'node:127.0.0.1-0' listening on '127.0.0.1:8000'\n",
376 |       "2022-04-14 16:23:33,483\tINFO api.py:521 -- Started Serve instance in namespace '5ba3a27e-f16a-4829-9268-ad13be21fc2e'.\n"
377 |      ]
378 |     },
379 |     {
380 |      "data": {
381 |       "text/plain": [
382 |        "<ray.serve.api.Client at 0x7fe790835a90>"
383 |       ]
384 |      },
385 |      "execution_count": 8,
386 |      "metadata": {},
387 |      "output_type": "execute_result"
388 |     },
389 |     {
390 |      "name": "stderr",
391 |      "output_type": "stream",
392 |      "text": [
393 |       "\u001b[2m\u001b[36m(HTTPProxyActor pid=51122)\u001b[0m INFO:     Started server process [51122]\n"
394 |      ]
395 |     }
396 |    ],
397 |    "source": [
398 |     "serve.shutdown()\n",
399 |     "reboot_ray()\n",
400 |     "serve.start()"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "markdown",
405 |    "id": "0983420f-8ac2-496d-9fb6-5632370a5387",
406 |    "metadata": {},
407 |    "source": [
408 |     "## Optimized Model Deployments\n",
409 |     "\n",
410 |     "Some of these classes appear in slightly modified format in `benchmark.ipynb`. Make sure to keep the code in sync."
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": 9,
416 |    "id": "356441ab-f4f1-4b6f-8e6c-fbd3d3eeae34",
417 |    "metadata": {},
418 |    "outputs": [],
419 |    "source": [
420 |     "# This class also appears in `benchmark.ipynb`\n",
421 |     "@serve.deployment\n",
422 |     "class Intent:\n",
423 |     "    def __init__(self):\n",
424 |     "        self._tokenizer = transformers.AutoTokenizer.from_pretrained('t5-base')\n",
425 |     "\n",
426 |     "        # Extract weights and load them onto the Plasma object store\n",
427 |     "        self._model_ref = ray.put(zerocopy.extract_tensors(\n",
428 |     "            transformers.AutoModelForSeq2SeqLM.from_pretrained(\n",
429 |     "                    INTENT_MODEL_NAME)))\n",
430 |     "\n",
431 |     "    async def __call__(self, request: starlette.requests.Request):\n",
432 |     "        json_request = await request.json()\n",
433 |     "\n",
434 |     "        # Preprocessing\n",
435 |     "        input_text = f'{json_request[\"context\"]} </s>'\n",
436 |     "        features = self._tokenizer([input_text], return_tensors='pt')\n",
437 |     "\n",
438 |     "        # Model inference runs asynchronously in a Ray task\n",
439 |     "        output = await zerocopy.call_model.remote(\n",
440 |     "            self._model_ref, [], features, 'generate')\n",
441 |     "\n",
442 |     "        # Postprocessing\n",
443 |     "        result_string = self._tokenizer.decode(output[0])\n",
444 |     "        result_string = result_string[len('<pad> '):-len('</s>')]\n",
445 |     "        return {\n",
446 |     "            'intent': result_string\n",
447 |     "        }\n",
448 |     "\n",
449 |     "\n",
450 |     "@serve.deployment\n",
451 |     "class Sentiment:\n",
452 |     "    def __init__(self):\n",
453 |     "        self._tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
454 |     "            SENTIMENT_MODEL_NAME)\n",
455 |     "\n",
456 |     "        model = (transformers.AutoModelForSequenceClassification\n",
457 |     "                 .from_pretrained(SENTIMENT_MODEL_NAME))\n",
458 |     "        self._model_ref = ray.put(zerocopy.extract_tensors(model))\n",
459 |     "\n",
460 |     "    async def __call__(self, request: starlette.requests.Request):\n",
461 |     "        json_request = await request.json()\n",
462 |     "\n",
463 |     "        # Preprocessing\n",
464 |     "        encoded_input = self._tokenizer(json_request['context'], \n",
465 |     "                                         return_tensors='pt')   \n",
466 |     "\n",
467 |     "        # Inference\n",
468 |     "        output = await zerocopy.call_model.remote(\n",
469 |     "            self._model_ref, [], encoded_input)\n",
470 |     "\n",
471 |     "        # Postprocessing\n",
472 |     "        scores = output[0][0].detach().numpy()\n",
473 |     "        scores = scipy.special.softmax(scores)\n",
474 |     "        scores = [float(s) for s in scores]\n",
475 |     "        scores = {k: v for k, v in zip(['positive', 'neutral', 'negative'], scores)}\n",
476 |     "        return scores\n",
477 |     "\n",
478 |     "\n",
479 |     "# This class also appears in `benchmark.ipynb`\n",
480 |     "@serve.deployment\n",
481 |     "class QA:\n",
482 |     "    def __init__(self):\n",
483 |     "        # Load the pipeline and move the model's weights onto the\n",
484 |     "        # Plasma object store.\n",
485 |     "        self._pipeline = zerocopy.rewrite_pipeline(\n",
486 |     "            transformers.pipeline('question-answering', \n",
487 |     "                                  model=QA_MODEL_NAME))\n",
488 |     "        self._threadpool = concurrent.futures.ThreadPoolExecutor()\n",
489 |     "\n",
490 |     "    async def __call__(self, request: starlette.requests.Request):\n",
491 |     "        json_request = await request.json()\n",
492 |     "\n",
493 |     "        # The original `transformers` code is not async-aware, so we\n",
494 |     "        # call it from `run_in_executor()`\n",
495 |     "        result = await asyncio.get_running_loop().run_in_executor(\n",
496 |     "             self._threadpool, lambda: self._pipeline(**json_request))\n",
497 |     "        return result\n",
498 |     "\n",
499 |     "\n",
500 |     "@serve.deployment\n",
501 |     "class Generate:\n",
502 |     "    def __init__(self):\n",
503 |     "        self._pipeline = zerocopy.rewrite_pipeline(\n",
504 |     "            transformers.pipeline('text-generation',\n",
505 |     "                                  model=GENERATE_MODEL_NAME),\n",
506 |     "            ('__call__', 'generate'))\n",
507 |     "        self._pad_token_id = self._pipeline.tokenizer.eos_token_id\n",
508 |     "        self._threadpool = concurrent.futures.ThreadPoolExecutor()\n",
509 |     "\n",
510 |     "    async def __call__(self, request: starlette.requests.Request):\n",
511 |     "        json_request = await request.json()\n",
512 |     "\n",
513 |     "        result = await asyncio.get_running_loop().run_in_executor(\n",
514 |     "            self._threadpool, \n",
515 |     "            lambda: self._pipeline(\n",
516 |     "                json_request['prompt_text'], \n",
517 |     "                pad_token_id=self._pad_token_id))\n",
518 |     "        return result\n"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "markdown",
523 |    "id": "60fa3569-8f71-4158-a1ce-a47d1c0da78a",
524 |    "metadata": {},
525 |    "source": [
526 |     "Now we can deploy all of these pipelines as Serve endpoints."
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": 10,
532 |    "id": "ae657532-5495-43d4-9bba-ae102cba0634",
533 |    "metadata": {},
534 |    "outputs": [
535 |     {
536 |      "name": "stderr",
537 |      "output_type": "stream",
538 |      "text": [
539 |       "2022-04-14 16:23:58,332\tINFO api.py:262 -- Updating deployment 'intent_en'. component=serve deployment=intent_en\n",
540 |       "2022-04-14 16:23:58,343\tINFO api.py:262 -- Updating deployment 'sentiment_en'. component=serve deployment=sentiment_en\n",
541 |       "2022-04-14 16:23:58,353\tINFO api.py:262 -- Updating deployment 'qa_en'. component=serve deployment=qa_en\n",
542 |       "2022-04-14 16:23:58,364\tINFO api.py:262 -- Updating deployment 'generate_en'. component=serve deployment=generate_en\n",
543 |       "2022-04-14 16:23:58,377\tINFO api.py:262 -- Updating deployment 'intent_es'. component=serve deployment=intent_es\n",
544 |       "2022-04-14 16:23:58,391\tINFO api.py:262 -- Updating deployment 'sentiment_es'. component=serve deployment=sentiment_es\n",
545 |       "2022-04-14 16:23:58,404\tINFO api.py:262 -- Updating deployment 'qa_es'. component=serve deployment=qa_es\n",
546 |       "2022-04-14 16:23:58,417\tINFO api.py:262 -- Updating deployment 'generate_es'. component=serve deployment=generate_es\n",
547 |       "2022-04-14 16:23:58,431\tINFO api.py:262 -- Updating deployment 'intent_zh'. component=serve deployment=intent_zh\n",
548 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:58,433\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'intent_en'. component=serve deployment=intent_en\n",
549 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:58,475\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'sentiment_en'. component=serve deployment=sentiment_en\n",
550 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:58,539\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'qa_en'. component=serve deployment=qa_en\n",
551 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:58,565\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'generate_en'. component=serve deployment=generate_en\n",
552 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:58,587\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'intent_es'. component=serve deployment=intent_es\n",
553 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:58,609\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'sentiment_es'. component=serve deployment=sentiment_es\n",
554 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:58,637\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'qa_es'. component=serve deployment=qa_es\n",
555 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:58,664\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'generate_es'. component=serve deployment=generate_es\n",
556 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:58,727\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'intent_zh'. component=serve deployment=intent_zh\n",
557 |       "2022-04-14 16:23:58,776\tINFO api.py:262 -- Updating deployment 'sentiment_zh'. component=serve deployment=sentiment_zh\n",
558 |       "2022-04-14 16:23:58,800\tINFO api.py:262 -- Updating deployment 'qa_zh'. component=serve deployment=qa_zh\n",
559 |       "2022-04-14 16:23:58,828\tINFO api.py:262 -- Updating deployment 'generate_zh'. component=serve deployment=generate_zh\n",
560 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:58,868\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'sentiment_zh'. component=serve deployment=sentiment_zh\n",
561 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:58,899\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'qa_zh'. component=serve deployment=qa_zh\n",
562 |       "\u001b[2m\u001b[36m(ServeController pid=51116)\u001b[0m 2022-04-14 16:23:58,932\tINFO deployment_state.py:920 -- Adding 1 replicas to deployment 'generate_zh'. component=serve deployment=generate_zh\n",
563 |       "\u001b[2m\u001b[36m(intent_en pid=51113)\u001b[0m The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.\n",
564 |       "\u001b[2m\u001b[36m(intent_es pid=51120)\u001b[0m The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.\n",
565 |       "\u001b[2m\u001b[36m(intent_zh pid=51108)\u001b[0m The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.\n"
566 |      ]
567 |     }
568 |    ],
569 |    "source": [
570 |     "# Define endpoints.\n",
571 |     "# Everything gets deployed under the prefix /predictions/ to make\n",
572 |     "# the deployment as similar as possible to the TorchServe baseline.\n",
573 |     "LANGUAGES = ['en', 'es', 'zh']\n",
574 |     "\n",
575 |     "\n",
576 |     "for lang in LANGUAGES:\n",
577 |     "    (Intent.options(name=f'intent_{lang}',\n",
578 |     "                   route_prefix=f'/predictions/intent_{lang}',\n",
579 |     "                   ray_actor_options={\"num_cpus\": 0.1})\n",
580 |     "     .deploy(_blocking=False))\n",
581 |     "    (Sentiment.options(name=f'sentiment_{lang}',\n",
582 |     "                   route_prefix=f'/predictions/sentiment_{lang}',\n",
583 |     "                   ray_actor_options={\"num_cpus\": 0.1})\n",
584 |     "     .deploy(_blocking=False))\n",
585 |     "    (QA.options(name=f'qa_{lang}',\n",
586 |     "                   route_prefix=f'/predictions/qa_{lang}',\n",
587 |     "                   ray_actor_options={\"num_cpus\": 0.1})\n",
588 |     "     .deploy(_blocking=False))\n",
589 |     "    (Generate.options(name=f'generate_{lang}',\n",
590 |     "                   route_prefix=f'/predictions/generate_{lang}',\n",
591 |     "                   ray_actor_options={\"num_cpus\": 0.1})\n",
592 |     "     .deploy(_blocking=False))\n",
593 |     "\n",
594 |     "# Wait a moment so log output doesn't go to the next cell's output\n",
595 |     "time.sleep(5.)"
596 |    ]
597 |   },
598 |   {
599 |    "cell_type": "code",
600 |    "execution_count": 14,
601 |    "id": "44da4cee-44a2-423a-9aa4-d5ed1c5dd8f5",
602 |    "metadata": {},
603 |    "outputs": [
604 |     {
605 |      "name": "stdout",
606 |      "output_type": "stream",
607 |      "text": [
608 |       "7950.210511 MB       61, (7950.210511 MB)  0, (0.0 MB)   0, (0.0 MB)    0, (0.0 MB)          53, (0.0 MB) \n",
609 |       "127.0.0.1     | 51116    | Worker  |           | 1.5e-05 MB | LOCAL_REFERENCE | b4567f7b86f1c9b04089143b9fcd9bcfa6aa4e880100000001000000\n",
610 |       "127.0.0.1     | 51116    | Worker  |           | 1.5e-05 MB | LOCAL_REFERENCE | 0cb686442cb43d5ecb863c842117b9ea56b331370100000001000000\n",
611 |       "127.0.0.1     | 51116    | Worker  |           | 1.5e-05 MB | LOCAL_REFERENCE | 5072e9fc92a6447effd95f38533e692f8796b72b0100000001000000\n",
612 |       "127.0.0.1     | 51116    | Worker  |           | 1.5e-05 MB | LOCAL_REFERENCE | d714b645ac9c0d738e0f785a690e9d7149e007d60100000001000000\n",
613 |       "127.0.0.1     | 51116    | Worker  |           | 1.5e-05 MB | LOCAL_REFERENCE | dd797876ac844e6cec693c0afb09f36ab69518530100000001000000\n",
614 |       "127.0.0.1     | 51116    | Worker  |           | 1.5e-05 MB | LOCAL_REFERENCE | d53def7e0cdfbb7ceeeccac7d4023d02b21863c30100000001000000\n",
615 |       "127.0.0.1     | 51116    | Worker  |           | 1.5e-05 MB | LOCAL_REFERENCE | 16f770dbddefee94dbc9313ac49521c8741ff04e0100000001000000\n",
616 |       "127.0.0.1     | 51116    | Worker  |           | 1.5e-05 MB | LOCAL_REFERENCE | 0dd10f8a1fd1a1671f480c93ba18e35136121ee60100000001000000\n",
617 |       "127.0.0.1     | 51116    | Worker  |           | 1.5e-05 MB | LOCAL_REFERENCE | 442bdb152e236da1adc15c5c0617f64b080feff00100000001000000\n",
618 |       "127.0.0.1     | 51116    | Worker  |           | 1.5e-05 MB | LOCAL_REFERENCE | 0897720c35efdaa055c84712d1dc142f48c765530100000001000000\n",
619 |       "127.0.0.1     | 51116    | Worker  |           | 1.5e-05 MB | LOCAL_REFERENCE | c3d6e51ec39b88c7160a6428f05a1d6f1a04bf6d0100000001000000\n",
620 |       "127.0.0.1     | 51116    | Worker  |           | 1.5e-05 MB | LOCAL_REFERENCE | 165d6c3f4d631fe6a91801fb44e469d6867cee2f0100000001000000\n",
621 |       "127.0.0.1     | 51116    | Worker  |           | 0.000471 MB | LOCAL_REFERENCE | 7693d3820768ba018e0f785a690e9d7149e007d60100000001000000\n",
622 |       "127.0.0.1     | 51116    | Worker  |           | 0.000471 MB | LOCAL_REFERENCE | 5f2178ca2cd853c3a91801fb44e469d6867cee2f0100000001000000\n",
623 |       "127.0.0.1     | 51116    | Worker  |           | 0.000471 MB | LOCAL_REFERENCE | 8aa143a5a17e8df9ec693c0afb09f36ab69518530100000001000000\n",
624 |       "127.0.0.1     | 51116    | Worker  |           | 0.000471 MB | LOCAL_REFERENCE | 0ef590906626002955c84712d1dc142f48c765530100000001000000\n",
625 |       "127.0.0.1     | 51116    | Worker  |           | 0.000473 MB | LOCAL_REFERENCE | be485eabab1d7255eeeccac7d4023d02b21863c30100000001000000\n",
626 |       "127.0.0.1     | 51116    | Worker  |           | 0.000473 MB | LOCAL_REFERENCE | 283b5af397b1effacb863c842117b9ea56b331370100000001000000\n",
627 |       "127.0.0.1     | 51116    | Worker  |           | 0.000473 MB | LOCAL_REFERENCE | 262c93c2c998dd361f480c93ba18e35136121ee60100000001000000\n",
628 |       "127.0.0.1     | 51116    | Worker  |           | 0.000473 MB | LOCAL_REFERENCE | 9cd3a330b64d82beadc15c5c0617f64b080feff00100000001000000\n",
629 |       "127.0.0.1     | 51116    | Worker  |           | 0.000473 MB | LOCAL_REFERENCE | 3c82fff14f280076dbc9313ac49521c8741ff04e0100000001000000\n",
630 |       "127.0.0.1     | 51116    | Worker  |           | 0.000473 MB | LOCAL_REFERENCE | 1dd65e66828b562e4089143b9fcd9bcfa6aa4e880100000001000000\n",
631 |       "127.0.0.1     | 51116    | Worker  |           | 0.000473 MB | LOCAL_REFERENCE | 5c0c06b8d916f1c2ffd95f38533e692f8796b72b0100000001000000\n",
632 |       "127.0.0.1     | 51116    | Worker  |           | 0.000473 MB | LOCAL_REFERENCE | 9e1e06801a61eab9160a6428f05a1d6f1a04bf6d0100000001000000\n",
633 |       "127.0.0.1     | 51118    | Worker  |           | 496.274462 MB | LOCAL_REFERENCE | 006f7d45c018d6754385d0e52b5eeba5858ffb400100000001000000\n",
634 |       "127.0.0.1     | 51117    | Worker  |           | 496.274462 MB | LOCAL_REFERENCE | 0023ab1ef11a6f790b38151c590b1a985f32c1a40100000001000000\n",
635 |       "127.0.0.1     | 51112    | Worker  |           | 496.274462 MB | LOCAL_REFERENCE | 005dd9a1b100e5ceae26dde31018e40eb0e985670100000001000000\n",
636 |       "127.0.0.1     | 51119    | Worker  |           | 498.640411 MB | LOCAL_REFERENCE | 002f0321dd4dbd029963848382af24fed2a1f0cb0100000001000000\n",
637 |       "127.0.0.1     | 51114    | Worker  |           | 498.640411 MB | LOCAL_REFERENCE | 0068374d51ce6ab1dc5cbf77007d565d24fc599a0100000001000000\n",
638 |       "127.0.0.1     | 51109    | Worker  |           | 498.640411 MB | LOCAL_REFERENCE | 00334dba1b575d7a9bb7057c0f79a831587babfc0100000001000000\n",
639 |       "127.0.0.1     | 51111    | Worker  |           | 664.763098 MB | LOCAL_REFERENCE | 00e8aa4f74d575572745a1e2397a88531383532e0100000001000000\n",
640 |       "127.0.0.1     | 51115    | Worker  |           | 664.763098 MB | LOCAL_REFERENCE | 005917687e8f2a9bc05e4bc971486f45791e4f2f0100000001000000\n",
641 |       "127.0.0.1     | 51121    | Worker  |           | 664.763098 MB | LOCAL_REFERENCE | 006d607cfebc96249ba9ae16371ef67ad0c613030100000001000000\n",
642 |       "127.0.0.1     | 51120    | Worker  |           | 990.39025 MB | LOCAL_REFERENCE | 00912000d9d33992ebc7332134e2a6b8f09abbb30100000001000000\n",
643 |       "127.0.0.1     | 51113    | Worker  |           | 990.39025 MB | LOCAL_REFERENCE | 008560c9ad45e06b537af4cf84a28d4483df29250100000001000000\n",
644 |       "127.0.0.1     | 51108    | Worker  |           | 990.39025 MB | LOCAL_REFERENCE | 0060d76c711e8b5826bd76ce63772f8aaf06d3560100000001000000\n"
645 |      ]
646 |     }
647 |    ],
648 |    "source": [
649 |     "# Dump object sizes from Plasma. Used to populate the table of model sizes in the main notebook.\n",
650 |     "!ray memory --units MB | grep MB"
651 |    ]
652 |   },
653 |   {
654 |    "cell_type": "code",
655 |    "execution_count": 12,
656 |    "id": "baac194f-9c7e-4d7c-8228-4e55b31738f2",
657 |    "metadata": {},
658 |    "outputs": [
659 |     {
660 |      "name": "stdout",
661 |      "output_type": "stream",
662 |      "text": [
663 |       "Intent result: {'intent': 'to eat'}\n",
664 |       "Sentiment result: {'positive': 0.5419476628303528, 'neutral': 0.38251087069511414, 'negative': 0.07554134726524353}\n",
665 |       "Question answering result: {'score': 4.278897904441692e-06, 'start': 483, 'end': 484, 'answer': '5'}\n",
666 |       "Natural language generation result: [{'generated_text': \"All your base are in a position to be able to compete to be the best in the world. We take your feedback very seriously. We're going to be working to make sure that we're doing everything we can to make a better game for everyone\"}]\n"
667 |      ]
668 |     },
669 |     {
670 |      "name": "stderr",
671 |      "output_type": "stream",
672 |      "text": [
673 |       "2022-03-08 11:42:42,682\tWARNING worker.py:1257 -- Warning: More than 5000 tasks are pending submission to actor fbce1094d701fcfadbb0ea8a01000000. To reduce memory usage, wait for these tasks to finish before sending more.\n",
674 |       "\u001b[2m\u001b[36m(pid=66875)\u001b[0m [2022-03-08 11:49:17,044 E 66875 66920] core_worker_process.cc:348: The global worker has already been shutdown. This happens when the language frontend accesses the Ray's worker after it is shutdown. The process will exit\n",
675 |       "2022-03-08 11:51:57,926\tWARNING worker.py:1257 -- Warning: More than 10000 tasks are pending submission to actor fbce1094d701fcfadbb0ea8a01000000. To reduce memory usage, wait for these tasks to finish before sending more.\n",
676 |       "\u001b[2m\u001b[36m(pid=70721)\u001b[0m [2022-03-08 11:52:00,501 E 70721 70771] core_worker_process.cc:348: The global worker has already been shutdown. This happens when the language frontend accesses the Ray's worker after it is shutdown. The process will exit\n",
677 |       "\u001b[2m\u001b[36m(pid=71167)\u001b[0m [2022-03-08 11:52:22,160 E 71167 71218] core_worker_process.cc:348: The global worker has already been shutdown. This happens when the language frontend accesses the Ray's worker after it is shutdown. The process will exit\n",
678 |       "\u001b[2m\u001b[36m(pid=71286)\u001b[0m [2022-03-08 11:52:27,603 E 71286 71403] core_worker_process.cc:348: The global worker has already been shutdown. This happens when the language frontend accesses the Ray's worker after it is shutdown. The process will exit\n",
679 |       "2022-03-08 12:05:17,889\tWARNING worker.py:1257 -- Warning: More than 20000 tasks are pending submission to actor fbce1094d701fcfadbb0ea8a01000000. To reduce memory usage, wait for these tasks to finish before sending more.\n"
680 |      ]
681 |     }
682 |    ],
683 |    "source": [
684 |     "# Verify that everything deployed properly.\n",
685 |     "intent_result = requests.put(\n",
686 |     "    'http://127.0.0.1:8000/predictions/intent_en', \n",
687 |     "    json.dumps(INTENT_INPUT)).json()\n",
688 |     "print(f'Intent result: {intent_result}')\n",
689 |     "\n",
690 |     "sentiment_result = requests.put(\n",
691 |     "    'http://127.0.0.1:8000/predictions/sentiment_en', \n",
692 |     "    json.dumps(SENTIMENT_INPUT)).json()\n",
693 |     "print(f'Sentiment result: {sentiment_result}')\n",
694 |     "\n",
695 |     "qa_result = requests.put(\n",
696 |     "    'http://127.0.0.1:8000/predictions/qa_en', \n",
697 |     "    json.dumps(QA_INPUT)).json()\n",
698 |     "print(f'Question answering result: {qa_result}')\n",
699 |     "\n",
700 |     "generate_result = requests.put(\n",
701 |     "    'http://127.0.0.1:8000/predictions/generate_en', \n",
702 |     "    json.dumps(GENERATE_INPUT)).json()\n",
703 |     "print(f'Natural language generation result: {generate_result}')"
704 |    ]
705 |   },
706 |   {
707 |    "cell_type": "markdown",
708 |    "id": "57fb1cf4-e53f-463f-a7f8-deadb3947186",
709 |    "metadata": {},
710 |    "source": [
711 |     "# Cleanup\n",
712 |     "\n",
713 |     "Once the benchmark is complete, shut down this notebook's Ray cluster."
714 |    ]
715 |   },
716 |   {
717 |    "cell_type": "code",
718 |    "execution_count": null,
719 |    "id": "42017025-37d7-4dd1-b436-ae4fea31bc94",
720 |    "metadata": {},
721 |    "outputs": [
722 |     {
723 |      "name": "stderr",
724 |      "output_type": "stream",
725 |      "text": [
726 |       "\u001b[2m\u001b[36m(ServeController pid=19731)\u001b[0m 2022-03-08 12:47:52,714\tINFO deployment_state.py:940 -- Removing 1 replicas from deployment 'intent_en'. component=serve deployment=intent_en\n",
727 |       "\u001b[2m\u001b[36m(ServeController pid=19731)\u001b[0m 2022-03-08 12:47:52,719\tINFO deployment_state.py:940 -- Removing 1 replicas from deployment 'sentiment_en'. component=serve deployment=sentiment_en\n",
728 |       "\u001b[2m\u001b[36m(ServeController pid=19731)\u001b[0m 2022-03-08 12:47:52,734\tINFO deployment_state.py:940 -- Removing 1 replicas from deployment 'qa_en'. component=serve deployment=qa_en\n",
729 |       "\u001b[2m\u001b[36m(ServeController pid=19731)\u001b[0m 2022-03-08 12:47:52,737\tINFO deployment_state.py:940 -- Removing 1 replicas from deployment 'generate_en'. component=serve deployment=generate_en\n",
730 |       "\u001b[2m\u001b[36m(ServeController pid=19731)\u001b[0m 2022-03-08 12:47:52,740\tINFO deployment_state.py:940 -- Removing 1 replicas from deployment 'intent_es'. component=serve deployment=intent_es\n",
731 |       "\u001b[2m\u001b[36m(ServeController pid=19731)\u001b[0m 2022-03-08 12:47:52,743\tINFO deployment_state.py:940 -- Removing 1 replicas from deployment 'sentiment_es'. component=serve deployment=sentiment_es\n",
732 |       "\u001b[2m\u001b[36m(ServeController pid=19731)\u001b[0m 2022-03-08 12:47:52,747\tINFO deployment_state.py:940 -- Removing 1 replicas from deployment 'qa_es'. component=serve deployment=qa_es\n",
733 |       "\u001b[2m\u001b[36m(ServeController pid=19731)\u001b[0m 2022-03-08 12:47:52,756\tINFO deployment_state.py:940 -- Removing 1 replicas from deployment 'generate_es'. component=serve deployment=generate_es\n",
734 |       "\u001b[2m\u001b[36m(ServeController pid=19731)\u001b[0m 2022-03-08 12:47:52,758\tINFO deployment_state.py:940 -- Removing 1 replicas from deployment 'intent_zh'. component=serve deployment=intent_zh\n",
735 |       "\u001b[2m\u001b[36m(ServeController pid=19731)\u001b[0m 2022-03-08 12:47:52,761\tINFO deployment_state.py:940 -- Removing 1 replicas from deployment 'sentiment_zh'. component=serve deployment=sentiment_zh\n",
736 |       "\u001b[2m\u001b[36m(ServeController pid=19731)\u001b[0m 2022-03-08 12:47:52,765\tINFO deployment_state.py:940 -- Removing 1 replicas from deployment 'qa_zh'. component=serve deployment=qa_zh\n",
737 |       "\u001b[2m\u001b[36m(ServeController pid=19731)\u001b[0m 2022-03-08 12:47:52,768\tINFO deployment_state.py:940 -- Removing 1 replicas from deployment 'generate_zh'. component=serve deployment=generate_zh\n"
738 |      ]
739 |     }
740 |    ],
741 |    "source": [
742 |     "serve.shutdown()\n",
743 |     "ray.shutdown()"
744 |    ]
745 |   },
746 |   {
747 |    "cell_type": "code",
748 |    "execution_count": null,
749 |    "id": "aa8ddf0f-c6a0-475e-b549-b16d29dce24b",
750 |    "metadata": {},
751 |    "outputs": [],
752 |    "source": []
753 |   }
754 |  ],
755 |  "metadata": {
756 |   "kernelspec": {
757 |    "display_name": "Python 3 (ipykernel)",
758 |    "language": "python",
759 |    "name": "python3"
760 |   },
761 |   "language_info": {
762 |    "codemirror_mode": {
763 |     "name": "ipython",
764 |     "version": 3
765 |    },
766 |    "file_extension": ".py",
767 |    "mimetype": "text/x-python",
768 |    "name": "python",
769 |    "nbconvert_exporter": "python",
770 |    "pygments_lexer": "ipython3",
771 |    "version": "3.8.13"
772 |   }
773 |  },
774 |  "nbformat": 4,
775 |  "nbformat_minor": 5
776 | }
777 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/torchserve.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "b887e3af-0ebb-4a37-89ee-9c60a1a18ed1",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# torchserve.ipynb\n",
  9 |     "\n",
 10 |     "This notebook contains code for the portions of the benchmark in [the benchmark notebook](./benchmark.ipynb) that use [TorchServe](https://github.com/pytorch/serve).\n",
 11 |     "\n"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "id": "6ca5621e-af12-443b-a2c8-8cd6186e2972",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "# Imports go here\n",
 22 |     "import json\n",
 23 |     "import os\n",
 24 |     "import requests\n",
 25 |     "\n",
 26 |     "import scipy.special\n",
 27 |     "import transformers\n",
 28 |     "\n",
 29 |     "# Fix silly warning messages about parallel tokenizers\n",
 30 |     "os.environ['TOKENIZERS_PARALLELISM'] = 'False'"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "id": "d82d5bd7-0042-446d-9260-89568fb58147",
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "# Constants go here\n",
 41 |     "\n",
 42 |     "INTENT_MODEL_NAME = 'mrm8488/t5-base-finetuned-e2m-intent'\n",
 43 |     "SENTIMENT_MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment'\n",
 44 |     "QA_MODEL_NAME = 'deepset/roberta-base-squad2'\n",
 45 |     "GENERATE_MODEL_NAME = 'gpt2'\n",
 46 |     "\n",
 47 |     "\n",
 48 |     "INTENT_INPUT = {\n",
 49 |     "    'context':\n",
 50 |     "        (\"I came here to eat chips and beat you up, \"\n",
 51 |     "         \"and I'm all out of chips.\")\n",
 52 |     "}\n",
 53 |     "\n",
 54 |     "SENTIMENT_INPUT = {\n",
 55 |     "    'context': \"We're not happy unless you're not happy.\"\n",
 56 |     "}\n",
 57 |     "\n",
 58 |     "QA_INPUT = {\n",
 59 |     "    'question': 'What is 1 + 1?',\n",
 60 |     "    'context': \n",
 61 |     "        \"\"\"Addition (usually signified by the plus symbol +) is one of the four basic operations of \n",
 62 |     "        arithmetic, the other three being subtraction, multiplication and division. The addition of two \n",
 63 |     "        whole numbers results in the total amount or sum of those values combined. The example in the\n",
 64 |     "        adjacent image shows a combination of three apples and two apples, making a total of five apples. \n",
 65 |     "        This observation is equivalent to the mathematical expression \"3 + 2 = 5\" (that is, \"3 plus 2 \n",
 66 |     "        is equal to 5\").\n",
 67 |     "        \"\"\"\n",
 68 |     "}\n",
 69 |     "\n",
 70 |     "GENERATE_INPUT = {\n",
 71 |     "    'prompt_text': 'All your base are'\n",
 72 |     "}"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "id": "e444265e-4885-480e-85bc-39d26020b0d3",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "## Model Packaging\n",
 81 |     "\n",
 82 |     "TorchServe requires models to be packaged up as model archive files. Documentation for this process (such as it is) is [here](https://github.com/pytorch/serve/blob/master/README.md#serve-a-model) and [here](https://github.com/pytorch/serve/blob/master/model-archiver/README.md).\n",
 83 |     "\n"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "id": "4c3a11f1-9404-40c9-9928-4f176ed84927",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "### Intent Model\n",
 92 |     "\n",
 93 |     "The intent model requires the caller to call the pre- and post-processing code manually. Only the model and tokenizer are provided on the model zoo."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 3,
 99 |    "id": "ff878387-488f-4c35-a312-2b5bf7b23770",
100 |    "metadata": {},
101 |    "outputs": [
102 |     {
103 |      "data": {
104 |       "application/vnd.jupyter.widget-view+json": {
105 |        "model_id": "407a723d8f474631945d8f471fe9502e",
106 |        "version_major": 2,
107 |        "version_minor": 0
108 |       },
109 |       "text/plain": [
110 |        "Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]"
111 |       ]
112 |      },
113 |      "metadata": {},
114 |      "output_type": "display_data"
115 |     },
116 |     {
117 |      "name": "stderr",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.\n"
121 |      ]
122 |     },
123 |     {
124 |      "data": {
125 |       "application/vnd.jupyter.widget-view+json": {
126 |        "model_id": "5902366cc25f4f97a4713319b030d46c",
127 |        "version_major": 2,
128 |        "version_minor": 0
129 |       },
130 |       "text/plain": [
131 |        "Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]"
132 |       ]
133 |      },
134 |      "metadata": {},
135 |      "output_type": "display_data"
136 |     },
137 |     {
138 |      "data": {
139 |       "application/vnd.jupyter.widget-view+json": {
140 |        "model_id": "c7cd70371bfb4340b498e7e6dfdea3bc",
141 |        "version_major": 2,
142 |        "version_minor": 0
143 |       },
144 |       "text/plain": [
145 |        "Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]"
146 |       ]
147 |      },
148 |      "metadata": {},
149 |      "output_type": "display_data"
150 |     },
151 |     {
152 |      "data": {
153 |       "application/vnd.jupyter.widget-view+json": {
154 |        "model_id": "7e2b53dd8a824f50b3d023ab47e32917",
155 |        "version_major": 2,
156 |        "version_minor": 0
157 |       },
158 |       "text/plain": [
159 |        "Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]"
160 |       ]
161 |      },
162 |      "metadata": {},
163 |      "output_type": "display_data"
164 |     },
165 |     {
166 |      "data": {
167 |       "application/vnd.jupyter.widget-view+json": {
168 |        "model_id": "ba3be79f182d4bc3b7590068f2d51261",
169 |        "version_major": 2,
170 |        "version_minor": 0
171 |       },
172 |       "text/plain": [
173 |        "Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]"
174 |       ]
175 |      },
176 |      "metadata": {},
177 |      "output_type": "display_data"
178 |     },
179 |     {
180 |      "data": {
181 |       "text/plain": [
182 |        "('torchserve/intent/tokenizer_config.json',\n",
183 |        " 'torchserve/intent/special_tokens_map.json',\n",
184 |        " 'torchserve/intent/tokenizer.json')"
185 |       ]
186 |      },
187 |      "execution_count": 3,
188 |      "metadata": {},
189 |      "output_type": "execute_result"
190 |     }
191 |    ],
192 |    "source": [
193 |     "# First we need to dump the model into a local directory.\n",
194 |     "intent_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(\n",
195 |     "    INTENT_MODEL_NAME)\n",
196 |     "intent_tokenizer = transformers.AutoTokenizer.from_pretrained('t5-base')\n",
197 |     "\n",
198 |     "intent_model.save_pretrained('torchserve/intent')\n",
199 |     "intent_tokenizer.save_pretrained('torchserve/intent')"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "id": "fb73b6ec-d774-4617-b6d8-93dd9993e3cb",
205 |    "metadata": {},
206 |    "source": [
207 |     "Next we wrapped the model in a handler class, located at `./torchserve/handler_intent.py`, which \n",
208 |     "needs to be in its own separate Python file in order for the `torch-model-archiver`\n",
209 |     "utility to work.\n",
210 |     "\n",
211 |     "The following command turns this Python file, plus the data files created by the \n",
212 |     "previous cell, into a model archive (`.mar`) file at `torchserve/model_store/intent.mar`."
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 4,
218 |    "id": "70d6058d-e704-4be5-a6ee-c344a6df649c",
219 |    "metadata": {},
220 |    "outputs": [
221 |     {
222 |      "name": "stdout",
223 |      "output_type": "stream",
224 |      "text": [
225 |       "CPU times: user 438 ms, sys: 116 ms, total: 553 ms\n",
226 |       "Wall time: 54 s\n"
227 |      ]
228 |     }
229 |    ],
230 |    "source": [
231 |     "%%time\n",
232 |     "!mkdir -p torchserve/model_store\n",
233 |     "!torch-model-archiver --model-name intent --version 1.0 \\\n",
234 |     " --serialized-file torchserve/intent/pytorch_model.bin \\\n",
235 |     " --handler torchserve/handler_intent.py \\\n",
236 |     " --extra-files \"torchserve/intent/config.json,torchserve/intent/special_tokens_map.json,torchserve/intent/tokenizer_config.json,torchserve/intent/tokenizer.json\" \\\n",
237 |     " --export-path torchserve/model_store \\\n",
238 |     " --force"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "id": "ea2e8234-c291-47a4-ba66-11a1a5d8fbb1",
244 |    "metadata": {},
245 |    "source": [
246 |     "### Sentiment Model\n",
247 |     "\n",
248 |     "The sentiment model operates similarly to the intent model."
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 5,
254 |    "id": "a9e54fd5-b992-47b7-b62d-b1e649f3c4db",
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "data": {
259 |       "application/vnd.jupyter.widget-view+json": {
260 |        "model_id": "714dbcc9ac1b41158499c3c94a23ce4b",
261 |        "version_major": 2,
262 |        "version_minor": 0
263 |       },
264 |       "text/plain": [
265 |        "Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]"
266 |       ]
267 |      },
268 |      "metadata": {},
269 |      "output_type": "display_data"
270 |     },
271 |     {
272 |      "data": {
273 |       "application/vnd.jupyter.widget-view+json": {
274 |        "model_id": "9c904d31746a451f9afa13cc16ecef0f",
275 |        "version_major": 2,
276 |        "version_minor": 0
277 |       },
278 |       "text/plain": [
279 |        "Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]"
280 |       ]
281 |      },
282 |      "metadata": {},
283 |      "output_type": "display_data"
284 |     },
285 |     {
286 |      "data": {
287 |       "application/vnd.jupyter.widget-view+json": {
288 |        "model_id": "857a937b34fc41fdaaecc7a41469d878",
289 |        "version_major": 2,
290 |        "version_minor": 0
291 |       },
292 |       "text/plain": [
293 |        "Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]"
294 |       ]
295 |      },
296 |      "metadata": {},
297 |      "output_type": "display_data"
298 |     },
299 |     {
300 |      "data": {
301 |       "application/vnd.jupyter.widget-view+json": {
302 |        "model_id": "d536218a01784b3d8999c194aad8060d",
303 |        "version_major": 2,
304 |        "version_minor": 0
305 |       },
306 |       "text/plain": [
307 |        "Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]"
308 |       ]
309 |      },
310 |      "metadata": {},
311 |      "output_type": "display_data"
312 |     },
313 |     {
314 |      "data": {
315 |       "application/vnd.jupyter.widget-view+json": {
316 |        "model_id": "1227715feec94181ae2f9d25f9a86831",
317 |        "version_major": 2,
318 |        "version_minor": 0
319 |       },
320 |       "text/plain": [
321 |        "Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]"
322 |       ]
323 |      },
324 |      "metadata": {},
325 |      "output_type": "display_data"
326 |     },
327 |     {
328 |      "data": {
329 |       "text/plain": [
330 |        "('torchserve/sentiment/tokenizer_config.json',\n",
331 |        " 'torchserve/sentiment/special_tokens_map.json',\n",
332 |        " 'torchserve/sentiment/vocab.json',\n",
333 |        " 'torchserve/sentiment/merges.txt',\n",
334 |        " 'torchserve/sentiment/added_tokens.json',\n",
335 |        " 'torchserve/sentiment/tokenizer.json')"
336 |       ]
337 |      },
338 |      "execution_count": 5,
339 |      "metadata": {},
340 |      "output_type": "execute_result"
341 |     }
342 |    ],
343 |    "source": [
344 |     "sentiment_tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
345 |     "    SENTIMENT_MODEL_NAME)\n",
346 |     "sentiment_model = (\n",
347 |     "    transformers.AutoModelForSequenceClassification\n",
348 |     "    .from_pretrained(SENTIMENT_MODEL_NAME))\n",
349 |     "\n",
350 |     "sentiment_model.save_pretrained('torchserve/sentiment')\n",
351 |     "sentiment_tokenizer.save_pretrained('torchserve/sentiment')"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 6,
357 |    "id": "b9957b85-3417-430c-87a5-a0645dd13508",
358 |    "metadata": {},
359 |    "outputs": [
360 |     {
361 |      "data": {
362 |       "text/plain": [
363 |        "[{'positive': 0.13167870044708252,\n",
364 |        "  'neutral': 0.6034972071647644,\n",
365 |        "  'negative': 0.26482412219047546},\n",
366 |        " {'positive': 0.22967909276485443,\n",
367 |        "  'neutral': 0.5535956025123596,\n",
368 |        "  'negative': 0.21672534942626953}]"
369 |       ]
370 |      },
371 |      "execution_count": 6,
372 |      "metadata": {},
373 |      "output_type": "execute_result"
374 |     }
375 |    ],
376 |    "source": [
377 |     "contexts = ['hello', 'world']\n",
378 |     "input_batch = sentiment_tokenizer(contexts, padding=True, \n",
379 |     "                                  return_tensors='pt')\n",
380 |     "\n",
381 |     "inference_output = sentiment_model(**input_batch)\n",
382 |     "\n",
383 |     "scores = inference_output.logits.detach().numpy()\n",
384 |     "scores = scipy.special.softmax(scores, axis=1).tolist()\n",
385 |     "scores = [{k: v for k, v in zip(['positive', 'neutral', 'negative'], row)}\n",
386 |     "          for row in scores]\n",
387 |     "# return scores\n",
388 |     "\n",
389 |     "scores"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "markdown",
394 |    "id": "8aacdaeb-b136-450d-b81b-34ecd1ae26ee",
395 |    "metadata": {},
396 |    "source": [
397 |     "As with the intent model, we created a handler class (located at `torchserve/handler_sentiment.py`), then\n",
398 |     "pass that class and the serialized model from two cells ago\n",
399 |     "through the `torch-model-archiver` utility."
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": 7,
405 |    "id": "24304451-d4eb-43ea-9a93-67279a6fa68f",
406 |    "metadata": {},
407 |    "outputs": [
408 |     {
409 |      "name": "stdout",
410 |      "output_type": "stream",
411 |      "text": [
412 |       "CPU times: user 210 ms, sys: 114 ms, total: 324 ms\n",
413 |       "Wall time: 24.2 s\n"
414 |      ]
415 |     }
416 |    ],
417 |    "source": [
418 |     "%%time\n",
419 |     "!torch-model-archiver --model-name sentiment --version 1.0 \\\n",
420 |     " --serialized-file torchserve/sentiment/pytorch_model.bin \\\n",
421 |     " --handler torchserve/handler_sentiment.py \\\n",
422 |     " --extra-files \"torchserve/sentiment/config.json,torchserve/sentiment/special_tokens_map.json,torchserve/sentiment/tokenizer_config.json,torchserve/sentiment/tokenizer.json\" \\\n",
423 |     " --export-path torchserve/model_store \\\n",
424 |     " --force"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "markdown",
429 |    "id": "2f9ea6aa-4fca-4492-8a53-94e0c7e98ffb",
430 |    "metadata": {},
431 |    "source": [
432 |     "### Question Answering Model\n",
433 |     "\n",
434 |     "The QA model uses a `transformers` pipeline. We squeeze this model into the TorchServe APIs by telling the pipeline to serialize all of its parts to a single directory, then passing the parts that aren't `pytorch_model.bin` in as extra files. At runtime, our custom handler uses the model loading code from `transformers` on the reconstituted model directory."
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": 8,
440 |    "id": "e50c4aec-51a7-489c-b15e-9aa4615a144d",
441 |    "metadata": {},
442 |    "outputs": [
443 |     {
444 |      "data": {
445 |       "application/vnd.jupyter.widget-view+json": {
446 |        "model_id": "448465d7e48d4d5aabea03ad6dcbcfff",
447 |        "version_major": 2,
448 |        "version_minor": 0
449 |       },
450 |       "text/plain": [
451 |        "Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]"
452 |       ]
453 |      },
454 |      "metadata": {},
455 |      "output_type": "display_data"
456 |     },
457 |     {
458 |      "data": {
459 |       "application/vnd.jupyter.widget-view+json": {
460 |        "model_id": "9985ef10e3b448eda08c251097e69ef9",
461 |        "version_major": 2,
462 |        "version_minor": 0
463 |       },
464 |       "text/plain": [
465 |        "Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]"
466 |       ]
467 |      },
468 |      "metadata": {},
469 |      "output_type": "display_data"
470 |     },
471 |     {
472 |      "data": {
473 |       "application/vnd.jupyter.widget-view+json": {
474 |        "model_id": "d96d2cfc47fd42bf924a3901608a7fa8",
475 |        "version_major": 2,
476 |        "version_minor": 0
477 |       },
478 |       "text/plain": [
479 |        "Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]"
480 |       ]
481 |      },
482 |      "metadata": {},
483 |      "output_type": "display_data"
484 |     },
485 |     {
486 |      "data": {
487 |       "application/vnd.jupyter.widget-view+json": {
488 |        "model_id": "85a064df9ecc43b08a7329114f1bd9f2",
489 |        "version_major": 2,
490 |        "version_minor": 0
491 |       },
492 |       "text/plain": [
493 |        "Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]"
494 |       ]
495 |      },
496 |      "metadata": {},
497 |      "output_type": "display_data"
498 |     },
499 |     {
500 |      "data": {
501 |       "application/vnd.jupyter.widget-view+json": {
502 |        "model_id": "fd9a953a18714c9481f26200c81371a8",
503 |        "version_major": 2,
504 |        "version_minor": 0
505 |       },
506 |       "text/plain": [
507 |        "Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]"
508 |       ]
509 |      },
510 |      "metadata": {},
511 |      "output_type": "display_data"
512 |     },
513 |     {
514 |      "data": {
515 |       "application/vnd.jupyter.widget-view+json": {
516 |        "model_id": "deb2aa49fa584d13b1897b8d5033b9e0",
517 |        "version_major": 2,
518 |        "version_minor": 0
519 |       },
520 |       "text/plain": [
521 |        "Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]"
522 |       ]
523 |      },
524 |      "metadata": {},
525 |      "output_type": "display_data"
526 |     }
527 |    ],
528 |    "source": [
529 |     "qa_pipeline = transformers.pipeline('question-answering', model=QA_MODEL_NAME)\n",
530 |     "qa_pipeline.save_pretrained('torchserve/qa')"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "markdown",
535 |    "id": "2d11cfe9-32e1-4302-b29e-9c1126ced947",
536 |    "metadata": {},
537 |    "source": [
538 |     "As with the previous models, we wrote a class (located at `torchserve/handler_qa.py`), then\n",
539 |     "pass that wrapper class and the serialized model through the `torch-model-archiver` utility."
540 |    ]
541 |   },
542 |   {
543 |    "cell_type": "code",
544 |    "execution_count": 9,
545 |    "id": "2785cf2b-5156-4dca-ad1c-297238f5ae0f",
546 |    "metadata": {},
547 |    "outputs": [
548 |     {
549 |      "name": "stdout",
550 |      "output_type": "stream",
551 |      "text": [
552 |       "CPU times: user 287 ms, sys: 67.5 ms, total: 354 ms\n",
553 |       "Wall time: 24.7 s\n"
554 |      ]
555 |     }
556 |    ],
557 |    "source": [
558 |     "%%time\n",
559 |     "!torch-model-archiver --model-name qa --version 1.0 \\\n",
560 |     " --serialized-file torchserve/qa/pytorch_model.bin \\\n",
561 |     " --handler torchserve/handler_qa.py \\\n",
562 |     " --extra-files \"torchserve/qa/config.json,torchserve/qa/merges.txt,torchserve/qa/special_tokens_map.json,torchserve/qa/tokenizer_config.json,torchserve/qa/tokenizer.json,torchserve/qa/vocab.json\" \\\n",
563 |     " --export-path torchserve/model_store \\\n",
564 |     " --force"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": 10,
570 |    "id": "c453edaa-7441-4e08-be68-8a2445dbc7a0",
571 |    "metadata": {},
572 |    "outputs": [
573 |     {
574 |      "data": {
575 |       "text/plain": [
576 |        "[{'score': 4.278918822819833e-06, 'start': 483, 'end': 484, 'answer': '5'},\n",
577 |        " {'score': 4.278918822819833e-06, 'start': 483, 'end': 484, 'answer': '5'}]"
578 |       ]
579 |      },
580 |      "execution_count": 10,
581 |      "metadata": {},
582 |      "output_type": "execute_result"
583 |     }
584 |    ],
585 |    "source": [
586 |     "data = [QA_INPUT, QA_INPUT]\n",
587 |     "\n",
588 |     "# Preprocessing\n",
589 |     "samples = [qa_pipeline.create_sample(**r) for r in data]\n",
590 |     "generators = [qa_pipeline.preprocess(s) for s in samples]\n",
591 |     "\n",
592 |     "# Inference\n",
593 |     "inference_outputs = ((qa_pipeline.forward(example) for example in batch) for batch in generators)\n",
594 |     "\n",
595 |     "post_results = [qa_pipeline.postprocess(o) for o in inference_outputs]\n",
596 |     "post_results"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "markdown",
601 |    "id": "7fb9b624-f0da-4901-a4af-a4b154c5067c",
602 |    "metadata": {},
603 |    "source": [
604 |     "### Natural Language Generation Model\n",
605 |     "\n",
606 |     "The text generation model is roughly similar to the QA model, albeit with important differences in how the three stages of the pipeline operate.  At least model loading is the same."
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": 11,
612 |    "id": "1fd35d09-511e-4ab2-a02c-bead6f415162",
613 |    "metadata": {},
614 |    "outputs": [
615 |     {
616 |      "data": {
617 |       "application/vnd.jupyter.widget-view+json": {
618 |        "model_id": "0f37b3dba81b4692b5d3692549b8f2a8",
619 |        "version_major": 2,
620 |        "version_minor": 0
621 |       },
622 |       "text/plain": [
623 |        "Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]"
624 |       ]
625 |      },
626 |      "metadata": {},
627 |      "output_type": "display_data"
628 |     },
629 |     {
630 |      "data": {
631 |       "application/vnd.jupyter.widget-view+json": {
632 |        "model_id": "42bfed5e50db41d2ae5e29418735990b",
633 |        "version_major": 2,
634 |        "version_minor": 0
635 |       },
636 |       "text/plain": [
637 |        "Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]"
638 |       ]
639 |      },
640 |      "metadata": {},
641 |      "output_type": "display_data"
642 |     },
643 |     {
644 |      "data": {
645 |       "application/vnd.jupyter.widget-view+json": {
646 |        "model_id": "39ec1685cb9b4180ac4b98a1eb6ee7f3",
647 |        "version_major": 2,
648 |        "version_minor": 0
649 |       },
650 |       "text/plain": [
651 |        "Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]"
652 |       ]
653 |      },
654 |      "metadata": {},
655 |      "output_type": "display_data"
656 |     },
657 |     {
658 |      "data": {
659 |       "application/vnd.jupyter.widget-view+json": {
660 |        "model_id": "eea1478b32fd41a4b002e498107a08f8",
661 |        "version_major": 2,
662 |        "version_minor": 0
663 |       },
664 |       "text/plain": [
665 |        "Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]"
666 |       ]
667 |      },
668 |      "metadata": {},
669 |      "output_type": "display_data"
670 |     },
671 |     {
672 |      "data": {
673 |       "application/vnd.jupyter.widget-view+json": {
674 |        "model_id": "a58c91158ae34ea8b61dbecfd44a91ee",
675 |        "version_major": 2,
676 |        "version_minor": 0
677 |       },
678 |       "text/plain": [
679 |        "Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]"
680 |       ]
681 |      },
682 |      "metadata": {},
683 |      "output_type": "display_data"
684 |     }
685 |    ],
686 |    "source": [
687 |     "generate_pipeline = transformers.pipeline(\n",
688 |     "    'text-generation', model=GENERATE_MODEL_NAME)\n",
689 |     "generate_pipeline.save_pretrained('torchserve/generate')"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "code",
694 |    "execution_count": 12,
695 |    "id": "950c95ed-b189-4f4e-9a03-470a601305e6",
696 |    "metadata": {},
697 |    "outputs": [
698 |     {
699 |      "data": {
700 |       "text/plain": [
701 |        "[[{'generated_text': \"All your base are all fine. But don't bother telling me exactly what's going to change that. You'll likely keep getting annoyed, and a little further explanation is best. The best part? I don't want to hear you whine. Well\"}],\n",
702 |        " [{'generated_text': 'All your base are so focused, and so focused because you want to leave the world and get away from the pain.\" I said, \"I\\'ll give you my keys and your cash, and your time. Now stop fighting with me, that\\'s'}]]"
703 |       ]
704 |      },
705 |      "execution_count": 12,
706 |      "metadata": {},
707 |      "output_type": "execute_result"
708 |     }
709 |    ],
710 |    "source": [
711 |     "data = [GENERATE_INPUT, GENERATE_INPUT]\n",
712 |     "\n",
713 |     "\n",
714 |     "pad_token_id = generate_pipeline.tokenizer.eos_token_id\n",
715 |     "\n",
716 |     "json_records = data\n",
717 |     "\n",
718 |     "# preprocess() takes a single input at a time, but we need to do \n",
719 |     "# a batch at a time.\n",
720 |     "input_batch = [generate_pipeline.preprocess(**r) for r in json_records]\n",
721 |     "\n",
722 |     "# forward() takes a single input at a time, but we need to run a\n",
723 |     "# batch at a time.\n",
724 |     "inference_output = [\n",
725 |     "    generate_pipeline.forward(r, pad_token_id=pad_token_id)\n",
726 |     "    for r in input_batch]\n",
727 |     "\n",
728 |     "# postprocess() takes a single generation result at a time, but we\n",
729 |     "# need to run a batch at a time.\n",
730 |     "generate_result = [generate_pipeline.postprocess(i)\n",
731 |     "                   for i in inference_output]\n",
732 |     "generate_result"
733 |    ]
734 |   },
735 |   {
736 |    "cell_type": "markdown",
737 |    "id": "9ebc388d-9499-404e-9118-d1ef1f262cac",
738 |    "metadata": {},
739 |    "source": [
740 |     "Once again, we wrote a class (located at `torchserve/handler_generate.py`), then\n",
741 |     "pass that wrapper class and the serialized model through the `torch-model-archiver` utility."
742 |    ]
743 |   },
744 |   {
745 |    "cell_type": "code",
746 |    "execution_count": 13,
747 |    "id": "558cbdb8-9a78-4649-aaa0-cf6f6a8e9e5b",
748 |    "metadata": {},
749 |    "outputs": [
750 |     {
751 |      "name": "stdout",
752 |      "output_type": "stream",
753 |      "text": [
754 |       "CPU times: user 198 ms, sys: 96 ms, total: 294 ms\n",
755 |       "Wall time: 24.5 s\n"
756 |      ]
757 |     }
758 |    ],
759 |    "source": [
760 |     "%%time\n",
761 |     "!torch-model-archiver --model-name generate --version 1.0 \\\n",
762 |     " --serialized-file torchserve/generate/pytorch_model.bin \\\n",
763 |     " --handler torchserve/handler_generate.py \\\n",
764 |     " --extra-files \"torchserve/generate/config.json,torchserve/generate/merges.txt,torchserve/generate/special_tokens_map.json,torchserve/generate/tokenizer_config.json,torchserve/generate/tokenizer.json,torchserve/generate/vocab.json\" \\\n",
765 |     " --export-path torchserve/model_store \\\n",
766 |     " --force"
767 |    ]
768 |   },
769 |   {
770 |    "cell_type": "markdown",
771 |    "id": "3a273a15-fe07-42d3-82a7-d52f101be4ca",
772 |    "metadata": {},
773 |    "source": [
774 |     "## Testing\n",
775 |     "\n",
776 |     "Now we can fire up TorchServe and test our models.\n",
777 |     "\n",
778 |     "For some reason, starting TorchServe needs to be done in a proper terminal window. Running the command from this notebook has no effect.  The commands to run (from the root of the repository) are:\n",
779 |     "\n",
780 |     "```\n",
781 |     "> conda activate ./env\n",
782 |     "> cd notebooks/benchmark/torchserve\n",
783 |     "> torchserve --start --ncs --model-store model_store --ts-config torchserve.properties\n",
784 |     "```\n",
785 |     "\n",
786 |     "Then pick up a cup of coffee and a book and wait a while. The startup process is like cold-starting a gas turbine and takes about 10 minutes.\n",
787 |     "\n",
788 |     "Once the server has started, we can test our deployed models by making POST requests."
789 |    ]
790 |   },
791 |   {
792 |    "cell_type": "code",
793 |    "execution_count": null,
794 |    "id": "26739838-035c-4ee4-be40-44c87ad66500",
795 |    "metadata": {},
796 |    "outputs": [],
797 |    "source": [
798 |     "# Probe the management API to verify that TorchServe is running.\n",
799 |     "requests.get('http://127.0.0.1:8081/models').json()"
800 |    ]
801 |   },
802 |   {
803 |    "cell_type": "code",
804 |    "execution_count": null,
805 |    "id": "8961900e-08cd-488e-bcd2-f88155ea04f1",
806 |    "metadata": {},
807 |    "outputs": [],
808 |    "source": [
809 |     "port = 8080\n",
810 |     "\n",
811 |     "intent_result = requests.put(\n",
812 |     "    f'http://127.0.0.1:{port}/predictions/intent_en', \n",
813 |     "    json.dumps(INTENT_INPUT)).json()\n",
814 |     "print(f'Intent result: {intent_result}')\n",
815 |     "\n",
816 |     "sentiment_result = requests.put(\n",
817 |     "    f'http://127.0.0.1:{port}/predictions/sentiment_en', \n",
818 |     "    json.dumps(SENTIMENT_INPUT)).json()\n",
819 |     "print(f'Sentiment result: {sentiment_result}')\n",
820 |     "\n",
821 |     "qa_result = requests.put(\n",
822 |     "    f'http://127.0.0.1:{port}/predictions/qa_en', \n",
823 |     "    json.dumps(QA_INPUT)).json()\n",
824 |     "print(f'Question answering result: {qa_result}')\n",
825 |     "\n",
826 |     "generate_result = requests.put(\n",
827 |     "    f'http://127.0.0.1:{port}/predictions/generate_en', \n",
828 |     "    json.dumps(GENERATE_INPUT)).json()\n",
829 |     "print(f'Natural language generation result: {generate_result}')"
830 |    ]
831 |   },
832 |   {
833 |    "cell_type": "markdown",
834 |    "id": "bdc90924-65e1-4399-9994-5cd755326e7c",
835 |    "metadata": {},
836 |    "source": [
837 |     "## Cleanup\n",
838 |     "\n",
839 |     "TorchServe consumes many resources even when it isn't doing anything. When you're done running the baseline portion of the benchmark, be sure to shut down the server by running:\n",
840 |     "```\n",
841 |     "> torchserve --stop\n",
842 |     "```"
843 |    ]
844 |   }
845 |  ],
846 |  "metadata": {
847 |   "kernelspec": {
848 |    "display_name": "Python 3 (ipykernel)",
849 |    "language": "python",
850 |    "name": "python3"
851 |   },
852 |   "language_info": {
853 |    "codemirror_mode": {
854 |     "name": "ipython",
855 |     "version": 3
856 |    },
857 |    "file_extension": ".py",
858 |    "mimetype": "text/x-python",
859 |    "name": "python",
860 |    "nbconvert_exporter": "python",
861 |    "pygments_lexer": "ipython3",
862 |    "version": "3.8.13"
863 |   }
864 |  },
865 |  "nbformat": 4,
866 |  "nbformat_minor": 5
867 | }
868 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/torchserve/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore directories of generated TorchServe model packages
2 | generate
3 | intent
4 | qa
5 | sentiment
6 | model_store
7 | logs
8 | 
9 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/torchserve/README.md:
--------------------------------------------------------------------------------
1 | # torchserve directory
2 | 
3 | This directory contains files related to the TorchServe portion of the benchmark
4 | in [this notebook](../benchmark.ipynb).
5 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/torchserve/handler_generate.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | 
  4 | from typing import Any, List
  5 | 
  6 | from ts.torch_handler.base_handler import BaseHandler
  7 | 
  8 | import torch
  9 | import transformers
 10 | 
 11 | # Disable intra-op parallelism early to prevent silly warning messages
 12 | torch.set_num_threads(1)
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def _get_input_record(request: Any, field_names: List[str]) -> str:
 18 |     '''
 19 |     Helper function to pull out input strings from the ``data`` object
 20 |     that TorchServe provides you with.
 21 | 
 22 |     Args:
 23 |         request: A single request
 24 |         field_names: The names of the field to extract from the request.
 25 |     '''
 26 |     # By convention request should be a JSON record, containing the specified
 27 |     # fields, and maybe some other fields.
 28 |     #
 29 |     # TorchServe helpfully gives you the raw binary data of the request.
 30 |     # Drill down to the requested parts and decode strings as needed.
 31 |     input_json = json.loads(request['body'].decode('utf-8'))
 32 | 
 33 |     if not isinstance(input_json, dict):
 34 |         raise ValueError(f'Request data {request} of type {type(request)} '
 35 |                          f'is not a JSON record.')
 36 |     for key in field_names:
 37 |         if key not in input_json:
 38 |             raise ValueError(f'Request data {request} of type {type(request)} '
 39 |                              f'does not contain required key "{key}"')
 40 |     return {key: input_json[key] for key in field_names}
 41 | 
 42 | 
 43 | class GenerateHandler(BaseHandler):
 44 |     '''
 45 |     Handler for the natural language generation model in our benchmark.
 46 |     '''
 47 | 
 48 |     def initialize(self, context):
 49 |         '''
 50 |         Overridden version of the eponymous method in the base class.
 51 | 
 52 |         Description from the base class:
 53 | 
 54 |         *Initialize function loads the model.pt file and initialized the model
 55 |         object. First try to load torchscript else load eager mode state_dict 
 56 |         based model.*
 57 | 
 58 |         Args:
 59 |             context (context): It is a JSON Object containing information
 60 |             pertaining to the model artifacts parameters.
 61 |         Raises:
 62 |             RuntimeError: Raises the Runtime error when the model.py is missing
 63 |         '''
 64 | 
 65 |         # Serialized model is buried behind several levels of indirection.
 66 |         self.manifest = context.manifest
 67 |         properties = context.system_properties
 68 |         model_dir = properties.get('model_dir')
 69 | 
 70 |         # Use Huggingface's loading code instead of calling Module.load_state_dict()
 71 |         # like the base class does.
 72 |         # To load a pipeline saved with save_pretrained(), you pass the location 
 73 |         # of the directory as a the `model` argument to `pipeline()`.
 74 |         self.pipeline = transformers.pipeline(
 75 |             'text-generation', model=model_dir)
 76 |         self.pipeline.model.eval()  # Just in case
 77 |         self.pad_token_id = self.pipeline.tokenizer.eos_token_id
 78 | 
 79 |         logger.info(
 80 |             f'Transformer model from path {model_dir} loaded successfully.'
 81 |         )
 82 | 
 83 |         self.initialized = True
 84 | 
 85 |     def preprocess(self, data):
 86 |         '''
 87 |         Overridden version of the eponymous method in the base class.
 88 | 
 89 |         Description from the base class:
 90 | 
 91 |         *Preprocess function to convert the request input to a tensor 
 92 |         (Torchserve supported format).
 93 |         The user needs to override to customize the pre-processing.*
 94 | 
 95 |         Args:
 96 |             data (list): List of the data from the request input.
 97 |         Returns:
 98 |             tensor: Returns the tensor data of the input
 99 |                     Just kidding, we return a list of dictionaries of
100 |                     tensors and other stuff.
101 |         '''
102 |         # Expected input format:
103 |         # [ { 'prompt_text': 'All your base are' }, ... ]
104 |         # where the outer list is `data` and the inner elements are raw request
105 |         # objects.
106 | 
107 |         # Start by parsing the JSON in each request.
108 |         json_records = [
109 |             _get_input_record(request, ('prompt_text',))
110 |             for request in data
111 |         ]
112 | 
113 |         # preprocess() takes a single input at a time, but we need to handle 
114 |         # a batch at a time.
115 |         return [self.pipeline.preprocess(**r) for r in json_records]
116 | 
117 |     def inference(self, input_batch):
118 |         '''
119 |         Overridden version of the eponymous method in the base class.
120 | 
121 |         Description from the base class:
122 | 
123 |         *The Inference Function is used to make a prediction call on the given
124 |         input request. The user needs to override the inference function to 
125 |         customize it.*
126 | 
127 |         Args:
128 |             data (Torch Tensor): A Torch Tensor is passed to make the Inference
129 |             Request.
130 |             The shape should match the model input shape.
131 |             Just kidding, this is actually a list of dictionaries.
132 |         Returns:
133 |             Torch Tensor : The Predicted Torch Tensor is returned in this function.
134 |             Just kidding, we return another list of dictionaries.
135 |         '''
136 |         # forward() takes a single input at a time, but we need to run a
137 |         # batch at a time.
138 |         return [self.pipeline.forward(r, pad_token_id=self.pad_token_id)
139 |                 for r in input_batch]
140 | 
141 |     def postprocess(self, inference_output):
142 |         '''
143 |         Overridden version of the eponymous method in the base class.
144 | 
145 |         Description from the base class:
146 | 
147 |         *The post process function makes use of the output from the inference 
148 |         and converts into a Torchserve supported response output.*
149 | 
150 |         Args:
151 |             data (Torch Tensor): The torch tensor received from the prediction
152 |             output of the model.
153 |             Just kidding, this is actually a list of dictionaries.
154 |         Returns:
155 |             List: The post process function returns a list of the predicted 
156 |             output.
157 |         '''
158 |         # postprocess() takes a single generation result at a time, but we
159 |         # need to run a batch at a time.
160 |         return [self.pipeline.postprocess(i) for i in inference_output]
161 | 
162 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/torchserve/handler_intent.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | 
  4 | from ts.torch_handler.base_handler import BaseHandler
  5 | 
  6 | import torch
  7 | import transformers
  8 | 
  9 | # Disable intra-op parallelism early to prevent silly warning messages
 10 | torch.set_num_threads(1)
 11 | 
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | def _get_input_text(request) -> str:
 17 |     '''
 18 |     Helper function to pull out input strings from the ``data`` object
 19 |     that TorchServe provides you with.
 20 | 
 21 |     Args:
 22 |         request: A single request
 23 |     '''
 24 |     # By convention, we use the following format:
 25 |     #
 26 |     # intent_input = {
 27 |     #     'context':
 28 |     #         ("I came here to eat chips and kick butt, "
 29 |     #          "and I'm all out of chips.")
 30 |     # }
 31 |     #
 32 |     # TorchServe helpfully decodes this JSON data into Python object and
 33 |     # extra-helpfully leaves the strings as binary data.
 34 |     # Drill down to the 'context' part and decode.
 35 |     input_json = json.loads(request['body'].decode('utf-8'))
 36 |     if not isinstance(input_json, dict) or 'context' not in input_json.keys():
 37 |         raise ValueError(f'Reqeust data {request} of type {type(request)} '
 38 |                          f'does not contain required key "context"')
 39 |     input_text = input_json['context']
 40 |     if isinstance(input_text, (bytes, bytearray)):
 41 |         input_text = input_text.decode('utf-8')
 42 |     if input_text is None:
 43 |         raise ValueError(f'Couldn\'t get input text from {request}')
 44 |     return input_text
 45 | 
 46 | 
 47 | class IntentHandler(BaseHandler):
 48 |     '''
 49 |     Handler for the Intent model in our benchmark.
 50 |     '''
 51 | 
 52 |     def initialize(self, context):
 53 |         '''
 54 |         Overridden version of the eponymous method in the base class.
 55 | 
 56 |         Description from the base class:
 57 | 
 58 |         *Initialize function loads the model.pt file and initialized the model
 59 |         object. First try to load torchscript else load eager mode state_dict 
 60 |         based model.*
 61 | 
 62 |         Args:
 63 |             context (context): It is a JSON Object containing information
 64 |             pertaining to the model artifacts parameters.
 65 |         Raises:
 66 |             RuntimeError: Raises the Runtime error when the model.py is missing
 67 |         '''
 68 | 
 69 |         # Serialized model is buried behind several levels of indirection.
 70 |         self.manifest = context.manifest
 71 |         properties = context.system_properties
 72 |         model_dir = properties.get('model_dir')
 73 | 
 74 |         # Use Huggingface's loading code instead of calling Module.load_state_dict()
 75 |         # like the base class does.
 76 |         self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir)
 77 |         self.model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_dir)
 78 | 
 79 |         self.model.eval()
 80 | 
 81 |         logger.info(
 82 |             f'Transformer model from path {model_dir} loaded successfully.'
 83 |         )
 84 | 
 85 | 
 86 |         self.initialized = True
 87 | 
 88 |     def preprocess(self, data):
 89 |         '''
 90 |         Overridden version of the eponymous method in the base class.
 91 | 
 92 |         Description from the base class:
 93 | 
 94 |         *Preprocess function to convert the request input to a tensor 
 95 |         (Torchserve supported format).
 96 |         The user needs to override to customize the pre-processing.*
 97 | 
 98 |         Args:
 99 |             data (list): List of the data from the request input.
100 |         Returns:
101 |             tensor: Returns the tensor data of the input
102 |                     Just kidding, we return a pair of tensors.
103 |         '''
104 |         # Ignore the contract specified in the base class and build up complex
105 |         # data structure. The Huggingface-provided example code does this too.
106 |         input_text_list = [_get_input_text(request) for request in data]
107 | 
108 |         return self.tokenizer(input_text_list, padding=True, return_tensors='pt')
109 | 
110 |     def inference(self, input_batch):
111 |         '''
112 |         Overridden version of the eponymous method in the base class.
113 | 
114 |         Description from the base class:
115 | 
116 |         *The Inference Function is used to make a prediction call on the given
117 |         input request. The user needs to override the inference function to 
118 |         customize it.*
119 | 
120 |         Args:
121 |             data (Torch Tensor): A Torch Tensor is passed to make the Inference
122 |             Request.
123 |             The shape should match the model input shape.
124 |             Just kidding, this is actually a complex data structure.
125 |         Returns:
126 |             Torch Tensor : The Predicted Torch Tensor is returned in this function.
127 |         '''
128 |         return self.model.generate(**input_batch)
129 | 
130 |     def postprocess(self, inference_output):
131 |         '''
132 |         Overridden version of the eponymous method in the base class.
133 | 
134 |         Description from the base class:
135 | 
136 |         *The post process function makes use of the output from the inference 
137 |         and converts into a Torchserve supported response output.*
138 | 
139 |         Args:
140 |             data (Torch Tensor): The torch tensor received from the prediction
141 |             output of the model.
142 |         Returns:
143 |             List: The post process function returns a list of the predicted 
144 |             output.
145 |         '''
146 |         # Start by producing a list of output strings.
147 |         output_strings = [self.tokenizer.decode(tensor)
148 |                           for tensor in inference_output]
149 | 
150 |         # Strip the padding tokens. Pity there isn't an option to do this in
151 |         # `decode()`.
152 |         output_strings = [s.replace('<pad>', '') for s in output_strings]
153 | 
154 |         # Remove the extra space at the beginning and the '</s>' at the end
155 |         output_strings = [s[len(' '):-len('</s>')] for s in output_strings]
156 | 
157 |         # Reformat as single-element JSON records.
158 |         # Hopefully this format qualifies as a "Torchserve supported
159 |         # response output".
160 |         return [{'intent': s} for s in output_strings]
161 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/torchserve/handler_qa.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | 
  4 | from typing import Any, List
  5 | 
  6 | from ts.torch_handler.base_handler import BaseHandler
  7 | 
  8 | import torch
  9 | import transformers
 10 | 
 11 | # Disable intra-op parallelism early to prevent silly warning messages
 12 | torch.set_num_threads(1)
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def _get_input_record(request: Any, field_names: List[str]) -> str:
 18 |     '''
 19 |     Helper function to pull out input strings from the ``data`` object
 20 |     that TorchServe provides you with.
 21 | 
 22 |     Args:
 23 |         request: A single request
 24 |         field_names: The names of the field to extract from the request.
 25 |     '''
 26 |     # By convention request should be a JSON record, containing the specified
 27 |     # fields, and maybe some other fields.
 28 |     #
 29 |     # TorchServe helpfully gives you the raw binary data of the request.
 30 |     # Drill down to the requested parts and decode strings as needed.
 31 |     input_json = json.loads(request['body'].decode('utf-8'))
 32 | 
 33 |     if not isinstance(input_json, dict):
 34 |         raise ValueError(f'Request data {request} of type {type(request)} '
 35 |                          f'is not a JSON record.')
 36 |     for key in field_names:
 37 |         if key not in input_json:
 38 |             raise ValueError(f'Request data {request} of type {type(request)} '
 39 |                              f'does not contain required key "{key}"')
 40 |     return {key: input_json[key] for key in field_names}
 41 | 
 42 | 
 43 | class QAHandler(BaseHandler):
 44 |     '''
 45 |     Handler for the question answering model in our benchmark.
 46 |     '''
 47 | 
 48 |     def initialize(self, context):
 49 |         '''
 50 |         Overridden version of the eponymous method in the base class.
 51 | 
 52 |         Description from the base class:
 53 | 
 54 |         *Initialize function loads the model.pt file and initialized the model
 55 |         object. First try to load torchscript else load eager mode state_dict 
 56 |         based model.*
 57 | 
 58 |         Args:
 59 |             context (context): It is a JSON Object containing information
 60 |             pertaining to the model artifacts parameters.
 61 |         Raises:
 62 |             RuntimeError: Raises the Runtime error when the model.py is missing
 63 |         '''
 64 | 
 65 |         # Serialized model is buried behind several levels of indirection.
 66 |         self.manifest = context.manifest
 67 |         properties = context.system_properties
 68 |         model_dir = properties.get('model_dir')
 69 | 
 70 |         # Use Huggingface's loading code instead of calling
 71 |         # Module.load_state_dict() like the base class does.
 72 |         # To load a pipeline saved with save_pretrained(), you pass the
 73 |         # location of the directory as a the `model` argument to `pipeline()`.
 74 |         self.pipeline = transformers.pipeline(
 75 |             'question-answering', model=model_dir)
 76 |         self.pipeline.model.eval()  # Just in case
 77 | 
 78 |         logger.info(
 79 |             f'Transformer model from path {model_dir} loaded successfully.'
 80 |         )
 81 | 
 82 |         self.initialized = True
 83 | 
 84 |     def preprocess(self, data):
 85 |         '''
 86 |         Overridden version of the eponymous method in the base class.
 87 | 
 88 |         Description from the base class:
 89 | 
 90 |         *Preprocess function to convert the request input to a tensor 
 91 |         (Torchserve supported format).
 92 |         The user needs to override to customize the pre-processing.*
 93 | 
 94 |         Args:
 95 |             data (list): List of the data from the request input.
 96 |         Returns:
 97 |             tensor: Returns the tensor data of the input
 98 |                     Just kidding, we return a list of generators.
 99 |         '''
100 |         # Expected input format:
101 |         # [{ 'question': '...', 'context': '...' }, ... ]
102 |         # where the outer list is `data` and the inner elements are raw request
103 |         # objects.
104 | 
105 |         # Start by parsing the JSON in each request.
106 |         json_records = [
107 |             _get_input_record(request, ('question', 'context'))
108 |             for request in data
109 |         ]
110 | 
111 |         # The preprocess method only works on example objects, so the input
112 |         # needs to go through a second layer of conversion.
113 |         samples = [self.pipeline.create_sample(**r) for r in json_records]
114 | 
115 |         # The preprocess method produces a generator for each sample. The generator
116 |         # produces a batch.  Leave each batch as a generator.
117 |         return [self.pipeline.preprocess(s) for s in samples]
118 | 
119 |     def inference(self, input_batch):
120 |         '''
121 |         Overridden version of the eponymous method in the base class.
122 | 
123 |         Description from the base class:
124 | 
125 |         *The Inference Function is used to make a prediction call on the given
126 |         input request. The user needs to override the inference function to 
127 |         customize it.*
128 | 
129 |         Args:
130 |             data (Torch Tensor): A Torch Tensor is passed to make the Inference
131 |             Request.
132 |             The shape should match the model input shape.
133 |             Just kidding, this is actually a list of generators.
134 |         Returns:
135 |             Torch Tensor : The Predicted Torch Tensor is returned in this function.
136 |             Just kidding, we return another generator of generators.
137 |         '''
138 |         return ((self.pipeline.forward(example) for example in generator) 
139 |                 for generator in input_batch)
140 | 
141 |     def postprocess(self, inference_output):
142 |         '''
143 |         Overridden version of the eponymous method in the base class.
144 | 
145 |         Description from the base class:
146 | 
147 |         *The post process function makes use of the output from the inference 
148 |         and converts into a Torchserve supported response output.*
149 | 
150 |         Args:
151 |             data (Torch Tensor): The torch tensor received from the prediction
152 |             output of the model.
153 |             Just kidding, this is actually a generator.
154 |         Returns:
155 |             List: The post process function returns a list of the predicted 
156 |             output.
157 |         '''
158 |         return [self.pipeline.postprocess(o) for o in inference_output]
159 | 
160 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/torchserve/handler_sentiment.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | 
  4 | from typing import Any, List
  5 | 
  6 | from ts.torch_handler.base_handler import BaseHandler
  7 | 
  8 | import scipy.special
  9 | import torch
 10 | import transformers
 11 | 
 12 | # Disable intra-op parallelism early to prevent silly warning messages
 13 | torch.set_num_threads(1)
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | def _get_input_record(request: Any, field_names: List[str]) -> str:
 19 |     '''
 20 |     Helper function to pull out input strings from the ``data`` object
 21 |     that TorchServe provides you with.
 22 | 
 23 |     Args:
 24 |         request: A single request
 25 |         field_names: The names of the field to extract from the request.
 26 |     '''
 27 |     # By convention request should be a JSON record, containing the specified
 28 |     # fields, and maybe some other fields.
 29 |     #
 30 |     # TorchServe helpfully gives you the raw binary data of the request.
 31 |     # Drill down to the requested parts and decode strings as needed.
 32 |     input_json = json.loads(request['body'].decode('utf-8'))
 33 | 
 34 |     if not isinstance(input_json, dict):
 35 |         raise ValueError(f'Request data {request} of type {type(request)} '
 36 |                          f'is not a JSON record.')
 37 |     for key in field_names:
 38 |         if key not in input_json:
 39 |             raise ValueError(f'Request data {request} of type {type(request)} '
 40 |                              f'does not contain required key "{key}"')
 41 |     return {key: input_json[key] for key in field_names}
 42 | 
 43 | 
 44 | class SentimentHandler(BaseHandler):
 45 |     '''
 46 |     Handler for the Sentiment model in our benchmark.
 47 |     '''
 48 | 
 49 |     def initialize(self, context):
 50 |         '''
 51 |         Overridden version of the eponymous method in the base class.
 52 | 
 53 |         Description from the base class:
 54 | 
 55 |         *Initialize function loads the model.pt file and initialized the model
 56 |         object. First try to load torchscript else load eager mode state_dict 
 57 |         based model.*
 58 | 
 59 |         Args:
 60 |             context (context): It is a JSON Object containing information
 61 |             pertaining to the model artifacts parameters.
 62 |         Raises:
 63 |             RuntimeError: Raises the Runtime error when the model.py is missing
 64 |         '''
 65 | 
 66 |         # Serialized model is buried behind several levels of indirection.
 67 |         self.manifest = context.manifest
 68 |         properties = context.system_properties
 69 |         model_dir = properties.get('model_dir')
 70 | 
 71 |         # Use Huggingface's loading code instead of calling
 72 |         # Module.load_state_dict() like the base class does.
 73 |         self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir)
 74 |         self.model = (transformers.AutoModelForSequenceClassification
 75 |                       .from_pretrained(model_dir))
 76 | 
 77 |         self.model.eval()
 78 | 
 79 | 
 80 |         logger.info(
 81 |             f'Transformer model from path {model_dir} loaded successfully.'
 82 |         )
 83 | 
 84 |         self.initialized = True
 85 | 
 86 |     def preprocess(self, data):
 87 |         '''
 88 |         Overridden version of the eponymous method in the base class.
 89 | 
 90 |         Description from the base class:
 91 | 
 92 |         *Preprocess function to convert the request input to a tensor 
 93 |         (Torchserve supported format).
 94 |         The user needs to override to customize the pre-processing.*
 95 | 
 96 |         Args:
 97 |             data (list): List of the data from the request input.
 98 |         Returns:
 99 |             tensor: Returns the tensor data of the input
100 |                     Just kidding, we return a pair of tensors.
101 |         '''
102 |         contexts = [_get_input_record(request, ('context',))['context']
103 |                     for request in data]
104 | 
105 |         return self.tokenizer(contexts, padding=True, return_tensors='pt')
106 | 
107 |     def inference(self, input_batch):
108 |         '''
109 |         Overridden version of the eponymous method in the base class.
110 | 
111 |         Description from the base class:
112 | 
113 |         *The Inference Function is used to make a prediction call on the given
114 |         input request. The user needs to override the inference function to 
115 |         customize it.*
116 | 
117 |         Args:
118 |             data (Torch Tensor): A Torch Tensor is passed to make the Inference
119 |             Request.
120 |             The shape should match the model input shape.
121 |             Just kidding, this is actually a complex data structure.
122 |         Returns:
123 |             Torch Tensor : The Predicted Torch Tensor is returned in this 
124 |                            function.
125 |                            Just kidding, the returned value is actually a 
126 |                            SequenceClassifierOutput object.
127 |         '''
128 |         return self.model(**input_batch)
129 | 
130 |     def postprocess(self, inference_output):
131 |         '''
132 |         Overridden version of the eponymous method in the base class.
133 | 
134 |         Description from the base class:
135 | 
136 |         *The post process function makes use of the output from the inference 
137 |         and converts into a Torchserve supported response output.*
138 | 
139 |         Args:
140 |             data (Torch Tensor): The torch tensor received from the prediction
141 |             output of the model.
142 |             Just kidding, the input is actually a SequenceClassifierOutput 
143 |             object.
144 |         Returns:
145 |             List: The post process function returns a list of the predicted 
146 |             output.
147 |         '''
148 |         # The `logits` field of the input SequenceClassifierOutput object
149 |         # contains a 2d array of logits.
150 |         scores = inference_output.logits.detach().numpy()
151 |         scores = scipy.special.softmax(scores, axis=1).tolist()
152 |         scores = [{k: v for k, v 
153 |                    in zip(['positive', 'neutral', 'negative'], row)}
154 |                   for row in scores]
155 |         return scores
156 | 
157 | 


--------------------------------------------------------------------------------
/notebooks/benchmark/torchserve/torchserve.properties:
--------------------------------------------------------------------------------
 1 | # Configuration file for our TorchServe instance.
 2 | # See https://pytorch.org/serve/configuration.html
 3 | 
 4 | # Default value of this parameter is extremely dangerous.
 5 | default_workers_per_model=1
 6 | 
 7 | 
 8 | # What models to load on startup.
 9 | # Note that this string is very sensitive to whitespace.
10 | load_models=intent_en=intent.mar,intent_es=intent.mar,intent_zh=intent.mar,\
11 | sentiment_en=sentiment.mar,sentiment_es=sentiment.mar,sentiment_zh=sentiment.mar,\
12 | qa_en=qa.mar,qa_es=qa.mar,qa_zh=qa.mar,\
13 | generate_en=generate.mar,generate_es=generate.mar,generate_zh=generate.mar
14 | 
15 | # Uncomment to load only one model
16 | #load_models=sentiment_en=sentiment.mar


--------------------------------------------------------------------------------
/notebooks/images/before_after.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/zero-copy-model-loading/e070a927034d8f8e39555c80e1fecf4f21fdec50/notebooks/images/before_after.png


--------------------------------------------------------------------------------
/notebooks/images/before_after.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/zero-copy-model-loading/e070a927034d8f8e39555c80e1fecf4f21fdec50/notebooks/images/before_after.pptx


--------------------------------------------------------------------------------
/notebooks/images/bert_load_times.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/zero-copy-model-loading/e070a927034d8f8e39555c80e1fecf4f21fdec50/notebooks/images/bert_load_times.png


--------------------------------------------------------------------------------
/notebooks/images/models_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/zero-copy-model-loading/e070a927034d8f8e39555c80e1fecf4f21fdec50/notebooks/images/models_table.png


--------------------------------------------------------------------------------
/notebooks/images/torch_serve_arch.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/zero-copy-model-loading/e070a927034d8f8e39555c80e1fecf4f21fdec50/notebooks/images/torch_serve_arch.jpg


--------------------------------------------------------------------------------
/notebooks/images/zerocopy_perf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/project-codeflare/zero-copy-model-loading/e070a927034d8f8e39555c80e1fecf4f21fdec50/notebooks/images/zerocopy_perf.png


--------------------------------------------------------------------------------
/notebooks/outputs/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything...
2 | *
3 | 
4 | # ...except for stuff we whitelist
5 | !.gitignore
6 | 
7 | 


--------------------------------------------------------------------------------
/package/README.md:
--------------------------------------------------------------------------------
 1 | # `zerocopy` Python package
 2 | 
 3 | This directory contains the source code for the `zerocopy` Python package.
 4 | 
 5 | ## Release instructions:
 6 | 
 7 | 1. `conda activate ../env`
 8 | 1. `python setup.py sdist bdist_wheel`
 9 | 1. (Optional test upload) `python -m twine upload --repository testpypi dist/*`
10 | 1. `python -m twine upload dist/*`
11 | 
12 | ## Testing
13 | 
14 | There are currently no automated tests. Run the code in
15 | `../notebooks/benchmark` manually to test prior to each release.
16 | 
17 | 


--------------------------------------------------------------------------------
/package/package.md:
--------------------------------------------------------------------------------
 1 | <!-- Package description for PyPI -->
 2 | 
 3 | Functions to enable zero-copy loading of PyTorch models using Ray's
 4 | in memory Plasma object store.
 5 | 
 6 | See ["How to Load PyTorch Models 340 Times Faster
 7 | with
 8 | Ray"](https://medium.com/ibm-data-ai/how-to-load-pytorch-models-340-times-faster-with-ray-8be751a6944c)
 9 | for more information.
10 | 
11 | 


--------------------------------------------------------------------------------
/package/requirements.txt:
--------------------------------------------------------------------------------
1 | ray
2 | torch
3 | 
4 | 


--------------------------------------------------------------------------------
/package/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright (c) 2021 IBM Corp.
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #  http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | #
15 | 
16 | import setuptools
17 | 
18 | with open("package.md", "r") as fh:
19 |     long_description = fh.read()
20 | 
21 | # read requirements from file
22 | with open('requirements.txt') as fh:
23 |     requirements = fh.read().splitlines()
24 | 
25 | setuptools.setup(
26 |     name="zerocopy",
27 |     version="0.1.0",
28 |     author="IBM",
29 |     author_email="frreiss@us.ibm.com",
30 |     description="Zero-copy model loading for PyTorch and Ray.",
31 |     long_description=long_description,
32 |     long_description_content_type="text/markdown",
33 |     url="https://github.com/frreiss/zero-copy-model-loading",
34 |     install_requires=requirements,
35 |     packages=setuptools.find_packages(),
36 |     classifiers=[
37 |         "Programming Language :: Python :: 3",
38 |         "License :: OSI Approved :: Apache Software License",
39 |         "Operating System :: OS Independent",
40 |         "Topic :: Scientific/Engineering",
41 |     ],
42 |     python_requires='>=3.7',
43 |     package_data={"": ["LICENSE.txt"]}, 
44 |     include_package_data=True
45 | )
46 | 


--------------------------------------------------------------------------------
/package/zerocopy/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright (c) 2021 IBM Corp.
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #  http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | #
15 | 
16 | """
17 | A library of functions for performing zero-copy model loading of PyTorch models for
18 | inference on Ray.
19 | """
20 | 
21 | from .invoke import (call_model, rewrite_pipeline)
22 | from .rewrite import (
23 |     extract_tensors, replace_tensors, replace_tensors_direct)


--------------------------------------------------------------------------------
/package/zerocopy/invoke.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #  Copyright (c) 2022 IBM Corp.
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #  http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | #
 15 | 
 16 | '''
 17 | Functions for invoking models that have been rewritten for zero-copy 
 18 | loading.
 19 | '''
 20 | 
 21 | import copy
 22 | import ray
 23 | import torch
 24 | import zerocopy.rewrite
 25 | 
 26 | from typing import Any
 27 | 
 28 | 
 29 | @ray.remote
 30 | def call_model(model_ref: ray.ObjectRef,
 31 |                args: Any = (),
 32 |                kwargs: Any = None,
 33 |                method_name: str = '__call__') -> Any:
 34 |     '''
 35 |     Ray task that uses zero-copy model loading to reconstitute a model
 36 |     from Plasma, then a method on the model.
 37 | 
 38 |     :param model_ref: Object reference to a tuple of model skeleton
 39 |      and model weights, as returned by :func:`extract_tensors`
 40 |     :param args: Ordered arguments to pass to the model's method
 41 |     :param kwargs: Keyword arguments to pass to the model's method,
 42 |                    or `None` to pass no keyword arguments
 43 |     :param method_name: Name of the method to call on the object
 44 | 
 45 |     :returns: Return value from calling the specified method
 46 |     '''
 47 |     if kwargs is None:
 48 |         kwargs = {}
 49 | 
 50 |     # Suppress PyTorch warnings about immutable tensors
 51 |     import warnings
 52 |     warnings.filterwarnings('ignore')
 53 | 
 54 |     model_skeleton, model_weights = model_ref
 55 |     zerocopy.rewrite.replace_tensors(model_skeleton, model_weights)
 56 |     with torch.no_grad():
 57 |         method = getattr(model_skeleton, method_name)
 58 |         return method(*args, **kwargs)
 59 | 
 60 | 
 61 | def rewrite_pipeline(pipeline: Any, method_names = ('__call__',)) -> Any:
 62 |     '''
 63 |     Rewrites PyTorch models in a model processing pipeline into Ray tasks
 64 |     that load the model using zero-copy model loading.
 65 | 
 66 |     Current limitatations:
 67 |     * Only models that are stored in fields of the top-level object will be
 68 |       rewritten. This method does *not* recursively traverse child objects.
 69 |     * Only models that are subclasses of ``torch.nn.Module`` will be rewritten.
 70 |     * If there are multiple pointers to the same model, they will be 
 71 |       treated as separate models and loaded separately onto Plasma.
 72 |     * ``pipeline`` must be an object that will still work properly after
 73 |       being shallow-copied with :func:`copy.copy()`
 74 | 
 75 |     :param pipeline: Python object that wraps a model serving pipeline
 76 |     :param method_names: Names of model methods to forward to remote classes.
 77 |     
 78 |     :returns: A **shallow** copy of ``pipeline`` in which all PyTorch models
 79 |      that are stored in fields of ``pipeline`` are replaced with wrapper
 80 |      objects that forward calls to Ray tasks.
 81 |     '''
 82 |     # Find all the models hanging directly off of the pipeline object.
 83 |     model_attr_names = [name for name in dir(pipeline) 
 84 |                         if isinstance(getattr(pipeline, name), torch.nn.Module)]
 85 |     
 86 |     # Shallow-copy the original pipeline
 87 |     result = copy.copy(pipeline)
 88 |     
 89 |     # Replace models with shims that push model inference onto Ray tasks.
 90 |     for name in model_attr_names:
 91 |         model = getattr(result, name)
 92 |         model_ref = ray.put(zerocopy.rewrite.extract_tensors(model))
 93 |         
 94 |         # Define a separate class for each shim.
 95 |         class _Callback:
 96 |             _model_ref = model_ref
 97 |         
 98 |         # Generate a class method for each method we forward
 99 |         for method_name in method_names:
100 |             if hasattr(model, method_name):
101 |                 def method_shim(cls_, *args, **kwargs):
102 |                     return ray.get(zerocopy.call_model.remote(cls_._model_ref, 
103 |                                                      args, kwargs,
104 |                                                      method_name))
105 |                 
106 |                 setattr(_Callback, method_name, method_shim)
107 |             
108 |         setattr(result, name, _Callback())
109 |     
110 |     return result
111 |     


--------------------------------------------------------------------------------
/package/zerocopy/rewrite.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #  Copyright (c) 2021, 2022 IBM Corp.
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #  http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | #
 15 | 
 16 | '''
 17 | PyTorch model rewrites related to zero-copy model loading. These rewrites allow
 18 | users to separate a model into its weights and graph, so that the weights can
 19 | be loaded via a zero-copy mechanism such as `ray.get()`, then plugged into an
 20 | empty version of the graph.
 21 | '''
 22 | 
 23 | import copy
 24 | import torch
 25 | from typing import Dict, List, Tuple
 26 | 
 27 | 
 28 | def extract_tensors(m: torch.nn.Module) -> Tuple[torch.nn.Module, List[Dict]]:
 29 |     '''
 30 |     Remove the tensors from a PyTorch model, convert them to NumPy
 31 |     arrays, and return the stripped model and tensors.
 32 | 
 33 |     :param m: Root node of a PyTorch model encoded as a graph of subclasses of
 34 |         :class:`torch.nn.Module`
 35 |     :type m: torch.nn.Module
 36 | 
 37 |     :returns: A tuple with two elements:
 38 |               * A deep copy of `m` in which all weight tensors have been
 39 |                 replaced by `None`
 40 |               * The tensors that were removed from the copy of `m`, encoded as
 41 |                 a list of dictionaries. Each dictionary holds the tensors
 42 |                 associated with a single :class:`torch.nn.Module` in the
 43 |                 model's graph, indexed by parameter name. The dictionaries 
 44 |                 occur in the order returned by :func:`m.named_modules`
 45 |     '''
 46 |     tensors = []
 47 |     for _, module in m.named_modules():
 48 |         # Store the tensors in Python dictionaries
 49 |         params = {
 50 |             name: torch.clone(param).detach().numpy()
 51 |             for name, param in module.named_parameters(recurse=False)
 52 |         }
 53 |         buffers = {
 54 |             name: torch.clone(buf).detach().numpy()
 55 |             for name, buf in module.named_buffers(recurse=False)
 56 |         }
 57 |         tensors.append({'params': params, 'buffers': buffers})
 58 | 
 59 |     # Make a copy of the original model and strip all tensors and
 60 |     # temporary buffers out of the copy.
 61 |     m_copy = copy.deepcopy(m)
 62 |     for _, module in m_copy.named_modules():
 63 |         for name in (
 64 |                 [name for name, _ in module.named_parameters(recurse=False)]
 65 |                 + [name for name, _ in module.named_buffers(recurse=False)]):
 66 |             setattr(module, name, None)
 67 | 
 68 |     # Make sure the copy is configured for inference.
 69 |     m_copy.train(False)
 70 |     return m_copy, tensors
 71 | 
 72 | 
 73 | def replace_tensors(m: torch.nn.Module, tensors: List[Dict]):
 74 |     '''
 75 |     The inverse operation of :func:`extract_tensors`. Restores the tensors that
 76 |     :func:`extract_tensors` stripped out of a  PyTorch model. This restore operation
 77 |     involves zero copying of data and results in a model that can be immediately
 78 |     used for CPU-based inference. To avoid copying, this function modifies the target
 79 |     model in place.
 80 | 
 81 |     :param m: Root node of a PyTorch model encoded as a graph of subclasses of
 82 |         :class:`torch.nn.Module`. Usually this parameter contains a model that has been
 83 |         stripped of its weights by :funct:`extract_tensors`. **Modified in place.**
 84 |         If any weights are present in `m`, this function will replace them.
 85 |     :param tensors: The tensors to be inserted into `m`, encoded as a list of
 86 |         dictionaries. Each dictionary holds the tensors associated with a single
 87 |         :class:`torch.nn.Module` in the model's graph, indexed by parameter name.
 88 |         The dictionaries occur in the order returned by :func:`m.named_modules`
 89 |     '''
 90 |     with torch.inference_mode():
 91 |         modules = [module for _, module in m.named_modules()]
 92 |         for module, tensor_dict in zip(modules, tensors):
 93 |             # There are separate APIs to set parameters and buffers.
 94 |             for name, array in tensor_dict['params'].items():
 95 |                 module.register_parameter(
 96 |                     name, torch.nn.Parameter(torch.as_tensor(array)))
 97 |             for name, array in tensor_dict['buffers'].items():
 98 |                 module.register_buffer(name, torch.as_tensor(array))
 99 | 
100 | 
101 | def replace_tensors_direct(m: torch.nn.Module, tensors: List[Dict]):
102 |     '''
103 |     A version of :func:`replace_tensors` that takes a faster but slightly dangerous
104 |     shortcut.
105 | 
106 |     Like :func:`replace_tenosrs`, this function restores the tensors that
107 |     :func:`extract_tensors` stripped out of a PyTorch model. However, this function
108 |     skips the step of wrapping the restored tensors in ``torch.nn.Parameters`` objects.
109 |     Skipping this step makes the restore operation go about 20% faster in testing on
110 |     ``bert-base-uncased``, but **may impact the correctness of some models**.
111 |     Be sure to test this method carefully before using it on a particular PyTorch model.
112 | 
113 |     Like :func:`replace_tensors`, this function modifies the model in place to avoid
114 |     copying data.
115 | 
116 |     :param m: Root node of a PyTorch model encoded as a graph of subclasses of
117 |         :class:`torch.nn.Module`. Usually this parameter contains a model that has been
118 |         stripped of its weights by :funct:`extract_tensors`. **Modified in place.**
119 |         If any weights are present in `m`, this function will replace them.
120 |     :param tensors: The tensors to be inserted into `m`, encoded as a list of
121 |         dictionaries. Each dictionary holds the tensors associated with a single
122 |         :class:`torch.nn.Module` in the model's graph, indexed by parameter name.
123 |         The dictionaries occur in the order returned by :func:`m.named_modules`
124 |     '''
125 |     with torch.inference_mode():
126 |         modules = [module for _, module in m.named_modules()]
127 |         for module, tensor_dict in zip(modules, tensors):
128 |             # There are separate APIs to set parameters and buffers.
129 |             for name, array in tensor_dict['params'].items():
130 |                 # Super fast, somewhat risky version avoids
131 |                 # wrapping parameters in Parameters objects.
132 |                 module._parameters[name] = torch.as_tensor(array)
133 |             for name, array in tensor_dict['buffers'].items():
134 |                 module.register_buffer(name, torch.as_tensor(array))
135 | 
136 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Packages relating to our core code
 2 | requests
 3 | ray[serve]
 4 | transformers
 5 | torch
 6 | torchvision
 7 | torchserve
 8 | torch-model-archiver
 9 | pandas
10 | pympler
11 | numpy
12 | scipy
13 | 
14 | # Packages related to the JupyterLab environment. Note that
15 | # JupyterLab itelf is installed from conda-forge, because the packages
16 | # in PyPI for Mac are currently unreliable.
17 | matplotlib
18 | 
19 | # Packages relating to publishing the `zerocopy` package
20 | twine
21 | 
22 | 


--------------------------------------------------------------------------------