├── .github └── workflows │ ├── main.yml │ └── python_client.yml ├── .gitignore ├── LICENSE ├── README.md ├── cli.md ├── clients ├── browser_side_js │ └── client.html └── python │ ├── README.md │ ├── fdclient │ ├── __init__.py │ └── client.py │ └── setup.py ├── fastdeploy ├── .gitignore ├── __init__.py ├── __main__.py ├── _infer.py ├── _loop.py ├── _rest.py ├── _utils.py └── monitor.sh ├── recipe.md ├── recipes ├── .gitignore ├── echo │ ├── .dockerignore │ ├── .gitignore │ ├── example.py │ ├── extra_prometheus_metrics.py │ ├── fastDeploy.auto_dockerfile │ ├── predictor.py │ └── requirements.txt ├── echo_chained │ ├── .dockerignore │ ├── .gitignore │ ├── example.py │ ├── predictor_1.py │ ├── predictor_2.py │ └── requirements.txt └── text_embeddings │ ├── example.py │ ├── predictor.py │ ├── requirements.txt │ └── words.txt ├── setup.py └── testing ├── README.md └── benchmark.py /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | paths: 7 | - 'setup.py' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | pypi: 12 | runs-on: ubuntu-latest 13 | 14 | permissions: 15 | id-token: write 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - uses: actions/setup-python@v4 20 | with: 21 | python-version: '3.x' 22 | 23 | - name: Install dependencies 24 | run: python -m pip install -U build 25 | 26 | - name: Build 27 | run: python -m build 28 | 29 | - name: Publish 30 | uses: pypa/gh-action-pypi-publish@release/v1 31 | -------------------------------------------------------------------------------- /.github/workflows/python_client.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | paths: 7 | - 'clients/python/setup.py' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | pypi: 12 | runs-on: ubuntu-latest 13 | 14 | permissions: 15 | id-token: write 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - uses: actions/setup-python@v4 20 | with: 21 | python-version: '3.x' 22 | 23 | - name: Install dependencies 24 | run: python -m pip install -U build 25 | 26 | - name: Build 27 | run: python -m build 28 | working-directory: clients/python 29 | 30 | - name: Move package 31 | run: mv clients/python/dist ./ 32 | 33 | - name: Publish 34 | uses: pypa/gh-action-pypi-publish@release/v1 35 | 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | .results_index/ 7 | .request_queue/ 8 | 9 | # C extensions 10 | *.so 11 | 12 | recipes/*/*index 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | 136 | # Mac DS_Store 137 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 notAI-tech 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## fastDeploy 2 | #### easy and performant micro-services for Python Deep Learning inference pipelines 3 | 4 | - Deploy any python inference pipeline with minimal extra code 5 | - Auto batching of concurrent inputs is enabled out of the box 6 | - no changes to inference code (unlike tf-serving etc), entire pipeline is run as is 7 | - Promethues metrics (open metrics) are exposed for monitoring 8 | - Auto generates clean dockerfiles and kubernetes health check, scaling friendly APIs 9 | - sequentially chained inference pipelines are supported out of the box 10 | - can be queried from any language via easy to use rest apis 11 | - easy to understand (simple consumer producer arch) and simple code base 12 | 13 | 14 | #### Installation: 15 | ```bash 16 | pip install --upgrade fastdeploy fdclient 17 | # fdclient is optional, only needed if you want to use python client 18 | ``` 19 | 20 | #### [CLI explained](https://github.com/notAI-tech/fastDeploy/blob/master/cli.md) 21 | 22 | #### Start fastDeploy server on a recipe: 23 | ```bash 24 | # Invoke fastdeploy 25 | python -m fastdeploy --help 26 | # or 27 | fastdeploy --help 28 | 29 | # Start prediction "loop" for recipe "echo" 30 | fastdeploy --loop --recipe recipes/echo 31 | 32 | # Start rest apis for recipe "echo" 33 | fastdeploy --rest --recipe recipes/echo 34 | ``` 35 | 36 | #### Send a request and get predictions: 37 | 38 | - [Python client usage](https://github.com/notAI-tech/fastDeploy/blob/master/clients/python/README.md) 39 | 40 | - [curl usage]() 41 | 42 | - [Nodejs client usage]() 43 | 44 | #### auto generate dockerfile and build docker image: 45 | ```bash 46 | # Write the dockerfile for recipe "echo" 47 | # and builds the docker image if docker is installed 48 | # base defaults to python:3.8-slim 49 | fastdeploy --build --recipe recipes/echo 50 | 51 | # Run docker image 52 | docker run -it -p8080:8080 fastdeploy_echo 53 | ``` 54 | 55 | #### Serving your model (recipe): 56 | 57 | - [Writing your model/pipeline's recipe](https://github.com/notAI-tech/fastDeploy/blob/master/recipe.md) 58 | 59 | 60 | ### Where to use fastDeploy? 61 | 62 | - to deploy any non ultra light weight models i.e: most DL models, >50ms inference time per example 63 | - if the model/pipeline benefits from batch inference, fastDeploy is perfect for your use-case 64 | - if you are going to have individual inputs (example, user's search input which needs to be vectorized or image to be classified) 65 | - in the case of individual inputs, requests coming in at close intervals will be batched together and sent to the model as a batch 66 | - perfect for creating internal micro services separating your model, pre and post processing from business logic 67 | - since prediction loop and inference endpoints are separated and are connected via sqlite backed queue, can be scaled independently 68 | 69 | 70 | ### Where not to use fastDeploy? 71 | - non cpu/gpu heavy models that are better of running parallely rather than in batch 72 | - if your predictor calls some external API or uploads to s3 etc in a blocking way 73 | - io heavy non batching use cases (eg: query ES or db for each input) 74 | - for these cases better to directly do from rest api code (instead of consumer producer mechanism) so that high concurrency can be achieved 75 | -------------------------------------------------------------------------------- /cli.md: -------------------------------------------------------------------------------- 1 | 2 | ### fastDeploy CLI usage explained 3 | 4 | 5 | - invoking the CLI 6 | ```python 7 | fastDeploy --help 8 | # or 9 | python -m fastDeploy --help 10 | ``` 11 | 12 | 13 | #### Prediction loop 14 | - Start prediction loop on your recipe 15 | ```python 16 | fastdeploy --loop --recipe ./recipes/echo 17 | ``` 18 | 19 | - Optional config can be passed with `--config` flag 20 | 21 | ```python 22 | fastdeploy --loop --recipe ./recipes/echo --config "predictor_name=predictor.py;optimal_batch_size=0" 23 | ``` 24 | 25 | | Config | Description | Default | 26 | | --- | --- | --- | 27 | | predictor_name | predictor.py or predictor_N.py, name of the predictor run in the loop | predictor.py | 28 | | optimal_batch_size | integer max batch size for the predictor | 0 (auto determine) | 29 | 30 | - Same config can also be passed as env variables 31 | ```python 32 | export PREDICTOR_NAME=predictor.py 33 | export OPTIMAL_BATCH_SIZE=0 34 | fastdeploy --loop --recipe ./recipes/echo 35 | ``` 36 | 37 | 38 | 39 | #### Start API server 40 | - Start API server on your recipe 41 | ```python 42 | fastdeploy --rest --recipe ./recipes/echo 43 | ``` 44 | 45 | - Optional config can be passed with `--config` flag 46 | 47 | ```python 48 | fastdeploy --rest --recipe ./recipes/echo --config "max_request_batch_size=0;workers=3;timeout=480;host=0.0.0.0;port=8080;only_async=false;allow_pickle=true;keep_alive=60" 49 | ``` 50 | 51 | - Same config can also be passed as env variables 52 | ```python 53 | export MAX_REQUEST_BATCH_SIZE=0 54 | export WORKERS=3 55 | export TIMEOUT=480 56 | export HOST=0.0.0.0 57 | export PORT=8080 58 | export ONLY_ASYNC=false 59 | export ALLOW_PICKLE=true 60 | export KEEP_ALIVE=60 61 | fastdeploy --rest --recipe ./recipes/echo 62 | ``` 63 | 64 | #### --config options explained 65 | 66 | | Config | Description | Default | 67 | | --- | --- | --- | 68 | | max_request_batch_size | integer max number of inputs in a batch. useful when exposing to outside directly to limit max number of inputs that can be in a request | 0 (None) | 69 | | workers | number of rest api gunicorn workers. 3 is more than enoough generally | 3 | 70 | | timeout | seconds after which request will fail | 480 | 71 | | host | host for the REST server | 0.0.0.0 | 72 | | port | port for the REST server | 8080 | 73 | | only_async | true/false | false | 74 | | allow_pickle | true/false - use for disallowing pickle protocol when expecting external inputs | true | 75 | | keep_alive | gunicorn gevent keep alive | 60 | 76 | 77 | 78 | #### Build docker image 79 | 80 | - Build generate docker image for your recipe 81 | ```python 82 | fastdeploy --build --recipe ./recipes/echo 83 | ``` 84 | 85 | - also supports optional config via `--config` flag 86 | - both rest and loop config options can be passed here in the same config string 87 | 88 | 89 | -------------------------------------------------------------------------------- /clients/browser_side_js/client.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | FDClient Test with Multiple File Upload and Download 7 | 8 | 21 | 149 | 150 | 151 |
152 |

FDClient Test with Multiple File Upload and Download

153 |
154 | 155 | 156 | 157 |
158 |
159 | 160 | 161 | 162 |
163 |
164 | 165 |
166 |
167 |

Download Processed Files:

168 | 169 |
170 |
171 |

Result:

172 |

173 |         
174 |
175 | 176 | -------------------------------------------------------------------------------- /clients/python/README.md: -------------------------------------------------------------------------------- 1 | ## fastDeploy python client 2 | 3 | ```python 4 | from fdclient import FDClient 5 | 6 | client = FDClient('http://localhost:8080') # optional compression=False to disable zstd compression 7 | 8 | # infer 9 | response = client.infer([obj_1, obj_2, ...]) # optional unique_id='some_id' to specify a unique id for the request 10 | 11 | # infer in background 12 | response_future = client.infer_background([obj_1, obj_2, ...]) # optional unique_id='some_id' to specify a unique id for the request 13 | response = response_future.result() # wait for the response and get it 14 | ``` 15 | 16 | - By default fdclient communicates with fastDeploy server via pickles 17 | - pickle is very useful and makes sense when using fastDeploy server as a micro service internally i.e: all requests to fastDeploy originate from code you have writtem 18 | - ***PICKLE is secure if all the inputs to fastDeploy are originating from your code and not direct external user's pickles*** 19 | - ***PICKLE is unsecure if you are passing external user inputs to fastDeploy directly without validation in between*** 20 | - start fastDeploy serve with `--config "allow_pickle=false"` if the fastDeploy APIs are exposed to outside 21 | - `allow_pickle=false` config on server side makes fdclient use `msgpack` if available or `json` if msgpack not available. 22 | 23 | #### If pickle is unsecure, why use it at all? 24 | 25 | - pickle is great to send or receive arbitary inputs and outputs 26 | - if `allow_pickle=true` (default) your inputs and outputs can be any python objects, eg: np arrays, pd dataframes, float32 anything .... 27 | - pickle is only unsecure if you are unpickling objects pickled by others (since they can insert malicious code) 28 | - If fastDeploy is being used only for internal microservices, pickle is the best way so enabled by default 29 | -------------------------------------------------------------------------------- /clients/python/fdclient/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import FDClient 2 | -------------------------------------------------------------------------------- /clients/python/fdclient/client.py: -------------------------------------------------------------------------------- 1 | try: 2 | import zstandard 3 | except: 4 | zstandard = None 5 | 6 | try: 7 | import msgpack 8 | except: 9 | msgpack = None 10 | 11 | import threading 12 | import requests 13 | import pickle 14 | import uuid 15 | import time 16 | import json 17 | 18 | 19 | class FDClient: 20 | def __init__(self, server_url, request_timeout, compression=True, use_requests_session=False): 21 | assert server_url.startswith("http://") or server_url.startswith( 22 | "https://" 23 | ), "Server URL must start with http:// or https://" 24 | 25 | self.server_url = server_url 26 | self.local_storage = threading.local() 27 | self.requests_session = requests.Session() if use_requests_session else requests 28 | self.compression = compression if zstandard is not None else False 29 | self.input_type = None 30 | self._set_input_type() 31 | 32 | self.request_timeout = request_timeout 33 | 34 | def _set_input_type(self): 35 | if self.input_type is None: 36 | try: 37 | self.input_type = ( 38 | "pickle" 39 | if self.requests_session.get( 40 | f"{self.server_url}/meta", params={"is_pickle_allowed": ""} 41 | ).json()["is_pickle_allowed"] 42 | else "msgpack" 43 | if msgpack is not None 44 | else "json" 45 | ) 46 | except Exception as e: 47 | self.input_type = None 48 | 49 | @property 50 | def _compressor(self): 51 | if self.compression is False: 52 | return None 53 | 54 | if ( 55 | not hasattr(self.local_storage, "compressor") 56 | or self.local_storage.compressor is None 57 | ): 58 | self.local_storage.compressor = zstandard.ZstdCompressor(level=-1) 59 | return self.local_storage.compressor 60 | 61 | @property 62 | def _decompressor(self): 63 | if self.compression is False: 64 | return None 65 | 66 | if ( 67 | not hasattr(self.local_storage, "decompressor") 68 | or self.local_storage.decompressor is None 69 | ): 70 | self.local_storage.decompressor = zstandard.ZstdDecompressor() 71 | return self.local_storage.decompressor 72 | 73 | @property 74 | def _decompressor(self): 75 | if self.compression is False: 76 | return None 77 | 78 | if ( 79 | not hasattr(self.local_storage, "decompressor") 80 | or self.local_storage.decompressor is None 81 | ): 82 | self.local_storage.decompressor = zstandard.ZstdDecompressor() 83 | return self.local_storage.decompressor 84 | 85 | def infer(self, data, unique_id=None, is_async=False): 86 | if self.input_type is None: 87 | self._set_input_type() 88 | if self.input_type is None: 89 | raise ValueError("Could not connect to server") 90 | 91 | assert isinstance(data, (list, tuple)), "Data must be of type list or tuple" 92 | 93 | unique_id = str(uuid.uuid4()) if not unique_id else unique_id 94 | 95 | if self.input_type == "pickle": 96 | data = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL) 97 | elif self.input_type == "msgpack": 98 | data = msgpack.packb(data, use_bin_type=True) 99 | else: 100 | data = json.dumps(data) 101 | 102 | response = self.requests_session.post( 103 | f"{self.server_url}/infer", 104 | params={ 105 | "unique_id": unique_id, 106 | "async": is_async, 107 | "input_type": self.input_type, 108 | "compressed": True if zstandard is not None else False, 109 | "timeout": self.request_timeout, 110 | }, 111 | data=self._compressor.compress(data) if zstandard is not None else data, 112 | headers={"Content-Type": "application/octet-stream"}, 113 | timeout=self.request_timeout * 1.1, 114 | ) 115 | 116 | if self.input_type == "pickle": 117 | return pickle.loads( 118 | self._decompressor.decompress(response.content) 119 | if zstandard is not None 120 | else response.content 121 | ) 122 | elif self.input_type == "msgpack": 123 | return msgpack.unpackb( 124 | self._decompressor.decompress(response.content) 125 | if zstandard is not None 126 | else response.content, 127 | raw=False, 128 | use_list=False, 129 | ) 130 | else: 131 | return json.loads( 132 | self._decompressor.decompress(response.content) 133 | if zstandard is not None 134 | else response.content 135 | ) 136 | 137 | def infer_async(self, data, unique_id=None): 138 | return self.infer(data, unique_id, is_async=True) 139 | 140 | 141 | if __name__ == "__main__": 142 | client = FDClient("http://localhost:8080") 143 | 144 | print(client.input_type) 145 | 146 | s = time.time() 147 | print("infer", client.infer(["this", "is", "some", b"data"]), time.time() - s) 148 | 149 | s = time.time() 150 | x = client.infer_background(["this", "is", b"some", "data"]) 151 | print("infer_background", x.result(), time.time() - s) 152 | 153 | s = time.time() 154 | 155 | print( 156 | "infer_background_multiple 40", 157 | [ 158 | _.result()["success"] 159 | for _ in client.infer_background_multiple( 160 | [["this", b"is", "some", "data"]] * 40 161 | ) 162 | ], 163 | time.time() - s, 164 | ) 165 | -------------------------------------------------------------------------------- /clients/python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Note: To use the 'upload' functionality of this file, you must: 5 | # $ pip install twine 6 | 7 | import io 8 | import os 9 | import sys 10 | from shutil import rmtree 11 | 12 | from setuptools import find_packages, setup, Command 13 | 14 | # Package meta-data. 15 | NAME = "fdclient" 16 | DESCRIPTION = "fastDeploy python client" 17 | URL = "https://github.com/notAI-tech/fastDeploy" 18 | EMAIL = "praneeth@bpraneeth.com" 19 | AUTHOR = "BEDAPUDI PRANEETH" 20 | REQUIRES_PYTHON = ">=3.6.0" 21 | VERSION = "3.1.1" 22 | 23 | # What packages are required for this module to be executed? 24 | REQUIRED = ["zstandard", "requests", "msgpack"] 25 | 26 | # What packages are optional? 27 | EXTRAS = { 28 | # 'fancy feature': ['django'], 29 | } 30 | 31 | # The rest you shouldn't have to touch too much :) 32 | # ------------------------------------------------ 33 | # Except, perhaps the License and Trove Classifiers! 34 | # If you do change the License, remember to change the Trove Classifier for that! 35 | 36 | here = os.path.abspath(os.path.dirname(__file__)) 37 | 38 | # Import the README and use it as the long-description. 39 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file! 40 | try: 41 | with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f: 42 | long_description = "\n" + f.read() 43 | except FileNotFoundError: 44 | long_description = DESCRIPTION 45 | 46 | # Load the package's __version__.py module as a dictionary. 47 | about = {} 48 | if not VERSION: 49 | with open(os.path.join(here, NAME, "__version__.py")) as f: 50 | exec(f.read(), about) 51 | else: 52 | about["__version__"] = VERSION 53 | 54 | 55 | class UploadCommand(Command): 56 | """Support setup.py upload.""" 57 | 58 | description = "Build and publish the package." 59 | user_options = [] 60 | 61 | @staticmethod 62 | def status(s): 63 | """Prints things in bold.""" 64 | print("\033[1m{0}\033[0m".format(s)) 65 | 66 | def initialize_options(self): 67 | pass 68 | 69 | def finalize_options(self): 70 | pass 71 | 72 | def run(self): 73 | try: 74 | self.status("Removing previous builds…") 75 | rmtree(os.path.join(here, "dist")) 76 | except OSError: 77 | pass 78 | 79 | self.status("Building Source and Wheel (universal) distribution…") 80 | os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable)) 81 | 82 | self.status("Uploading the package to PyPI via Twine…") 83 | os.system("twine upload dist/*") 84 | 85 | self.status("Pushing git tags…") 86 | os.system("git tag v{0}".format(about["__version__"])) 87 | os.system("git push --tags") 88 | 89 | sys.exit() 90 | 91 | 92 | # Where the magic happens: 93 | setup( 94 | name=NAME, 95 | version=about["__version__"], 96 | description=DESCRIPTION, 97 | long_description=long_description, 98 | long_description_content_type="text/markdown", 99 | author=AUTHOR, 100 | author_email=EMAIL, 101 | python_requires=REQUIRES_PYTHON, 102 | url=URL, 103 | packages=find_packages(exclude=("tests",)), 104 | # If your package is a single module, use this instead of 'packages': 105 | # py_modules=['mypackage'], 106 | install_requires=REQUIRED, 107 | extras_require=EXTRAS, 108 | include_package_data=True, 109 | license="MIT", 110 | classifiers=[ 111 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 112 | "License :: OSI Approved :: MIT License", 113 | "Programming Language :: Python", 114 | "Programming Language :: Python :: 3", 115 | "Programming Language :: Python :: 3.6", 116 | "Programming Language :: Python :: Implementation :: CPython", 117 | ], 118 | # $ setup.py publish support. 119 | cmdclass={ 120 | "upload": UploadCommand, 121 | }, 122 | ) 123 | -------------------------------------------------------------------------------- /fastdeploy/.gitignore: -------------------------------------------------------------------------------- 1 | fastdeploy-ui 2 | -------------------------------------------------------------------------------- /fastdeploy/__init__.py: -------------------------------------------------------------------------------- 1 | from . import __main__ 2 | -------------------------------------------------------------------------------- /fastdeploy/__main__.py: -------------------------------------------------------------------------------- 1 | import resource 2 | 3 | try: 4 | resource.setrlimit(resource.RLIMIT_NOFILE, (131072, 131072)) 5 | except: 6 | pass 7 | 8 | import os 9 | import sys 10 | import glob 11 | import argparse 12 | import subprocess 13 | 14 | parser = argparse.ArgumentParser( 15 | description="CLI for fastDeploy", formatter_class=argparse.RawTextHelpFormatter 16 | ) 17 | parser.add_argument( 18 | "--recipe", 19 | type=str, 20 | help="Path to recipe folder that contains predictor.py", 21 | required=False, 22 | ) 23 | 24 | parser.add_argument( 25 | "--loop", 26 | help=f"""Start prediction loop""", 27 | required=False, 28 | action="store_true", 29 | ) 30 | 31 | parser.add_argument( 32 | "--rest", 33 | help="""Start REST server""", 34 | required=False, 35 | action="store_true", 36 | ) 37 | 38 | parser.add_argument( 39 | "--build", 40 | help="""Build docker image""", 41 | required=False, 42 | action="store_true", 43 | ) 44 | 45 | parser.add_argument( 46 | "--config", 47 | type=str, 48 | help=""" 49 | example usage: --config "workers=3, timeout:480, allow_pickle=true" 50 | 51 | REST 52 | max_request_batch_size: integer max number of inputs in a batch, default=0 (None) 53 | workers: integer number of workers, default=3 54 | timeout: seconds after which request will fail, default=480 55 | host: host for the REST server, default=0.0.0.0 56 | port: port for the REST server, default=8080 57 | allow_pickle: true/false, default=true 58 | keep_alive: gunicorn gevent keep alive, default=60 59 | 60 | 61 | LOOP 62 | predictor_name: predictor.py or predictor_N.py, name of the predictor run in the loop, default: predictor.py 63 | optimal_batch_size: integer max batch size for the predictor, default=0 (auto) 64 | 65 | DOCKER 66 | base: base image for docker, default=python:3.8-slim 67 | """, 68 | required=False, 69 | default="max_request_batch_size=0,workers=3,timeout=480,host=0.0.0.0,port=8080,allow_pickle=true,predictor_name=predictor.py,optimal_batch_size=0,keep_alive=60,base=python:3.8-slim", 70 | ) 71 | 72 | args = parser.parse_args() 73 | 74 | CONFIG = { 75 | # rest config 76 | "max_request_batch_size": int(os.getenv("MAX_REQUEST_BATCH_SIZE", "0")), 77 | "workers": int(os.getenv("WORKERS", "3")), 78 | "timeout": int(os.getenv("TIMEOUT", "480")), 79 | "host": os.getenv("HOST", "0.0.0.0"), 80 | "port": int(os.getenv("PORT", "8080")), 81 | "allow_pickle": os.getenv("ALLOW_PICKLE", "true").lower() == "true", 82 | # predictor config 83 | "predictor_name": os.getenv("PREDICTOR_NAME", "predictor.py"), 84 | "optimal_batch_size": int(os.getenv("OPTIMAL_BATCH_SIZE", "0")), 85 | "keep_alive": int(os.getenv("KEEP_ALIVE", "60")), 86 | # building docker config 87 | "base": os.getenv("BASE", "python:3.8-slim"), 88 | } 89 | 90 | if args.config: 91 | for config in args.config.split(","): 92 | try: 93 | k, v = config.strip().split("=") 94 | except: 95 | continue 96 | 97 | if os.getenv(k.upper()) is not None: 98 | continue 99 | 100 | try: 101 | CONFIG[k.strip()] = int(v.strip()) 102 | except: 103 | CONFIG[k.strip()] = v.strip() 104 | 105 | for k, v in CONFIG.items(): 106 | os.environ[k.upper()] = str(v) 107 | 108 | sys.path.append(os.path.abspath(args.recipe)) 109 | os.chdir(os.path.abspath(args.recipe)) 110 | 111 | try: 112 | if not os.path.exists(os.path.join(args.recipe, ".gitignore")): 113 | _gitignore_f = open(os.path.join(args.recipe, ".gitignore"), "a") 114 | _gitignore_f.write("\nfastdeploy_dbs\nfastdeploy_dbs/*\n") 115 | _gitignore_f.flush() 116 | _gitignore_f.close() 117 | except: 118 | pass 119 | 120 | try: 121 | if not os.path.exists(os.path.join(args.recipe, ".dockerignore")): 122 | _dockerignore_f = open(os.path.join(args.recipe, ".dockerignore"), "w") 123 | _dockerignore_f.write("\nfastdeploy_dbs\nfastdeploy_dbs/*\n") 124 | _dockerignore_f.flush() 125 | _dockerignore_f.close() 126 | except: 127 | pass 128 | 129 | 130 | def loop(): 131 | from ._loop import start_loop 132 | 133 | start_loop() 134 | 135 | 136 | def rest(): 137 | from ._rest import app 138 | import gunicorn.app.base 139 | 140 | class StandaloneApplication(gunicorn.app.base.BaseApplication): 141 | def __init__(self, app, options=None): 142 | self.options = options or {} 143 | self.application = app 144 | super().__init__() 145 | 146 | def load_config(self): 147 | config = { 148 | key: value 149 | for key, value in self.options.items() 150 | if key in self.cfg.settings and value is not None 151 | } 152 | for key, value in config.items(): 153 | self.cfg.set(key.lower(), value) 154 | 155 | def load(self): 156 | return self.application 157 | 158 | options = { 159 | "preload": "", 160 | "bind": "%s:%s" % (CONFIG["host"], CONFIG["port"]), 161 | "workers": CONFIG["workers"], 162 | "worker_connections": 1000, 163 | "worker_class": "gevent", 164 | "timeout": CONFIG["timeout"], 165 | "allow_redirects": True, 166 | "keepalive": CONFIG["keep_alive"], 167 | "keep_alive": CONFIG["keep_alive"], 168 | } 169 | 170 | print( 171 | f"fastDeploy REST interface active at http://{CONFIG['host']}:{CONFIG['port']}" 172 | ) 173 | 174 | StandaloneApplication(app, options).run() 175 | 176 | 177 | def build_docker_image(): 178 | if not os.path.exists("requirements.txt"): 179 | raise Exception("requirements.txt not found") 180 | 181 | f = open("fastDeploy.auto_dockerfile", "w") 182 | f.write( 183 | f"""FROM {CONFIG['base']} 184 | RUN python3 -m pip install --upgrade --no-cache-dir pip fastdeploy 185 | 186 | ENV {' '.join([f"{k.upper()}={v}" for k, v in CONFIG.items()])} 187 | 188 | ADD . /recipe 189 | WORKDIR /recipe 190 | {'' if not os.path.exists("extras.sh") else 'RUN chmod +x /recipe/extras.sh && /recipe/extras.sh'} 191 | RUN python3 -m pip install --no-cache-dir -r /recipe/requirements.txt 192 | RUN cd /recipe && python3 -c "from predictor import predictor; from example import example; predictor(example)" 193 | 194 | ENTRYPOINT ["/bin/sh", "-c"] 195 | 196 | CMD ["ulimit -n 1000000 && python3 -m fastdeploy --recipe /recipe --rest & python3 -m fastdeploy --recipe /recipe --loop"] 197 | """ 198 | ) 199 | f.flush() 200 | f.close() 201 | 202 | print(f"Dockerfile generated at {os.path.abspath('fastDeploy.auto_dockerfile')}") 203 | 204 | print( 205 | f"Run `docker build -f fastDeploy.auto_dockerfile -t {os.path.abspath('.')}` to build the image" 206 | ) 207 | exit() 208 | 209 | 210 | if args.loop: 211 | loop() 212 | 213 | elif args.rest: 214 | rest() 215 | 216 | elif args.build: 217 | build_docker_image() 218 | -------------------------------------------------------------------------------- /fastdeploy/_infer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import json 4 | import pickle 5 | 6 | import msgpack 7 | import zstandard 8 | 9 | import threading 10 | 11 | from . import _utils 12 | 13 | started_at_time = time.time() 14 | 15 | # make sure all predictors are running before starting the inference server 16 | # if any are not yet started/ still loading then wait for them to start 17 | for predictor_file, predictor_sequence in _utils.PREDICTOR_FILE_TO_SEQUENCE.items(): 18 | log_printed = False 19 | while True: 20 | try: 21 | time_per_example = _utils.META_INDEX.get( 22 | f"{predictor_sequence}", select_keys=["time_per_example"] 23 | )[f"{predictor_sequence}"]["time_per_example"] 24 | started_at_time = time.time() 25 | break 26 | except: 27 | if not log_printed: 28 | _utils.logger.info(f"Waiting for {predictor_file} to start") 29 | log_printed = True 30 | time.sleep(1) 31 | 32 | 33 | _utils.logger.info(f"pids: {_utils.get_fd_pids()}") 34 | 35 | class Infer: 36 | started_at_time = started_at_time 37 | 38 | def __init__( 39 | self, 40 | allow_pickle=os.getenv("ALLOW_PICKLE", "true").lower() == "true", 41 | ): 42 | self.local_storage = threading.local() 43 | self.allow_pickle = allow_pickle 44 | 45 | @property 46 | def _compressor(self): 47 | if ( 48 | not hasattr(self.local_storage, "compressor") 49 | or self.local_storage.compressor is None 50 | ): 51 | self.local_storage.compressor = zstandard.ZstdCompressor(level=-1) 52 | return self.local_storage.compressor 53 | 54 | @property 55 | def _decompressor(self): 56 | if ( 57 | not hasattr(self.local_storage, "decompressor") 58 | or self.local_storage.decompressor is None 59 | ): 60 | self.local_storage.decompressor = zstandard.ZstdDecompressor() 61 | return self.local_storage.decompressor 62 | 63 | def read_inputs(self, unique_id, inputs, input_type, is_compressed): 64 | if input_type == "pickle": 65 | if not self.allow_pickle: 66 | _utils.logger.warning( 67 | f"{unique_id}: tried to use pickle input, but pickle is disallowed" 68 | ) 69 | raise Exception("pickle input disallowed, use msgpack or json") 70 | 71 | inputs = pickle.loads( 72 | inputs if not is_compressed else self._decompressor.decompress(inputs) 73 | ) 74 | _utils.logger.debug(f"pickle input read") 75 | 76 | elif input_type == "msgpack": 77 | inputs = msgpack.unpackb( 78 | inputs if not is_compressed else self._decompressor.decompress(inputs), 79 | use_list=False, 80 | raw=False, 81 | ) 82 | 83 | _utils.logger.debug(f"{unique_id}: msgpack input read") 84 | 85 | elif input_type == "json": 86 | inputs = json.loads( 87 | inputs if not is_compressed else self._decompressor.decompress(inputs) 88 | ) 89 | 90 | # for backward compatibility 91 | try: 92 | inputs = inputs["data"] 93 | except: 94 | pass 95 | 96 | _utils.logger.debug(f"{unique_id}: json input read") 97 | 98 | else: 99 | _utils.logger.warning(f"{unique_id}: input_type {input_type} not supported") 100 | raise Exception(f"input_type {input_type} not supported") 101 | 102 | return inputs 103 | 104 | def create_response(self, unique_id, response, is_compressed, input_type): 105 | success = response["success"] 106 | if input_type == "pickle": 107 | response = pickle.dumps(response) 108 | elif input_type == "msgpack": 109 | response = msgpack.packb(response, use_bin_type=True) 110 | elif input_type == "json": 111 | pass 112 | 113 | if is_compressed: 114 | response = self._compressor.compress(response) 115 | _utils.logger.debug(f"{unique_id}: response compressed") 116 | 117 | return success, response 118 | 119 | def get_timeout_response( 120 | self, unique_id, is_compressed, input_type, is_client_timeout=False 121 | ): 122 | if is_client_timeout: 123 | _utils.MAIN_INDEX.update( 124 | { 125 | unique_id: { 126 | "-1.predicted_at": time.time(), 127 | "timedout_in_queue": True, 128 | } 129 | } 130 | ) 131 | _utils.logger.warning(f"{unique_id}: client timeout") 132 | 133 | return self.create_response( 134 | unique_id, 135 | { 136 | "success": False, 137 | "reason": "timeout" if not is_client_timeout else "client_timeout", 138 | "unique_id": unique_id, 139 | "prediction": None, 140 | }, 141 | is_compressed, 142 | input_type, 143 | ) 144 | 145 | def add_to_infer_queue( 146 | self, inputs: bytes, unique_id: str, input_type: str, is_compressed: bool 147 | ): 148 | try: 149 | request_received_at = time.time() 150 | _utils.logger.debug(f"{unique_id}: reading inputs") 151 | 152 | inputs = self.read_inputs(unique_id, inputs, input_type, is_compressed) 153 | 154 | if inputs is None: 155 | _utils.logger.warning(f"{unique_id}: inputs are None") 156 | return self.create_response( 157 | unique_id, 158 | { 159 | "success": False, 160 | "reason": f"inputs have to be {'pickle,' if self.allow_pickle else ''} msgpack or json", 161 | "unique_id": unique_id, 162 | "prediction": None, 163 | }, 164 | is_compressed, 165 | input_type, 166 | ) 167 | 168 | if not isinstance(inputs, (list, tuple)): 169 | _utils.logger.warning(f"{unique_id}: inputs have to be a list or tuple") 170 | return self.create_response( 171 | unique_id, 172 | { 173 | "success": False, 174 | "reason": "inputs have to be a list or tuple", 175 | "unique_id": unique_id, 176 | "prediction": None, 177 | }, 178 | is_compressed, 179 | input_type, 180 | ) 181 | 182 | if not inputs: 183 | _utils.logger.debug(f"{unique_id}: empty inputs") 184 | return self.create_response( 185 | unique_id, 186 | { 187 | "success": True, 188 | "reason": "empty inputs", 189 | "unique_id": unique_id, 190 | "prediction": [], 191 | }, 192 | is_compressed, 193 | input_type, 194 | ) 195 | 196 | else: 197 | # -1 is the predictor sequence for the rest server, basically where the request originates 198 | _utils.MAIN_INDEX.update( 199 | { 200 | unique_id: { 201 | "-1.outputs": inputs, 202 | "-1.received_at": request_received_at, 203 | "-1.predicted_in_batch_of": len(inputs), 204 | "-1.predicted_at": 0, 205 | "last_predictor_sequence": -1, 206 | "last_predictor_success": True, 207 | "timedout_in_queue": None, 208 | } 209 | } 210 | ) 211 | 212 | _utils.logger.debug(f"{unique_id}: added to request queue") 213 | 214 | return True, None 215 | except Exception as ex: 216 | _utils.logger.exception(ex, exc_info=True) 217 | return self.create_response( 218 | unique_id, 219 | { 220 | "success": False, 221 | "reason": str(ex), 222 | "unique_id": unique_id, 223 | "prediction": None, 224 | }, 225 | is_compressed, 226 | input_type, 227 | ) 228 | 229 | def get_responses_for_unique_ids(self, unique_ids, is_compresseds, input_types): 230 | all_current_results = _utils.MAIN_INDEX.get( 231 | unique_ids, 232 | select_keys=[ 233 | f"{_utils.LAST_PREDICTOR_SEQUENCE}.outputs", 234 | "last_predictor_success", 235 | "last_predictor_sequence", 236 | "timedout_in_queue", 237 | ], 238 | ) 239 | 240 | all_responses = {} 241 | 242 | updations = {} 243 | still_processing = [] 244 | 245 | for unique_id, is_compressed, input_type in zip( 246 | unique_ids, is_compresseds, input_types 247 | ): 248 | current_results = all_current_results[unique_id] 249 | 250 | if current_results["timedout_in_queue"]: 251 | _utils.logger.warning(f"{unique_id}: timedout in queue") 252 | updations[unique_id] = { 253 | "-1.predicted_at": time.time(), 254 | } 255 | all_responses[unique_id] = self.get_timeout_response( 256 | unique_id, is_compressed, input_type 257 | ) 258 | _utils.logger.debug(f"{unique_id}: timedout in queue response created") 259 | 260 | elif ( 261 | current_results["last_predictor_success"] is True 262 | and current_results["last_predictor_sequence"] 263 | == _utils.LAST_PREDICTOR_SEQUENCE 264 | ): 265 | updations[unique_id] = { 266 | "-1.predicted_at": time.time(), 267 | } 268 | 269 | all_responses[unique_id] = self.create_response( 270 | unique_id, 271 | { 272 | "success": True, 273 | "unique_id": unique_id, 274 | "prediction": current_results[ 275 | f"{_utils.LAST_PREDICTOR_SEQUENCE}.outputs" 276 | ], 277 | "reason": None, 278 | }, 279 | is_compressed, 280 | input_type, 281 | ) 282 | _utils.logger.debug(f"{unique_id}: response created") 283 | elif current_results["last_predictor_success"] is False: 284 | _utils.logger.warning( 285 | f"{unique_id}: predictor failed at {current_results['last_predictor_sequence']}" 286 | ) 287 | updations[unique_id] = { 288 | "-1.predicted_at": time.time(), 289 | } 290 | all_responses[unique_id] = self.create_response( 291 | unique_id, 292 | { 293 | "success": False, 294 | "reason": f"prediction failed predictor {current_results['last_predictor_sequence']}", 295 | "unique_id": unique_id, 296 | "prediction": None, 297 | }, 298 | is_compressed, 299 | input_type, 300 | ) 301 | _utils.logger.debug(f"{unique_id}: failed response created") 302 | 303 | else: 304 | still_processing.append(unique_id) 305 | 306 | if updations: 307 | _utils.MAIN_INDEX.update(updations) 308 | 309 | if still_processing: 310 | _utils.logger.debug(f"Still processing: {still_processing}") 311 | 312 | return all_responses 313 | -------------------------------------------------------------------------------- /fastdeploy/_loop.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import importlib 4 | 5 | from . import _utils 6 | 7 | 8 | def load_predictor(predictor_name): 9 | predictor = importlib.import_module(os.path.splitext(predictor_name)[0]).predictor 10 | predictor_sequence = _utils.PREDICTOR_FILE_TO_SEQUENCE[predictor_name] 11 | _utils.logger.debug( 12 | f"{predictor_name}: predictor loaded with predictor_sequence {predictor_sequence}" 13 | ) 14 | return predictor, predictor_sequence 15 | 16 | 17 | def get_example(predictor_sequence): 18 | if predictor_sequence == 0: 19 | return _utils.example 20 | 21 | while True: 22 | _utils.logger.debug(f"Waiting for previous predictor to finish warmup") 23 | try: 24 | example = _utils.META_INDEX.get( 25 | f"{predictor_sequence - 1}", select_keys=["example_output"] 26 | )[f"{predictor_sequence - 1}"]["example_output"] 27 | if example is not None: 28 | return example 29 | except: 30 | time.sleep(1) 31 | 32 | 33 | def initialize_predictor( 34 | predictor, 35 | predictor_name, 36 | predictor_sequence, 37 | example, 38 | optimal_batch_size, 39 | ): 40 | example_output = _utils.warmup(predictor, example) 41 | _utils.logger.info(f"{predictor_name}: warmup done") 42 | 43 | optimal_batch_size, time_per_example = _utils.calculate_optimum_batch_sizes( 44 | predictor, predictor_sequence, example, optimal_batch_size 45 | ) 46 | 47 | return { 48 | "optimal_batch_size": optimal_batch_size, 49 | "time_per_example": time_per_example, 50 | "predictor_name": predictor_name, 51 | "predictor_sequence": predictor_sequence, 52 | "request_poll_time": 0.01, 53 | "example_output": example_output, 54 | "status": "running", 55 | } 56 | 57 | 58 | def process_batch(predictor, input_batch, optimal_batch_size): 59 | last_predictor_success = False 60 | received_at = time.time() 61 | try: 62 | results = predictor(input_batch, batch_size=optimal_batch_size) 63 | last_predictor_success = True 64 | except Exception as ex: 65 | _utils.logger.exception(ex, exc_info=True) 66 | results = [None] * len(input_batch) 67 | 68 | predicted_at = time.time() 69 | 70 | if len(results) != len(input_batch): 71 | raise Exception( 72 | f"Predictor returned {len(results)} results for {len(input_batch)} inputs" 73 | ) 74 | 75 | return results, last_predictor_success, received_at, predicted_at 76 | 77 | 78 | to_process = {} 79 | current_sum_of_to_process = 0 80 | 81 | 82 | def fetch_batch( 83 | main_index, 84 | predictor_sequence, 85 | optimal_batch_size, 86 | max_wait_time_for_batch_collection, 87 | ): 88 | global to_process 89 | global current_sum_of_to_process 90 | 91 | unique_id_wise_input_count = {} 92 | input_batch = [] 93 | current_batch_length = 0 94 | batch_collection_started_at = time.time() 95 | last_input_received_at = time.time() 96 | 97 | while current_batch_length < optimal_batch_size: 98 | if current_sum_of_to_process < optimal_batch_size: 99 | to_process.update( 100 | main_index.search( 101 | query={ 102 | "-1.predicted_at": 0, # prediction not yet done 103 | "last_predictor_success": True, # last predictor success 104 | "last_predictor_sequence": predictor_sequence 105 | - 1, # last predictor sequence 106 | "timedout_in_queue": {"$ne": True}, # not timedout in queue 107 | }, 108 | n=optimal_batch_size, 109 | select_keys=[f"{predictor_sequence - 1}.outputs"], 110 | update={ 111 | "last_predictor_sequence": predictor_sequence, # set last predictor sequence to current predictor sequence 112 | "last_predictor_success": None, # reset last predictor success 113 | f"{predictor_sequence}.received_at": time.time(), # set received at to current time 114 | }, 115 | ) 116 | ) 117 | 118 | for unique_id, data in to_process.items(): 119 | if current_batch_length > optimal_batch_size * 0.8: 120 | break 121 | outputs = data[f"{predictor_sequence - 1}.outputs"] 122 | input_count = len(outputs) 123 | unique_id_wise_input_count[unique_id] = input_count 124 | input_batch.extend(outputs) 125 | current_batch_length += input_count 126 | last_input_received_at = time.time() 127 | 128 | for unique_id in unique_id_wise_input_count.keys(): 129 | try: 130 | del to_process[unique_id] 131 | except: 132 | pass 133 | 134 | current_sum_of_to_process = sum( 135 | len(v[f"{predictor_sequence - 1}.outputs"]) for v in to_process.values() 136 | ) 137 | 138 | if current_batch_length == 0: 139 | if time.time() - last_input_received_at > 5: 140 | time.sleep(0.05) 141 | else: 142 | time.sleep(max_wait_time_for_batch_collection / 2) 143 | continue 144 | 145 | elif ( 146 | time.time() - batch_collection_started_at 147 | < max_wait_time_for_batch_collection 148 | and current_batch_length / optimal_batch_size < 0.9 149 | ): 150 | time.sleep(max_wait_time_for_batch_collection / 2) 151 | continue 152 | 153 | else: 154 | # finished collecting batch 155 | break 156 | 157 | _utils.logger.info( 158 | f"Fetched batch {unique_id_wise_input_count} with {current_sum_of_to_process} remaining in memory, to_process: {len(to_process)}" 159 | ) 160 | 161 | return unique_id_wise_input_count, input_batch 162 | 163 | 164 | def prepare_results( 165 | unique_id_wise_input_count, 166 | results, 167 | predictor_sequence, 168 | last_predictor_success, 169 | received_at, 170 | predicted_at, 171 | current_batch_length, 172 | ): 173 | """Prepare results for updating the main index.""" 174 | unique_id_wise_results = {} 175 | total_input_count_till_now = 0 176 | 177 | for unique_id, input_count in unique_id_wise_input_count.items(): 178 | unique_id_wise_results[unique_id] = { 179 | f"{predictor_sequence}.outputs": results[ 180 | total_input_count_till_now : total_input_count_till_now + input_count 181 | ], 182 | f"{predictor_sequence}.predicted_at": predicted_at, 183 | "last_predictor_success": last_predictor_success, 184 | f"{predictor_sequence}.received_at": received_at, 185 | f"{predictor_sequence}.predicted_in_batch_of": current_batch_length, 186 | } 187 | total_input_count_till_now += input_count 188 | 189 | return unique_id_wise_results 190 | 191 | 192 | def start_loop( 193 | predictor_name=os.getenv("PREDICTOR_NAME"), 194 | optimal_batch_size=int(os.getenv("OPTIMAL_BATCH_SIZE")), 195 | ): 196 | """Main loop for processing predictions.""" 197 | timeout_time = float(os.getenv("TIMEOUT", 0)) 198 | predictor, predictor_sequence = load_predictor(predictor_name) 199 | example = get_example(predictor_sequence) 200 | predictor_info = initialize_predictor( 201 | predictor, predictor_name, predictor_sequence, example, optimal_batch_size 202 | ) 203 | _utils.META_INDEX.update({f"{predictor_sequence}": predictor_info}) 204 | 205 | optimal_batch_size = predictor_info["optimal_batch_size"] 206 | time_per_example = predictor_info["time_per_example"] 207 | max_wait_time_for_batch_collection = max(0.003, time_per_example * 0.51) 208 | 209 | _utils.logger.info( 210 | f"""{predictor_name} 211 | optimal_batch_size: {optimal_batch_size} 212 | time_per_example: {time_per_example} 213 | predictor_sequence: {predictor_sequence} 214 | max_wait_time_for_batch_collection: {max_wait_time_for_batch_collection} 215 | """ 216 | ) 217 | 218 | prediction_loop_started_at = time.time() 219 | 220 | while True: 221 | """ 222 | Set timedout_in_queue to True for all the predictions that have been in the queue for more than timeout_time seconds 223 | and delete older than 30 seconds predictions that have finished prediction 224 | """ 225 | 226 | timedout_in_queue_unique_ids = _utils.MAIN_INDEX.search( 227 | query={ 228 | "-1.predicted_at": 0, 229 | "-1.received_at": {"$lt": time.time() - timeout_time}, 230 | "timedout_in_queue": {"$ne": True}, 231 | "last_predictor_sequence": {"$ne": _utils.LAST_PREDICTOR_SEQUENCE}, 232 | }, 233 | update={"timedout_in_queue": True}, 234 | select_keys=[], 235 | ) 236 | 237 | if timedout_in_queue_unique_ids: 238 | _utils.logger.warning( 239 | f"{_utils.MAIN_INDEX.count()} in queue, set timedout_in_queue to True for {list(timedout_in_queue_unique_ids)} unique_ids" 240 | ) 241 | 242 | _utils.MAIN_INDEX.delete( 243 | query={ 244 | "$and": [ 245 | {"-1.predicted_at": {"$gt": 0}}, 246 | {"-1.predicted_at": {"$lt": time.time() - 40}}, 247 | ] 248 | }, 249 | ) 250 | 251 | unique_id_wise_input_count, input_batch = fetch_batch( 252 | _utils.MAIN_INDEX, 253 | predictor_sequence, 254 | optimal_batch_size, 255 | max_wait_time_for_batch_collection, 256 | ) 257 | 258 | _utils.logger.debug(f"Processing batch {unique_id_wise_input_count}") 259 | 260 | process_batch_started_at = time.time() 261 | results, last_predictor_success, received_at, predicted_at = process_batch( 262 | predictor, input_batch, optimal_batch_size 263 | ) 264 | process_batch_ended_at = time.time() 265 | 266 | unique_id_wise_results = prepare_results( 267 | unique_id_wise_input_count, 268 | results, 269 | predictor_sequence, 270 | last_predictor_success, 271 | received_at, 272 | predicted_at, 273 | len(input_batch), 274 | ) 275 | _utils.MAIN_INDEX.update(unique_id_wise_results) 276 | 277 | _utils.logger.debug( 278 | f"Updated results predictor {predictor_sequence}: {list(unique_id_wise_results)}" 279 | ) 280 | 281 | _utils.GLOBAL_METRICS_INDEX.math( 282 | "total_predictor_run_for_hours", 283 | (process_batch_ended_at - process_batch_started_at) / 3600, 284 | "+=", 285 | ) 286 | 287 | _utils.GLOBAL_METRICS_INDEX["total_predictor_up_for_hours"] = ( 288 | time.time() - prediction_loop_started_at 289 | ) / 3600 290 | 291 | 292 | if __name__ == "__main__": 293 | import sys 294 | 295 | start_loop(sys.argv[1]) 296 | -------------------------------------------------------------------------------- /fastdeploy/_rest.py: -------------------------------------------------------------------------------- 1 | from gevent import monkey 2 | 3 | monkey.patch_all() 4 | 5 | import os 6 | import json 7 | import time 8 | import uuid 9 | import pickle 10 | import falcon 11 | import gevent 12 | import threading 13 | import importlib 14 | 15 | from . import _utils 16 | from . import _infer 17 | 18 | try: 19 | get_prometheus_metrics = importlib.import_module( 20 | "extra_prometheus_metrics" 21 | ).get_prometheus_metrics 22 | except ImportError: 23 | get_prometheus_metrics = None 24 | 25 | 26 | class AsyncResponseHandler: 27 | def __init__(self, check_interval=0.003): 28 | self.pending_requests = {} 29 | self.check_interval = check_interval 30 | self.lock = threading.Lock() 31 | self.infer = _infer.Infer() 32 | 33 | gevent.spawn(self._response_checker) 34 | 35 | def register_request_and_wait_for_response( 36 | self, unique_id, is_compressed, input_type, timeout 37 | ): 38 | event = gevent.event.Event() 39 | 40 | with self.lock: 41 | self.pending_requests[unique_id] = { 42 | "event": event, 43 | "is_compressed": is_compressed, 44 | "input_type": input_type, 45 | "timestamp": time.time(), 46 | } 47 | 48 | try: 49 | if event.wait(timeout=timeout): 50 | with self.lock: 51 | response = self.pending_requests[unique_id].get("response") 52 | return response 53 | else: 54 | return self.infer.get_timeout_response( 55 | unique_id, is_compressed, input_type, is_client_timeout=True 56 | ) 57 | except Exception as e: 58 | _utils.logger.exception(e, exc_info=True) 59 | _utils.logger.error(f"Error registering request and waiting for response: {e}") 60 | return self.infer.get_timeout_response( 61 | unique_id, is_compressed, input_type, is_client_timeout=True 62 | ) 63 | finally: 64 | with self.lock: 65 | self.pending_requests.pop(unique_id, None) 66 | 67 | def deregister_request(self, unique_id): 68 | with self.lock: 69 | self.pending_requests.pop(unique_id, None) 70 | 71 | def _response_checker(self): 72 | last_input_received_at = time.time() 73 | while True: 74 | try: 75 | unique_ids = [] 76 | is_compresseds = [] 77 | input_types = [] 78 | with self.lock: 79 | for uid, data in self.pending_requests.items(): 80 | unique_ids.append(uid) 81 | is_compresseds.append(data["is_compressed"]) 82 | input_types.append(data["input_type"]) 83 | last_input_received_at = data["timestamp"] 84 | 85 | if not unique_ids and (time.time() - last_input_received_at) > 5: 86 | time.sleep(0.05) 87 | continue 88 | 89 | if unique_ids: 90 | _utils.logger.debug( 91 | f"Checking responses for unique_ids: {unique_ids}" 92 | ) 93 | try: 94 | responses = self.infer.get_responses_for_unique_ids( 95 | unique_ids=unique_ids, 96 | is_compresseds=is_compresseds, 97 | input_types=input_types, 98 | ) 99 | 100 | for uid, response in responses.items(): 101 | if response is not None: 102 | with self.lock: 103 | if uid in self.pending_requests: 104 | request_data = self.pending_requests[uid] 105 | request_data["response"] = response 106 | request_data["event"].set() 107 | 108 | except Exception as e: 109 | _utils.logger.exception(e, exc_info=True) 110 | _utils.logger.error(f"Error checking responses: {e}") 111 | 112 | except Exception as e: 113 | _utils.logger.error(f"Error in response checker loop: {e}") 114 | 115 | finally: 116 | gevent.sleep(self.check_interval) 117 | 118 | 119 | class Infer(object): 120 | def __init__(self): 121 | self._infer = _infer.Infer() 122 | self._response_handler = AsyncResponseHandler() 123 | 124 | def on_post(self, req, resp): 125 | request_received_at = time.time() 126 | 127 | unique_id = str(req.params.get("unique_id", uuid.uuid4())) 128 | client_timeout = float(req.params.get("timeout", os.getenv("TIMEOUT", 480))) 129 | 130 | is_compressed = req.params.get("compressed", "f")[0].lower() == "t" 131 | input_type = req.params.get("input_type", "json") 132 | 133 | success, failure_response = self._infer.add_to_infer_queue( 134 | inputs=req.stream.read(), 135 | unique_id=unique_id, 136 | input_type=input_type, 137 | is_compressed=is_compressed, 138 | ) 139 | 140 | if is_compressed: 141 | resp.content_type = "application/octet-stream" 142 | elif input_type == "json": 143 | resp.content_type = "application/json" 144 | elif input_type == "pickle": 145 | resp.content_type = "application/pickle" 146 | elif input_type == "msgpack": 147 | resp.content_type = "application/msgpack" 148 | 149 | if success is not True: 150 | resp.status = falcon.HTTP_400 151 | if input_type == "json": 152 | resp.media = failure_response 153 | else: 154 | resp.data = failure_response 155 | 156 | else: 157 | ( 158 | success, 159 | response, 160 | ) = self._response_handler.register_request_and_wait_for_response( 161 | unique_id, is_compressed, input_type, client_timeout 162 | ) 163 | if success: 164 | resp.status = falcon.HTTP_200 165 | else: 166 | resp.status = falcon.HTTP_500 167 | 168 | if input_type == "json": 169 | resp.media = response 170 | else: 171 | resp.data = response 172 | 173 | 174 | class PrometheusMetrics(object): 175 | def on_get(self, req, resp): 176 | _LAST_X_SECONDS = int( 177 | req.params.get("last_x_seconds", int(os.getenv("LAST_X_SECONDS", 30))) 178 | ) 179 | CURRENT_TIME = time.time() 180 | LAST_X_SECONDS = time.time() - _LAST_X_SECONDS 181 | 182 | number_of_requests_timedout_in_last_x_seconds = _utils.MAIN_INDEX.count( 183 | query={ 184 | "-1.predicted_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}, 185 | "timedout_in_queue": True, 186 | } 187 | ) 188 | 189 | requests_received_in_last_x_seconds = _utils.MAIN_INDEX.count( 190 | query={"-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}} 191 | ) 192 | 193 | requests_processed_in_last_x_seconds = _utils.MAIN_INDEX.count( 194 | query={"-1.predicted_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}} 195 | ) 196 | 197 | requests_received_in_last_x_seconds_that_failed = _utils.MAIN_INDEX.count( 198 | query={ 199 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}, 200 | "last_predictor_success": False, 201 | } 202 | ) 203 | 204 | requests_processed_in_last_x_seconds_that_failed = _utils.MAIN_INDEX.count( 205 | query={ 206 | "-1.predicted_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}, 207 | "last_predictor_success": False, 208 | } 209 | ) 210 | 211 | requests_received_in_last_x_seconds_that_are_pending = _utils.MAIN_INDEX.count( 212 | query={ 213 | "-1.predicted_at": 0, 214 | "last_predictor_success": {"$ne": False}, 215 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}, 216 | } 217 | ) 218 | 219 | requests_received_in_last_x_seconds_that_are_successful = ( 220 | _utils.MAIN_INDEX.count( 221 | query={ 222 | "-1.predicted_at": {"$ne": 0}, 223 | "last_predictor_success": True, 224 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}, 225 | "timedout_in_queue": {"$ne": True}, 226 | } 227 | ) 228 | ) 229 | 230 | requests_processed_in_last_x_seconds_that_are_successful = ( 231 | _utils.MAIN_INDEX.count( 232 | query={ 233 | "-1.predicted_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}, 234 | "last_predictor_success": True, 235 | "timedout_in_queue": {"$ne": True}, 236 | } 237 | ) 238 | ) 239 | 240 | avg_total_time_per_req_for_reqs_in_last_x_seconds = 0 241 | 242 | __sum_of_received_at = _utils.MAIN_INDEX.math( 243 | "-1.received_at", 244 | "sum", 245 | query={ 246 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}, 247 | "-1.predicted_at": {"$ne": 0}, 248 | "timedout_in_queue": {"$ne": True}, 249 | }, 250 | ) 251 | 252 | __sum_of_predicted_at = _utils.MAIN_INDEX.math( 253 | "-1.predicted_at", 254 | "sum", 255 | query={ 256 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}, 257 | "-1.predicted_at": {"$ne": 0}, 258 | "timedout_in_queue": {"$ne": True}, 259 | }, 260 | ) 261 | 262 | if __sum_of_received_at and __sum_of_predicted_at: 263 | avg_total_time_per_req_for_reqs_in_last_x_seconds = ( 264 | __sum_of_predicted_at - __sum_of_received_at 265 | ) / requests_received_in_last_x_seconds_that_are_successful 266 | 267 | avg_actual_total_time_per_req_for_reqs_in_last_x_seconds = 0 268 | 269 | for executor_n in [0]: 270 | _temp_sum_of_received_at = _utils.MAIN_INDEX.math( 271 | f"{executor_n}.received_at", 272 | "sum", 273 | query={ 274 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}, 275 | "-1.predicted_at": {"$ne": 0}, 276 | "timedout_in_queue": {"$ne": True}, 277 | }, 278 | ) 279 | 280 | _temp_sum_of_predicted_at = _utils.MAIN_INDEX.math( 281 | f"{executor_n}.predicted_at", 282 | "sum", 283 | query={ 284 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}, 285 | "-1.predicted_at": {"$ne": 0}, 286 | "timedout_in_queue": {"$ne": True}, 287 | }, 288 | ) 289 | 290 | if _temp_sum_of_received_at and _temp_sum_of_predicted_at: 291 | avg_actual_total_time_per_req_for_reqs_in_last_x_seconds = ( 292 | _temp_sum_of_predicted_at - _temp_sum_of_received_at 293 | ) / requests_received_in_last_x_seconds_that_are_successful 294 | 295 | prometheus_text = f""" 296 | # HELP requests_received_in_last_x_seconds The number of requests received in last {_LAST_X_SECONDS} seconds. 297 | # TYPE requests_received_in_last_x_seconds gauge 298 | requests_received_in_last_x_seconds {requests_received_in_last_x_seconds} 299 | 300 | # HELP requests_processed_in_last_x_seconds The number of requests processed in last {_LAST_X_SECONDS} seconds. 301 | # TYPE requests_processed_in_last_x_seconds gauge 302 | requests_processed_in_last_x_seconds {requests_processed_in_last_x_seconds} 303 | 304 | # HELP number_of_requests_timedout_in_last_x_seconds The number of requests timedout at predictor(s) in last {_LAST_X_SECONDS} seconds. 305 | # TYPE number_of_requests_timedout_in_last_x_seconds gauge 306 | number_of_requests_timedout_in_last_x_seconds {number_of_requests_timedout_in_last_x_seconds} 307 | 308 | # HELP requests_received_in_last_x_seconds_that_failed The number of requests received in last {_LAST_X_SECONDS} seconds that failed. 309 | # TYPE requests_received_in_last_x_seconds_that_failed gauge 310 | requests_received_in_last_x_seconds_that_failed {requests_received_in_last_x_seconds_that_failed} 311 | 312 | # HELP requests_processed_in_last_x_seconds_that_failed The number of requests processed in last {_LAST_X_SECONDS} seconds that failed. 313 | # TYPE requests_processed_in_last_x_seconds_that_failed gauge 314 | requests_processed_in_last_x_seconds_that_failed {requests_processed_in_last_x_seconds_that_failed} 315 | 316 | # HELP requests_received_in_last_x_seconds_that_are_pending The number of requests received in last {_LAST_X_SECONDS} seconds that are pending. 317 | # TYPE requests_received_in_last_x_seconds_that_are_pending gauge 318 | requests_received_in_last_x_seconds_that_are_pending {requests_received_in_last_x_seconds_that_are_pending} 319 | 320 | # HELP requests_received_in_last_x_seconds_that_are_successful The number of requests received in last {_LAST_X_SECONDS} seconds that are successful. 321 | # TYPE requests_received_in_last_x_seconds_that_are_successful gauge 322 | requests_received_in_last_x_seconds_that_are_successful {requests_received_in_last_x_seconds_that_are_successful} 323 | 324 | # HELP requests_processed_in_last_x_seconds_that_are_successful The number of requests processed in last {_LAST_X_SECONDS} seconds that are successful. 325 | # TYPE requests_processed_in_last_x_seconds_that_are_successful gauge 326 | requests_processed_in_last_x_seconds_that_are_successful {requests_processed_in_last_x_seconds_that_are_successful} 327 | 328 | # HELP avg_total_time_per_req_for_reqs_in_last_x_seconds The average total time per request for requests in last {_LAST_X_SECONDS} seconds. 329 | # TYPE avg_total_time_per_req_for_reqs_in_last_x_seconds gauge 330 | avg_total_time_per_req_for_reqs_in_last_x_seconds {avg_total_time_per_req_for_reqs_in_last_x_seconds} 331 | 332 | # HELP avg_actual_total_time_per_req_for_reqs_in_last_x_seconds The average actual total time per request for requests in last {_LAST_X_SECONDS} seconds. 333 | # TYPE avg_actual_total_time_per_req_for_reqs_in_last_x_seconds gauge 334 | avg_actual_total_time_per_req_for_reqs_in_last_x_seconds {avg_actual_total_time_per_req_for_reqs_in_last_x_seconds} 335 | """.strip() 336 | 337 | if get_prometheus_metrics is not None: 338 | extra_prometheus_metrics_data = get_prometheus_metrics() 339 | 340 | if extra_prometheus_metrics_data: 341 | extra_prometheus_texts = [] 342 | for metric_name, metric_data in extra_prometheus_metrics_data.items(): 343 | extra_prometheus_texts.append( 344 | f""" 345 | # HELP {metric_name} {metric_data['help']} 346 | # TYPE {metric_name} {metric_data['type']} 347 | {metric_name} {metric_data['value']} 348 | """.strip() 349 | ) 350 | prometheus_text += "\n\n" + "\n\n".join(extra_prometheus_texts) 351 | 352 | resp.status = falcon.HTTP_200 353 | resp.content_type = "text/plain; version=0.0.4" 354 | resp.text = prometheus_text 355 | 356 | 357 | class Health(object): 358 | def on_get(self, req, resp): 359 | fail_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y_param = req.params.get( 360 | "fail_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y", 361 | None, 362 | ) 363 | 364 | fail_if_requests_older_than_x_seconds_pending_param = req.params.get( 365 | "fail_if_requests_older_than_x_seconds_pending", None 366 | ) 367 | 368 | fail_if_up_time_more_than_x_seconds_param = req.params.get( 369 | "fail_if_up_time_more_than_x_seconds", None 370 | ) 371 | 372 | fail_if_requests_timedout_in_last_x_seconds_is_more_than_y_param = ( 373 | req.params.get( 374 | "fail_if_requests_timedout_in_last_x_seconds_is_more_than_y", None 375 | ) 376 | ) 377 | 378 | is_predictor_is_up_param = req.params.get("is_predictor_is_up", None) 379 | 380 | if fail_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y_param: 381 | ( 382 | x, 383 | y, 384 | ) = fail_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y_param.split( 385 | "," 386 | ) 387 | x, y = int(x), int(y) 388 | if _utils.check_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y( 389 | x, y 390 | ): 391 | resp.status = falcon.HTTP_503 392 | resp.media = { 393 | "reason": f"More than {y}% requests failed in last {x} seconds" 394 | } 395 | return 396 | 397 | if fail_if_requests_older_than_x_seconds_pending_param: 398 | if _utils.check_if_requests_older_than_x_seconds_pending( 399 | int(fail_if_requests_older_than_x_seconds_pending_param) 400 | ): 401 | resp.status = falcon.HTTP_503 402 | resp.media = { 403 | "reason": f"Requests older than {fail_if_requests_older_than_x_seconds_pending_param} seconds are pending" 404 | } 405 | return 406 | 407 | if fail_if_up_time_more_than_x_seconds_param: 408 | if time.time() - Infer.started_at_time > int( 409 | fail_if_up_time_more_than_x_seconds_param 410 | ): 411 | resp.status = falcon.HTTP_503 412 | resp.media = { 413 | "reason": f"Up time more than {fail_if_up_time_more_than_x_seconds_param} seconds" 414 | } 415 | return 416 | 417 | if fail_if_requests_timedout_in_last_x_seconds_is_more_than_y_param: 418 | ( 419 | x, 420 | y, 421 | ) = fail_if_requests_timedout_in_last_x_seconds_is_more_than_y_param.split( 422 | "," 423 | ) 424 | x, y = int(x), int(y) 425 | if _utils.check_if_requests_timedout_in_last_x_seconds_is_more_than_y(x, y): 426 | resp.status = falcon.HTTP_503 427 | return 428 | 429 | resp.status = falcon.HTTP_200 430 | resp.media = {"status": "ok"} 431 | 432 | 433 | class Meta(object): 434 | def on_get(self, req, resp): 435 | resp.status = falcon.HTTP_200 436 | 437 | if "is_pickle_allowed" in req.params: 438 | resp.media = { 439 | "is_pickle_allowed": os.getenv("ALLOW_PICKLE", "true").lower() == "true" 440 | } 441 | 442 | else: 443 | try: 444 | json.dumps(_utils.example) 445 | __example = _utils.example 446 | except: 447 | __example = None 448 | 449 | resp.media = { 450 | "name": _utils.recipe_name, 451 | "example": __example, 452 | "is_pickle_allowed": os.getenv("ALLOW_PICKLE", "true").lower() 453 | == "true", 454 | "timeout": int(os.getenv("TIMEOUT")), 455 | } 456 | 457 | 458 | class Die(object): 459 | def on_get(self, req, resp): 460 | if req.params.get("die", "false").lower()[0] == "t": 461 | resp.status = falcon.HTTP_200 462 | resp.media = {"status": "killed"} 463 | _utils.kill_fd(loop=True, rest=True) 464 | 465 | 466 | app = falcon.App( 467 | middleware=falcon.CORSMiddleware(allow_origins="*", allow_credentials="*"), 468 | ) 469 | 470 | infer_api = Infer() 471 | prometheus_metrics = PrometheusMetrics() 472 | health_api = Health() 473 | die_api = Die() 474 | 475 | app.add_route("/infer", infer_api) 476 | app.add_route("/sync", infer_api) 477 | app.add_route("/prometheus_metrics", prometheus_metrics) 478 | app.add_route("/health", health_api) 479 | app.add_route("/meta", Meta()) 480 | app.add_route("/die", die_api) 481 | -------------------------------------------------------------------------------- /fastdeploy/_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.basicConfig( 4 | format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", 5 | datefmt="%Y-%m-%d:%H:%M:%S", 6 | level=logging.INFO, 7 | ) 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | import os 13 | import glob 14 | import json 15 | import time 16 | import psutil 17 | from datetime import datetime 18 | from liteindex import DefinedIndex, KVIndex 19 | 20 | try: 21 | from example import example 22 | except: 23 | raise Exception("example.py not found. Please follow the instructions in README.md") 24 | 25 | try: 26 | from example import name as recipe_name 27 | except: 28 | recipe_name = os.path.basename(os.getcwd()).strip("/") 29 | 30 | 31 | PREDICTOR_SEQUENCE_TO_FILES = {} 32 | 33 | predictor_files = [ 34 | _ 35 | for _ in glob.glob("predictor*.py") 36 | if _ == "predictor.py" or _.split("predictor_")[1].split(".")[0].isdigit() 37 | ] 38 | 39 | for f in sorted( 40 | predictor_files, 41 | key=lambda x: int( 42 | x.split("predictor_")[1].split(".")[0] if x != "predictor.py" else 0 43 | ), 44 | ): 45 | if f == "predictor.py": 46 | PREDICTOR_SEQUENCE_TO_FILES[0] = f 47 | break 48 | else: 49 | PREDICTOR_SEQUENCE_TO_FILES[len(PREDICTOR_SEQUENCE_TO_FILES)] = f 50 | 51 | PREDICTOR_FILE_TO_SEQUENCE = {v: k for k, v in PREDICTOR_SEQUENCE_TO_FILES.items()} 52 | 53 | LAST_PREDICTOR_SEQUENCE = max(PREDICTOR_SEQUENCE_TO_FILES.keys()) 54 | FIRST_PREDICTOR_SEQUENCE = min(PREDICTOR_SEQUENCE_TO_FILES.keys()) 55 | 56 | META_INDEX = DefinedIndex( 57 | "meta_index", 58 | schema={ 59 | "optimal_batch_size": DefinedIndex.Type.number, 60 | "time_per_example": DefinedIndex.Type.number, 61 | "predictor_name": DefinedIndex.Type.string, 62 | "predictor_sequence": DefinedIndex.Type.number, 63 | "request_poll_time": DefinedIndex.Type.number, 64 | "example_output": DefinedIndex.Type.other, 65 | "status": DefinedIndex.Type.string, 66 | }, 67 | db_path=os.path.join("fastdeploy_dbs", f"main_index.db"), 68 | ) 69 | 70 | KV_STORE = KVIndex(os.path.join("fastdeploy_dbs", f"kv_store.db")) 71 | KV_STORE.clear() 72 | 73 | 74 | MAIN_INDEX = DefinedIndex( 75 | "main_index", 76 | schema={ 77 | **{ 78 | "last_predictor_sequence": DefinedIndex.Type.number, 79 | "last_predictor_success": DefinedIndex.Type.boolean, 80 | "-1.outputs": DefinedIndex.Type.other, 81 | "-1.predicted_at": DefinedIndex.Type.number, 82 | "-1.received_at": DefinedIndex.Type.number, 83 | "-1.predicted_in_batch_of": DefinedIndex.Type.number, 84 | "timedout_in_queue": DefinedIndex.Type.boolean, 85 | }, 86 | **{f"{_}.outputs": "other" for _ in PREDICTOR_SEQUENCE_TO_FILES}, 87 | **{f"{_}.predicted_at": "number" for _ in PREDICTOR_SEQUENCE_TO_FILES}, 88 | **{f"{_}.received_at": "number" for _ in PREDICTOR_SEQUENCE_TO_FILES}, 89 | **{f"{_}.predicted_in_batch_of": "number" for _ in PREDICTOR_SEQUENCE_TO_FILES}, 90 | }, 91 | db_path=os.path.join("fastdeploy_dbs", f"main_index.db"), 92 | auto_vacuum=False, 93 | ) 94 | 95 | # for setting timedout_in_queue 96 | # used in _loop.py start_loop to set timedout_in_queue to True for all the predictions that have been in the queue for more than timeout_time seconds 97 | MAIN_INDEX.optimize_for_query( 98 | ["-1.predicted_at", "-1.received_at", "timedout_in_queue"] 99 | ) 100 | 101 | # for getting next batch to process 102 | # used in _loop.py fetch_batch function 103 | MAIN_INDEX.optimize_for_query( 104 | [ 105 | "-1.predicted_at", 106 | "last_predictor_success", 107 | "last_predictor_sequence", 108 | "timedout_in_queue", 109 | ] 110 | ) 111 | 112 | # in general queries 113 | MAIN_INDEX.optimize_for_query(["-1.received_at"]) 114 | MAIN_INDEX.optimize_for_query(["last_predictor_success"]) 115 | MAIN_INDEX.optimize_for_query(["last_predictor_sequence"]) 116 | MAIN_INDEX.optimize_for_query(["timedout_in_queue"]) 117 | 118 | 119 | GLOBAL_METRICS_INDEX = KVIndex( 120 | os.path.join("fastdeploy_dbs", f"global_metrics_index.db") 121 | ) 122 | GLOBAL_METRICS_INDEX["total_predictor_run_for_hours"] = 0 123 | GLOBAL_METRICS_INDEX["total_predictor_up_for_hours"] = 0 124 | 125 | 126 | def get_fd_pids(): 127 | # get pids of processes with fastdeploy and rest or loop in their full cmdline 128 | pids = { 129 | "rest": [], 130 | "loop": [] 131 | } 132 | 133 | for proc in psutil.process_iter(): 134 | try: 135 | full_cmdline = " ".join(proc.cmdline()) 136 | if "fastdeploy" in full_cmdline and "--rest" in full_cmdline: 137 | pids["rest"].append(proc.pid) 138 | elif "fastdeploy" in full_cmdline and "--loop" in full_cmdline: 139 | pids["loop"].append(proc.pid) 140 | except Exception as e: 141 | pass 142 | 143 | return pids 144 | 145 | 146 | def kill_fd(loop=True, rest=True): 147 | pids = get_fd_pids() 148 | if loop and pids["loop"]: 149 | os.system(f"kill -9 {' '.join([str(pid) for pid in pids['loop']])}") 150 | if rest and pids["rest"]: 151 | os.system(f"kill -9 {' '.join([str(pid) for pid in pids['rest']])}") 152 | 153 | 154 | def warmup(predictor, example_input, n=3): 155 | """ 156 | Run warmup prediction on the model. 157 | 158 | :param n: number of warmup predictions to be run. defaults to 3 159 | """ 160 | logger.info("Warming up .. ") 161 | for _ in range(n - 1): 162 | predictor(example_input) 163 | 164 | return predictor(example_input) 165 | 166 | 167 | def calculate_optimum_batch_sizes( 168 | predictor, 169 | predictor_sequence, 170 | example_input, 171 | max_batch_size, 172 | max_batch_search_sec=10, 173 | ): 174 | search_over_batch_sizes = ( 175 | [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] 176 | if max_batch_size == 0 177 | else [max_batch_size] 178 | ) 179 | 180 | time_per_example = 0 181 | max_batch_size = 0 182 | 183 | for batch_size in search_over_batch_sizes: 184 | logger.info(f"Trying batch size: {batch_size}") 185 | start = time.time() 186 | predictor((example_input * batch_size)[:batch_size], batch_size=batch_size) 187 | end = time.time() 188 | 189 | _time_per_example = (end - start) / batch_size 190 | 191 | logger.info(f"batch_size: {batch_size}, time_per_example: {_time_per_example}") 192 | 193 | if time_per_example == 0: 194 | time_per_example = _time_per_example 195 | max_batch_size = batch_size 196 | elif _time_per_example < time_per_example: 197 | time_per_example = _time_per_example 198 | max_batch_size = batch_size 199 | else: 200 | break 201 | 202 | logger.info( 203 | f"{PREDICTOR_SEQUENCE_TO_FILES[predictor_sequence]}: Optimum batch size: {max_batch_size}, time_per_example: {time_per_example}" 204 | ) 205 | 206 | return max_batch_size, time_per_example 207 | 208 | 209 | def check_if_requests_timedout_in_last_x_seconds_is_more_than_y( 210 | last_x_seconds, max_percentage_of_timedout_requests 211 | ): 212 | time_before_x_seconds = time.time() - last_x_seconds 213 | requests_received_in_last_x_seconds = MAIN_INDEX.count( 214 | query={"-1.predicted_at": {"$gte": time_before_x_seconds}} 215 | ) 216 | 217 | requests_timedout_in_last_x_seconds = MAIN_INDEX.count( 218 | query={ 219 | "-1.predicted_at": {"$gte": time_before_x_seconds}, 220 | "timedout_in_queue": True, 221 | } 222 | ) 223 | 224 | if requests_received_in_last_x_seconds == 0: 225 | return False 226 | 227 | logger.warning( 228 | f"Requests timedout in last {last_x_seconds} seconds: {requests_timedout_in_last_x_seconds}/{requests_received_in_last_x_seconds}" 229 | ) 230 | 231 | if ( 232 | requests_timedout_in_last_x_seconds / requests_received_in_last_x_seconds 233 | ) * 100 >= max_percentage_of_timedout_requests: 234 | return True 235 | return False 236 | 237 | 238 | def check_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y( 239 | last_x_seconds, max_percentage_of_failed_requests 240 | ): 241 | time_before_x_seconds = time.time() - last_x_seconds 242 | requests_received_in_last_x_seconds = MAIN_INDEX.count( 243 | query={"-1.predicted_at": {"$gte": time_before_x_seconds}} 244 | ) 245 | 246 | if requests_received_in_last_x_seconds == 0: 247 | return False 248 | 249 | requests_received_in_last_x_seconds_that_failed = MAIN_INDEX.count( 250 | query={ 251 | "-1.predicted_at": {"$gte": time_before_x_seconds}, 252 | "last_predictor_success": False, 253 | } 254 | ) 255 | 256 | if ( 257 | requests_received_in_last_x_seconds_that_failed 258 | / requests_received_in_last_x_seconds 259 | ) * 100 >= max_percentage_of_failed_requests: 260 | return True 261 | 262 | return False 263 | 264 | 265 | def check_if_requests_older_than_x_seconds_pending(x): 266 | time_before_x_seconds = time.time() - x 267 | 268 | requests_older_than_x_seconds_pending = MAIN_INDEX.count( 269 | query={ 270 | "-1.received_at": {"$lte": time_before_x_seconds}, 271 | "-1.predicted_at": 0, 272 | "last_predictor_success": {"$ne": False}, 273 | } 274 | ) 275 | 276 | if requests_older_than_x_seconds_pending > 0: 277 | return True 278 | return False 279 | -------------------------------------------------------------------------------- /fastdeploy/monitor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Function to check if nvidia-smi is available 4 | check_nvidia_smi() { 5 | command -v nvidia-smi >/dev/null 2>&1 6 | } 7 | 8 | # Function to get GPU usage for a PID 9 | get_gpu_usage() { 10 | pid=$1 11 | if check_nvidia_smi; then 12 | gpu_mem=$(nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader,nounits | grep "^$pid," | cut -d',' -f2 | tr -d ' ') 13 | gpu_util=$(nvidia-smi --query-compute-apps=pid,gpu_util --format=csv,noheader,nounits | grep "^$pid," | cut -d',' -f2 | tr -d ' ') 14 | 15 | gpu_mem=${gpu_mem:-0} 16 | gpu_util=${gpu_util:-0} 17 | else 18 | gpu_mem=0 19 | gpu_util=0 20 | fi 21 | 22 | echo "$gpu_util $gpu_mem" 23 | } 24 | 25 | # Function to get CPU and memory usage for a single PID 26 | get_usage() { 27 | pid=$1 28 | cpu=$(ps -p $pid -o %cpu= | tr -d ' ') 29 | mem=$(ps -p $pid -o rss= | tr -d ' ') 30 | mem_mb=$(printf "%.2f" $(echo "$mem / 1024" | bc -l)) 31 | echo "$cpu $mem_mb" 32 | } 33 | 34 | # Function to sum CPU and memory usage for multiple PIDs 35 | sum_usage() { 36 | pids=$1 37 | cpu_sum=0 38 | mem_sum=0 39 | 40 | for pid in $pids; do 41 | read cpu mem <<< $(get_usage $pid) 42 | cpu_sum=$(echo "$cpu_sum + $cpu" | bc -l) 43 | mem_sum=$(echo "$mem_sum + $mem" | bc -l) 44 | done 45 | 46 | echo "$cpu_sum $mem_sum" 47 | } 48 | 49 | # Initialize arrays for storing historical data 50 | declare -a loop_cpu_history 51 | declare -a loop_ram_history 52 | declare -a loop_gpu_util_history 53 | declare -a loop_gpu_mem_history 54 | declare -a rest_cpu_history 55 | declare -a rest_ram_history 56 | 57 | # Function to calculate statistics 58 | calculate_stats() { 59 | local values=("$@") 60 | local count=${#values[@]} 61 | 62 | if [ $count -eq 0 ]; then 63 | echo '{"min": "N/A", "max": "N/A", "avg": "N/A"}' 64 | return 65 | fi 66 | 67 | local min=${values[0]} 68 | local max=${values[0]} 69 | local sum=0 70 | 71 | for value in "${values[@]}"; do 72 | sum=$(printf "%.2f" $(echo "$sum + $value" | bc -l)) 73 | 74 | if (( $(echo "$value < $min" | bc -l) )); then 75 | min=$value 76 | fi 77 | 78 | if (( $(echo "$value > $max" | bc -l) )); then 79 | max=$value 80 | fi 81 | done 82 | 83 | local avg=$(printf "%.2f" $(echo "$sum / $count" | bc -l)) 84 | 85 | echo "{\"min\": $min, \"max\": $max, \"avg\": $avg}" 86 | } 87 | 88 | # Function to add value to history array (maintaining last 5 values) 89 | add_to_history() { 90 | local array_name=$1 91 | local value=$2 92 | 93 | eval "$array_name[\${#$array_name[@]}]=$value" 94 | 95 | if [ $(eval "echo \${#$array_name[@]}") -gt 5 ]; then 96 | eval "$array_name=(\"\${$array_name[@]:1}\")" 97 | fi 98 | } 99 | 100 | # Function to create JSON output 101 | create_json() { 102 | local loop_pid=$1 103 | local rest_pids=$2 104 | local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") 105 | local output="" 106 | 107 | output+="{\n" 108 | output+=" \"timestamp\": \"$timestamp\",\n" 109 | 110 | # Loop process data 111 | output+=" \"loop_process\": {\n" 112 | if [ ! -z "$loop_pid" ]; then 113 | read cpu mem <<< $(get_usage $loop_pid) 114 | read gpu_util gpu_mem <<< $(get_gpu_usage $loop_pid) 115 | 116 | add_to_history loop_cpu_history "$cpu" 117 | add_to_history loop_ram_history "$mem" 118 | add_to_history loop_gpu_util_history "$gpu_util" 119 | add_to_history loop_gpu_mem_history "$gpu_mem" 120 | 121 | output+=" \"pid\": $loop_pid,\n" 122 | output+=" \"status\": \"running\",\n" 123 | output+=" \"current\": {\n" 124 | output+=" \"cpu\": $cpu,\n" 125 | output+=" \"ram\": $mem,\n" 126 | output+=" \"gpu_util\": $gpu_util,\n" 127 | output+=" \"gpu_mem\": $gpu_mem\n" 128 | output+=" },\n" 129 | output+=" \"stats\": {\n" 130 | output+=" \"cpu\": $(calculate_stats "${loop_cpu_history[@]}"),\n" 131 | output+=" \"ram\": $(calculate_stats "${loop_ram_history[@]}"),\n" 132 | output+=" \"gpu_util\": $(calculate_stats "${loop_gpu_util_history[@]}"),\n" 133 | output+=" \"gpu_mem\": $(calculate_stats "${loop_gpu_mem_history[@]}")\n" 134 | output+=" }\n" 135 | else 136 | output+=" \"status\": \"not_running\"\n" 137 | fi 138 | output+=" },\n" 139 | 140 | # REST processes data 141 | output+=" \"rest_processes\": {\n" 142 | if [ ! -z "$rest_pids" ]; then 143 | read cpu mem <<< $(sum_usage "$rest_pids") 144 | 145 | add_to_history rest_cpu_history "$cpu" 146 | add_to_history rest_ram_history "$mem" 147 | 148 | output+=" \"pids\": [$(echo $rest_pids | sed 's/ /, /g')],\n" 149 | output+=" \"status\": \"running\",\n" 150 | output+=" \"current\": {\n" 151 | output+=" \"cpu\": $cpu,\n" 152 | output+=" \"ram\": $mem\n" 153 | output+=" },\n" 154 | output+=" \"stats\": {\n" 155 | output+=" \"cpu\": $(calculate_stats "${rest_cpu_history[@]}"),\n" 156 | output+=" \"ram\": $(calculate_stats "${rest_ram_history[@]}")\n" 157 | output+=" }\n" 158 | else 159 | output+=" \"status\": \"not_running\"\n" 160 | fi 161 | output+=" }\n" 162 | output+="}" 163 | 164 | echo -e "$output" 165 | } 166 | 167 | # Main monitoring function 168 | monitor() { 169 | # Get PIDs 170 | loop_pid=$(pgrep -f "fastdeploy.*loop") 171 | rest_pids=$(pgrep -f "fastdeploy.*rest") 172 | 173 | # Create JSON and write to file 174 | create_json "$loop_pid" "$rest_pids" > monitoring_results.json 175 | } 176 | 177 | # Run the monitor function every 2 seconds 178 | while true; do 179 | monitor 180 | sleep 1 181 | done 182 | -------------------------------------------------------------------------------- /recipe.md: -------------------------------------------------------------------------------- 1 | ### Serving your pipeline with fastdeploy [example](https://github.com/notAI-tech/fastDeploy/tree/master/recipes/echo) 2 | 3 | - Create a recipe folder with the following structure: 4 | ``` 5 | recipe_folder/ 6 | ├── example.py 7 | ├── predictor.py 8 | ├── requirements.txt (optional) 9 | └── extras.sh (optional) 10 | ``` 11 | 12 | - `example.py` 13 | 14 | ```python 15 | name = "your_app_or_model_name" 16 | 17 | example = [ 18 | example_object_1, 19 | example_object_2, 20 | ] 21 | ``` 22 | 23 | - `predictor.py` 24 | 25 | ```python 26 | # Whatever code and imports you need to load your model and make predictions 27 | 28 | # predictor function must be defined exactly as below 29 | # batch_size is the optimal batch size for your model 30 | # inputs length may or may not be equal to batch_size 31 | # len(outputs) == len(inputs) 32 | def predictor(inputs, batch_size=1): 33 | return outputs 34 | ``` 35 | 36 | - `requirements.txt` (optional): all python dependencies for your pipeline 37 | 38 | - `extras.sh` (optional): any bash commands to run before installing requirements.txt 39 | 40 | - #### start the loop 41 | 42 | ```bash 43 | fastdeploy --loop --recipe recipes/echo_chained 44 | ``` 45 | 46 | - #### start the server 47 | 48 | ```bash 49 | fastdeploy --rest --recipe recipes/echo_chained 50 | ``` 51 | 52 | 53 | ### Chained recipe [example](https://github.com/notAI-tech/fastDeploy/tree/master/recipes/echo_chained) 54 | - Chained recipe means you have multiple predictor_X.py which are chained sequentially 55 | - `predictor_1.py` will be called first, then `predictor_2.py` and so on 56 | - Each predictor_X.py must have a predictor function defined as above 57 | - Each predictor_X.py is run separately i.e: can be in different virtualenvs 58 | 59 | - #### start all the loops 60 | 61 | ```bash 62 | fastdeploy --loop --recipe recipes/echo_chained --config "predictor_name:predictor_1.py" 63 | 64 | fastdeploy --loop --recipe recipes/echo_chained --config "predictor_name:predictor_2.py" 65 | ``` 66 | 67 | - #### start the server 68 | 69 | ```bash 70 | fastdeploy --rest --recipe recipes/echo_chained 71 | ``` 72 | -------------------------------------------------------------------------------- /recipes/.gitignore: -------------------------------------------------------------------------------- 1 | */*default* 2 | */fastdeploy_dbs 3 | -------------------------------------------------------------------------------- /recipes/echo/.dockerignore: -------------------------------------------------------------------------------- 1 | *.request_index 2 | *.results_index 3 | *.log_index -------------------------------------------------------------------------------- /recipes/echo/.gitignore: -------------------------------------------------------------------------------- 1 | *.request_index 2 | *.results_index 3 | *.log_index -------------------------------------------------------------------------------- /recipes/echo/example.py: -------------------------------------------------------------------------------- 1 | name = "echo" 2 | 3 | example = ["Any JSON serialiazable Python object can be input"] 4 | -------------------------------------------------------------------------------- /recipes/echo/extra_prometheus_metrics.py: -------------------------------------------------------------------------------- 1 | def get_prometheus_metrics(): 2 | return { 3 | "test_metric": { 4 | "type": "counter", 5 | "help": "This is a test metric", 6 | "value": 1 7 | } 8 | } -------------------------------------------------------------------------------- /recipes/echo/fastDeploy.auto_dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim 2 | RUN python3 -m pip install --upgrade --no-cache-dir pip fastdeploy 3 | 4 | ENV MAX_REQUEST_BATCH_SIZE=0 WORKERS=3 TIMEOUT=480 HOST=0.0.0.0 PORT=8080 ONLY_ASYNC=false ALLOW_PICKLE=true PREDICTOR_NAME=predictor.py OPTIMAL_BATCH_SIZE=0 KEEP_ALIVE=60 BASE=python:3.8-slim 5 | 6 | ADD . /recipe 7 | WORKDIR /recipe 8 | 9 | RUN python3 -m pip install --no-cache-dir -r /recipe/requirements.txt 10 | RUN cd /recipe && python3 -c "from predictor import predictor; from example import example; predictor(example)" 11 | 12 | ENTRYPOINT ["/bin/sh", "-c"] 13 | 14 | CMD ["ulimit -n 1000000 && python3 -m fastdeploy --recipe /recipe --loop & python3 -m fastdeploy --recipe /recipe --rest"] 15 | -------------------------------------------------------------------------------- /recipes/echo/predictor.py: -------------------------------------------------------------------------------- 1 | # Do the required imports 2 | import os 3 | import time 4 | 5 | # Any code can be here 6 | # Load your models, import your local scripts 7 | # modify the code inside predictor function. 8 | 9 | SLEEP_TIME = float(os.getenv("SLEEP_TIME", "0.2")) 10 | 11 | def predictor(input_list, batch_size=1): 12 | output_list = [] 13 | while input_list: 14 | input_batch = input_list[:batch_size] 15 | input_list = input_list[batch_size:] 16 | output_list += input_batch 17 | time.sleep(SLEEP_TIME) 18 | 19 | return output_list 20 | -------------------------------------------------------------------------------- /recipes/echo/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/notAI-tech/fastDeploy/34865d1be99cc5ab98645985c6c7dda7119df1c4/recipes/echo/requirements.txt -------------------------------------------------------------------------------- /recipes/echo_chained/.dockerignore: -------------------------------------------------------------------------------- 1 | *.request_index 2 | *.results_index 3 | *.log_index -------------------------------------------------------------------------------- /recipes/echo_chained/.gitignore: -------------------------------------------------------------------------------- 1 | *.request_index 2 | *.results_index 3 | *.log_index -------------------------------------------------------------------------------- /recipes/echo_chained/example.py: -------------------------------------------------------------------------------- 1 | example = ["Any JSON serialiazable Python object can be input"] 2 | -------------------------------------------------------------------------------- /recipes/echo_chained/predictor_1.py: -------------------------------------------------------------------------------- 1 | # Do the required imports 2 | import os 3 | import time 4 | 5 | # Any code can be here 6 | # Load your models, import your local scripts 7 | # modify the code inside predictor function. 8 | 9 | SLEEP_TIME = float(os.getenv("SLEEP_TIME", "0.1")) 10 | 11 | def predictor(input_list, batch_size=1): 12 | output_list = [] 13 | while input_list: 14 | print(input_list) 15 | input_batch = input_list[:batch_size] 16 | input_list = input_list[batch_size:] 17 | output_list += [(1, _) for _ in input_batch] 18 | time.sleep(SLEEP_TIME) 19 | 20 | return output_list 21 | -------------------------------------------------------------------------------- /recipes/echo_chained/predictor_2.py: -------------------------------------------------------------------------------- 1 | # Do the required imports 2 | import os 3 | import time 4 | 5 | # Any code can be here 6 | # Load your models, import your local scripts 7 | # modify the code inside predictor function. 8 | 9 | SLEEP_TIME = float(os.getenv("SLEEP_TIME", "0.2")) 10 | 11 | def predictor(input_list, batch_size=1): 12 | print(input_list) 13 | output_list = [] 14 | while input_list: 15 | input_batch = input_list[:batch_size] 16 | input_list = input_list[batch_size:] 17 | output_list += [(2, _) for _ in input_batch] 18 | time.sleep(SLEEP_TIME) 19 | 20 | return output_list 21 | -------------------------------------------------------------------------------- /recipes/echo_chained/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/notAI-tech/fastDeploy/34865d1be99cc5ab98645985c6c7dda7119df1c4/recipes/echo_chained/requirements.txt -------------------------------------------------------------------------------- /recipes/text_embeddings/example.py: -------------------------------------------------------------------------------- 1 | # generate random sentence with words of size 1-10 characters and total 5-100 words 2 | 3 | import random 4 | import string 5 | 6 | words = open("words.txt", "r").read().split() 7 | 8 | def generate_random_sentence(): 9 | # Generate random number of words between 5-100 10 | num_words = random.randint(3, 100) 11 | 12 | sentence = [] 13 | for _ in range(num_words): 14 | word = random.choice(words) 15 | sentence.append(word) 16 | 17 | return ' '.join(sentence) 18 | 19 | 20 | def example_function(): 21 | return [generate_random_sentence() for _ in range(random.randint(1, 10))] 22 | 23 | example = example_function() -------------------------------------------------------------------------------- /recipes/text_embeddings/predictor.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer 2 | 3 | sentences = ['That is a happy person', 'That is a very happy person'] 4 | 5 | model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True, backend="onnx", model_kwargs={"file_name": "model.onnx", "provider": "CPUExecutionProvider"}) 6 | 7 | def predictor(input_list, batch_size=16): 8 | return model.encode(input_list, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False, batch_size=batch_size) 9 | 10 | -------------------------------------------------------------------------------- /recipes/text_embeddings/requirements.txt: -------------------------------------------------------------------------------- 1 | sentence-transformers[onnx] 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Note: To use the 'upload' functionality of this file, you must: 5 | # $ pip install twine 6 | 7 | import io 8 | import os 9 | import sys 10 | from shutil import rmtree 11 | 12 | from setuptools import find_packages, setup, Command 13 | 14 | # Package meta-data. 15 | NAME = "fastdeploy" 16 | DESCRIPTION = "Deploy DL/ ML inference pipelines with minimal extra code. " 17 | URL = "https://github.com/notAI-tech/fastDeploy" 18 | EMAIL = "praneeth@bpraneeth.com" 19 | AUTHOR = "BEDAPUDI PRANEETH" 20 | REQUIRES_PYTHON = ">=3.6.0" 21 | VERSION = "3.1.1" 22 | 23 | # What packages are required for this module to be executed? 24 | REQUIRED = ["falcon", "liteindex==0.0.3.2.dev6", "zstandard", "gunicorn[gevent]", "msgpack", "psutil"] 25 | 26 | # What packages are optional? 27 | EXTRAS = { 28 | } 29 | 30 | # The rest you shouldn't have to touch too much :) 31 | # ------------------------------------------------ 32 | # Except, perhaps the License and Trove Classifiers! 33 | # If you do change the License, remember to change the Trove Classifier for that! 34 | 35 | here = os.path.abspath(os.path.dirname(__file__)) 36 | 37 | # Import the README and use it as the long-description. 38 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file! 39 | try: 40 | with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f: 41 | long_description = "\n" + f.read() 42 | except FileNotFoundError: 43 | long_description = DESCRIPTION 44 | 45 | # Load the package's __version__.py module as a dictionary. 46 | about = {} 47 | if not VERSION: 48 | with open(os.path.join(here, NAME, "__version__.py")) as f: 49 | exec(f.read(), about) 50 | else: 51 | about["__version__"] = VERSION 52 | 53 | 54 | class UploadCommand(Command): 55 | """Support setup.py upload.""" 56 | 57 | description = "Build and publish the package." 58 | user_options = [] 59 | 60 | @staticmethod 61 | def status(s): 62 | """Prints things in bold.""" 63 | print("\033[1m{0}\033[0m".format(s)) 64 | 65 | def initialize_options(self): 66 | pass 67 | 68 | def finalize_options(self): 69 | pass 70 | 71 | def run(self): 72 | try: 73 | self.status("Removing previous builds…") 74 | rmtree(os.path.join(here, "dist")) 75 | except OSError: 76 | pass 77 | 78 | self.status("Building Source and Wheel (universal) distribution…") 79 | os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable)) 80 | 81 | self.status("Uploading the package to PyPI via Twine…") 82 | os.system("twine upload dist/*") 83 | 84 | self.status("Pushing git tags…") 85 | os.system("git tag v{0}".format(about["__version__"])) 86 | os.system("git push --tags") 87 | 88 | sys.exit() 89 | 90 | 91 | # Where the magic happens: 92 | setup( 93 | name=NAME, 94 | version=about["__version__"], 95 | description=DESCRIPTION, 96 | long_description=long_description, 97 | long_description_content_type="text/markdown", 98 | author=AUTHOR, 99 | author_email=EMAIL, 100 | python_requires=REQUIRES_PYTHON, 101 | url=URL, 102 | packages=find_packages(exclude=("tests",)), 103 | # If your package is a single module, use this instead of 'packages': 104 | # py_modules=['mypackage'], 105 | entry_points={"console_scripts": ["fastdeploy=fastdeploy:main"]}, 106 | install_requires=REQUIRED, 107 | extras_require=EXTRAS, 108 | package_data={NAME: ["fastdeploy-ui/*", "fastdeploy-ui/build/*"]}, 109 | include_package_data=True, 110 | license="MIT", 111 | classifiers=[ 112 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 113 | "License :: OSI Approved :: MIT License", 114 | "Programming Language :: Python", 115 | "Programming Language :: Python :: 3", 116 | "Programming Language :: Python :: 3.6", 117 | "Programming Language :: Python :: Implementation :: CPython", 118 | ], 119 | # $ setup.py publish support. 120 | cmdclass={ 121 | "upload": UploadCommand, 122 | }, 123 | ) 124 | -------------------------------------------------------------------------------- /testing/README.md: -------------------------------------------------------------------------------- 1 | python benchmark.py --target_rps_per_connection 100 --parallel_connections 10 --duration 60 --warmup 1 --server_url http://10.18.9.60:8080 --input_file /Users/praneeth.bedapudi/RINGCENTRAL/marauders-map/ml_serving/nlu/semantic_score_serving/example.py --results_file a.json --request_timeout 0.6 2 | -------------------------------------------------------------------------------- /testing/benchmark.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | import argparse 4 | import json 5 | import random 6 | import numpy as np 7 | from datetime import datetime 8 | import os 9 | import importlib.util 10 | from tqdm import tqdm 11 | from fdclient import FDClient 12 | import multiprocessing as mp 13 | from dataclasses import dataclass 14 | from typing import List, Dict, Any 15 | import queue 16 | import signal 17 | 18 | # Configure logging 19 | logging.basicConfig(format='%(asctime)s - %(message)s') 20 | logger = logging.getLogger(__name__) 21 | 22 | @dataclass 23 | class ConnectionStats: 24 | latencies: List[float] 25 | errors: List[str] 26 | successes: int 27 | failures: int 28 | connection_id: int 29 | 30 | 31 | class BenchmarkProcess(mp.Process): 32 | def __init__(self, connection_id, server_url, target_rps, duration, 33 | input_source, request_batch_size, is_warmup, 34 | stats_queue, progress_queue, request_timeout=10): 35 | super().__init__() 36 | self.connection_id = connection_id 37 | self.server_url = server_url 38 | self.target_rps = target_rps 39 | self.duration = duration 40 | self.input_source = input_source 41 | self.request_batch_size = request_batch_size 42 | self.is_warmup = is_warmup 43 | self.stats_queue = stats_queue 44 | self.progress_queue = progress_queue 45 | self.request_timeout = request_timeout 46 | self._loaded_function = None 47 | 48 | def _load_function(self): 49 | """Load the Python function inside the process""" 50 | if self.input_source['type'] == 'function': 51 | path = os.path.abspath(self.input_source['path']) 52 | directory = os.path.dirname(path) 53 | filename = os.path.basename(path) 54 | 55 | original_dir = os.getcwd() 56 | try: 57 | os.chdir(directory) 58 | module_name = os.path.splitext(filename)[0] 59 | spec = importlib.util.spec_from_file_location(module_name, filename) 60 | module = importlib.util.module_from_spec(spec) 61 | spec.loader.exec_module(module) 62 | 63 | if not hasattr(module, 'example_function'): 64 | raise ValueError("Python file must contain example_function()") 65 | 66 | self._loaded_function = module.example_function 67 | finally: 68 | os.chdir(original_dir) 69 | 70 | def generate_payload(self): 71 | """Generate payload based on input source type""" 72 | if self.input_source['type'] == 'json': 73 | return [self.input_source['data'][random.randint(0, len(self.input_source['data']) - 1)] 74 | for _ in range(self.request_batch_size)] 75 | else: # function 76 | if self._loaded_function is None: 77 | self._load_function() 78 | return self._loaded_function()[:self.request_batch_size] 79 | 80 | def run(self): 81 | # Handle Ctrl+C gracefully 82 | signal.signal(signal.SIGINT, signal.SIG_IGN) 83 | 84 | client = FDClient(server_url=self.server_url, request_timeout=self.request_timeout) 85 | 86 | if self.target_rps: 87 | sleep_time = 1.0 / self.target_rps 88 | else: 89 | sleep_time = 0 90 | 91 | start_time = time.time() 92 | stats = ConnectionStats( 93 | latencies=[], errors=[], successes=0, failures=0, 94 | connection_id=self.connection_id 95 | ) 96 | requests_made = 0 97 | 98 | while time.time() - start_time < self.duration: 99 | request_start = time.time() 100 | 101 | try: 102 | # Generate and send request 103 | inps = self.generate_payload() 104 | request_id = f"{'warm' if self.is_warmup else 'req'}-conn{self.connection_id}-{requests_made}" 105 | 106 | results = client.infer(inps, unique_id=request_id) 107 | latency = (time.time() - request_start) * 1000 # Convert to ms 108 | 109 | if results['success']: 110 | if not self.is_warmup: 111 | stats.successes += 1 112 | stats.latencies.append(latency) 113 | else: 114 | if not self.is_warmup: 115 | stats.failures += 1 116 | stats.errors.append(results.get('reason', 'Unknown error')) 117 | 118 | except Exception as e: 119 | if not self.is_warmup: 120 | stats.failures += 1 121 | stats.errors.append(str(e)) 122 | 123 | requests_made += 1 124 | 125 | # Update progress 126 | elapsed = time.time() - start_time 127 | self.progress_queue.put((self.connection_id, min(elapsed, self.duration))) 128 | 129 | # Rate limiting 130 | elapsed = time.time() - request_start 131 | if sleep_time > elapsed: 132 | time.sleep(sleep_time - elapsed) 133 | 134 | # Send final stats 135 | self.stats_queue.put((self.connection_id, stats)) 136 | 137 | class BenchmarkRunner: 138 | def __init__(self, target_rps_per_connection, duration_seconds, server_url, 139 | parallel_connections=1, warmup_seconds=5, input_source=None, 140 | request_batch_size=1, log_dir=None, debug=False, request_timeout=10): 141 | self.target_rps_per_connection = target_rps_per_connection 142 | self.parallel_connections = parallel_connections 143 | self.duration_seconds = duration_seconds 144 | self.warmup_seconds = warmup_seconds 145 | self.server_url = server_url 146 | self.input_source = input_source 147 | self.request_batch_size = request_batch_size 148 | self.log_dir = log_dir 149 | self.debug = debug 150 | self.request_timeout = request_timeout 151 | 152 | if self.log_dir: 153 | os.makedirs(self.log_dir, exist_ok=True) 154 | 155 | # For handling Ctrl+C gracefully 156 | self.stop_event = mp.Event() 157 | signal.signal(signal.SIGINT, self._handle_interrupt) 158 | 159 | def _handle_interrupt(self, signum, frame): 160 | print("\nStopping benchmark gracefully...") 161 | self.stop_event.set() 162 | 163 | def _update_progress_bars(self, progress_queue, pbars, duration, process_count): 164 | """Update progress bars from queue until duration is reached or stop_event is set""" 165 | start_time = time.time() 166 | while time.time() - start_time < duration and not self.stop_event.is_set(): 167 | try: 168 | conn_id, progress = progress_queue.get(timeout=0.1) 169 | pbars[conn_id].n = progress 170 | pbars[conn_id].refresh() 171 | except queue.Empty: 172 | continue 173 | 174 | def run_benchmark(self): 175 | """Run the benchmark with parallel processes""" 176 | # Create queues for inter-process communication 177 | stats_queue = mp.Queue() 178 | progress_queue = mp.Queue() 179 | 180 | print("\nStarting warmup period...") 181 | 182 | # Create progress bars for warmup 183 | warmup_pbars = { 184 | i: tqdm( 185 | total=self.warmup_seconds, 186 | desc=f"Warmup Conn {i}", 187 | position=i, 188 | unit="s", 189 | leave=True 190 | ) 191 | for i in range(self.parallel_connections) 192 | } 193 | 194 | # Start warmup processes 195 | warmup_processes = [ 196 | BenchmarkProcess( 197 | connection_id=i, 198 | server_url=self.server_url, 199 | target_rps=self.target_rps_per_connection, 200 | duration=self.warmup_seconds, 201 | input_source=self.input_source, 202 | request_batch_size=self.request_batch_size, 203 | is_warmup=True, 204 | stats_queue=stats_queue, 205 | progress_queue=progress_queue, 206 | request_timeout=self.request_timeout 207 | ) 208 | for i in range(self.parallel_connections) 209 | ] 210 | 211 | for p in warmup_processes: 212 | p.start() 213 | 214 | # Update warmup progress bars 215 | self._update_progress_bars( 216 | progress_queue, warmup_pbars, 217 | self.warmup_seconds, self.parallel_connections 218 | ) 219 | 220 | # Wait for warmup processes to finish 221 | for p in warmup_processes: 222 | p.join() 223 | 224 | # Clear warmup stats queue 225 | while not stats_queue.empty(): 226 | stats_queue.get() 227 | 228 | # Close warmup progress bars 229 | for pbar in warmup_pbars.values(): 230 | pbar.close() 231 | 232 | if self.stop_event.is_set(): 233 | print("\nBenchmark interrupted during warmup") 234 | return None 235 | 236 | print("\nStarting benchmark...") 237 | 238 | # Create progress bars for benchmark 239 | benchmark_pbars = { 240 | i: tqdm( 241 | total=self.duration_seconds, 242 | desc=f"Benchmark Conn {i}", 243 | position=i, 244 | unit="s", 245 | leave=True 246 | ) 247 | for i in range(self.parallel_connections) 248 | } 249 | 250 | # Start benchmark processes 251 | benchmark_processes = [ 252 | BenchmarkProcess( 253 | connection_id=i, 254 | server_url=self.server_url, 255 | target_rps=self.target_rps_per_connection, 256 | duration=self.duration_seconds, 257 | input_source=self.input_source, 258 | request_batch_size=self.request_batch_size, 259 | is_warmup=False, 260 | stats_queue=stats_queue, 261 | progress_queue=progress_queue, 262 | request_timeout=self.request_timeout 263 | ) 264 | for i in range(self.parallel_connections) 265 | ] 266 | 267 | for p in benchmark_processes: 268 | p.start() 269 | 270 | # Update benchmark progress bars 271 | self._update_progress_bars( 272 | progress_queue, benchmark_pbars, 273 | self.duration_seconds, self.parallel_connections 274 | ) 275 | 276 | # Collect results 277 | connection_stats = {} 278 | for _ in range(self.parallel_connections): 279 | conn_id, stats = stats_queue.get() 280 | connection_stats[conn_id] = stats 281 | 282 | # Wait for all processes to finish 283 | for p in benchmark_processes: 284 | p.join() 285 | 286 | # Close benchmark progress bars 287 | for pbar in benchmark_pbars.values(): 288 | pbar.close() 289 | 290 | # Move cursor to bottom of progress bars 291 | print("\n" * (self.parallel_connections)) 292 | 293 | if self.stop_event.is_set(): 294 | print("\nBenchmark interrupted") 295 | return None 296 | 297 | # Aggregate results 298 | all_latencies = [] 299 | total_successes = 0 300 | total_failures = 0 301 | all_errors = [] 302 | 303 | for stats in connection_stats.values(): 304 | all_latencies.extend(stats.latencies) 305 | total_successes += stats.successes 306 | total_failures += stats.failures 307 | all_errors.extend(stats.errors) 308 | 309 | if all_latencies: 310 | total_time = self.duration_seconds 311 | p50 = np.percentile(all_latencies, 50) 312 | p90 = np.percentile(all_latencies, 90) 313 | p95 = np.percentile(all_latencies, 95) 314 | p99 = np.percentile(all_latencies, 99) 315 | avg_latency = np.mean(all_latencies) 316 | std_latency = np.std(all_latencies) 317 | total_requests = total_successes + total_failures 318 | actual_rps = total_requests / total_time 319 | 320 | results = { 321 | 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 322 | 'total_requests': total_requests, 323 | 'successes': total_successes, 324 | 'failures': total_failures, 325 | 'success_rate': (total_successes/total_requests)*100 if total_requests > 0 else 0, 326 | 'average_latency_ms': float(avg_latency), 327 | 'std_latency_ms': float(std_latency), 328 | 'p50_latency_ms': float(p50), 329 | 'p90_latency_ms': float(p90), 330 | 'p95_latency_ms': float(p95), 331 | 'p99_latency_ms': float(p99), 332 | 'min_latency_ms': float(min(all_latencies)), 333 | 'max_latency_ms': float(max(all_latencies)), 334 | 'actual_rps': float(actual_rps), 335 | 'target_rps_per_connection': self.target_rps_per_connection, 336 | 'parallel_connections': self.parallel_connections, 337 | 'total_target_rps': (self.target_rps_per_connection or 0) * self.parallel_connections, 338 | 'duration_seconds': self.duration_seconds, 339 | 'warmup_seconds': self.warmup_seconds, 340 | 'request_batch_size': self.request_batch_size, 341 | 'errors': all_errors[:10] if all_errors else [], # First 10 errors 342 | 'error_count': len(all_errors), 343 | # Per-connection stats 344 | 'connection_stats': { 345 | conn_id: { 346 | 'requests': stats.successes + stats.failures, 347 | 'successes': stats.successes, 348 | 'failures': stats.failures, 349 | 'success_rate': (stats.successes/(stats.successes + stats.failures))*100 if (stats.successes + stats.failures) > 0 else 0, 350 | 'average_latency_ms': float(np.mean(stats.latencies)) if stats.latencies else 0, 351 | 'actual_rps': (stats.successes + stats.failures) / total_time 352 | } 353 | for conn_id, stats in connection_stats.items() 354 | } 355 | } 356 | return results 357 | return None 358 | 359 | def format_duration(ms): 360 | """Format milliseconds into a readable duration.""" 361 | if ms < 1: 362 | return f"{ms*1000:.2f}μs" 363 | elif ms < 1000: 364 | return f"{ms:.2f}ms" 365 | else: 366 | return f"{ms/1000:.2f}s" 367 | 368 | def print_results(results): 369 | """Print formatted benchmark results.""" 370 | if not results: 371 | return 372 | 373 | print("\n" + "="*80) 374 | print("BENCHMARK RESULTS") 375 | print("="*80) 376 | 377 | # Overall Statistics 378 | print("\n📊 OVERALL STATISTICS") 379 | print("-"*40) 380 | print(f"Total Requests: {results['total_requests']:,}") 381 | print(f"Successful: {results['successes']:,}") 382 | print(f"Failed: {results['failures']:,}") 383 | print(f"Success Rate: {results['success_rate']:.2f}%") 384 | 385 | # Throughput 386 | print("\n🚀 THROUGHPUT") 387 | print("-"*40) 388 | print(f"Actual RPS: {results['actual_rps']:.2f}") 389 | print(f"Target RPS: {results['total_target_rps'] or 'unlimited'}") 390 | print(f"Connections: {results['parallel_connections']}") 391 | print(f"Duration: {results['duration_seconds']}s (+ {results['warmup_seconds']}s warmup)") 392 | print(f"Batch Size: {results['request_batch_size']}") 393 | 394 | # Latency Statistics 395 | print("\n⚡ LATENCY STATISTICS") 396 | print("-"*40) 397 | print(f"Average: {format_duration(results['average_latency_ms'])}") 398 | print(f"Std Dev: {format_duration(results['std_latency_ms'])}") 399 | print(f"Min: {format_duration(results['min_latency_ms'])}") 400 | print(f"Max: {format_duration(results['max_latency_ms'])}") 401 | print(f"P50: {format_duration(results['p50_latency_ms'])}") 402 | print(f"P90: {format_duration(results['p90_latency_ms'])}") 403 | print(f"P95: {format_duration(results['p95_latency_ms'])}") 404 | print(f"P99: {format_duration(results['p99_latency_ms'])}") 405 | 406 | # Per-Connection Statistics 407 | print("\n🔌 PER-CONNECTION STATISTICS") 408 | print("-"*40) 409 | for conn_id, stats in results['connection_stats'].items(): 410 | print(f"\nConnection {conn_id}:") 411 | print(f" Requests: {stats['requests']:,}") 412 | print(f" Success Rate: {stats['success_rate']:.2f}%") 413 | print(f" Actual RPS: {stats['actual_rps']:.2f}") 414 | print(f" Avg Latency: {format_duration(stats['average_latency_ms'])}") 415 | 416 | # Error Summary 417 | if results['errors']: 418 | print("\n❌ ERROR SUMMARY") 419 | print("-"*40) 420 | print(f"Total Errors: {results['error_count']}") 421 | print("\nFirst 10 Errors:") 422 | for i, error in enumerate(results['errors'], 1): 423 | print(f"{i}. {error}") 424 | 425 | print("\n" + "="*80) 426 | 427 | 428 | def main(): 429 | parser = argparse.ArgumentParser(description='API Benchmark Tool') 430 | parser.add_argument('--server_url', type=str, required=True, help='Server URL') 431 | parser.add_argument('--target_rps_per_connection', type=int, default=None, 432 | help='Target requests per second per connection') 433 | parser.add_argument('--parallel_connections', type=int, default=1, 434 | help='Number of parallel connections') 435 | parser.add_argument('--duration', type=int, default=60, help='Test duration in seconds') 436 | parser.add_argument('--warmup', type=int, default=5, help='Warmup period in seconds') 437 | parser.add_argument('--debug', action='store_true', help='Enable debug logging') 438 | parser.add_argument('--input_file', type=str, required=True, help='Input .json or .py file path') 439 | parser.add_argument('--request_batch_size', type=int, default=1, help='Request batch size') 440 | parser.add_argument('--log_dir', type=str, default=None, help='Directory to log request inputs and outputs') 441 | parser.add_argument('--results_file', type=str, default='benchmark_results.json', 442 | help='File to write benchmark results') 443 | parser.add_argument('--request_timeout', type=float, default=10, help='Request timeout in seconds') 444 | args = parser.parse_args() 445 | 446 | if args.debug: 447 | logger.setLevel(logging.DEBUG) 448 | 449 | # Load input source 450 | input_source = None 451 | if args.input_file.endswith('.json'): 452 | try: 453 | with open(args.input_file, 'r') as f: 454 | input_data = json.load(f) 455 | input_source = {'type': 'json', 'data': input_data} 456 | except Exception as e: 457 | logger.error(f"Failed to load JSON input file: {e}") 458 | return 459 | elif args.input_file.endswith('.py'): 460 | input_source = {'type': 'function', 'path': args.input_file} 461 | else: 462 | logger.error("Input file must be either .json or .py") 463 | return 464 | 465 | # Initialize and run benchmark 466 | runner = BenchmarkRunner( 467 | target_rps_per_connection=args.target_rps_per_connection, 468 | parallel_connections=args.parallel_connections, 469 | duration_seconds=args.duration, 470 | server_url=args.server_url, 471 | warmup_seconds=args.warmup, 472 | input_source=input_source, 473 | request_batch_size=args.request_batch_size, 474 | log_dir=args.log_dir, 475 | debug=args.debug, 476 | request_timeout=args.request_timeout 477 | ) 478 | 479 | total_target_rps = (args.target_rps_per_connection or 'unlimited') 480 | if args.target_rps_per_connection: 481 | total_target_rps = args.target_rps_per_connection * args.parallel_connections 482 | 483 | print(f"\n{'='*80}") 484 | print("BENCHMARK CONFIGURATION") 485 | print(f"{'='*80}") 486 | print(f"Server URL: {args.server_url}") 487 | print(f"Parallel connections: {args.parallel_connections}") 488 | print(f"Target RPS/conn: {args.target_rps_per_connection or 'unlimited'}") 489 | print(f"Total target RPS: {total_target_rps}") 490 | print(f"Duration: {args.duration}s (+ {args.warmup}s warmup)") 491 | print(f"Request batch size: {args.request_batch_size}") 492 | print(f"Input source: {args.input_file}") 493 | print(f"Log directory: {args.log_dir or 'disabled'}") 494 | print(f"Debug mode: {'enabled' if args.debug else 'disabled'}") 495 | print(f"Request timeout: {args.request_timeout}s") 496 | print(f"{'='*80}\n") 497 | 498 | try: 499 | results = runner.run_benchmark() 500 | 501 | if results: 502 | # Write results to file 503 | with open(args.results_file, 'w') as f: 504 | json.dump(results, f, indent=2) 505 | print(f"\nDetailed results saved to: {args.results_file}") 506 | 507 | # Print formatted results 508 | print_results(results) 509 | else: 510 | print("\nNo results generated. Benchmark may have been interrupted.") 511 | except KeyboardInterrupt: 512 | print("\nBenchmark interrupted by user.") 513 | except Exception as e: 514 | logger.error(f"Benchmark failed: {e}") 515 | if args.debug: 516 | raise 517 | 518 | if __name__ == "__main__": 519 | main() --------------------------------------------------------------------------------