├── .github
└── workflows
│ ├── main.yml
│ └── python_client.yml
├── .gitignore
├── LICENSE
├── README.md
├── cli.md
├── clients
├── browser_side_js
│ └── client.html
└── python
│ ├── README.md
│ ├── fdclient
│ ├── __init__.py
│ └── client.py
│ └── setup.py
├── fastdeploy
├── .gitignore
├── __init__.py
├── __main__.py
├── _infer.py
├── _loop.py
├── _rest.py
├── _utils.py
└── monitor.sh
├── recipe.md
├── recipes
├── .gitignore
├── echo
│ ├── .dockerignore
│ ├── .gitignore
│ ├── example.py
│ ├── extra_prometheus_metrics.py
│ ├── fastDeploy.auto_dockerfile
│ ├── predictor.py
│ └── requirements.txt
├── echo_chained
│ ├── .dockerignore
│ ├── .gitignore
│ ├── example.py
│ ├── predictor_1.py
│ ├── predictor_2.py
│ └── requirements.txt
└── text_embeddings
│ ├── example.py
│ ├── predictor.py
│ ├── requirements.txt
│ └── words.txt
├── setup.py
└── testing
├── README.md
└── benchmark.py
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | paths:
7 | - 'setup.py'
8 | workflow_dispatch:
9 |
10 | jobs:
11 | pypi:
12 | runs-on: ubuntu-latest
13 |
14 | permissions:
15 | id-token: write
16 |
17 | steps:
18 | - uses: actions/checkout@v3
19 | - uses: actions/setup-python@v4
20 | with:
21 | python-version: '3.x'
22 |
23 | - name: Install dependencies
24 | run: python -m pip install -U build
25 |
26 | - name: Build
27 | run: python -m build
28 |
29 | - name: Publish
30 | uses: pypa/gh-action-pypi-publish@release/v1
31 |
--------------------------------------------------------------------------------
/.github/workflows/python_client.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | paths:
7 | - 'clients/python/setup.py'
8 | workflow_dispatch:
9 |
10 | jobs:
11 | pypi:
12 | runs-on: ubuntu-latest
13 |
14 | permissions:
15 | id-token: write
16 |
17 | steps:
18 | - uses: actions/checkout@v3
19 | - uses: actions/setup-python@v4
20 | with:
21 | python-version: '3.x'
22 |
23 | - name: Install dependencies
24 | run: python -m pip install -U build
25 |
26 | - name: Build
27 | run: python -m build
28 | working-directory: clients/python
29 |
30 | - name: Move package
31 | run: mv clients/python/dist ./
32 |
33 | - name: Publish
34 | uses: pypa/gh-action-pypi-publish@release/v1
35 |
36 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | .results_index/
7 | .request_queue/
8 |
9 | # C extensions
10 | *.so
11 |
12 | recipes/*/*index
13 |
14 | # Distribution / packaging
15 | .Python
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | pip-wheel-metadata/
29 | share/python-wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .nox/
49 | .coverage
50 | .coverage.*
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 | *.cover
55 | *.py,cover
56 | .hypothesis/
57 | .pytest_cache/
58 |
59 | # Translations
60 | *.mo
61 | *.pot
62 |
63 | # Django stuff:
64 | *.log
65 | local_settings.py
66 | db.sqlite3
67 | db.sqlite3-journal
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # IPython
86 | profile_default/
87 | ipython_config.py
88 |
89 | # pyenv
90 | .python-version
91 |
92 | # pipenv
93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
96 | # install all needed dependencies.
97 | #Pipfile.lock
98 |
99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 |
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 |
106 | # SageMath parsed files
107 | *.sage.py
108 |
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 |
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 |
122 | # Rope project settings
123 | .ropeproject
124 |
125 | # mkdocs documentation
126 | /site
127 |
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 |
133 | # Pyre type checker
134 | .pyre/
135 |
136 | # Mac DS_Store
137 | .DS_Store
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 notAI-tech
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## fastDeploy
2 | #### easy and performant micro-services for Python Deep Learning inference pipelines
3 |
4 | - Deploy any python inference pipeline with minimal extra code
5 | - Auto batching of concurrent inputs is enabled out of the box
6 | - no changes to inference code (unlike tf-serving etc), entire pipeline is run as is
7 | - Promethues metrics (open metrics) are exposed for monitoring
8 | - Auto generates clean dockerfiles and kubernetes health check, scaling friendly APIs
9 | - sequentially chained inference pipelines are supported out of the box
10 | - can be queried from any language via easy to use rest apis
11 | - easy to understand (simple consumer producer arch) and simple code base
12 |
13 |
14 | #### Installation:
15 | ```bash
16 | pip install --upgrade fastdeploy fdclient
17 | # fdclient is optional, only needed if you want to use python client
18 | ```
19 |
20 | #### [CLI explained](https://github.com/notAI-tech/fastDeploy/blob/master/cli.md)
21 |
22 | #### Start fastDeploy server on a recipe:
23 | ```bash
24 | # Invoke fastdeploy
25 | python -m fastdeploy --help
26 | # or
27 | fastdeploy --help
28 |
29 | # Start prediction "loop" for recipe "echo"
30 | fastdeploy --loop --recipe recipes/echo
31 |
32 | # Start rest apis for recipe "echo"
33 | fastdeploy --rest --recipe recipes/echo
34 | ```
35 |
36 | #### Send a request and get predictions:
37 |
38 | - [Python client usage](https://github.com/notAI-tech/fastDeploy/blob/master/clients/python/README.md)
39 |
40 | - [curl usage]()
41 |
42 | - [Nodejs client usage]()
43 |
44 | #### auto generate dockerfile and build docker image:
45 | ```bash
46 | # Write the dockerfile for recipe "echo"
47 | # and builds the docker image if docker is installed
48 | # base defaults to python:3.8-slim
49 | fastdeploy --build --recipe recipes/echo
50 |
51 | # Run docker image
52 | docker run -it -p8080:8080 fastdeploy_echo
53 | ```
54 |
55 | #### Serving your model (recipe):
56 |
57 | - [Writing your model/pipeline's recipe](https://github.com/notAI-tech/fastDeploy/blob/master/recipe.md)
58 |
59 |
60 | ### Where to use fastDeploy?
61 |
62 | - to deploy any non ultra light weight models i.e: most DL models, >50ms inference time per example
63 | - if the model/pipeline benefits from batch inference, fastDeploy is perfect for your use-case
64 | - if you are going to have individual inputs (example, user's search input which needs to be vectorized or image to be classified)
65 | - in the case of individual inputs, requests coming in at close intervals will be batched together and sent to the model as a batch
66 | - perfect for creating internal micro services separating your model, pre and post processing from business logic
67 | - since prediction loop and inference endpoints are separated and are connected via sqlite backed queue, can be scaled independently
68 |
69 |
70 | ### Where not to use fastDeploy?
71 | - non cpu/gpu heavy models that are better of running parallely rather than in batch
72 | - if your predictor calls some external API or uploads to s3 etc in a blocking way
73 | - io heavy non batching use cases (eg: query ES or db for each input)
74 | - for these cases better to directly do from rest api code (instead of consumer producer mechanism) so that high concurrency can be achieved
75 |
--------------------------------------------------------------------------------
/cli.md:
--------------------------------------------------------------------------------
1 |
2 | ### fastDeploy CLI usage explained
3 |
4 |
5 | - invoking the CLI
6 | ```python
7 | fastDeploy --help
8 | # or
9 | python -m fastDeploy --help
10 | ```
11 |
12 |
13 | #### Prediction loop
14 | - Start prediction loop on your recipe
15 | ```python
16 | fastdeploy --loop --recipe ./recipes/echo
17 | ```
18 |
19 | - Optional config can be passed with `--config` flag
20 |
21 | ```python
22 | fastdeploy --loop --recipe ./recipes/echo --config "predictor_name=predictor.py;optimal_batch_size=0"
23 | ```
24 |
25 | | Config | Description | Default |
26 | | --- | --- | --- |
27 | | predictor_name | predictor.py or predictor_N.py, name of the predictor run in the loop | predictor.py |
28 | | optimal_batch_size | integer max batch size for the predictor | 0 (auto determine) |
29 |
30 | - Same config can also be passed as env variables
31 | ```python
32 | export PREDICTOR_NAME=predictor.py
33 | export OPTIMAL_BATCH_SIZE=0
34 | fastdeploy --loop --recipe ./recipes/echo
35 | ```
36 |
37 |
38 |
39 | #### Start API server
40 | - Start API server on your recipe
41 | ```python
42 | fastdeploy --rest --recipe ./recipes/echo
43 | ```
44 |
45 | - Optional config can be passed with `--config` flag
46 |
47 | ```python
48 | fastdeploy --rest --recipe ./recipes/echo --config "max_request_batch_size=0;workers=3;timeout=480;host=0.0.0.0;port=8080;only_async=false;allow_pickle=true;keep_alive=60"
49 | ```
50 |
51 | - Same config can also be passed as env variables
52 | ```python
53 | export MAX_REQUEST_BATCH_SIZE=0
54 | export WORKERS=3
55 | export TIMEOUT=480
56 | export HOST=0.0.0.0
57 | export PORT=8080
58 | export ONLY_ASYNC=false
59 | export ALLOW_PICKLE=true
60 | export KEEP_ALIVE=60
61 | fastdeploy --rest --recipe ./recipes/echo
62 | ```
63 |
64 | #### --config options explained
65 |
66 | | Config | Description | Default |
67 | | --- | --- | --- |
68 | | max_request_batch_size | integer max number of inputs in a batch. useful when exposing to outside directly to limit max number of inputs that can be in a request | 0 (None) |
69 | | workers | number of rest api gunicorn workers. 3 is more than enoough generally | 3 |
70 | | timeout | seconds after which request will fail | 480 |
71 | | host | host for the REST server | 0.0.0.0 |
72 | | port | port for the REST server | 8080 |
73 | | only_async | true/false | false |
74 | | allow_pickle | true/false - use for disallowing pickle protocol when expecting external inputs | true |
75 | | keep_alive | gunicorn gevent keep alive | 60 |
76 |
77 |
78 | #### Build docker image
79 |
80 | - Build generate docker image for your recipe
81 | ```python
82 | fastdeploy --build --recipe ./recipes/echo
83 | ```
84 |
85 | - also supports optional config via `--config` flag
86 | - both rest and loop config options can be passed here in the same config string
87 |
88 |
89 |
--------------------------------------------------------------------------------
/clients/browser_side_js/client.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | FDClient Test with Multiple File Upload and Download
7 |
8 |
21 |
149 |
150 |
151 |
152 | FDClient Test with Multiple File Upload and Download
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
Download Processed Files:
168 |
169 |
170 |
171 |
Result:
172 |
173 |
174 |
175 |
176 |
--------------------------------------------------------------------------------
/clients/python/README.md:
--------------------------------------------------------------------------------
1 | ## fastDeploy python client
2 |
3 | ```python
4 | from fdclient import FDClient
5 |
6 | client = FDClient('http://localhost:8080') # optional compression=False to disable zstd compression
7 |
8 | # infer
9 | response = client.infer([obj_1, obj_2, ...]) # optional unique_id='some_id' to specify a unique id for the request
10 |
11 | # infer in background
12 | response_future = client.infer_background([obj_1, obj_2, ...]) # optional unique_id='some_id' to specify a unique id for the request
13 | response = response_future.result() # wait for the response and get it
14 | ```
15 |
16 | - By default fdclient communicates with fastDeploy server via pickles
17 | - pickle is very useful and makes sense when using fastDeploy server as a micro service internally i.e: all requests to fastDeploy originate from code you have writtem
18 | - ***PICKLE is secure if all the inputs to fastDeploy are originating from your code and not direct external user's pickles***
19 | - ***PICKLE is unsecure if you are passing external user inputs to fastDeploy directly without validation in between***
20 | - start fastDeploy serve with `--config "allow_pickle=false"` if the fastDeploy APIs are exposed to outside
21 | - `allow_pickle=false` config on server side makes fdclient use `msgpack` if available or `json` if msgpack not available.
22 |
23 | #### If pickle is unsecure, why use it at all?
24 |
25 | - pickle is great to send or receive arbitary inputs and outputs
26 | - if `allow_pickle=true` (default) your inputs and outputs can be any python objects, eg: np arrays, pd dataframes, float32 anything ....
27 | - pickle is only unsecure if you are unpickling objects pickled by others (since they can insert malicious code)
28 | - If fastDeploy is being used only for internal microservices, pickle is the best way so enabled by default
29 |
--------------------------------------------------------------------------------
/clients/python/fdclient/__init__.py:
--------------------------------------------------------------------------------
1 | from .client import FDClient
2 |
--------------------------------------------------------------------------------
/clients/python/fdclient/client.py:
--------------------------------------------------------------------------------
1 | try:
2 | import zstandard
3 | except:
4 | zstandard = None
5 |
6 | try:
7 | import msgpack
8 | except:
9 | msgpack = None
10 |
11 | import threading
12 | import requests
13 | import pickle
14 | import uuid
15 | import time
16 | import json
17 |
18 |
19 | class FDClient:
20 | def __init__(self, server_url, request_timeout, compression=True, use_requests_session=False):
21 | assert server_url.startswith("http://") or server_url.startswith(
22 | "https://"
23 | ), "Server URL must start with http:// or https://"
24 |
25 | self.server_url = server_url
26 | self.local_storage = threading.local()
27 | self.requests_session = requests.Session() if use_requests_session else requests
28 | self.compression = compression if zstandard is not None else False
29 | self.input_type = None
30 | self._set_input_type()
31 |
32 | self.request_timeout = request_timeout
33 |
34 | def _set_input_type(self):
35 | if self.input_type is None:
36 | try:
37 | self.input_type = (
38 | "pickle"
39 | if self.requests_session.get(
40 | f"{self.server_url}/meta", params={"is_pickle_allowed": ""}
41 | ).json()["is_pickle_allowed"]
42 | else "msgpack"
43 | if msgpack is not None
44 | else "json"
45 | )
46 | except Exception as e:
47 | self.input_type = None
48 |
49 | @property
50 | def _compressor(self):
51 | if self.compression is False:
52 | return None
53 |
54 | if (
55 | not hasattr(self.local_storage, "compressor")
56 | or self.local_storage.compressor is None
57 | ):
58 | self.local_storage.compressor = zstandard.ZstdCompressor(level=-1)
59 | return self.local_storage.compressor
60 |
61 | @property
62 | def _decompressor(self):
63 | if self.compression is False:
64 | return None
65 |
66 | if (
67 | not hasattr(self.local_storage, "decompressor")
68 | or self.local_storage.decompressor is None
69 | ):
70 | self.local_storage.decompressor = zstandard.ZstdDecompressor()
71 | return self.local_storage.decompressor
72 |
73 | @property
74 | def _decompressor(self):
75 | if self.compression is False:
76 | return None
77 |
78 | if (
79 | not hasattr(self.local_storage, "decompressor")
80 | or self.local_storage.decompressor is None
81 | ):
82 | self.local_storage.decompressor = zstandard.ZstdDecompressor()
83 | return self.local_storage.decompressor
84 |
85 | def infer(self, data, unique_id=None, is_async=False):
86 | if self.input_type is None:
87 | self._set_input_type()
88 | if self.input_type is None:
89 | raise ValueError("Could not connect to server")
90 |
91 | assert isinstance(data, (list, tuple)), "Data must be of type list or tuple"
92 |
93 | unique_id = str(uuid.uuid4()) if not unique_id else unique_id
94 |
95 | if self.input_type == "pickle":
96 | data = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
97 | elif self.input_type == "msgpack":
98 | data = msgpack.packb(data, use_bin_type=True)
99 | else:
100 | data = json.dumps(data)
101 |
102 | response = self.requests_session.post(
103 | f"{self.server_url}/infer",
104 | params={
105 | "unique_id": unique_id,
106 | "async": is_async,
107 | "input_type": self.input_type,
108 | "compressed": True if zstandard is not None else False,
109 | "timeout": self.request_timeout,
110 | },
111 | data=self._compressor.compress(data) if zstandard is not None else data,
112 | headers={"Content-Type": "application/octet-stream"},
113 | timeout=self.request_timeout * 1.1,
114 | )
115 |
116 | if self.input_type == "pickle":
117 | return pickle.loads(
118 | self._decompressor.decompress(response.content)
119 | if zstandard is not None
120 | else response.content
121 | )
122 | elif self.input_type == "msgpack":
123 | return msgpack.unpackb(
124 | self._decompressor.decompress(response.content)
125 | if zstandard is not None
126 | else response.content,
127 | raw=False,
128 | use_list=False,
129 | )
130 | else:
131 | return json.loads(
132 | self._decompressor.decompress(response.content)
133 | if zstandard is not None
134 | else response.content
135 | )
136 |
137 | def infer_async(self, data, unique_id=None):
138 | return self.infer(data, unique_id, is_async=True)
139 |
140 |
141 | if __name__ == "__main__":
142 | client = FDClient("http://localhost:8080")
143 |
144 | print(client.input_type)
145 |
146 | s = time.time()
147 | print("infer", client.infer(["this", "is", "some", b"data"]), time.time() - s)
148 |
149 | s = time.time()
150 | x = client.infer_background(["this", "is", b"some", "data"])
151 | print("infer_background", x.result(), time.time() - s)
152 |
153 | s = time.time()
154 |
155 | print(
156 | "infer_background_multiple 40",
157 | [
158 | _.result()["success"]
159 | for _ in client.infer_background_multiple(
160 | [["this", b"is", "some", "data"]] * 40
161 | )
162 | ],
163 | time.time() - s,
164 | )
165 |
--------------------------------------------------------------------------------
/clients/python/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Note: To use the 'upload' functionality of this file, you must:
5 | # $ pip install twine
6 |
7 | import io
8 | import os
9 | import sys
10 | from shutil import rmtree
11 |
12 | from setuptools import find_packages, setup, Command
13 |
14 | # Package meta-data.
15 | NAME = "fdclient"
16 | DESCRIPTION = "fastDeploy python client"
17 | URL = "https://github.com/notAI-tech/fastDeploy"
18 | EMAIL = "praneeth@bpraneeth.com"
19 | AUTHOR = "BEDAPUDI PRANEETH"
20 | REQUIRES_PYTHON = ">=3.6.0"
21 | VERSION = "3.1.1"
22 |
23 | # What packages are required for this module to be executed?
24 | REQUIRED = ["zstandard", "requests", "msgpack"]
25 |
26 | # What packages are optional?
27 | EXTRAS = {
28 | # 'fancy feature': ['django'],
29 | }
30 |
31 | # The rest you shouldn't have to touch too much :)
32 | # ------------------------------------------------
33 | # Except, perhaps the License and Trove Classifiers!
34 | # If you do change the License, remember to change the Trove Classifier for that!
35 |
36 | here = os.path.abspath(os.path.dirname(__file__))
37 |
38 | # Import the README and use it as the long-description.
39 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
40 | try:
41 | with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
42 | long_description = "\n" + f.read()
43 | except FileNotFoundError:
44 | long_description = DESCRIPTION
45 |
46 | # Load the package's __version__.py module as a dictionary.
47 | about = {}
48 | if not VERSION:
49 | with open(os.path.join(here, NAME, "__version__.py")) as f:
50 | exec(f.read(), about)
51 | else:
52 | about["__version__"] = VERSION
53 |
54 |
55 | class UploadCommand(Command):
56 | """Support setup.py upload."""
57 |
58 | description = "Build and publish the package."
59 | user_options = []
60 |
61 | @staticmethod
62 | def status(s):
63 | """Prints things in bold."""
64 | print("\033[1m{0}\033[0m".format(s))
65 |
66 | def initialize_options(self):
67 | pass
68 |
69 | def finalize_options(self):
70 | pass
71 |
72 | def run(self):
73 | try:
74 | self.status("Removing previous builds…")
75 | rmtree(os.path.join(here, "dist"))
76 | except OSError:
77 | pass
78 |
79 | self.status("Building Source and Wheel (universal) distribution…")
80 | os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
81 |
82 | self.status("Uploading the package to PyPI via Twine…")
83 | os.system("twine upload dist/*")
84 |
85 | self.status("Pushing git tags…")
86 | os.system("git tag v{0}".format(about["__version__"]))
87 | os.system("git push --tags")
88 |
89 | sys.exit()
90 |
91 |
92 | # Where the magic happens:
93 | setup(
94 | name=NAME,
95 | version=about["__version__"],
96 | description=DESCRIPTION,
97 | long_description=long_description,
98 | long_description_content_type="text/markdown",
99 | author=AUTHOR,
100 | author_email=EMAIL,
101 | python_requires=REQUIRES_PYTHON,
102 | url=URL,
103 | packages=find_packages(exclude=("tests",)),
104 | # If your package is a single module, use this instead of 'packages':
105 | # py_modules=['mypackage'],
106 | install_requires=REQUIRED,
107 | extras_require=EXTRAS,
108 | include_package_data=True,
109 | license="MIT",
110 | classifiers=[
111 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
112 | "License :: OSI Approved :: MIT License",
113 | "Programming Language :: Python",
114 | "Programming Language :: Python :: 3",
115 | "Programming Language :: Python :: 3.6",
116 | "Programming Language :: Python :: Implementation :: CPython",
117 | ],
118 | # $ setup.py publish support.
119 | cmdclass={
120 | "upload": UploadCommand,
121 | },
122 | )
123 |
--------------------------------------------------------------------------------
/fastdeploy/.gitignore:
--------------------------------------------------------------------------------
1 | fastdeploy-ui
2 |
--------------------------------------------------------------------------------
/fastdeploy/__init__.py:
--------------------------------------------------------------------------------
1 | from . import __main__
2 |
--------------------------------------------------------------------------------
/fastdeploy/__main__.py:
--------------------------------------------------------------------------------
1 | import resource
2 |
3 | try:
4 | resource.setrlimit(resource.RLIMIT_NOFILE, (131072, 131072))
5 | except:
6 | pass
7 |
8 | import os
9 | import sys
10 | import glob
11 | import argparse
12 | import subprocess
13 |
14 | parser = argparse.ArgumentParser(
15 | description="CLI for fastDeploy", formatter_class=argparse.RawTextHelpFormatter
16 | )
17 | parser.add_argument(
18 | "--recipe",
19 | type=str,
20 | help="Path to recipe folder that contains predictor.py",
21 | required=False,
22 | )
23 |
24 | parser.add_argument(
25 | "--loop",
26 | help=f"""Start prediction loop""",
27 | required=False,
28 | action="store_true",
29 | )
30 |
31 | parser.add_argument(
32 | "--rest",
33 | help="""Start REST server""",
34 | required=False,
35 | action="store_true",
36 | )
37 |
38 | parser.add_argument(
39 | "--build",
40 | help="""Build docker image""",
41 | required=False,
42 | action="store_true",
43 | )
44 |
45 | parser.add_argument(
46 | "--config",
47 | type=str,
48 | help="""
49 | example usage: --config "workers=3, timeout:480, allow_pickle=true"
50 |
51 | REST
52 | max_request_batch_size: integer max number of inputs in a batch, default=0 (None)
53 | workers: integer number of workers, default=3
54 | timeout: seconds after which request will fail, default=480
55 | host: host for the REST server, default=0.0.0.0
56 | port: port for the REST server, default=8080
57 | allow_pickle: true/false, default=true
58 | keep_alive: gunicorn gevent keep alive, default=60
59 |
60 |
61 | LOOP
62 | predictor_name: predictor.py or predictor_N.py, name of the predictor run in the loop, default: predictor.py
63 | optimal_batch_size: integer max batch size for the predictor, default=0 (auto)
64 |
65 | DOCKER
66 | base: base image for docker, default=python:3.8-slim
67 | """,
68 | required=False,
69 | default="max_request_batch_size=0,workers=3,timeout=480,host=0.0.0.0,port=8080,allow_pickle=true,predictor_name=predictor.py,optimal_batch_size=0,keep_alive=60,base=python:3.8-slim",
70 | )
71 |
72 | args = parser.parse_args()
73 |
74 | CONFIG = {
75 | # rest config
76 | "max_request_batch_size": int(os.getenv("MAX_REQUEST_BATCH_SIZE", "0")),
77 | "workers": int(os.getenv("WORKERS", "3")),
78 | "timeout": int(os.getenv("TIMEOUT", "480")),
79 | "host": os.getenv("HOST", "0.0.0.0"),
80 | "port": int(os.getenv("PORT", "8080")),
81 | "allow_pickle": os.getenv("ALLOW_PICKLE", "true").lower() == "true",
82 | # predictor config
83 | "predictor_name": os.getenv("PREDICTOR_NAME", "predictor.py"),
84 | "optimal_batch_size": int(os.getenv("OPTIMAL_BATCH_SIZE", "0")),
85 | "keep_alive": int(os.getenv("KEEP_ALIVE", "60")),
86 | # building docker config
87 | "base": os.getenv("BASE", "python:3.8-slim"),
88 | }
89 |
90 | if args.config:
91 | for config in args.config.split(","):
92 | try:
93 | k, v = config.strip().split("=")
94 | except:
95 | continue
96 |
97 | if os.getenv(k.upper()) is not None:
98 | continue
99 |
100 | try:
101 | CONFIG[k.strip()] = int(v.strip())
102 | except:
103 | CONFIG[k.strip()] = v.strip()
104 |
105 | for k, v in CONFIG.items():
106 | os.environ[k.upper()] = str(v)
107 |
108 | sys.path.append(os.path.abspath(args.recipe))
109 | os.chdir(os.path.abspath(args.recipe))
110 |
111 | try:
112 | if not os.path.exists(os.path.join(args.recipe, ".gitignore")):
113 | _gitignore_f = open(os.path.join(args.recipe, ".gitignore"), "a")
114 | _gitignore_f.write("\nfastdeploy_dbs\nfastdeploy_dbs/*\n")
115 | _gitignore_f.flush()
116 | _gitignore_f.close()
117 | except:
118 | pass
119 |
120 | try:
121 | if not os.path.exists(os.path.join(args.recipe, ".dockerignore")):
122 | _dockerignore_f = open(os.path.join(args.recipe, ".dockerignore"), "w")
123 | _dockerignore_f.write("\nfastdeploy_dbs\nfastdeploy_dbs/*\n")
124 | _dockerignore_f.flush()
125 | _dockerignore_f.close()
126 | except:
127 | pass
128 |
129 |
130 | def loop():
131 | from ._loop import start_loop
132 |
133 | start_loop()
134 |
135 |
136 | def rest():
137 | from ._rest import app
138 | import gunicorn.app.base
139 |
140 | class StandaloneApplication(gunicorn.app.base.BaseApplication):
141 | def __init__(self, app, options=None):
142 | self.options = options or {}
143 | self.application = app
144 | super().__init__()
145 |
146 | def load_config(self):
147 | config = {
148 | key: value
149 | for key, value in self.options.items()
150 | if key in self.cfg.settings and value is not None
151 | }
152 | for key, value in config.items():
153 | self.cfg.set(key.lower(), value)
154 |
155 | def load(self):
156 | return self.application
157 |
158 | options = {
159 | "preload": "",
160 | "bind": "%s:%s" % (CONFIG["host"], CONFIG["port"]),
161 | "workers": CONFIG["workers"],
162 | "worker_connections": 1000,
163 | "worker_class": "gevent",
164 | "timeout": CONFIG["timeout"],
165 | "allow_redirects": True,
166 | "keepalive": CONFIG["keep_alive"],
167 | "keep_alive": CONFIG["keep_alive"],
168 | }
169 |
170 | print(
171 | f"fastDeploy REST interface active at http://{CONFIG['host']}:{CONFIG['port']}"
172 | )
173 |
174 | StandaloneApplication(app, options).run()
175 |
176 |
177 | def build_docker_image():
178 | if not os.path.exists("requirements.txt"):
179 | raise Exception("requirements.txt not found")
180 |
181 | f = open("fastDeploy.auto_dockerfile", "w")
182 | f.write(
183 | f"""FROM {CONFIG['base']}
184 | RUN python3 -m pip install --upgrade --no-cache-dir pip fastdeploy
185 |
186 | ENV {' '.join([f"{k.upper()}={v}" for k, v in CONFIG.items()])}
187 |
188 | ADD . /recipe
189 | WORKDIR /recipe
190 | {'' if not os.path.exists("extras.sh") else 'RUN chmod +x /recipe/extras.sh && /recipe/extras.sh'}
191 | RUN python3 -m pip install --no-cache-dir -r /recipe/requirements.txt
192 | RUN cd /recipe && python3 -c "from predictor import predictor; from example import example; predictor(example)"
193 |
194 | ENTRYPOINT ["/bin/sh", "-c"]
195 |
196 | CMD ["ulimit -n 1000000 && python3 -m fastdeploy --recipe /recipe --rest & python3 -m fastdeploy --recipe /recipe --loop"]
197 | """
198 | )
199 | f.flush()
200 | f.close()
201 |
202 | print(f"Dockerfile generated at {os.path.abspath('fastDeploy.auto_dockerfile')}")
203 |
204 | print(
205 | f"Run `docker build -f fastDeploy.auto_dockerfile -t {os.path.abspath('.')}` to build the image"
206 | )
207 | exit()
208 |
209 |
210 | if args.loop:
211 | loop()
212 |
213 | elif args.rest:
214 | rest()
215 |
216 | elif args.build:
217 | build_docker_image()
218 |
--------------------------------------------------------------------------------
/fastdeploy/_infer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import json
4 | import pickle
5 |
6 | import msgpack
7 | import zstandard
8 |
9 | import threading
10 |
11 | from . import _utils
12 |
13 | started_at_time = time.time()
14 |
15 | # make sure all predictors are running before starting the inference server
16 | # if any are not yet started/ still loading then wait for them to start
17 | for predictor_file, predictor_sequence in _utils.PREDICTOR_FILE_TO_SEQUENCE.items():
18 | log_printed = False
19 | while True:
20 | try:
21 | time_per_example = _utils.META_INDEX.get(
22 | f"{predictor_sequence}", select_keys=["time_per_example"]
23 | )[f"{predictor_sequence}"]["time_per_example"]
24 | started_at_time = time.time()
25 | break
26 | except:
27 | if not log_printed:
28 | _utils.logger.info(f"Waiting for {predictor_file} to start")
29 | log_printed = True
30 | time.sleep(1)
31 |
32 |
33 | _utils.logger.info(f"pids: {_utils.get_fd_pids()}")
34 |
35 | class Infer:
36 | started_at_time = started_at_time
37 |
38 | def __init__(
39 | self,
40 | allow_pickle=os.getenv("ALLOW_PICKLE", "true").lower() == "true",
41 | ):
42 | self.local_storage = threading.local()
43 | self.allow_pickle = allow_pickle
44 |
45 | @property
46 | def _compressor(self):
47 | if (
48 | not hasattr(self.local_storage, "compressor")
49 | or self.local_storage.compressor is None
50 | ):
51 | self.local_storage.compressor = zstandard.ZstdCompressor(level=-1)
52 | return self.local_storage.compressor
53 |
54 | @property
55 | def _decompressor(self):
56 | if (
57 | not hasattr(self.local_storage, "decompressor")
58 | or self.local_storage.decompressor is None
59 | ):
60 | self.local_storage.decompressor = zstandard.ZstdDecompressor()
61 | return self.local_storage.decompressor
62 |
63 | def read_inputs(self, unique_id, inputs, input_type, is_compressed):
64 | if input_type == "pickle":
65 | if not self.allow_pickle:
66 | _utils.logger.warning(
67 | f"{unique_id}: tried to use pickle input, but pickle is disallowed"
68 | )
69 | raise Exception("pickle input disallowed, use msgpack or json")
70 |
71 | inputs = pickle.loads(
72 | inputs if not is_compressed else self._decompressor.decompress(inputs)
73 | )
74 | _utils.logger.debug(f"pickle input read")
75 |
76 | elif input_type == "msgpack":
77 | inputs = msgpack.unpackb(
78 | inputs if not is_compressed else self._decompressor.decompress(inputs),
79 | use_list=False,
80 | raw=False,
81 | )
82 |
83 | _utils.logger.debug(f"{unique_id}: msgpack input read")
84 |
85 | elif input_type == "json":
86 | inputs = json.loads(
87 | inputs if not is_compressed else self._decompressor.decompress(inputs)
88 | )
89 |
90 | # for backward compatibility
91 | try:
92 | inputs = inputs["data"]
93 | except:
94 | pass
95 |
96 | _utils.logger.debug(f"{unique_id}: json input read")
97 |
98 | else:
99 | _utils.logger.warning(f"{unique_id}: input_type {input_type} not supported")
100 | raise Exception(f"input_type {input_type} not supported")
101 |
102 | return inputs
103 |
104 | def create_response(self, unique_id, response, is_compressed, input_type):
105 | success = response["success"]
106 | if input_type == "pickle":
107 | response = pickle.dumps(response)
108 | elif input_type == "msgpack":
109 | response = msgpack.packb(response, use_bin_type=True)
110 | elif input_type == "json":
111 | pass
112 |
113 | if is_compressed:
114 | response = self._compressor.compress(response)
115 | _utils.logger.debug(f"{unique_id}: response compressed")
116 |
117 | return success, response
118 |
119 | def get_timeout_response(
120 | self, unique_id, is_compressed, input_type, is_client_timeout=False
121 | ):
122 | if is_client_timeout:
123 | _utils.MAIN_INDEX.update(
124 | {
125 | unique_id: {
126 | "-1.predicted_at": time.time(),
127 | "timedout_in_queue": True,
128 | }
129 | }
130 | )
131 | _utils.logger.warning(f"{unique_id}: client timeout")
132 |
133 | return self.create_response(
134 | unique_id,
135 | {
136 | "success": False,
137 | "reason": "timeout" if not is_client_timeout else "client_timeout",
138 | "unique_id": unique_id,
139 | "prediction": None,
140 | },
141 | is_compressed,
142 | input_type,
143 | )
144 |
145 | def add_to_infer_queue(
146 | self, inputs: bytes, unique_id: str, input_type: str, is_compressed: bool
147 | ):
148 | try:
149 | request_received_at = time.time()
150 | _utils.logger.debug(f"{unique_id}: reading inputs")
151 |
152 | inputs = self.read_inputs(unique_id, inputs, input_type, is_compressed)
153 |
154 | if inputs is None:
155 | _utils.logger.warning(f"{unique_id}: inputs are None")
156 | return self.create_response(
157 | unique_id,
158 | {
159 | "success": False,
160 | "reason": f"inputs have to be {'pickle,' if self.allow_pickle else ''} msgpack or json",
161 | "unique_id": unique_id,
162 | "prediction": None,
163 | },
164 | is_compressed,
165 | input_type,
166 | )
167 |
168 | if not isinstance(inputs, (list, tuple)):
169 | _utils.logger.warning(f"{unique_id}: inputs have to be a list or tuple")
170 | return self.create_response(
171 | unique_id,
172 | {
173 | "success": False,
174 | "reason": "inputs have to be a list or tuple",
175 | "unique_id": unique_id,
176 | "prediction": None,
177 | },
178 | is_compressed,
179 | input_type,
180 | )
181 |
182 | if not inputs:
183 | _utils.logger.debug(f"{unique_id}: empty inputs")
184 | return self.create_response(
185 | unique_id,
186 | {
187 | "success": True,
188 | "reason": "empty inputs",
189 | "unique_id": unique_id,
190 | "prediction": [],
191 | },
192 | is_compressed,
193 | input_type,
194 | )
195 |
196 | else:
197 | # -1 is the predictor sequence for the rest server, basically where the request originates
198 | _utils.MAIN_INDEX.update(
199 | {
200 | unique_id: {
201 | "-1.outputs": inputs,
202 | "-1.received_at": request_received_at,
203 | "-1.predicted_in_batch_of": len(inputs),
204 | "-1.predicted_at": 0,
205 | "last_predictor_sequence": -1,
206 | "last_predictor_success": True,
207 | "timedout_in_queue": None,
208 | }
209 | }
210 | )
211 |
212 | _utils.logger.debug(f"{unique_id}: added to request queue")
213 |
214 | return True, None
215 | except Exception as ex:
216 | _utils.logger.exception(ex, exc_info=True)
217 | return self.create_response(
218 | unique_id,
219 | {
220 | "success": False,
221 | "reason": str(ex),
222 | "unique_id": unique_id,
223 | "prediction": None,
224 | },
225 | is_compressed,
226 | input_type,
227 | )
228 |
229 | def get_responses_for_unique_ids(self, unique_ids, is_compresseds, input_types):
230 | all_current_results = _utils.MAIN_INDEX.get(
231 | unique_ids,
232 | select_keys=[
233 | f"{_utils.LAST_PREDICTOR_SEQUENCE}.outputs",
234 | "last_predictor_success",
235 | "last_predictor_sequence",
236 | "timedout_in_queue",
237 | ],
238 | )
239 |
240 | all_responses = {}
241 |
242 | updations = {}
243 | still_processing = []
244 |
245 | for unique_id, is_compressed, input_type in zip(
246 | unique_ids, is_compresseds, input_types
247 | ):
248 | current_results = all_current_results[unique_id]
249 |
250 | if current_results["timedout_in_queue"]:
251 | _utils.logger.warning(f"{unique_id}: timedout in queue")
252 | updations[unique_id] = {
253 | "-1.predicted_at": time.time(),
254 | }
255 | all_responses[unique_id] = self.get_timeout_response(
256 | unique_id, is_compressed, input_type
257 | )
258 | _utils.logger.debug(f"{unique_id}: timedout in queue response created")
259 |
260 | elif (
261 | current_results["last_predictor_success"] is True
262 | and current_results["last_predictor_sequence"]
263 | == _utils.LAST_PREDICTOR_SEQUENCE
264 | ):
265 | updations[unique_id] = {
266 | "-1.predicted_at": time.time(),
267 | }
268 |
269 | all_responses[unique_id] = self.create_response(
270 | unique_id,
271 | {
272 | "success": True,
273 | "unique_id": unique_id,
274 | "prediction": current_results[
275 | f"{_utils.LAST_PREDICTOR_SEQUENCE}.outputs"
276 | ],
277 | "reason": None,
278 | },
279 | is_compressed,
280 | input_type,
281 | )
282 | _utils.logger.debug(f"{unique_id}: response created")
283 | elif current_results["last_predictor_success"] is False:
284 | _utils.logger.warning(
285 | f"{unique_id}: predictor failed at {current_results['last_predictor_sequence']}"
286 | )
287 | updations[unique_id] = {
288 | "-1.predicted_at": time.time(),
289 | }
290 | all_responses[unique_id] = self.create_response(
291 | unique_id,
292 | {
293 | "success": False,
294 | "reason": f"prediction failed predictor {current_results['last_predictor_sequence']}",
295 | "unique_id": unique_id,
296 | "prediction": None,
297 | },
298 | is_compressed,
299 | input_type,
300 | )
301 | _utils.logger.debug(f"{unique_id}: failed response created")
302 |
303 | else:
304 | still_processing.append(unique_id)
305 |
306 | if updations:
307 | _utils.MAIN_INDEX.update(updations)
308 |
309 | if still_processing:
310 | _utils.logger.debug(f"Still processing: {still_processing}")
311 |
312 | return all_responses
313 |
--------------------------------------------------------------------------------
/fastdeploy/_loop.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import importlib
4 |
5 | from . import _utils
6 |
7 |
8 | def load_predictor(predictor_name):
9 | predictor = importlib.import_module(os.path.splitext(predictor_name)[0]).predictor
10 | predictor_sequence = _utils.PREDICTOR_FILE_TO_SEQUENCE[predictor_name]
11 | _utils.logger.debug(
12 | f"{predictor_name}: predictor loaded with predictor_sequence {predictor_sequence}"
13 | )
14 | return predictor, predictor_sequence
15 |
16 |
17 | def get_example(predictor_sequence):
18 | if predictor_sequence == 0:
19 | return _utils.example
20 |
21 | while True:
22 | _utils.logger.debug(f"Waiting for previous predictor to finish warmup")
23 | try:
24 | example = _utils.META_INDEX.get(
25 | f"{predictor_sequence - 1}", select_keys=["example_output"]
26 | )[f"{predictor_sequence - 1}"]["example_output"]
27 | if example is not None:
28 | return example
29 | except:
30 | time.sleep(1)
31 |
32 |
33 | def initialize_predictor(
34 | predictor,
35 | predictor_name,
36 | predictor_sequence,
37 | example,
38 | optimal_batch_size,
39 | ):
40 | example_output = _utils.warmup(predictor, example)
41 | _utils.logger.info(f"{predictor_name}: warmup done")
42 |
43 | optimal_batch_size, time_per_example = _utils.calculate_optimum_batch_sizes(
44 | predictor, predictor_sequence, example, optimal_batch_size
45 | )
46 |
47 | return {
48 | "optimal_batch_size": optimal_batch_size,
49 | "time_per_example": time_per_example,
50 | "predictor_name": predictor_name,
51 | "predictor_sequence": predictor_sequence,
52 | "request_poll_time": 0.01,
53 | "example_output": example_output,
54 | "status": "running",
55 | }
56 |
57 |
58 | def process_batch(predictor, input_batch, optimal_batch_size):
59 | last_predictor_success = False
60 | received_at = time.time()
61 | try:
62 | results = predictor(input_batch, batch_size=optimal_batch_size)
63 | last_predictor_success = True
64 | except Exception as ex:
65 | _utils.logger.exception(ex, exc_info=True)
66 | results = [None] * len(input_batch)
67 |
68 | predicted_at = time.time()
69 |
70 | if len(results) != len(input_batch):
71 | raise Exception(
72 | f"Predictor returned {len(results)} results for {len(input_batch)} inputs"
73 | )
74 |
75 | return results, last_predictor_success, received_at, predicted_at
76 |
77 |
78 | to_process = {}
79 | current_sum_of_to_process = 0
80 |
81 |
82 | def fetch_batch(
83 | main_index,
84 | predictor_sequence,
85 | optimal_batch_size,
86 | max_wait_time_for_batch_collection,
87 | ):
88 | global to_process
89 | global current_sum_of_to_process
90 |
91 | unique_id_wise_input_count = {}
92 | input_batch = []
93 | current_batch_length = 0
94 | batch_collection_started_at = time.time()
95 | last_input_received_at = time.time()
96 |
97 | while current_batch_length < optimal_batch_size:
98 | if current_sum_of_to_process < optimal_batch_size:
99 | to_process.update(
100 | main_index.search(
101 | query={
102 | "-1.predicted_at": 0, # prediction not yet done
103 | "last_predictor_success": True, # last predictor success
104 | "last_predictor_sequence": predictor_sequence
105 | - 1, # last predictor sequence
106 | "timedout_in_queue": {"$ne": True}, # not timedout in queue
107 | },
108 | n=optimal_batch_size,
109 | select_keys=[f"{predictor_sequence - 1}.outputs"],
110 | update={
111 | "last_predictor_sequence": predictor_sequence, # set last predictor sequence to current predictor sequence
112 | "last_predictor_success": None, # reset last predictor success
113 | f"{predictor_sequence}.received_at": time.time(), # set received at to current time
114 | },
115 | )
116 | )
117 |
118 | for unique_id, data in to_process.items():
119 | if current_batch_length > optimal_batch_size * 0.8:
120 | break
121 | outputs = data[f"{predictor_sequence - 1}.outputs"]
122 | input_count = len(outputs)
123 | unique_id_wise_input_count[unique_id] = input_count
124 | input_batch.extend(outputs)
125 | current_batch_length += input_count
126 | last_input_received_at = time.time()
127 |
128 | for unique_id in unique_id_wise_input_count.keys():
129 | try:
130 | del to_process[unique_id]
131 | except:
132 | pass
133 |
134 | current_sum_of_to_process = sum(
135 | len(v[f"{predictor_sequence - 1}.outputs"]) for v in to_process.values()
136 | )
137 |
138 | if current_batch_length == 0:
139 | if time.time() - last_input_received_at > 5:
140 | time.sleep(0.05)
141 | else:
142 | time.sleep(max_wait_time_for_batch_collection / 2)
143 | continue
144 |
145 | elif (
146 | time.time() - batch_collection_started_at
147 | < max_wait_time_for_batch_collection
148 | and current_batch_length / optimal_batch_size < 0.9
149 | ):
150 | time.sleep(max_wait_time_for_batch_collection / 2)
151 | continue
152 |
153 | else:
154 | # finished collecting batch
155 | break
156 |
157 | _utils.logger.info(
158 | f"Fetched batch {unique_id_wise_input_count} with {current_sum_of_to_process} remaining in memory, to_process: {len(to_process)}"
159 | )
160 |
161 | return unique_id_wise_input_count, input_batch
162 |
163 |
164 | def prepare_results(
165 | unique_id_wise_input_count,
166 | results,
167 | predictor_sequence,
168 | last_predictor_success,
169 | received_at,
170 | predicted_at,
171 | current_batch_length,
172 | ):
173 | """Prepare results for updating the main index."""
174 | unique_id_wise_results = {}
175 | total_input_count_till_now = 0
176 |
177 | for unique_id, input_count in unique_id_wise_input_count.items():
178 | unique_id_wise_results[unique_id] = {
179 | f"{predictor_sequence}.outputs": results[
180 | total_input_count_till_now : total_input_count_till_now + input_count
181 | ],
182 | f"{predictor_sequence}.predicted_at": predicted_at,
183 | "last_predictor_success": last_predictor_success,
184 | f"{predictor_sequence}.received_at": received_at,
185 | f"{predictor_sequence}.predicted_in_batch_of": current_batch_length,
186 | }
187 | total_input_count_till_now += input_count
188 |
189 | return unique_id_wise_results
190 |
191 |
192 | def start_loop(
193 | predictor_name=os.getenv("PREDICTOR_NAME"),
194 | optimal_batch_size=int(os.getenv("OPTIMAL_BATCH_SIZE")),
195 | ):
196 | """Main loop for processing predictions."""
197 | timeout_time = float(os.getenv("TIMEOUT", 0))
198 | predictor, predictor_sequence = load_predictor(predictor_name)
199 | example = get_example(predictor_sequence)
200 | predictor_info = initialize_predictor(
201 | predictor, predictor_name, predictor_sequence, example, optimal_batch_size
202 | )
203 | _utils.META_INDEX.update({f"{predictor_sequence}": predictor_info})
204 |
205 | optimal_batch_size = predictor_info["optimal_batch_size"]
206 | time_per_example = predictor_info["time_per_example"]
207 | max_wait_time_for_batch_collection = max(0.003, time_per_example * 0.51)
208 |
209 | _utils.logger.info(
210 | f"""{predictor_name}
211 | optimal_batch_size: {optimal_batch_size}
212 | time_per_example: {time_per_example}
213 | predictor_sequence: {predictor_sequence}
214 | max_wait_time_for_batch_collection: {max_wait_time_for_batch_collection}
215 | """
216 | )
217 |
218 | prediction_loop_started_at = time.time()
219 |
220 | while True:
221 | """
222 | Set timedout_in_queue to True for all the predictions that have been in the queue for more than timeout_time seconds
223 | and delete older than 30 seconds predictions that have finished prediction
224 | """
225 |
226 | timedout_in_queue_unique_ids = _utils.MAIN_INDEX.search(
227 | query={
228 | "-1.predicted_at": 0,
229 | "-1.received_at": {"$lt": time.time() - timeout_time},
230 | "timedout_in_queue": {"$ne": True},
231 | "last_predictor_sequence": {"$ne": _utils.LAST_PREDICTOR_SEQUENCE},
232 | },
233 | update={"timedout_in_queue": True},
234 | select_keys=[],
235 | )
236 |
237 | if timedout_in_queue_unique_ids:
238 | _utils.logger.warning(
239 | f"{_utils.MAIN_INDEX.count()} in queue, set timedout_in_queue to True for {list(timedout_in_queue_unique_ids)} unique_ids"
240 | )
241 |
242 | _utils.MAIN_INDEX.delete(
243 | query={
244 | "$and": [
245 | {"-1.predicted_at": {"$gt": 0}},
246 | {"-1.predicted_at": {"$lt": time.time() - 40}},
247 | ]
248 | },
249 | )
250 |
251 | unique_id_wise_input_count, input_batch = fetch_batch(
252 | _utils.MAIN_INDEX,
253 | predictor_sequence,
254 | optimal_batch_size,
255 | max_wait_time_for_batch_collection,
256 | )
257 |
258 | _utils.logger.debug(f"Processing batch {unique_id_wise_input_count}")
259 |
260 | process_batch_started_at = time.time()
261 | results, last_predictor_success, received_at, predicted_at = process_batch(
262 | predictor, input_batch, optimal_batch_size
263 | )
264 | process_batch_ended_at = time.time()
265 |
266 | unique_id_wise_results = prepare_results(
267 | unique_id_wise_input_count,
268 | results,
269 | predictor_sequence,
270 | last_predictor_success,
271 | received_at,
272 | predicted_at,
273 | len(input_batch),
274 | )
275 | _utils.MAIN_INDEX.update(unique_id_wise_results)
276 |
277 | _utils.logger.debug(
278 | f"Updated results predictor {predictor_sequence}: {list(unique_id_wise_results)}"
279 | )
280 |
281 | _utils.GLOBAL_METRICS_INDEX.math(
282 | "total_predictor_run_for_hours",
283 | (process_batch_ended_at - process_batch_started_at) / 3600,
284 | "+=",
285 | )
286 |
287 | _utils.GLOBAL_METRICS_INDEX["total_predictor_up_for_hours"] = (
288 | time.time() - prediction_loop_started_at
289 | ) / 3600
290 |
291 |
292 | if __name__ == "__main__":
293 | import sys
294 |
295 | start_loop(sys.argv[1])
296 |
--------------------------------------------------------------------------------
/fastdeploy/_rest.py:
--------------------------------------------------------------------------------
1 | from gevent import monkey
2 |
3 | monkey.patch_all()
4 |
5 | import os
6 | import json
7 | import time
8 | import uuid
9 | import pickle
10 | import falcon
11 | import gevent
12 | import threading
13 | import importlib
14 |
15 | from . import _utils
16 | from . import _infer
17 |
18 | try:
19 | get_prometheus_metrics = importlib.import_module(
20 | "extra_prometheus_metrics"
21 | ).get_prometheus_metrics
22 | except ImportError:
23 | get_prometheus_metrics = None
24 |
25 |
26 | class AsyncResponseHandler:
27 | def __init__(self, check_interval=0.003):
28 | self.pending_requests = {}
29 | self.check_interval = check_interval
30 | self.lock = threading.Lock()
31 | self.infer = _infer.Infer()
32 |
33 | gevent.spawn(self._response_checker)
34 |
35 | def register_request_and_wait_for_response(
36 | self, unique_id, is_compressed, input_type, timeout
37 | ):
38 | event = gevent.event.Event()
39 |
40 | with self.lock:
41 | self.pending_requests[unique_id] = {
42 | "event": event,
43 | "is_compressed": is_compressed,
44 | "input_type": input_type,
45 | "timestamp": time.time(),
46 | }
47 |
48 | try:
49 | if event.wait(timeout=timeout):
50 | with self.lock:
51 | response = self.pending_requests[unique_id].get("response")
52 | return response
53 | else:
54 | return self.infer.get_timeout_response(
55 | unique_id, is_compressed, input_type, is_client_timeout=True
56 | )
57 | except Exception as e:
58 | _utils.logger.exception(e, exc_info=True)
59 | _utils.logger.error(f"Error registering request and waiting for response: {e}")
60 | return self.infer.get_timeout_response(
61 | unique_id, is_compressed, input_type, is_client_timeout=True
62 | )
63 | finally:
64 | with self.lock:
65 | self.pending_requests.pop(unique_id, None)
66 |
67 | def deregister_request(self, unique_id):
68 | with self.lock:
69 | self.pending_requests.pop(unique_id, None)
70 |
71 | def _response_checker(self):
72 | last_input_received_at = time.time()
73 | while True:
74 | try:
75 | unique_ids = []
76 | is_compresseds = []
77 | input_types = []
78 | with self.lock:
79 | for uid, data in self.pending_requests.items():
80 | unique_ids.append(uid)
81 | is_compresseds.append(data["is_compressed"])
82 | input_types.append(data["input_type"])
83 | last_input_received_at = data["timestamp"]
84 |
85 | if not unique_ids and (time.time() - last_input_received_at) > 5:
86 | time.sleep(0.05)
87 | continue
88 |
89 | if unique_ids:
90 | _utils.logger.debug(
91 | f"Checking responses for unique_ids: {unique_ids}"
92 | )
93 | try:
94 | responses = self.infer.get_responses_for_unique_ids(
95 | unique_ids=unique_ids,
96 | is_compresseds=is_compresseds,
97 | input_types=input_types,
98 | )
99 |
100 | for uid, response in responses.items():
101 | if response is not None:
102 | with self.lock:
103 | if uid in self.pending_requests:
104 | request_data = self.pending_requests[uid]
105 | request_data["response"] = response
106 | request_data["event"].set()
107 |
108 | except Exception as e:
109 | _utils.logger.exception(e, exc_info=True)
110 | _utils.logger.error(f"Error checking responses: {e}")
111 |
112 | except Exception as e:
113 | _utils.logger.error(f"Error in response checker loop: {e}")
114 |
115 | finally:
116 | gevent.sleep(self.check_interval)
117 |
118 |
119 | class Infer(object):
120 | def __init__(self):
121 | self._infer = _infer.Infer()
122 | self._response_handler = AsyncResponseHandler()
123 |
124 | def on_post(self, req, resp):
125 | request_received_at = time.time()
126 |
127 | unique_id = str(req.params.get("unique_id", uuid.uuid4()))
128 | client_timeout = float(req.params.get("timeout", os.getenv("TIMEOUT", 480)))
129 |
130 | is_compressed = req.params.get("compressed", "f")[0].lower() == "t"
131 | input_type = req.params.get("input_type", "json")
132 |
133 | success, failure_response = self._infer.add_to_infer_queue(
134 | inputs=req.stream.read(),
135 | unique_id=unique_id,
136 | input_type=input_type,
137 | is_compressed=is_compressed,
138 | )
139 |
140 | if is_compressed:
141 | resp.content_type = "application/octet-stream"
142 | elif input_type == "json":
143 | resp.content_type = "application/json"
144 | elif input_type == "pickle":
145 | resp.content_type = "application/pickle"
146 | elif input_type == "msgpack":
147 | resp.content_type = "application/msgpack"
148 |
149 | if success is not True:
150 | resp.status = falcon.HTTP_400
151 | if input_type == "json":
152 | resp.media = failure_response
153 | else:
154 | resp.data = failure_response
155 |
156 | else:
157 | (
158 | success,
159 | response,
160 | ) = self._response_handler.register_request_and_wait_for_response(
161 | unique_id, is_compressed, input_type, client_timeout
162 | )
163 | if success:
164 | resp.status = falcon.HTTP_200
165 | else:
166 | resp.status = falcon.HTTP_500
167 |
168 | if input_type == "json":
169 | resp.media = response
170 | else:
171 | resp.data = response
172 |
173 |
174 | class PrometheusMetrics(object):
175 | def on_get(self, req, resp):
176 | _LAST_X_SECONDS = int(
177 | req.params.get("last_x_seconds", int(os.getenv("LAST_X_SECONDS", 30)))
178 | )
179 | CURRENT_TIME = time.time()
180 | LAST_X_SECONDS = time.time() - _LAST_X_SECONDS
181 |
182 | number_of_requests_timedout_in_last_x_seconds = _utils.MAIN_INDEX.count(
183 | query={
184 | "-1.predicted_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
185 | "timedout_in_queue": True,
186 | }
187 | )
188 |
189 | requests_received_in_last_x_seconds = _utils.MAIN_INDEX.count(
190 | query={"-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}}
191 | )
192 |
193 | requests_processed_in_last_x_seconds = _utils.MAIN_INDEX.count(
194 | query={"-1.predicted_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}}
195 | )
196 |
197 | requests_received_in_last_x_seconds_that_failed = _utils.MAIN_INDEX.count(
198 | query={
199 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
200 | "last_predictor_success": False,
201 | }
202 | )
203 |
204 | requests_processed_in_last_x_seconds_that_failed = _utils.MAIN_INDEX.count(
205 | query={
206 | "-1.predicted_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
207 | "last_predictor_success": False,
208 | }
209 | )
210 |
211 | requests_received_in_last_x_seconds_that_are_pending = _utils.MAIN_INDEX.count(
212 | query={
213 | "-1.predicted_at": 0,
214 | "last_predictor_success": {"$ne": False},
215 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
216 | }
217 | )
218 |
219 | requests_received_in_last_x_seconds_that_are_successful = (
220 | _utils.MAIN_INDEX.count(
221 | query={
222 | "-1.predicted_at": {"$ne": 0},
223 | "last_predictor_success": True,
224 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
225 | "timedout_in_queue": {"$ne": True},
226 | }
227 | )
228 | )
229 |
230 | requests_processed_in_last_x_seconds_that_are_successful = (
231 | _utils.MAIN_INDEX.count(
232 | query={
233 | "-1.predicted_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
234 | "last_predictor_success": True,
235 | "timedout_in_queue": {"$ne": True},
236 | }
237 | )
238 | )
239 |
240 | avg_total_time_per_req_for_reqs_in_last_x_seconds = 0
241 |
242 | __sum_of_received_at = _utils.MAIN_INDEX.math(
243 | "-1.received_at",
244 | "sum",
245 | query={
246 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
247 | "-1.predicted_at": {"$ne": 0},
248 | "timedout_in_queue": {"$ne": True},
249 | },
250 | )
251 |
252 | __sum_of_predicted_at = _utils.MAIN_INDEX.math(
253 | "-1.predicted_at",
254 | "sum",
255 | query={
256 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
257 | "-1.predicted_at": {"$ne": 0},
258 | "timedout_in_queue": {"$ne": True},
259 | },
260 | )
261 |
262 | if __sum_of_received_at and __sum_of_predicted_at:
263 | avg_total_time_per_req_for_reqs_in_last_x_seconds = (
264 | __sum_of_predicted_at - __sum_of_received_at
265 | ) / requests_received_in_last_x_seconds_that_are_successful
266 |
267 | avg_actual_total_time_per_req_for_reqs_in_last_x_seconds = 0
268 |
269 | for executor_n in [0]:
270 | _temp_sum_of_received_at = _utils.MAIN_INDEX.math(
271 | f"{executor_n}.received_at",
272 | "sum",
273 | query={
274 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
275 | "-1.predicted_at": {"$ne": 0},
276 | "timedout_in_queue": {"$ne": True},
277 | },
278 | )
279 |
280 | _temp_sum_of_predicted_at = _utils.MAIN_INDEX.math(
281 | f"{executor_n}.predicted_at",
282 | "sum",
283 | query={
284 | "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
285 | "-1.predicted_at": {"$ne": 0},
286 | "timedout_in_queue": {"$ne": True},
287 | },
288 | )
289 |
290 | if _temp_sum_of_received_at and _temp_sum_of_predicted_at:
291 | avg_actual_total_time_per_req_for_reqs_in_last_x_seconds = (
292 | _temp_sum_of_predicted_at - _temp_sum_of_received_at
293 | ) / requests_received_in_last_x_seconds_that_are_successful
294 |
295 | prometheus_text = f"""
296 | # HELP requests_received_in_last_x_seconds The number of requests received in last {_LAST_X_SECONDS} seconds.
297 | # TYPE requests_received_in_last_x_seconds gauge
298 | requests_received_in_last_x_seconds {requests_received_in_last_x_seconds}
299 |
300 | # HELP requests_processed_in_last_x_seconds The number of requests processed in last {_LAST_X_SECONDS} seconds.
301 | # TYPE requests_processed_in_last_x_seconds gauge
302 | requests_processed_in_last_x_seconds {requests_processed_in_last_x_seconds}
303 |
304 | # HELP number_of_requests_timedout_in_last_x_seconds The number of requests timedout at predictor(s) in last {_LAST_X_SECONDS} seconds.
305 | # TYPE number_of_requests_timedout_in_last_x_seconds gauge
306 | number_of_requests_timedout_in_last_x_seconds {number_of_requests_timedout_in_last_x_seconds}
307 |
308 | # HELP requests_received_in_last_x_seconds_that_failed The number of requests received in last {_LAST_X_SECONDS} seconds that failed.
309 | # TYPE requests_received_in_last_x_seconds_that_failed gauge
310 | requests_received_in_last_x_seconds_that_failed {requests_received_in_last_x_seconds_that_failed}
311 |
312 | # HELP requests_processed_in_last_x_seconds_that_failed The number of requests processed in last {_LAST_X_SECONDS} seconds that failed.
313 | # TYPE requests_processed_in_last_x_seconds_that_failed gauge
314 | requests_processed_in_last_x_seconds_that_failed {requests_processed_in_last_x_seconds_that_failed}
315 |
316 | # HELP requests_received_in_last_x_seconds_that_are_pending The number of requests received in last {_LAST_X_SECONDS} seconds that are pending.
317 | # TYPE requests_received_in_last_x_seconds_that_are_pending gauge
318 | requests_received_in_last_x_seconds_that_are_pending {requests_received_in_last_x_seconds_that_are_pending}
319 |
320 | # HELP requests_received_in_last_x_seconds_that_are_successful The number of requests received in last {_LAST_X_SECONDS} seconds that are successful.
321 | # TYPE requests_received_in_last_x_seconds_that_are_successful gauge
322 | requests_received_in_last_x_seconds_that_are_successful {requests_received_in_last_x_seconds_that_are_successful}
323 |
324 | # HELP requests_processed_in_last_x_seconds_that_are_successful The number of requests processed in last {_LAST_X_SECONDS} seconds that are successful.
325 | # TYPE requests_processed_in_last_x_seconds_that_are_successful gauge
326 | requests_processed_in_last_x_seconds_that_are_successful {requests_processed_in_last_x_seconds_that_are_successful}
327 |
328 | # HELP avg_total_time_per_req_for_reqs_in_last_x_seconds The average total time per request for requests in last {_LAST_X_SECONDS} seconds.
329 | # TYPE avg_total_time_per_req_for_reqs_in_last_x_seconds gauge
330 | avg_total_time_per_req_for_reqs_in_last_x_seconds {avg_total_time_per_req_for_reqs_in_last_x_seconds}
331 |
332 | # HELP avg_actual_total_time_per_req_for_reqs_in_last_x_seconds The average actual total time per request for requests in last {_LAST_X_SECONDS} seconds.
333 | # TYPE avg_actual_total_time_per_req_for_reqs_in_last_x_seconds gauge
334 | avg_actual_total_time_per_req_for_reqs_in_last_x_seconds {avg_actual_total_time_per_req_for_reqs_in_last_x_seconds}
335 | """.strip()
336 |
337 | if get_prometheus_metrics is not None:
338 | extra_prometheus_metrics_data = get_prometheus_metrics()
339 |
340 | if extra_prometheus_metrics_data:
341 | extra_prometheus_texts = []
342 | for metric_name, metric_data in extra_prometheus_metrics_data.items():
343 | extra_prometheus_texts.append(
344 | f"""
345 | # HELP {metric_name} {metric_data['help']}
346 | # TYPE {metric_name} {metric_data['type']}
347 | {metric_name} {metric_data['value']}
348 | """.strip()
349 | )
350 | prometheus_text += "\n\n" + "\n\n".join(extra_prometheus_texts)
351 |
352 | resp.status = falcon.HTTP_200
353 | resp.content_type = "text/plain; version=0.0.4"
354 | resp.text = prometheus_text
355 |
356 |
357 | class Health(object):
358 | def on_get(self, req, resp):
359 | fail_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y_param = req.params.get(
360 | "fail_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y",
361 | None,
362 | )
363 |
364 | fail_if_requests_older_than_x_seconds_pending_param = req.params.get(
365 | "fail_if_requests_older_than_x_seconds_pending", None
366 | )
367 |
368 | fail_if_up_time_more_than_x_seconds_param = req.params.get(
369 | "fail_if_up_time_more_than_x_seconds", None
370 | )
371 |
372 | fail_if_requests_timedout_in_last_x_seconds_is_more_than_y_param = (
373 | req.params.get(
374 | "fail_if_requests_timedout_in_last_x_seconds_is_more_than_y", None
375 | )
376 | )
377 |
378 | is_predictor_is_up_param = req.params.get("is_predictor_is_up", None)
379 |
380 | if fail_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y_param:
381 | (
382 | x,
383 | y,
384 | ) = fail_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y_param.split(
385 | ","
386 | )
387 | x, y = int(x), int(y)
388 | if _utils.check_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y(
389 | x, y
390 | ):
391 | resp.status = falcon.HTTP_503
392 | resp.media = {
393 | "reason": f"More than {y}% requests failed in last {x} seconds"
394 | }
395 | return
396 |
397 | if fail_if_requests_older_than_x_seconds_pending_param:
398 | if _utils.check_if_requests_older_than_x_seconds_pending(
399 | int(fail_if_requests_older_than_x_seconds_pending_param)
400 | ):
401 | resp.status = falcon.HTTP_503
402 | resp.media = {
403 | "reason": f"Requests older than {fail_if_requests_older_than_x_seconds_pending_param} seconds are pending"
404 | }
405 | return
406 |
407 | if fail_if_up_time_more_than_x_seconds_param:
408 | if time.time() - Infer.started_at_time > int(
409 | fail_if_up_time_more_than_x_seconds_param
410 | ):
411 | resp.status = falcon.HTTP_503
412 | resp.media = {
413 | "reason": f"Up time more than {fail_if_up_time_more_than_x_seconds_param} seconds"
414 | }
415 | return
416 |
417 | if fail_if_requests_timedout_in_last_x_seconds_is_more_than_y_param:
418 | (
419 | x,
420 | y,
421 | ) = fail_if_requests_timedout_in_last_x_seconds_is_more_than_y_param.split(
422 | ","
423 | )
424 | x, y = int(x), int(y)
425 | if _utils.check_if_requests_timedout_in_last_x_seconds_is_more_than_y(x, y):
426 | resp.status = falcon.HTTP_503
427 | return
428 |
429 | resp.status = falcon.HTTP_200
430 | resp.media = {"status": "ok"}
431 |
432 |
433 | class Meta(object):
434 | def on_get(self, req, resp):
435 | resp.status = falcon.HTTP_200
436 |
437 | if "is_pickle_allowed" in req.params:
438 | resp.media = {
439 | "is_pickle_allowed": os.getenv("ALLOW_PICKLE", "true").lower() == "true"
440 | }
441 |
442 | else:
443 | try:
444 | json.dumps(_utils.example)
445 | __example = _utils.example
446 | except:
447 | __example = None
448 |
449 | resp.media = {
450 | "name": _utils.recipe_name,
451 | "example": __example,
452 | "is_pickle_allowed": os.getenv("ALLOW_PICKLE", "true").lower()
453 | == "true",
454 | "timeout": int(os.getenv("TIMEOUT")),
455 | }
456 |
457 |
458 | class Die(object):
459 | def on_get(self, req, resp):
460 | if req.params.get("die", "false").lower()[0] == "t":
461 | resp.status = falcon.HTTP_200
462 | resp.media = {"status": "killed"}
463 | _utils.kill_fd(loop=True, rest=True)
464 |
465 |
466 | app = falcon.App(
467 | middleware=falcon.CORSMiddleware(allow_origins="*", allow_credentials="*"),
468 | )
469 |
470 | infer_api = Infer()
471 | prometheus_metrics = PrometheusMetrics()
472 | health_api = Health()
473 | die_api = Die()
474 |
475 | app.add_route("/infer", infer_api)
476 | app.add_route("/sync", infer_api)
477 | app.add_route("/prometheus_metrics", prometheus_metrics)
478 | app.add_route("/health", health_api)
479 | app.add_route("/meta", Meta())
480 | app.add_route("/die", die_api)
481 |
--------------------------------------------------------------------------------
/fastdeploy/_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | logging.basicConfig(
4 | format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
5 | datefmt="%Y-%m-%d:%H:%M:%S",
6 | level=logging.INFO,
7 | )
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | import os
13 | import glob
14 | import json
15 | import time
16 | import psutil
17 | from datetime import datetime
18 | from liteindex import DefinedIndex, KVIndex
19 |
20 | try:
21 | from example import example
22 | except:
23 | raise Exception("example.py not found. Please follow the instructions in README.md")
24 |
25 | try:
26 | from example import name as recipe_name
27 | except:
28 | recipe_name = os.path.basename(os.getcwd()).strip("/")
29 |
30 |
31 | PREDICTOR_SEQUENCE_TO_FILES = {}
32 |
33 | predictor_files = [
34 | _
35 | for _ in glob.glob("predictor*.py")
36 | if _ == "predictor.py" or _.split("predictor_")[1].split(".")[0].isdigit()
37 | ]
38 |
39 | for f in sorted(
40 | predictor_files,
41 | key=lambda x: int(
42 | x.split("predictor_")[1].split(".")[0] if x != "predictor.py" else 0
43 | ),
44 | ):
45 | if f == "predictor.py":
46 | PREDICTOR_SEQUENCE_TO_FILES[0] = f
47 | break
48 | else:
49 | PREDICTOR_SEQUENCE_TO_FILES[len(PREDICTOR_SEQUENCE_TO_FILES)] = f
50 |
51 | PREDICTOR_FILE_TO_SEQUENCE = {v: k for k, v in PREDICTOR_SEQUENCE_TO_FILES.items()}
52 |
53 | LAST_PREDICTOR_SEQUENCE = max(PREDICTOR_SEQUENCE_TO_FILES.keys())
54 | FIRST_PREDICTOR_SEQUENCE = min(PREDICTOR_SEQUENCE_TO_FILES.keys())
55 |
56 | META_INDEX = DefinedIndex(
57 | "meta_index",
58 | schema={
59 | "optimal_batch_size": DefinedIndex.Type.number,
60 | "time_per_example": DefinedIndex.Type.number,
61 | "predictor_name": DefinedIndex.Type.string,
62 | "predictor_sequence": DefinedIndex.Type.number,
63 | "request_poll_time": DefinedIndex.Type.number,
64 | "example_output": DefinedIndex.Type.other,
65 | "status": DefinedIndex.Type.string,
66 | },
67 | db_path=os.path.join("fastdeploy_dbs", f"main_index.db"),
68 | )
69 |
70 | KV_STORE = KVIndex(os.path.join("fastdeploy_dbs", f"kv_store.db"))
71 | KV_STORE.clear()
72 |
73 |
74 | MAIN_INDEX = DefinedIndex(
75 | "main_index",
76 | schema={
77 | **{
78 | "last_predictor_sequence": DefinedIndex.Type.number,
79 | "last_predictor_success": DefinedIndex.Type.boolean,
80 | "-1.outputs": DefinedIndex.Type.other,
81 | "-1.predicted_at": DefinedIndex.Type.number,
82 | "-1.received_at": DefinedIndex.Type.number,
83 | "-1.predicted_in_batch_of": DefinedIndex.Type.number,
84 | "timedout_in_queue": DefinedIndex.Type.boolean,
85 | },
86 | **{f"{_}.outputs": "other" for _ in PREDICTOR_SEQUENCE_TO_FILES},
87 | **{f"{_}.predicted_at": "number" for _ in PREDICTOR_SEQUENCE_TO_FILES},
88 | **{f"{_}.received_at": "number" for _ in PREDICTOR_SEQUENCE_TO_FILES},
89 | **{f"{_}.predicted_in_batch_of": "number" for _ in PREDICTOR_SEQUENCE_TO_FILES},
90 | },
91 | db_path=os.path.join("fastdeploy_dbs", f"main_index.db"),
92 | auto_vacuum=False,
93 | )
94 |
95 | # for setting timedout_in_queue
96 | # used in _loop.py start_loop to set timedout_in_queue to True for all the predictions that have been in the queue for more than timeout_time seconds
97 | MAIN_INDEX.optimize_for_query(
98 | ["-1.predicted_at", "-1.received_at", "timedout_in_queue"]
99 | )
100 |
101 | # for getting next batch to process
102 | # used in _loop.py fetch_batch function
103 | MAIN_INDEX.optimize_for_query(
104 | [
105 | "-1.predicted_at",
106 | "last_predictor_success",
107 | "last_predictor_sequence",
108 | "timedout_in_queue",
109 | ]
110 | )
111 |
112 | # in general queries
113 | MAIN_INDEX.optimize_for_query(["-1.received_at"])
114 | MAIN_INDEX.optimize_for_query(["last_predictor_success"])
115 | MAIN_INDEX.optimize_for_query(["last_predictor_sequence"])
116 | MAIN_INDEX.optimize_for_query(["timedout_in_queue"])
117 |
118 |
119 | GLOBAL_METRICS_INDEX = KVIndex(
120 | os.path.join("fastdeploy_dbs", f"global_metrics_index.db")
121 | )
122 | GLOBAL_METRICS_INDEX["total_predictor_run_for_hours"] = 0
123 | GLOBAL_METRICS_INDEX["total_predictor_up_for_hours"] = 0
124 |
125 |
126 | def get_fd_pids():
127 | # get pids of processes with fastdeploy and rest or loop in their full cmdline
128 | pids = {
129 | "rest": [],
130 | "loop": []
131 | }
132 |
133 | for proc in psutil.process_iter():
134 | try:
135 | full_cmdline = " ".join(proc.cmdline())
136 | if "fastdeploy" in full_cmdline and "--rest" in full_cmdline:
137 | pids["rest"].append(proc.pid)
138 | elif "fastdeploy" in full_cmdline and "--loop" in full_cmdline:
139 | pids["loop"].append(proc.pid)
140 | except Exception as e:
141 | pass
142 |
143 | return pids
144 |
145 |
146 | def kill_fd(loop=True, rest=True):
147 | pids = get_fd_pids()
148 | if loop and pids["loop"]:
149 | os.system(f"kill -9 {' '.join([str(pid) for pid in pids['loop']])}")
150 | if rest and pids["rest"]:
151 | os.system(f"kill -9 {' '.join([str(pid) for pid in pids['rest']])}")
152 |
153 |
154 | def warmup(predictor, example_input, n=3):
155 | """
156 | Run warmup prediction on the model.
157 |
158 | :param n: number of warmup predictions to be run. defaults to 3
159 | """
160 | logger.info("Warming up .. ")
161 | for _ in range(n - 1):
162 | predictor(example_input)
163 |
164 | return predictor(example_input)
165 |
166 |
167 | def calculate_optimum_batch_sizes(
168 | predictor,
169 | predictor_sequence,
170 | example_input,
171 | max_batch_size,
172 | max_batch_search_sec=10,
173 | ):
174 | search_over_batch_sizes = (
175 | [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
176 | if max_batch_size == 0
177 | else [max_batch_size]
178 | )
179 |
180 | time_per_example = 0
181 | max_batch_size = 0
182 |
183 | for batch_size in search_over_batch_sizes:
184 | logger.info(f"Trying batch size: {batch_size}")
185 | start = time.time()
186 | predictor((example_input * batch_size)[:batch_size], batch_size=batch_size)
187 | end = time.time()
188 |
189 | _time_per_example = (end - start) / batch_size
190 |
191 | logger.info(f"batch_size: {batch_size}, time_per_example: {_time_per_example}")
192 |
193 | if time_per_example == 0:
194 | time_per_example = _time_per_example
195 | max_batch_size = batch_size
196 | elif _time_per_example < time_per_example:
197 | time_per_example = _time_per_example
198 | max_batch_size = batch_size
199 | else:
200 | break
201 |
202 | logger.info(
203 | f"{PREDICTOR_SEQUENCE_TO_FILES[predictor_sequence]}: Optimum batch size: {max_batch_size}, time_per_example: {time_per_example}"
204 | )
205 |
206 | return max_batch_size, time_per_example
207 |
208 |
209 | def check_if_requests_timedout_in_last_x_seconds_is_more_than_y(
210 | last_x_seconds, max_percentage_of_timedout_requests
211 | ):
212 | time_before_x_seconds = time.time() - last_x_seconds
213 | requests_received_in_last_x_seconds = MAIN_INDEX.count(
214 | query={"-1.predicted_at": {"$gte": time_before_x_seconds}}
215 | )
216 |
217 | requests_timedout_in_last_x_seconds = MAIN_INDEX.count(
218 | query={
219 | "-1.predicted_at": {"$gte": time_before_x_seconds},
220 | "timedout_in_queue": True,
221 | }
222 | )
223 |
224 | if requests_received_in_last_x_seconds == 0:
225 | return False
226 |
227 | logger.warning(
228 | f"Requests timedout in last {last_x_seconds} seconds: {requests_timedout_in_last_x_seconds}/{requests_received_in_last_x_seconds}"
229 | )
230 |
231 | if (
232 | requests_timedout_in_last_x_seconds / requests_received_in_last_x_seconds
233 | ) * 100 >= max_percentage_of_timedout_requests:
234 | return True
235 | return False
236 |
237 |
238 | def check_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y(
239 | last_x_seconds, max_percentage_of_failed_requests
240 | ):
241 | time_before_x_seconds = time.time() - last_x_seconds
242 | requests_received_in_last_x_seconds = MAIN_INDEX.count(
243 | query={"-1.predicted_at": {"$gte": time_before_x_seconds}}
244 | )
245 |
246 | if requests_received_in_last_x_seconds == 0:
247 | return False
248 |
249 | requests_received_in_last_x_seconds_that_failed = MAIN_INDEX.count(
250 | query={
251 | "-1.predicted_at": {"$gte": time_before_x_seconds},
252 | "last_predictor_success": False,
253 | }
254 | )
255 |
256 | if (
257 | requests_received_in_last_x_seconds_that_failed
258 | / requests_received_in_last_x_seconds
259 | ) * 100 >= max_percentage_of_failed_requests:
260 | return True
261 |
262 | return False
263 |
264 |
265 | def check_if_requests_older_than_x_seconds_pending(x):
266 | time_before_x_seconds = time.time() - x
267 |
268 | requests_older_than_x_seconds_pending = MAIN_INDEX.count(
269 | query={
270 | "-1.received_at": {"$lte": time_before_x_seconds},
271 | "-1.predicted_at": 0,
272 | "last_predictor_success": {"$ne": False},
273 | }
274 | )
275 |
276 | if requests_older_than_x_seconds_pending > 0:
277 | return True
278 | return False
279 |
--------------------------------------------------------------------------------
/fastdeploy/monitor.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Function to check if nvidia-smi is available
4 | check_nvidia_smi() {
5 | command -v nvidia-smi >/dev/null 2>&1
6 | }
7 |
8 | # Function to get GPU usage for a PID
9 | get_gpu_usage() {
10 | pid=$1
11 | if check_nvidia_smi; then
12 | gpu_mem=$(nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader,nounits | grep "^$pid," | cut -d',' -f2 | tr -d ' ')
13 | gpu_util=$(nvidia-smi --query-compute-apps=pid,gpu_util --format=csv,noheader,nounits | grep "^$pid," | cut -d',' -f2 | tr -d ' ')
14 |
15 | gpu_mem=${gpu_mem:-0}
16 | gpu_util=${gpu_util:-0}
17 | else
18 | gpu_mem=0
19 | gpu_util=0
20 | fi
21 |
22 | echo "$gpu_util $gpu_mem"
23 | }
24 |
25 | # Function to get CPU and memory usage for a single PID
26 | get_usage() {
27 | pid=$1
28 | cpu=$(ps -p $pid -o %cpu= | tr -d ' ')
29 | mem=$(ps -p $pid -o rss= | tr -d ' ')
30 | mem_mb=$(printf "%.2f" $(echo "$mem / 1024" | bc -l))
31 | echo "$cpu $mem_mb"
32 | }
33 |
34 | # Function to sum CPU and memory usage for multiple PIDs
35 | sum_usage() {
36 | pids=$1
37 | cpu_sum=0
38 | mem_sum=0
39 |
40 | for pid in $pids; do
41 | read cpu mem <<< $(get_usage $pid)
42 | cpu_sum=$(echo "$cpu_sum + $cpu" | bc -l)
43 | mem_sum=$(echo "$mem_sum + $mem" | bc -l)
44 | done
45 |
46 | echo "$cpu_sum $mem_sum"
47 | }
48 |
49 | # Initialize arrays for storing historical data
50 | declare -a loop_cpu_history
51 | declare -a loop_ram_history
52 | declare -a loop_gpu_util_history
53 | declare -a loop_gpu_mem_history
54 | declare -a rest_cpu_history
55 | declare -a rest_ram_history
56 |
57 | # Function to calculate statistics
58 | calculate_stats() {
59 | local values=("$@")
60 | local count=${#values[@]}
61 |
62 | if [ $count -eq 0 ]; then
63 | echo '{"min": "N/A", "max": "N/A", "avg": "N/A"}'
64 | return
65 | fi
66 |
67 | local min=${values[0]}
68 | local max=${values[0]}
69 | local sum=0
70 |
71 | for value in "${values[@]}"; do
72 | sum=$(printf "%.2f" $(echo "$sum + $value" | bc -l))
73 |
74 | if (( $(echo "$value < $min" | bc -l) )); then
75 | min=$value
76 | fi
77 |
78 | if (( $(echo "$value > $max" | bc -l) )); then
79 | max=$value
80 | fi
81 | done
82 |
83 | local avg=$(printf "%.2f" $(echo "$sum / $count" | bc -l))
84 |
85 | echo "{\"min\": $min, \"max\": $max, \"avg\": $avg}"
86 | }
87 |
88 | # Function to add value to history array (maintaining last 5 values)
89 | add_to_history() {
90 | local array_name=$1
91 | local value=$2
92 |
93 | eval "$array_name[\${#$array_name[@]}]=$value"
94 |
95 | if [ $(eval "echo \${#$array_name[@]}") -gt 5 ]; then
96 | eval "$array_name=(\"\${$array_name[@]:1}\")"
97 | fi
98 | }
99 |
100 | # Function to create JSON output
101 | create_json() {
102 | local loop_pid=$1
103 | local rest_pids=$2
104 | local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
105 | local output=""
106 |
107 | output+="{\n"
108 | output+=" \"timestamp\": \"$timestamp\",\n"
109 |
110 | # Loop process data
111 | output+=" \"loop_process\": {\n"
112 | if [ ! -z "$loop_pid" ]; then
113 | read cpu mem <<< $(get_usage $loop_pid)
114 | read gpu_util gpu_mem <<< $(get_gpu_usage $loop_pid)
115 |
116 | add_to_history loop_cpu_history "$cpu"
117 | add_to_history loop_ram_history "$mem"
118 | add_to_history loop_gpu_util_history "$gpu_util"
119 | add_to_history loop_gpu_mem_history "$gpu_mem"
120 |
121 | output+=" \"pid\": $loop_pid,\n"
122 | output+=" \"status\": \"running\",\n"
123 | output+=" \"current\": {\n"
124 | output+=" \"cpu\": $cpu,\n"
125 | output+=" \"ram\": $mem,\n"
126 | output+=" \"gpu_util\": $gpu_util,\n"
127 | output+=" \"gpu_mem\": $gpu_mem\n"
128 | output+=" },\n"
129 | output+=" \"stats\": {\n"
130 | output+=" \"cpu\": $(calculate_stats "${loop_cpu_history[@]}"),\n"
131 | output+=" \"ram\": $(calculate_stats "${loop_ram_history[@]}"),\n"
132 | output+=" \"gpu_util\": $(calculate_stats "${loop_gpu_util_history[@]}"),\n"
133 | output+=" \"gpu_mem\": $(calculate_stats "${loop_gpu_mem_history[@]}")\n"
134 | output+=" }\n"
135 | else
136 | output+=" \"status\": \"not_running\"\n"
137 | fi
138 | output+=" },\n"
139 |
140 | # REST processes data
141 | output+=" \"rest_processes\": {\n"
142 | if [ ! -z "$rest_pids" ]; then
143 | read cpu mem <<< $(sum_usage "$rest_pids")
144 |
145 | add_to_history rest_cpu_history "$cpu"
146 | add_to_history rest_ram_history "$mem"
147 |
148 | output+=" \"pids\": [$(echo $rest_pids | sed 's/ /, /g')],\n"
149 | output+=" \"status\": \"running\",\n"
150 | output+=" \"current\": {\n"
151 | output+=" \"cpu\": $cpu,\n"
152 | output+=" \"ram\": $mem\n"
153 | output+=" },\n"
154 | output+=" \"stats\": {\n"
155 | output+=" \"cpu\": $(calculate_stats "${rest_cpu_history[@]}"),\n"
156 | output+=" \"ram\": $(calculate_stats "${rest_ram_history[@]}")\n"
157 | output+=" }\n"
158 | else
159 | output+=" \"status\": \"not_running\"\n"
160 | fi
161 | output+=" }\n"
162 | output+="}"
163 |
164 | echo -e "$output"
165 | }
166 |
167 | # Main monitoring function
168 | monitor() {
169 | # Get PIDs
170 | loop_pid=$(pgrep -f "fastdeploy.*loop")
171 | rest_pids=$(pgrep -f "fastdeploy.*rest")
172 |
173 | # Create JSON and write to file
174 | create_json "$loop_pid" "$rest_pids" > monitoring_results.json
175 | }
176 |
177 | # Run the monitor function every 2 seconds
178 | while true; do
179 | monitor
180 | sleep 1
181 | done
182 |
--------------------------------------------------------------------------------
/recipe.md:
--------------------------------------------------------------------------------
1 | ### Serving your pipeline with fastdeploy [example](https://github.com/notAI-tech/fastDeploy/tree/master/recipes/echo)
2 |
3 | - Create a recipe folder with the following structure:
4 | ```
5 | recipe_folder/
6 | ├── example.py
7 | ├── predictor.py
8 | ├── requirements.txt (optional)
9 | └── extras.sh (optional)
10 | ```
11 |
12 | - `example.py`
13 |
14 | ```python
15 | name = "your_app_or_model_name"
16 |
17 | example = [
18 | example_object_1,
19 | example_object_2,
20 | ]
21 | ```
22 |
23 | - `predictor.py`
24 |
25 | ```python
26 | # Whatever code and imports you need to load your model and make predictions
27 |
28 | # predictor function must be defined exactly as below
29 | # batch_size is the optimal batch size for your model
30 | # inputs length may or may not be equal to batch_size
31 | # len(outputs) == len(inputs)
32 | def predictor(inputs, batch_size=1):
33 | return outputs
34 | ```
35 |
36 | - `requirements.txt` (optional): all python dependencies for your pipeline
37 |
38 | - `extras.sh` (optional): any bash commands to run before installing requirements.txt
39 |
40 | - #### start the loop
41 |
42 | ```bash
43 | fastdeploy --loop --recipe recipes/echo_chained
44 | ```
45 |
46 | - #### start the server
47 |
48 | ```bash
49 | fastdeploy --rest --recipe recipes/echo_chained
50 | ```
51 |
52 |
53 | ### Chained recipe [example](https://github.com/notAI-tech/fastDeploy/tree/master/recipes/echo_chained)
54 | - Chained recipe means you have multiple predictor_X.py which are chained sequentially
55 | - `predictor_1.py` will be called first, then `predictor_2.py` and so on
56 | - Each predictor_X.py must have a predictor function defined as above
57 | - Each predictor_X.py is run separately i.e: can be in different virtualenvs
58 |
59 | - #### start all the loops
60 |
61 | ```bash
62 | fastdeploy --loop --recipe recipes/echo_chained --config "predictor_name:predictor_1.py"
63 |
64 | fastdeploy --loop --recipe recipes/echo_chained --config "predictor_name:predictor_2.py"
65 | ```
66 |
67 | - #### start the server
68 |
69 | ```bash
70 | fastdeploy --rest --recipe recipes/echo_chained
71 | ```
72 |
--------------------------------------------------------------------------------
/recipes/.gitignore:
--------------------------------------------------------------------------------
1 | */*default*
2 | */fastdeploy_dbs
3 |
--------------------------------------------------------------------------------
/recipes/echo/.dockerignore:
--------------------------------------------------------------------------------
1 | *.request_index
2 | *.results_index
3 | *.log_index
--------------------------------------------------------------------------------
/recipes/echo/.gitignore:
--------------------------------------------------------------------------------
1 | *.request_index
2 | *.results_index
3 | *.log_index
--------------------------------------------------------------------------------
/recipes/echo/example.py:
--------------------------------------------------------------------------------
1 | name = "echo"
2 |
3 | example = ["Any JSON serialiazable Python object can be input"]
4 |
--------------------------------------------------------------------------------
/recipes/echo/extra_prometheus_metrics.py:
--------------------------------------------------------------------------------
1 | def get_prometheus_metrics():
2 | return {
3 | "test_metric": {
4 | "type": "counter",
5 | "help": "This is a test metric",
6 | "value": 1
7 | }
8 | }
--------------------------------------------------------------------------------
/recipes/echo/fastDeploy.auto_dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8-slim
2 | RUN python3 -m pip install --upgrade --no-cache-dir pip fastdeploy
3 |
4 | ENV MAX_REQUEST_BATCH_SIZE=0 WORKERS=3 TIMEOUT=480 HOST=0.0.0.0 PORT=8080 ONLY_ASYNC=false ALLOW_PICKLE=true PREDICTOR_NAME=predictor.py OPTIMAL_BATCH_SIZE=0 KEEP_ALIVE=60 BASE=python:3.8-slim
5 |
6 | ADD . /recipe
7 | WORKDIR /recipe
8 |
9 | RUN python3 -m pip install --no-cache-dir -r /recipe/requirements.txt
10 | RUN cd /recipe && python3 -c "from predictor import predictor; from example import example; predictor(example)"
11 |
12 | ENTRYPOINT ["/bin/sh", "-c"]
13 |
14 | CMD ["ulimit -n 1000000 && python3 -m fastdeploy --recipe /recipe --loop & python3 -m fastdeploy --recipe /recipe --rest"]
15 |
--------------------------------------------------------------------------------
/recipes/echo/predictor.py:
--------------------------------------------------------------------------------
1 | # Do the required imports
2 | import os
3 | import time
4 |
5 | # Any code can be here
6 | # Load your models, import your local scripts
7 | # modify the code inside predictor function.
8 |
9 | SLEEP_TIME = float(os.getenv("SLEEP_TIME", "0.2"))
10 |
11 | def predictor(input_list, batch_size=1):
12 | output_list = []
13 | while input_list:
14 | input_batch = input_list[:batch_size]
15 | input_list = input_list[batch_size:]
16 | output_list += input_batch
17 | time.sleep(SLEEP_TIME)
18 |
19 | return output_list
20 |
--------------------------------------------------------------------------------
/recipes/echo/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/notAI-tech/fastDeploy/34865d1be99cc5ab98645985c6c7dda7119df1c4/recipes/echo/requirements.txt
--------------------------------------------------------------------------------
/recipes/echo_chained/.dockerignore:
--------------------------------------------------------------------------------
1 | *.request_index
2 | *.results_index
3 | *.log_index
--------------------------------------------------------------------------------
/recipes/echo_chained/.gitignore:
--------------------------------------------------------------------------------
1 | *.request_index
2 | *.results_index
3 | *.log_index
--------------------------------------------------------------------------------
/recipes/echo_chained/example.py:
--------------------------------------------------------------------------------
1 | example = ["Any JSON serialiazable Python object can be input"]
2 |
--------------------------------------------------------------------------------
/recipes/echo_chained/predictor_1.py:
--------------------------------------------------------------------------------
1 | # Do the required imports
2 | import os
3 | import time
4 |
5 | # Any code can be here
6 | # Load your models, import your local scripts
7 | # modify the code inside predictor function.
8 |
9 | SLEEP_TIME = float(os.getenv("SLEEP_TIME", "0.1"))
10 |
11 | def predictor(input_list, batch_size=1):
12 | output_list = []
13 | while input_list:
14 | print(input_list)
15 | input_batch = input_list[:batch_size]
16 | input_list = input_list[batch_size:]
17 | output_list += [(1, _) for _ in input_batch]
18 | time.sleep(SLEEP_TIME)
19 |
20 | return output_list
21 |
--------------------------------------------------------------------------------
/recipes/echo_chained/predictor_2.py:
--------------------------------------------------------------------------------
1 | # Do the required imports
2 | import os
3 | import time
4 |
5 | # Any code can be here
6 | # Load your models, import your local scripts
7 | # modify the code inside predictor function.
8 |
9 | SLEEP_TIME = float(os.getenv("SLEEP_TIME", "0.2"))
10 |
11 | def predictor(input_list, batch_size=1):
12 | print(input_list)
13 | output_list = []
14 | while input_list:
15 | input_batch = input_list[:batch_size]
16 | input_list = input_list[batch_size:]
17 | output_list += [(2, _) for _ in input_batch]
18 | time.sleep(SLEEP_TIME)
19 |
20 | return output_list
21 |
--------------------------------------------------------------------------------
/recipes/echo_chained/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/notAI-tech/fastDeploy/34865d1be99cc5ab98645985c6c7dda7119df1c4/recipes/echo_chained/requirements.txt
--------------------------------------------------------------------------------
/recipes/text_embeddings/example.py:
--------------------------------------------------------------------------------
1 | # generate random sentence with words of size 1-10 characters and total 5-100 words
2 |
3 | import random
4 | import string
5 |
6 | words = open("words.txt", "r").read().split()
7 |
8 | def generate_random_sentence():
9 | # Generate random number of words between 5-100
10 | num_words = random.randint(3, 100)
11 |
12 | sentence = []
13 | for _ in range(num_words):
14 | word = random.choice(words)
15 | sentence.append(word)
16 |
17 | return ' '.join(sentence)
18 |
19 |
20 | def example_function():
21 | return [generate_random_sentence() for _ in range(random.randint(1, 10))]
22 |
23 | example = example_function()
--------------------------------------------------------------------------------
/recipes/text_embeddings/predictor.py:
--------------------------------------------------------------------------------
1 | from sentence_transformers import SentenceTransformer
2 |
3 | sentences = ['That is a happy person', 'That is a very happy person']
4 |
5 | model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True, backend="onnx", model_kwargs={"file_name": "model.onnx", "provider": "CPUExecutionProvider"})
6 |
7 | def predictor(input_list, batch_size=16):
8 | return model.encode(input_list, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False, batch_size=batch_size)
9 |
10 |
--------------------------------------------------------------------------------
/recipes/text_embeddings/requirements.txt:
--------------------------------------------------------------------------------
1 | sentence-transformers[onnx]
2 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Note: To use the 'upload' functionality of this file, you must:
5 | # $ pip install twine
6 |
7 | import io
8 | import os
9 | import sys
10 | from shutil import rmtree
11 |
12 | from setuptools import find_packages, setup, Command
13 |
14 | # Package meta-data.
15 | NAME = "fastdeploy"
16 | DESCRIPTION = "Deploy DL/ ML inference pipelines with minimal extra code. "
17 | URL = "https://github.com/notAI-tech/fastDeploy"
18 | EMAIL = "praneeth@bpraneeth.com"
19 | AUTHOR = "BEDAPUDI PRANEETH"
20 | REQUIRES_PYTHON = ">=3.6.0"
21 | VERSION = "3.1.1"
22 |
23 | # What packages are required for this module to be executed?
24 | REQUIRED = ["falcon", "liteindex==0.0.3.2.dev6", "zstandard", "gunicorn[gevent]", "msgpack", "psutil"]
25 |
26 | # What packages are optional?
27 | EXTRAS = {
28 | }
29 |
30 | # The rest you shouldn't have to touch too much :)
31 | # ------------------------------------------------
32 | # Except, perhaps the License and Trove Classifiers!
33 | # If you do change the License, remember to change the Trove Classifier for that!
34 |
35 | here = os.path.abspath(os.path.dirname(__file__))
36 |
37 | # Import the README and use it as the long-description.
38 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
39 | try:
40 | with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
41 | long_description = "\n" + f.read()
42 | except FileNotFoundError:
43 | long_description = DESCRIPTION
44 |
45 | # Load the package's __version__.py module as a dictionary.
46 | about = {}
47 | if not VERSION:
48 | with open(os.path.join(here, NAME, "__version__.py")) as f:
49 | exec(f.read(), about)
50 | else:
51 | about["__version__"] = VERSION
52 |
53 |
54 | class UploadCommand(Command):
55 | """Support setup.py upload."""
56 |
57 | description = "Build and publish the package."
58 | user_options = []
59 |
60 | @staticmethod
61 | def status(s):
62 | """Prints things in bold."""
63 | print("\033[1m{0}\033[0m".format(s))
64 |
65 | def initialize_options(self):
66 | pass
67 |
68 | def finalize_options(self):
69 | pass
70 |
71 | def run(self):
72 | try:
73 | self.status("Removing previous builds…")
74 | rmtree(os.path.join(here, "dist"))
75 | except OSError:
76 | pass
77 |
78 | self.status("Building Source and Wheel (universal) distribution…")
79 | os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
80 |
81 | self.status("Uploading the package to PyPI via Twine…")
82 | os.system("twine upload dist/*")
83 |
84 | self.status("Pushing git tags…")
85 | os.system("git tag v{0}".format(about["__version__"]))
86 | os.system("git push --tags")
87 |
88 | sys.exit()
89 |
90 |
91 | # Where the magic happens:
92 | setup(
93 | name=NAME,
94 | version=about["__version__"],
95 | description=DESCRIPTION,
96 | long_description=long_description,
97 | long_description_content_type="text/markdown",
98 | author=AUTHOR,
99 | author_email=EMAIL,
100 | python_requires=REQUIRES_PYTHON,
101 | url=URL,
102 | packages=find_packages(exclude=("tests",)),
103 | # If your package is a single module, use this instead of 'packages':
104 | # py_modules=['mypackage'],
105 | entry_points={"console_scripts": ["fastdeploy=fastdeploy:main"]},
106 | install_requires=REQUIRED,
107 | extras_require=EXTRAS,
108 | package_data={NAME: ["fastdeploy-ui/*", "fastdeploy-ui/build/*"]},
109 | include_package_data=True,
110 | license="MIT",
111 | classifiers=[
112 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
113 | "License :: OSI Approved :: MIT License",
114 | "Programming Language :: Python",
115 | "Programming Language :: Python :: 3",
116 | "Programming Language :: Python :: 3.6",
117 | "Programming Language :: Python :: Implementation :: CPython",
118 | ],
119 | # $ setup.py publish support.
120 | cmdclass={
121 | "upload": UploadCommand,
122 | },
123 | )
124 |
--------------------------------------------------------------------------------
/testing/README.md:
--------------------------------------------------------------------------------
1 | python benchmark.py --target_rps_per_connection 100 --parallel_connections 10 --duration 60 --warmup 1 --server_url http://10.18.9.60:8080 --input_file /Users/praneeth.bedapudi/RINGCENTRAL/marauders-map/ml_serving/nlu/semantic_score_serving/example.py --results_file a.json --request_timeout 0.6
2 |
--------------------------------------------------------------------------------
/testing/benchmark.py:
--------------------------------------------------------------------------------
1 | import time
2 | import logging
3 | import argparse
4 | import json
5 | import random
6 | import numpy as np
7 | from datetime import datetime
8 | import os
9 | import importlib.util
10 | from tqdm import tqdm
11 | from fdclient import FDClient
12 | import multiprocessing as mp
13 | from dataclasses import dataclass
14 | from typing import List, Dict, Any
15 | import queue
16 | import signal
17 |
18 | # Configure logging
19 | logging.basicConfig(format='%(asctime)s - %(message)s')
20 | logger = logging.getLogger(__name__)
21 |
22 | @dataclass
23 | class ConnectionStats:
24 | latencies: List[float]
25 | errors: List[str]
26 | successes: int
27 | failures: int
28 | connection_id: int
29 |
30 |
31 | class BenchmarkProcess(mp.Process):
32 | def __init__(self, connection_id, server_url, target_rps, duration,
33 | input_source, request_batch_size, is_warmup,
34 | stats_queue, progress_queue, request_timeout=10):
35 | super().__init__()
36 | self.connection_id = connection_id
37 | self.server_url = server_url
38 | self.target_rps = target_rps
39 | self.duration = duration
40 | self.input_source = input_source
41 | self.request_batch_size = request_batch_size
42 | self.is_warmup = is_warmup
43 | self.stats_queue = stats_queue
44 | self.progress_queue = progress_queue
45 | self.request_timeout = request_timeout
46 | self._loaded_function = None
47 |
48 | def _load_function(self):
49 | """Load the Python function inside the process"""
50 | if self.input_source['type'] == 'function':
51 | path = os.path.abspath(self.input_source['path'])
52 | directory = os.path.dirname(path)
53 | filename = os.path.basename(path)
54 |
55 | original_dir = os.getcwd()
56 | try:
57 | os.chdir(directory)
58 | module_name = os.path.splitext(filename)[0]
59 | spec = importlib.util.spec_from_file_location(module_name, filename)
60 | module = importlib.util.module_from_spec(spec)
61 | spec.loader.exec_module(module)
62 |
63 | if not hasattr(module, 'example_function'):
64 | raise ValueError("Python file must contain example_function()")
65 |
66 | self._loaded_function = module.example_function
67 | finally:
68 | os.chdir(original_dir)
69 |
70 | def generate_payload(self):
71 | """Generate payload based on input source type"""
72 | if self.input_source['type'] == 'json':
73 | return [self.input_source['data'][random.randint(0, len(self.input_source['data']) - 1)]
74 | for _ in range(self.request_batch_size)]
75 | else: # function
76 | if self._loaded_function is None:
77 | self._load_function()
78 | return self._loaded_function()[:self.request_batch_size]
79 |
80 | def run(self):
81 | # Handle Ctrl+C gracefully
82 | signal.signal(signal.SIGINT, signal.SIG_IGN)
83 |
84 | client = FDClient(server_url=self.server_url, request_timeout=self.request_timeout)
85 |
86 | if self.target_rps:
87 | sleep_time = 1.0 / self.target_rps
88 | else:
89 | sleep_time = 0
90 |
91 | start_time = time.time()
92 | stats = ConnectionStats(
93 | latencies=[], errors=[], successes=0, failures=0,
94 | connection_id=self.connection_id
95 | )
96 | requests_made = 0
97 |
98 | while time.time() - start_time < self.duration:
99 | request_start = time.time()
100 |
101 | try:
102 | # Generate and send request
103 | inps = self.generate_payload()
104 | request_id = f"{'warm' if self.is_warmup else 'req'}-conn{self.connection_id}-{requests_made}"
105 |
106 | results = client.infer(inps, unique_id=request_id)
107 | latency = (time.time() - request_start) * 1000 # Convert to ms
108 |
109 | if results['success']:
110 | if not self.is_warmup:
111 | stats.successes += 1
112 | stats.latencies.append(latency)
113 | else:
114 | if not self.is_warmup:
115 | stats.failures += 1
116 | stats.errors.append(results.get('reason', 'Unknown error'))
117 |
118 | except Exception as e:
119 | if not self.is_warmup:
120 | stats.failures += 1
121 | stats.errors.append(str(e))
122 |
123 | requests_made += 1
124 |
125 | # Update progress
126 | elapsed = time.time() - start_time
127 | self.progress_queue.put((self.connection_id, min(elapsed, self.duration)))
128 |
129 | # Rate limiting
130 | elapsed = time.time() - request_start
131 | if sleep_time > elapsed:
132 | time.sleep(sleep_time - elapsed)
133 |
134 | # Send final stats
135 | self.stats_queue.put((self.connection_id, stats))
136 |
137 | class BenchmarkRunner:
138 | def __init__(self, target_rps_per_connection, duration_seconds, server_url,
139 | parallel_connections=1, warmup_seconds=5, input_source=None,
140 | request_batch_size=1, log_dir=None, debug=False, request_timeout=10):
141 | self.target_rps_per_connection = target_rps_per_connection
142 | self.parallel_connections = parallel_connections
143 | self.duration_seconds = duration_seconds
144 | self.warmup_seconds = warmup_seconds
145 | self.server_url = server_url
146 | self.input_source = input_source
147 | self.request_batch_size = request_batch_size
148 | self.log_dir = log_dir
149 | self.debug = debug
150 | self.request_timeout = request_timeout
151 |
152 | if self.log_dir:
153 | os.makedirs(self.log_dir, exist_ok=True)
154 |
155 | # For handling Ctrl+C gracefully
156 | self.stop_event = mp.Event()
157 | signal.signal(signal.SIGINT, self._handle_interrupt)
158 |
159 | def _handle_interrupt(self, signum, frame):
160 | print("\nStopping benchmark gracefully...")
161 | self.stop_event.set()
162 |
163 | def _update_progress_bars(self, progress_queue, pbars, duration, process_count):
164 | """Update progress bars from queue until duration is reached or stop_event is set"""
165 | start_time = time.time()
166 | while time.time() - start_time < duration and not self.stop_event.is_set():
167 | try:
168 | conn_id, progress = progress_queue.get(timeout=0.1)
169 | pbars[conn_id].n = progress
170 | pbars[conn_id].refresh()
171 | except queue.Empty:
172 | continue
173 |
174 | def run_benchmark(self):
175 | """Run the benchmark with parallel processes"""
176 | # Create queues for inter-process communication
177 | stats_queue = mp.Queue()
178 | progress_queue = mp.Queue()
179 |
180 | print("\nStarting warmup period...")
181 |
182 | # Create progress bars for warmup
183 | warmup_pbars = {
184 | i: tqdm(
185 | total=self.warmup_seconds,
186 | desc=f"Warmup Conn {i}",
187 | position=i,
188 | unit="s",
189 | leave=True
190 | )
191 | for i in range(self.parallel_connections)
192 | }
193 |
194 | # Start warmup processes
195 | warmup_processes = [
196 | BenchmarkProcess(
197 | connection_id=i,
198 | server_url=self.server_url,
199 | target_rps=self.target_rps_per_connection,
200 | duration=self.warmup_seconds,
201 | input_source=self.input_source,
202 | request_batch_size=self.request_batch_size,
203 | is_warmup=True,
204 | stats_queue=stats_queue,
205 | progress_queue=progress_queue,
206 | request_timeout=self.request_timeout
207 | )
208 | for i in range(self.parallel_connections)
209 | ]
210 |
211 | for p in warmup_processes:
212 | p.start()
213 |
214 | # Update warmup progress bars
215 | self._update_progress_bars(
216 | progress_queue, warmup_pbars,
217 | self.warmup_seconds, self.parallel_connections
218 | )
219 |
220 | # Wait for warmup processes to finish
221 | for p in warmup_processes:
222 | p.join()
223 |
224 | # Clear warmup stats queue
225 | while not stats_queue.empty():
226 | stats_queue.get()
227 |
228 | # Close warmup progress bars
229 | for pbar in warmup_pbars.values():
230 | pbar.close()
231 |
232 | if self.stop_event.is_set():
233 | print("\nBenchmark interrupted during warmup")
234 | return None
235 |
236 | print("\nStarting benchmark...")
237 |
238 | # Create progress bars for benchmark
239 | benchmark_pbars = {
240 | i: tqdm(
241 | total=self.duration_seconds,
242 | desc=f"Benchmark Conn {i}",
243 | position=i,
244 | unit="s",
245 | leave=True
246 | )
247 | for i in range(self.parallel_connections)
248 | }
249 |
250 | # Start benchmark processes
251 | benchmark_processes = [
252 | BenchmarkProcess(
253 | connection_id=i,
254 | server_url=self.server_url,
255 | target_rps=self.target_rps_per_connection,
256 | duration=self.duration_seconds,
257 | input_source=self.input_source,
258 | request_batch_size=self.request_batch_size,
259 | is_warmup=False,
260 | stats_queue=stats_queue,
261 | progress_queue=progress_queue,
262 | request_timeout=self.request_timeout
263 | )
264 | for i in range(self.parallel_connections)
265 | ]
266 |
267 | for p in benchmark_processes:
268 | p.start()
269 |
270 | # Update benchmark progress bars
271 | self._update_progress_bars(
272 | progress_queue, benchmark_pbars,
273 | self.duration_seconds, self.parallel_connections
274 | )
275 |
276 | # Collect results
277 | connection_stats = {}
278 | for _ in range(self.parallel_connections):
279 | conn_id, stats = stats_queue.get()
280 | connection_stats[conn_id] = stats
281 |
282 | # Wait for all processes to finish
283 | for p in benchmark_processes:
284 | p.join()
285 |
286 | # Close benchmark progress bars
287 | for pbar in benchmark_pbars.values():
288 | pbar.close()
289 |
290 | # Move cursor to bottom of progress bars
291 | print("\n" * (self.parallel_connections))
292 |
293 | if self.stop_event.is_set():
294 | print("\nBenchmark interrupted")
295 | return None
296 |
297 | # Aggregate results
298 | all_latencies = []
299 | total_successes = 0
300 | total_failures = 0
301 | all_errors = []
302 |
303 | for stats in connection_stats.values():
304 | all_latencies.extend(stats.latencies)
305 | total_successes += stats.successes
306 | total_failures += stats.failures
307 | all_errors.extend(stats.errors)
308 |
309 | if all_latencies:
310 | total_time = self.duration_seconds
311 | p50 = np.percentile(all_latencies, 50)
312 | p90 = np.percentile(all_latencies, 90)
313 | p95 = np.percentile(all_latencies, 95)
314 | p99 = np.percentile(all_latencies, 99)
315 | avg_latency = np.mean(all_latencies)
316 | std_latency = np.std(all_latencies)
317 | total_requests = total_successes + total_failures
318 | actual_rps = total_requests / total_time
319 |
320 | results = {
321 | 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
322 | 'total_requests': total_requests,
323 | 'successes': total_successes,
324 | 'failures': total_failures,
325 | 'success_rate': (total_successes/total_requests)*100 if total_requests > 0 else 0,
326 | 'average_latency_ms': float(avg_latency),
327 | 'std_latency_ms': float(std_latency),
328 | 'p50_latency_ms': float(p50),
329 | 'p90_latency_ms': float(p90),
330 | 'p95_latency_ms': float(p95),
331 | 'p99_latency_ms': float(p99),
332 | 'min_latency_ms': float(min(all_latencies)),
333 | 'max_latency_ms': float(max(all_latencies)),
334 | 'actual_rps': float(actual_rps),
335 | 'target_rps_per_connection': self.target_rps_per_connection,
336 | 'parallel_connections': self.parallel_connections,
337 | 'total_target_rps': (self.target_rps_per_connection or 0) * self.parallel_connections,
338 | 'duration_seconds': self.duration_seconds,
339 | 'warmup_seconds': self.warmup_seconds,
340 | 'request_batch_size': self.request_batch_size,
341 | 'errors': all_errors[:10] if all_errors else [], # First 10 errors
342 | 'error_count': len(all_errors),
343 | # Per-connection stats
344 | 'connection_stats': {
345 | conn_id: {
346 | 'requests': stats.successes + stats.failures,
347 | 'successes': stats.successes,
348 | 'failures': stats.failures,
349 | 'success_rate': (stats.successes/(stats.successes + stats.failures))*100 if (stats.successes + stats.failures) > 0 else 0,
350 | 'average_latency_ms': float(np.mean(stats.latencies)) if stats.latencies else 0,
351 | 'actual_rps': (stats.successes + stats.failures) / total_time
352 | }
353 | for conn_id, stats in connection_stats.items()
354 | }
355 | }
356 | return results
357 | return None
358 |
359 | def format_duration(ms):
360 | """Format milliseconds into a readable duration."""
361 | if ms < 1:
362 | return f"{ms*1000:.2f}μs"
363 | elif ms < 1000:
364 | return f"{ms:.2f}ms"
365 | else:
366 | return f"{ms/1000:.2f}s"
367 |
368 | def print_results(results):
369 | """Print formatted benchmark results."""
370 | if not results:
371 | return
372 |
373 | print("\n" + "="*80)
374 | print("BENCHMARK RESULTS")
375 | print("="*80)
376 |
377 | # Overall Statistics
378 | print("\n📊 OVERALL STATISTICS")
379 | print("-"*40)
380 | print(f"Total Requests: {results['total_requests']:,}")
381 | print(f"Successful: {results['successes']:,}")
382 | print(f"Failed: {results['failures']:,}")
383 | print(f"Success Rate: {results['success_rate']:.2f}%")
384 |
385 | # Throughput
386 | print("\n🚀 THROUGHPUT")
387 | print("-"*40)
388 | print(f"Actual RPS: {results['actual_rps']:.2f}")
389 | print(f"Target RPS: {results['total_target_rps'] or 'unlimited'}")
390 | print(f"Connections: {results['parallel_connections']}")
391 | print(f"Duration: {results['duration_seconds']}s (+ {results['warmup_seconds']}s warmup)")
392 | print(f"Batch Size: {results['request_batch_size']}")
393 |
394 | # Latency Statistics
395 | print("\n⚡ LATENCY STATISTICS")
396 | print("-"*40)
397 | print(f"Average: {format_duration(results['average_latency_ms'])}")
398 | print(f"Std Dev: {format_duration(results['std_latency_ms'])}")
399 | print(f"Min: {format_duration(results['min_latency_ms'])}")
400 | print(f"Max: {format_duration(results['max_latency_ms'])}")
401 | print(f"P50: {format_duration(results['p50_latency_ms'])}")
402 | print(f"P90: {format_duration(results['p90_latency_ms'])}")
403 | print(f"P95: {format_duration(results['p95_latency_ms'])}")
404 | print(f"P99: {format_duration(results['p99_latency_ms'])}")
405 |
406 | # Per-Connection Statistics
407 | print("\n🔌 PER-CONNECTION STATISTICS")
408 | print("-"*40)
409 | for conn_id, stats in results['connection_stats'].items():
410 | print(f"\nConnection {conn_id}:")
411 | print(f" Requests: {stats['requests']:,}")
412 | print(f" Success Rate: {stats['success_rate']:.2f}%")
413 | print(f" Actual RPS: {stats['actual_rps']:.2f}")
414 | print(f" Avg Latency: {format_duration(stats['average_latency_ms'])}")
415 |
416 | # Error Summary
417 | if results['errors']:
418 | print("\n❌ ERROR SUMMARY")
419 | print("-"*40)
420 | print(f"Total Errors: {results['error_count']}")
421 | print("\nFirst 10 Errors:")
422 | for i, error in enumerate(results['errors'], 1):
423 | print(f"{i}. {error}")
424 |
425 | print("\n" + "="*80)
426 |
427 |
428 | def main():
429 | parser = argparse.ArgumentParser(description='API Benchmark Tool')
430 | parser.add_argument('--server_url', type=str, required=True, help='Server URL')
431 | parser.add_argument('--target_rps_per_connection', type=int, default=None,
432 | help='Target requests per second per connection')
433 | parser.add_argument('--parallel_connections', type=int, default=1,
434 | help='Number of parallel connections')
435 | parser.add_argument('--duration', type=int, default=60, help='Test duration in seconds')
436 | parser.add_argument('--warmup', type=int, default=5, help='Warmup period in seconds')
437 | parser.add_argument('--debug', action='store_true', help='Enable debug logging')
438 | parser.add_argument('--input_file', type=str, required=True, help='Input .json or .py file path')
439 | parser.add_argument('--request_batch_size', type=int, default=1, help='Request batch size')
440 | parser.add_argument('--log_dir', type=str, default=None, help='Directory to log request inputs and outputs')
441 | parser.add_argument('--results_file', type=str, default='benchmark_results.json',
442 | help='File to write benchmark results')
443 | parser.add_argument('--request_timeout', type=float, default=10, help='Request timeout in seconds')
444 | args = parser.parse_args()
445 |
446 | if args.debug:
447 | logger.setLevel(logging.DEBUG)
448 |
449 | # Load input source
450 | input_source = None
451 | if args.input_file.endswith('.json'):
452 | try:
453 | with open(args.input_file, 'r') as f:
454 | input_data = json.load(f)
455 | input_source = {'type': 'json', 'data': input_data}
456 | except Exception as e:
457 | logger.error(f"Failed to load JSON input file: {e}")
458 | return
459 | elif args.input_file.endswith('.py'):
460 | input_source = {'type': 'function', 'path': args.input_file}
461 | else:
462 | logger.error("Input file must be either .json or .py")
463 | return
464 |
465 | # Initialize and run benchmark
466 | runner = BenchmarkRunner(
467 | target_rps_per_connection=args.target_rps_per_connection,
468 | parallel_connections=args.parallel_connections,
469 | duration_seconds=args.duration,
470 | server_url=args.server_url,
471 | warmup_seconds=args.warmup,
472 | input_source=input_source,
473 | request_batch_size=args.request_batch_size,
474 | log_dir=args.log_dir,
475 | debug=args.debug,
476 | request_timeout=args.request_timeout
477 | )
478 |
479 | total_target_rps = (args.target_rps_per_connection or 'unlimited')
480 | if args.target_rps_per_connection:
481 | total_target_rps = args.target_rps_per_connection * args.parallel_connections
482 |
483 | print(f"\n{'='*80}")
484 | print("BENCHMARK CONFIGURATION")
485 | print(f"{'='*80}")
486 | print(f"Server URL: {args.server_url}")
487 | print(f"Parallel connections: {args.parallel_connections}")
488 | print(f"Target RPS/conn: {args.target_rps_per_connection or 'unlimited'}")
489 | print(f"Total target RPS: {total_target_rps}")
490 | print(f"Duration: {args.duration}s (+ {args.warmup}s warmup)")
491 | print(f"Request batch size: {args.request_batch_size}")
492 | print(f"Input source: {args.input_file}")
493 | print(f"Log directory: {args.log_dir or 'disabled'}")
494 | print(f"Debug mode: {'enabled' if args.debug else 'disabled'}")
495 | print(f"Request timeout: {args.request_timeout}s")
496 | print(f"{'='*80}\n")
497 |
498 | try:
499 | results = runner.run_benchmark()
500 |
501 | if results:
502 | # Write results to file
503 | with open(args.results_file, 'w') as f:
504 | json.dump(results, f, indent=2)
505 | print(f"\nDetailed results saved to: {args.results_file}")
506 |
507 | # Print formatted results
508 | print_results(results)
509 | else:
510 | print("\nNo results generated. Benchmark may have been interrupted.")
511 | except KeyboardInterrupt:
512 | print("\nBenchmark interrupted by user.")
513 | except Exception as e:
514 | logger.error(f"Benchmark failed: {e}")
515 | if args.debug:
516 | raise
517 |
518 | if __name__ == "__main__":
519 | main()
--------------------------------------------------------------------------------