├── .github
    └── workflows
    │   ├── main.yml
    │   └── python_client.yml
├── .gitignore
├── LICENSE
├── README.md
├── cli.md
├── clients
    ├── browser_side_js
    │   └── client.html
    └── python
    │   ├── README.md
    │   ├── fdclient
    │       ├── __init__.py
    │       └── client.py
    │   └── setup.py
├── fastdeploy
    ├── .gitignore
    ├── __init__.py
    ├── __main__.py
    ├── _infer.py
    ├── _loop.py
    ├── _rest.py
    ├── _utils.py
    └── monitor.sh
├── recipe.md
├── recipes
    ├── .gitignore
    ├── echo
    │   ├── .dockerignore
    │   ├── .gitignore
    │   ├── example.py
    │   ├── extra_prometheus_metrics.py
    │   ├── fastDeploy.auto_dockerfile
    │   ├── predictor.py
    │   └── requirements.txt
    ├── echo_chained
    │   ├── .dockerignore
    │   ├── .gitignore
    │   ├── example.py
    │   ├── predictor_1.py
    │   ├── predictor_2.py
    │   └── requirements.txt
    └── text_embeddings
    │   ├── example.py
    │   ├── predictor.py
    │   ├── requirements.txt
    │   └── words.txt
├── setup.py
└── testing
    ├── README.md
    └── benchmark.py


/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |     paths:
 7 |       - 'setup.py'
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   pypi:
12 |     runs-on: ubuntu-latest
13 | 
14 |     permissions:
15 |       id-token: write
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v3
19 |     - uses: actions/setup-python@v4
20 |       with:
21 |         python-version: '3.x'
22 | 
23 |     - name: Install dependencies
24 |       run: python -m pip install -U build
25 |     
26 |     - name: Build
27 |       run: python -m build
28 |     
29 |     - name: Publish
30 |       uses: pypa/gh-action-pypi-publish@release/v1
31 | 


--------------------------------------------------------------------------------
/.github/workflows/python_client.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |     paths:
 7 |       - 'clients/python/setup.py'
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   pypi:
12 |     runs-on: ubuntu-latest
13 | 
14 |     permissions:
15 |       id-token: write
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v3
19 |     - uses: actions/setup-python@v4
20 |       with:
21 |         python-version: '3.x'
22 | 
23 |     - name: Install dependencies
24 |       run: python -m pip install -U build
25 |     
26 |     - name: Build
27 |       run: python -m build
28 |       working-directory: clients/python
29 |     
30 |     - name: Move package
31 |       run: mv clients/python/dist ./
32 |     
33 |     - name: Publish
34 |       uses: pypa/gh-action-pypi-publish@release/v1
35 | 
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | .results_index/
  7 | .request_queue/
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | recipes/*/*index
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | pip-wheel-metadata/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | 
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 
136 | # Mac DS_Store
137 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 notAI-tech
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## fastDeploy
 2 | #### easy and performant micro-services for Python Deep Learning inference pipelines
 3 | 
 4 | - Deploy any python inference pipeline with minimal extra code
 5 | - Auto batching of concurrent inputs is enabled out of the box
 6 | - no changes to inference code (unlike tf-serving etc), entire pipeline is run as is
 7 | - Promethues metrics (open metrics) are exposed for monitoring
 8 | - Auto generates clean dockerfiles and kubernetes health check, scaling friendly APIs
 9 | - sequentially chained inference pipelines are supported out of the box
10 | - can be queried from any language via easy to use rest apis
11 | - easy to understand (simple consumer producer arch) and simple code base
12 | 
13 | 
14 | #### Installation:
15 | ```bash
16 | pip install --upgrade fastdeploy fdclient
17 | # fdclient is optional, only needed if you want to use python client
18 | ```
19 | 
20 | #### [CLI explained](https://github.com/notAI-tech/fastDeploy/blob/master/cli.md)
21 | 
22 | #### Start fastDeploy server on a recipe: 
23 | ```bash
24 | # Invoke fastdeploy 
25 | python -m fastdeploy --help
26 | # or
27 | fastdeploy --help
28 | 
29 | # Start prediction "loop" for recipe "echo"
30 | fastdeploy --loop --recipe recipes/echo
31 | 
32 | # Start rest apis for recipe "echo"
33 | fastdeploy --rest --recipe recipes/echo
34 | ```
35 | 
36 | #### Send a request and get predictions:
37 | 
38 | - [Python client usage](https://github.com/notAI-tech/fastDeploy/blob/master/clients/python/README.md)
39 | 
40 | - [curl usage]()
41 | 
42 | - [Nodejs client usage]()
43 | 
44 | #### auto generate dockerfile and build docker image:
45 | ```bash
46 | # Write the dockerfile for recipe "echo"
47 | # and builds the docker image if docker is installed
48 | # base defaults to python:3.8-slim
49 | fastdeploy --build --recipe recipes/echo
50 | 
51 | # Run docker image
52 | docker run -it -p8080:8080 fastdeploy_echo
53 | ```
54 | 
55 | #### Serving your model (recipe):
56 | 
57 | - [Writing your model/pipeline's recipe](https://github.com/notAI-tech/fastDeploy/blob/master/recipe.md)
58 | 
59 | 
60 | ### Where to use fastDeploy?
61 | 
62 | - to deploy any non ultra light weight models i.e: most DL models, >50ms inference time per example
63 | - if the model/pipeline benefits from batch inference, fastDeploy is perfect for your use-case
64 | - if you are going to have individual inputs (example, user's search input which needs to be vectorized or image to be classified)
65 | - in the case of individual inputs, requests coming in at close intervals will be batched together and sent to the model as a batch
66 | - perfect for creating internal micro services separating your model, pre and post processing from business logic
67 | - since prediction loop and inference endpoints are separated and are connected via sqlite backed queue, can be scaled independently
68 | 
69 | 
70 | ### Where not to use fastDeploy?
71 | - non cpu/gpu heavy models that are better of running parallely rather than in batch
72 | - if your predictor calls some external API or uploads to s3 etc in a blocking way
73 | - io heavy non batching use cases (eg: query ES or db for each input)
74 | - for these cases better to directly do from rest api code (instead of consumer producer mechanism) so that high concurrency can be achieved
75 | 


--------------------------------------------------------------------------------
/cli.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### fastDeploy CLI usage explained
 3 | 
 4 | 
 5 | - invoking the CLI
 6 | ```python
 7 | fastDeploy --help
 8 | # or
 9 | python -m fastDeploy --help
10 | ```
11 | 
12 | 
13 | #### Prediction loop
14 | - Start prediction loop on your recipe
15 | ```python
16 | fastdeploy --loop --recipe ./recipes/echo
17 | ```
18 | 
19 | - Optional config can be passed with `--config` flag
20 | 
21 | ```python
22 | fastdeploy --loop --recipe ./recipes/echo --config "predictor_name=predictor.py;optimal_batch_size=0"
23 | ```
24 | 
25 | | Config | Description | Default |
26 | | --- | --- | --- |
27 | | predictor_name | predictor.py or predictor_N.py, name of the predictor run in the loop | predictor.py |
28 | | optimal_batch_size | integer max batch size for the predictor | 0 (auto determine) |
29 | 
30 | - Same config can also be passed as env variables
31 | ```python
32 | export PREDICTOR_NAME=predictor.py
33 | export OPTIMAL_BATCH_SIZE=0
34 | fastdeploy --loop --recipe ./recipes/echo
35 | ```
36 | 
37 | 
38 | 
39 | #### Start API server
40 | - Start API server on your recipe
41 | ```python
42 | fastdeploy --rest --recipe ./recipes/echo
43 | ```
44 | 
45 | - Optional config can be passed with `--config` flag
46 | 
47 | ```python
48 | fastdeploy --rest --recipe ./recipes/echo --config "max_request_batch_size=0;workers=3;timeout=480;host=0.0.0.0;port=8080;only_async=false;allow_pickle=true;keep_alive=60"
49 | ```
50 | 
51 | - Same config can also be passed as env variables
52 | ```python
53 | export MAX_REQUEST_BATCH_SIZE=0
54 | export WORKERS=3
55 | export TIMEOUT=480
56 | export HOST=0.0.0.0
57 | export PORT=8080
58 | export ONLY_ASYNC=false
59 | export ALLOW_PICKLE=true
60 | export KEEP_ALIVE=60
61 | fastdeploy --rest --recipe ./recipes/echo
62 | ```
63 | 
64 | #### --config options explained
65 | 
66 | | Config | Description | Default |
67 | | --- | --- | --- |
68 | | max_request_batch_size | integer max number of inputs in a batch. useful when exposing to outside directly to limit max number of inputs that can be in a request | 0 (None) |
69 | | workers | number of rest api gunicorn workers. 3 is more than enoough generally | 3 |
70 | | timeout | seconds after which request will fail | 480 |
71 | | host | host for the REST server | 0.0.0.0 |
72 | | port | port for the REST server | 8080 |
73 | | only_async | true/false | false |
74 | | allow_pickle | true/false - use for disallowing pickle protocol when expecting external inputs | true |
75 | | keep_alive | gunicorn gevent keep alive | 60 |
76 | 
77 | 
78 | #### Build docker image
79 | 
80 | - Build generate docker image for your recipe
81 | ```python
82 | fastdeploy --build --recipe ./recipes/echo
83 | ```
84 | 
85 | - also supports optional config via `--config` flag
86 | - both rest and loop config options can be passed here in the same config string
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/clients/browser_side_js/client.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>FDClient Test with Multiple File Upload and Download</title>
  7 |     <script src="https://unpkg.com/@msgpack/msgpack@2.8.0/dist.es5+umd/msgpack.min.js"></script>
  8 |     <style>
  9 |         main {
 10 |             width: 80%;
 11 |             margin: 20px auto;
 12 |         }
 13 |         #fileList, #downloadList {
 14 |             margin-top: 10px;
 15 |         }
 16 |         #result {
 17 |             white-space: pre-wrap;
 18 |             word-break: break-all;
 19 |         }
 20 |     </style>
 21 |     <script>
 22 |         // FDClient class implementation
 23 |         class FDClient {
 24 |             constructor(serverUrl, compression = true, requestTimeout = null) {
 25 |                 if (!serverUrl.startsWith("http://") && !serverUrl.startsWith("https://")) {
 26 |                     throw new Error("Server URL must start with http:// or https://");
 27 |                 }
 28 |                 if ((serverUrl.match(/\//g) || []).length !== 2) {
 29 |                     throw new Error("Server URL must be in the format http(s)://<ip>:<port>");
 30 |                 }
 31 |                 this.serverUrl = serverUrl;
 32 |                 this.compression = compression;
 33 |                 this.requestTimeout = requestTimeout;
 34 |                 this.inputType = 'msgpack';
 35 |             }
 36 | 
 37 |             async infer(data, uniqueId = null, isAsync = false) {
 38 |                 if (!Array.isArray(data)) {
 39 |                     throw new Error("Data must be of type array");
 40 |                 }
 41 | 
 42 |                 uniqueId = uniqueId || crypto.randomUUID();
 43 | 
 44 |                 let packedData = MessagePack.encode(data);
 45 | 
 46 |                 const params = new URLSearchParams({
 47 |                     unique_id: uniqueId,
 48 |                     async: isAsync,
 49 |                     input_type: this.inputType,
 50 |                     compressed: false,
 51 |                 });
 52 | 
 53 |                 const response = await fetch(`${this.serverUrl}/infer?${params}`, {
 54 |                     method: 'POST',
 55 |                     headers: {
 56 |                         'Content-Type': 'application/octet-stream',
 57 |                     },
 58 |                     body: packedData,
 59 |                     signal: this.requestTimeout ? AbortSignal.timeout(this.requestTimeout) : undefined,
 60 |                 });
 61 | 
 62 |                 return await MessagePack.decodeAsync(response.body);
 63 |             }
 64 |         }
 65 | 
 66 |         let client;
 67 | 
 68 |         function initClient() {
 69 |             const serverUrl = document.getElementById('serverUrl').value;
 70 |             client = new FDClient(serverUrl);
 71 |             console.log("Client initialized with server URL:", serverUrl);
 72 |         }
 73 | 
 74 |         function updateFileList() {
 75 |             const fileInput = document.getElementById('fileInput');
 76 |             const fileList = document.getElementById('fileList');
 77 |             fileList.innerHTML = '';
 78 |             for (let file of fileInput.files) {
 79 |                 const li = document.createElement('li');
 80 |                 li.textContent = `${file.name} (${file.type || 'unknown'}) - ${file.size} bytes`;
 81 |                 fileList.appendChild(li);
 82 |             }
 83 |         }
 84 | 
 85 |         function byteArrayToHexString(byteArray) {
 86 |             return Array.from(byteArray, function(byte) {
 87 |                 return ('0' + (byte & 0xFF).toString(16)).slice(-2);
 88 |             }).join(' ');
 89 |         }
 90 | 
 91 |         function createDownloadLink(filename, content) {
 92 |             const blob = new Blob([content], { type: 'application/octet-stream' });
 93 |             const url = URL.createObjectURL(blob);
 94 |             const a = document.createElement('a');
 95 |             a.href = url;
 96 |             a.download = filename;
 97 |             a.textContent = `Download ${filename}`;
 98 |             return a;
 99 |         }
100 | 
101 |         async function testInfer() {
102 |             if (!client) {
103 |                 alert("Please initialize the client first!");
104 |                 return;
105 |             }
106 | 
107 |             const fileInput = document.getElementById('fileInput');
108 |             if (fileInput.files.length === 0) {
109 |                 alert("Please select at least one file!");
110 |                 return;
111 |             }
112 | 
113 |             try {
114 |                 const fileContents = await Promise.all(
115 |                     Array.from(fileInput.files).map(file => 
116 |                         new Promise((resolve, reject) => {
117 |                             const reader = new FileReader();
118 |                             reader.onload = e => resolve(new Uint8Array(e.target.result));
119 |                             reader.onerror = reject;
120 |                             reader.readAsArrayBuffer(file);
121 |                         })
122 |                     )
123 |                 );
124 | 
125 |                 console.time('infer');
126 |                 const result = await client.infer(fileContents);
127 |                 console.timeEnd('infer');
128 |                 
129 |                 if (result && result.prediction && Array.isArray(result.prediction)) {
130 |                     const downloadList = document.getElementById('downloadList');
131 |                     downloadList.innerHTML = '';
132 |                     result.prediction.forEach((fileContent, index) => {
133 |                         const filename = `output_file_${index + 1}`;
134 |                         const li = document.createElement('li');
135 |                         const downloadLink = createDownloadLink(filename, fileContent);
136 |                         li.appendChild(downloadLink);
137 |                         downloadList.appendChild(li);
138 |                     });
139 |                     document.getElementById('result').textContent = `Received ${result.prediction.length} file(s). You can download them using the links above.`;
140 |                 } else {
141 |                     document.getElementById('result').textContent = "Unexpected response format. Raw response:\n" + JSON.stringify(result, null, 2);
142 |                 }
143 |             } catch (error) {
144 |                 console.error("Error during inference:", error);
145 |                 document.getElementById('result').textContent = "Error: " + error.message;
146 |             }
147 |         }
148 |     </script>
149 | </head>
150 | <body>
151 |     <main>
152 |         <h1>FDClient Test with Multiple File Upload and Download</h1>
153 |         <div>
154 |             <label for="serverUrl">Server URL:</label>
155 |             <input type="text" id="serverUrl" value="http://localhost:8080">
156 |             <button onclick="initClient()">Initialize Client</button>
157 |         </div>
158 |         <div>
159 |             <label for="fileInput">Select Files:</label>
160 |             <input type="file" id="fileInput" multiple onchange="updateFileList()">
161 |             <ul id="fileList"></ul>
162 |         </div>
163 |         <div>
164 |             <button onclick="testInfer()">Test Infer</button>
165 |         </div>
166 |         <div>
167 |             <h3>Download Processed Files:</h3>
168 |             <ul id="downloadList"></ul>
169 |         </div>
170 |         <div>
171 |             <h3>Result:</h3>
172 |             <pre id="result"></pre>
173 |         </div>
174 |     </main>
175 | </body>
176 | </html>


--------------------------------------------------------------------------------
/clients/python/README.md:
--------------------------------------------------------------------------------
 1 | ## fastDeploy python client
 2 | 
 3 | ```python
 4 | from fdclient import FDClient
 5 | 
 6 | client = FDClient('http://localhost:8080') # optional compression=False to disable zstd compression
 7 | 
 8 | # infer
 9 | response = client.infer([obj_1, obj_2, ...]) # optional unique_id='some_id' to specify a unique id for the request
10 | 
11 | # infer in background
12 | response_future = client.infer_background([obj_1, obj_2, ...]) # optional unique_id='some_id' to specify a unique id for the request
13 | response = response_future.result() # wait for the response and get it
14 | ```
15 | 
16 | - By default fdclient communicates with fastDeploy server via pickles
17 | - pickle is very useful and makes sense when using fastDeploy server as a micro service internally i.e: all requests to fastDeploy originate from code you have writtem
18 | - ***PICKLE is secure if all the inputs to fastDeploy are originating from your code and not direct external user's pickles***
19 | - ***PICKLE is unsecure if you are passing external user inputs to fastDeploy directly without validation in between***
20 | - start fastDeploy serve with `--config "allow_pickle=false"` if the fastDeploy APIs are exposed to outside
21 | - `allow_pickle=false` config on server side makes fdclient use `msgpack` if available or `json` if msgpack not available.
22 | 
23 | #### If pickle is unsecure, why use it at all?
24 | 
25 | - pickle is great to send or receive arbitary inputs and outputs
26 | - if `allow_pickle=true` (default) your inputs and outputs can be any python objects, eg: np arrays, pd dataframes, float32 anything ....
27 | - pickle is only unsecure if you are unpickling objects pickled by others (since they can insert malicious code)
28 | - If fastDeploy is being used only for internal microservices, pickle is the best way so enabled by default
29 | 


--------------------------------------------------------------------------------
/clients/python/fdclient/__init__.py:
--------------------------------------------------------------------------------
1 | from .client import FDClient
2 | 


--------------------------------------------------------------------------------
/clients/python/fdclient/client.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     import zstandard
  3 | except:
  4 |     zstandard = None
  5 | 
  6 | try:
  7 |     import msgpack
  8 | except:
  9 |     msgpack = None
 10 | 
 11 | import threading
 12 | import requests
 13 | import pickle
 14 | import uuid
 15 | import time
 16 | import json
 17 | 
 18 | 
 19 | class FDClient:
 20 |     def __init__(self, server_url, request_timeout, compression=True, use_requests_session=False):
 21 |         assert server_url.startswith("http://") or server_url.startswith(
 22 |             "https://"
 23 |         ), "Server URL must start with http:// or https://"
 24 | 
 25 |         self.server_url = server_url
 26 |         self.local_storage = threading.local()
 27 |         self.requests_session = requests.Session() if use_requests_session else requests
 28 |         self.compression = compression if zstandard is not None else False
 29 |         self.input_type = None
 30 |         self._set_input_type()
 31 |             
 32 |         self.request_timeout = request_timeout
 33 | 
 34 |     def _set_input_type(self):
 35 |         if self.input_type is None:
 36 |             try:
 37 |                 self.input_type = (
 38 |                     "pickle"
 39 |                     if self.requests_session.get(
 40 |                         f"{self.server_url}/meta", params={"is_pickle_allowed": ""}
 41 |                     ).json()["is_pickle_allowed"]
 42 |                     else "msgpack"
 43 |                     if msgpack is not None
 44 |                     else "json"
 45 |                 )
 46 |             except Exception as e:
 47 |                 self.input_type = None
 48 | 
 49 |     @property
 50 |     def _compressor(self):
 51 |         if self.compression is False:
 52 |             return None
 53 | 
 54 |         if (
 55 |             not hasattr(self.local_storage, "compressor")
 56 |             or self.local_storage.compressor is None
 57 |         ):
 58 |             self.local_storage.compressor = zstandard.ZstdCompressor(level=-1)
 59 |         return self.local_storage.compressor
 60 | 
 61 |     @property
 62 |     def _decompressor(self):
 63 |         if self.compression is False:
 64 |             return None
 65 | 
 66 |         if (
 67 |             not hasattr(self.local_storage, "decompressor")
 68 |             or self.local_storage.decompressor is None
 69 |         ):
 70 |             self.local_storage.decompressor = zstandard.ZstdDecompressor()
 71 |         return self.local_storage.decompressor
 72 | 
 73 |     @property
 74 |     def _decompressor(self):
 75 |         if self.compression is False:
 76 |             return None
 77 | 
 78 |         if (
 79 |             not hasattr(self.local_storage, "decompressor")
 80 |             or self.local_storage.decompressor is None
 81 |         ):
 82 |             self.local_storage.decompressor = zstandard.ZstdDecompressor()
 83 |         return self.local_storage.decompressor
 84 | 
 85 |     def infer(self, data, unique_id=None, is_async=False):
 86 |         if self.input_type is None:
 87 |             self._set_input_type()
 88 |             if self.input_type is None:
 89 |                 raise ValueError("Could not connect to server")
 90 |         
 91 |         assert isinstance(data, (list, tuple)), "Data must be of type list or tuple"
 92 | 
 93 |         unique_id = str(uuid.uuid4()) if not unique_id else unique_id
 94 | 
 95 |         if self.input_type == "pickle":
 96 |             data = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
 97 |         elif self.input_type == "msgpack":
 98 |             data = msgpack.packb(data, use_bin_type=True)
 99 |         else:
100 |             data = json.dumps(data)
101 | 
102 |         response = self.requests_session.post(
103 |             f"{self.server_url}/infer",
104 |             params={
105 |                 "unique_id": unique_id,
106 |                 "async": is_async,
107 |                 "input_type": self.input_type,
108 |                 "compressed": True if zstandard is not None else False,
109 |                 "timeout": self.request_timeout,
110 |             },
111 |             data=self._compressor.compress(data) if zstandard is not None else data,
112 |             headers={"Content-Type": "application/octet-stream"},
113 |             timeout=self.request_timeout * 1.1,
114 |         )
115 | 
116 |         if self.input_type == "pickle":
117 |             return pickle.loads(
118 |                 self._decompressor.decompress(response.content)
119 |                 if zstandard is not None
120 |                 else response.content
121 |             )
122 |         elif self.input_type == "msgpack":
123 |             return msgpack.unpackb(
124 |                 self._decompressor.decompress(response.content)
125 |                 if zstandard is not None
126 |                 else response.content,
127 |                 raw=False,
128 |                 use_list=False,
129 |             )
130 |         else:
131 |             return json.loads(
132 |                 self._decompressor.decompress(response.content)
133 |                 if zstandard is not None
134 |                 else response.content
135 |             )
136 | 
137 |     def infer_async(self, data, unique_id=None):
138 |         return self.infer(data, unique_id, is_async=True)
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     client = FDClient("http://localhost:8080")
143 | 
144 |     print(client.input_type)
145 | 
146 |     s = time.time()
147 |     print("infer", client.infer(["this", "is", "some", b"data"]), time.time() - s)
148 | 
149 |     s = time.time()
150 |     x = client.infer_background(["this", "is", b"some", "data"])
151 |     print("infer_background", x.result(), time.time() - s)
152 | 
153 |     s = time.time()
154 | 
155 |     print(
156 |         "infer_background_multiple 40",
157 |         [
158 |             _.result()["success"]
159 |             for _ in client.infer_background_multiple(
160 |                 [["this", b"is", "some", "data"]] * 40
161 |             )
162 |         ],
163 |         time.time() - s,
164 |     )
165 | 


--------------------------------------------------------------------------------
/clients/python/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Note: To use the 'upload' functionality of this file, you must:
  5 | #   $ pip install twine
  6 | 
  7 | import io
  8 | import os
  9 | import sys
 10 | from shutil import rmtree
 11 | 
 12 | from setuptools import find_packages, setup, Command
 13 | 
 14 | # Package meta-data.
 15 | NAME = "fdclient"
 16 | DESCRIPTION = "fastDeploy python client"
 17 | URL = "https://github.com/notAI-tech/fastDeploy"
 18 | EMAIL = "praneeth@bpraneeth.com"
 19 | AUTHOR = "BEDAPUDI PRANEETH"
 20 | REQUIRES_PYTHON = ">=3.6.0"
 21 | VERSION = "3.1.1"
 22 | 
 23 | # What packages are required for this module to be executed?
 24 | REQUIRED = ["zstandard", "requests", "msgpack"]
 25 | 
 26 | # What packages are optional?
 27 | EXTRAS = {
 28 |     # 'fancy feature': ['django'],
 29 | }
 30 | 
 31 | # The rest you shouldn't have to touch too much :)
 32 | # ------------------------------------------------
 33 | # Except, perhaps the License and Trove Classifiers!
 34 | # If you do change the License, remember to change the Trove Classifier for that!
 35 | 
 36 | here = os.path.abspath(os.path.dirname(__file__))
 37 | 
 38 | # Import the README and use it as the long-description.
 39 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
 40 | try:
 41 |     with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
 42 |         long_description = "\n" + f.read()
 43 | except FileNotFoundError:
 44 |     long_description = DESCRIPTION
 45 | 
 46 | # Load the package's __version__.py module as a dictionary.
 47 | about = {}
 48 | if not VERSION:
 49 |     with open(os.path.join(here, NAME, "__version__.py")) as f:
 50 |         exec(f.read(), about)
 51 | else:
 52 |     about["__version__"] = VERSION
 53 | 
 54 | 
 55 | class UploadCommand(Command):
 56 |     """Support setup.py upload."""
 57 | 
 58 |     description = "Build and publish the package."
 59 |     user_options = []
 60 | 
 61 |     @staticmethod
 62 |     def status(s):
 63 |         """Prints things in bold."""
 64 |         print("\033[1m{0}\033[0m".format(s))
 65 | 
 66 |     def initialize_options(self):
 67 |         pass
 68 | 
 69 |     def finalize_options(self):
 70 |         pass
 71 | 
 72 |     def run(self):
 73 |         try:
 74 |             self.status("Removing previous builds…")
 75 |             rmtree(os.path.join(here, "dist"))
 76 |         except OSError:
 77 |             pass
 78 | 
 79 |         self.status("Building Source and Wheel (universal) distribution…")
 80 |         os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
 81 | 
 82 |         self.status("Uploading the package to PyPI via Twine…")
 83 |         os.system("twine upload dist/*")
 84 | 
 85 |         self.status("Pushing git tags…")
 86 |         os.system("git tag v{0}".format(about["__version__"]))
 87 |         os.system("git push --tags")
 88 | 
 89 |         sys.exit()
 90 | 
 91 | 
 92 | # Where the magic happens:
 93 | setup(
 94 |     name=NAME,
 95 |     version=about["__version__"],
 96 |     description=DESCRIPTION,
 97 |     long_description=long_description,
 98 |     long_description_content_type="text/markdown",
 99 |     author=AUTHOR,
100 |     author_email=EMAIL,
101 |     python_requires=REQUIRES_PYTHON,
102 |     url=URL,
103 |     packages=find_packages(exclude=("tests",)),
104 |     # If your package is a single module, use this instead of 'packages':
105 |     # py_modules=['mypackage'],
106 |     install_requires=REQUIRED,
107 |     extras_require=EXTRAS,
108 |     include_package_data=True,
109 |     license="MIT",
110 |     classifiers=[
111 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
112 |         "License :: OSI Approved :: MIT License",
113 |         "Programming Language :: Python",
114 |         "Programming Language :: Python :: 3",
115 |         "Programming Language :: Python :: 3.6",
116 |         "Programming Language :: Python :: Implementation :: CPython",
117 |     ],
118 |     # $ setup.py publish support.
119 |     cmdclass={
120 |         "upload": UploadCommand,
121 |     },
122 | )
123 | 


--------------------------------------------------------------------------------
/fastdeploy/.gitignore:
--------------------------------------------------------------------------------
1 | fastdeploy-ui
2 | 


--------------------------------------------------------------------------------
/fastdeploy/__init__.py:
--------------------------------------------------------------------------------
1 | from . import __main__
2 | 


--------------------------------------------------------------------------------
/fastdeploy/__main__.py:
--------------------------------------------------------------------------------
  1 | import resource
  2 | 
  3 | try:
  4 |     resource.setrlimit(resource.RLIMIT_NOFILE, (131072, 131072))
  5 | except:
  6 |     pass
  7 | 
  8 | import os
  9 | import sys
 10 | import glob
 11 | import argparse
 12 | import subprocess
 13 | 
 14 | parser = argparse.ArgumentParser(
 15 |     description="CLI for fastDeploy", formatter_class=argparse.RawTextHelpFormatter
 16 | )
 17 | parser.add_argument(
 18 |     "--recipe",
 19 |     type=str,
 20 |     help="Path to recipe folder that contains predictor.py",
 21 |     required=False,
 22 | )
 23 | 
 24 | parser.add_argument(
 25 |     "--loop",
 26 |     help=f"""Start prediction loop""",
 27 |     required=False,
 28 |     action="store_true",
 29 | )
 30 | 
 31 | parser.add_argument(
 32 |     "--rest",
 33 |     help="""Start REST server""",
 34 |     required=False,
 35 |     action="store_true",
 36 | )
 37 | 
 38 | parser.add_argument(
 39 |     "--build",
 40 |     help="""Build docker image""",
 41 |     required=False,
 42 |     action="store_true",
 43 | )
 44 | 
 45 | parser.add_argument(
 46 |     "--config",
 47 |     type=str,
 48 |     help="""
 49 |         example usage: --config "workers=3, timeout:480, allow_pickle=true"
 50 | 
 51 |         REST
 52 |             max_request_batch_size: integer max number of inputs in a batch, default=0 (None)
 53 |             workers: integer number of workers, default=3
 54 |             timeout: seconds after which request will fail, default=480
 55 |             host: host for the REST server, default=0.0.0.0
 56 |             port: port for the REST server, default=8080
 57 |             allow_pickle: true/false, default=true
 58 |             keep_alive: gunicorn gevent keep alive, default=60
 59 | 
 60 | 
 61 |         LOOP
 62 |             predictor_name: predictor.py or predictor_N.py, name of the predictor run in the loop, default: predictor.py
 63 |             optimal_batch_size: integer max batch size for the predictor, default=0 (auto)
 64 |         
 65 |         DOCKER
 66 |             base: base image for docker, default=python:3.8-slim
 67 |     """,
 68 |     required=False,
 69 |     default="max_request_batch_size=0,workers=3,timeout=480,host=0.0.0.0,port=8080,allow_pickle=true,predictor_name=predictor.py,optimal_batch_size=0,keep_alive=60,base=python:3.8-slim",
 70 | )
 71 | 
 72 | args = parser.parse_args()
 73 | 
 74 | CONFIG = {
 75 |     # rest config
 76 |     "max_request_batch_size": int(os.getenv("MAX_REQUEST_BATCH_SIZE", "0")),
 77 |     "workers": int(os.getenv("WORKERS", "3")),
 78 |     "timeout": int(os.getenv("TIMEOUT", "480")),
 79 |     "host": os.getenv("HOST", "0.0.0.0"),
 80 |     "port": int(os.getenv("PORT", "8080")),
 81 |     "allow_pickle": os.getenv("ALLOW_PICKLE", "true").lower() == "true",
 82 |     # predictor config
 83 |     "predictor_name": os.getenv("PREDICTOR_NAME", "predictor.py"),
 84 |     "optimal_batch_size": int(os.getenv("OPTIMAL_BATCH_SIZE", "0")),
 85 |     "keep_alive": int(os.getenv("KEEP_ALIVE", "60")),
 86 |     # building docker config
 87 |     "base": os.getenv("BASE", "python:3.8-slim"),
 88 | }
 89 | 
 90 | if args.config:
 91 |     for config in args.config.split(","):
 92 |         try:
 93 |             k, v = config.strip().split("=")
 94 |         except:
 95 |             continue
 96 | 
 97 |         if os.getenv(k.upper()) is not None:
 98 |             continue
 99 | 
100 |         try:
101 |             CONFIG[k.strip()] = int(v.strip())
102 |         except:
103 |             CONFIG[k.strip()] = v.strip()
104 | 
105 | for k, v in CONFIG.items():
106 |     os.environ[k.upper()] = str(v)
107 | 
108 | sys.path.append(os.path.abspath(args.recipe))
109 | os.chdir(os.path.abspath(args.recipe))
110 | 
111 | try:
112 |     if not os.path.exists(os.path.join(args.recipe, ".gitignore")):
113 |         _gitignore_f = open(os.path.join(args.recipe, ".gitignore"), "a")
114 |         _gitignore_f.write("\nfastdeploy_dbs\nfastdeploy_dbs/*\n")
115 |         _gitignore_f.flush()
116 |         _gitignore_f.close()
117 | except:
118 |     pass
119 | 
120 | try:
121 |     if not os.path.exists(os.path.join(args.recipe, ".dockerignore")):
122 |         _dockerignore_f = open(os.path.join(args.recipe, ".dockerignore"), "w")
123 |         _dockerignore_f.write("\nfastdeploy_dbs\nfastdeploy_dbs/*\n")
124 |         _dockerignore_f.flush()
125 |         _dockerignore_f.close()
126 | except:
127 |     pass
128 | 
129 | 
130 | def loop():
131 |     from ._loop import start_loop
132 | 
133 |     start_loop()
134 | 
135 | 
136 | def rest():
137 |     from ._rest import app
138 |     import gunicorn.app.base
139 | 
140 |     class StandaloneApplication(gunicorn.app.base.BaseApplication):
141 |         def __init__(self, app, options=None):
142 |             self.options = options or {}
143 |             self.application = app
144 |             super().__init__()
145 | 
146 |         def load_config(self):
147 |             config = {
148 |                 key: value
149 |                 for key, value in self.options.items()
150 |                 if key in self.cfg.settings and value is not None
151 |             }
152 |             for key, value in config.items():
153 |                 self.cfg.set(key.lower(), value)
154 | 
155 |         def load(self):
156 |             return self.application
157 | 
158 |     options = {
159 |         "preload": "",
160 |         "bind": "%s:%s" % (CONFIG["host"], CONFIG["port"]),
161 |         "workers": CONFIG["workers"],
162 |         "worker_connections": 1000,
163 |         "worker_class": "gevent",
164 |         "timeout": CONFIG["timeout"],
165 |         "allow_redirects": True,
166 |         "keepalive": CONFIG["keep_alive"],
167 |         "keep_alive": CONFIG["keep_alive"],
168 |     }
169 | 
170 |     print(
171 |         f"fastDeploy REST interface active at http://{CONFIG['host']}:{CONFIG['port']}"
172 |     )
173 | 
174 |     StandaloneApplication(app, options).run()
175 | 
176 | 
177 | def build_docker_image():
178 |     if not os.path.exists("requirements.txt"):
179 |         raise Exception("requirements.txt not found")
180 | 
181 |     f = open("fastDeploy.auto_dockerfile", "w")
182 |     f.write(
183 |         f"""FROM {CONFIG['base']}
184 | RUN python3 -m pip install --upgrade --no-cache-dir pip fastdeploy
185 | 
186 | ENV {' '.join([f"{k.upper()}={v}" for k, v in CONFIG.items()])}
187 | 
188 | ADD . /recipe
189 | WORKDIR /recipe
190 | {'' if not os.path.exists("extras.sh") else 'RUN chmod +x /recipe/extras.sh && /recipe/extras.sh'}
191 | RUN python3 -m pip install --no-cache-dir -r /recipe/requirements.txt
192 | RUN cd /recipe && python3 -c "from predictor import predictor; from example import example; predictor(example)"
193 | 
194 | ENTRYPOINT ["/bin/sh", "-c"]
195 | 
196 | CMD ["ulimit -n 1000000 && python3 -m fastdeploy --recipe /recipe --rest & python3 -m fastdeploy --recipe /recipe --loop"]
197 | """
198 |     )
199 |     f.flush()
200 |     f.close()
201 | 
202 |     print(f"Dockerfile generated at {os.path.abspath('fastDeploy.auto_dockerfile')}")
203 | 
204 |     print(
205 |         f"Run `docker build -f fastDeploy.auto_dockerfile -t <image_name:tag> {os.path.abspath('.')}` to build the image"
206 |     )
207 |     exit()
208 | 
209 | 
210 | if args.loop:
211 |     loop()
212 | 
213 | elif args.rest:
214 |     rest()
215 | 
216 | elif args.build:
217 |     build_docker_image()
218 | 


--------------------------------------------------------------------------------
/fastdeploy/_infer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import json
  4 | import pickle
  5 | 
  6 | import msgpack
  7 | import zstandard
  8 | 
  9 | import threading
 10 | 
 11 | from . import _utils
 12 | 
 13 | started_at_time = time.time()
 14 | 
 15 | # make sure all predictors are running before starting the inference server
 16 | # if any are not yet started/ still loading then wait for them to start
 17 | for predictor_file, predictor_sequence in _utils.PREDICTOR_FILE_TO_SEQUENCE.items():
 18 |     log_printed = False
 19 |     while True:
 20 |         try:
 21 |             time_per_example = _utils.META_INDEX.get(
 22 |                 f"{predictor_sequence}", select_keys=["time_per_example"]
 23 |             )[f"{predictor_sequence}"]["time_per_example"]
 24 |             started_at_time = time.time()
 25 |             break
 26 |         except:
 27 |             if not log_printed:
 28 |                 _utils.logger.info(f"Waiting for {predictor_file} to start")
 29 |             log_printed = True
 30 |             time.sleep(1)
 31 | 
 32 | 
 33 | _utils.logger.info(f"pids: {_utils.get_fd_pids()}")
 34 | 
 35 | class Infer:
 36 |     started_at_time = started_at_time
 37 | 
 38 |     def __init__(
 39 |         self,
 40 |         allow_pickle=os.getenv("ALLOW_PICKLE", "true").lower() == "true",
 41 |     ):
 42 |         self.local_storage = threading.local()
 43 |         self.allow_pickle = allow_pickle
 44 | 
 45 |     @property
 46 |     def _compressor(self):
 47 |         if (
 48 |             not hasattr(self.local_storage, "compressor")
 49 |             or self.local_storage.compressor is None
 50 |         ):
 51 |             self.local_storage.compressor = zstandard.ZstdCompressor(level=-1)
 52 |         return self.local_storage.compressor
 53 | 
 54 |     @property
 55 |     def _decompressor(self):
 56 |         if (
 57 |             not hasattr(self.local_storage, "decompressor")
 58 |             or self.local_storage.decompressor is None
 59 |         ):
 60 |             self.local_storage.decompressor = zstandard.ZstdDecompressor()
 61 |         return self.local_storage.decompressor
 62 | 
 63 |     def read_inputs(self, unique_id, inputs, input_type, is_compressed):
 64 |         if input_type == "pickle":
 65 |             if not self.allow_pickle:
 66 |                 _utils.logger.warning(
 67 |                     f"{unique_id}: tried to use pickle input, but pickle is disallowed"
 68 |                 )
 69 |                 raise Exception("pickle input disallowed, use msgpack or json")
 70 | 
 71 |             inputs = pickle.loads(
 72 |                 inputs if not is_compressed else self._decompressor.decompress(inputs)
 73 |             )
 74 |             _utils.logger.debug(f"pickle input read")
 75 | 
 76 |         elif input_type == "msgpack":
 77 |             inputs = msgpack.unpackb(
 78 |                 inputs if not is_compressed else self._decompressor.decompress(inputs),
 79 |                 use_list=False,
 80 |                 raw=False,
 81 |             )
 82 | 
 83 |             _utils.logger.debug(f"{unique_id}: msgpack input read")
 84 | 
 85 |         elif input_type == "json":
 86 |             inputs = json.loads(
 87 |                 inputs if not is_compressed else self._decompressor.decompress(inputs)
 88 |             )
 89 | 
 90 |             # for backward compatibility
 91 |             try:
 92 |                 inputs = inputs["data"]
 93 |             except:
 94 |                 pass
 95 | 
 96 |             _utils.logger.debug(f"{unique_id}: json input read")
 97 | 
 98 |         else:
 99 |             _utils.logger.warning(f"{unique_id}: input_type {input_type} not supported")
100 |             raise Exception(f"input_type {input_type} not supported")
101 | 
102 |         return inputs
103 | 
104 |     def create_response(self, unique_id, response, is_compressed, input_type):
105 |         success = response["success"]
106 |         if input_type == "pickle":
107 |             response = pickle.dumps(response)
108 |         elif input_type == "msgpack":
109 |             response = msgpack.packb(response, use_bin_type=True)
110 |         elif input_type == "json":
111 |             pass
112 | 
113 |         if is_compressed:
114 |             response = self._compressor.compress(response)
115 |             _utils.logger.debug(f"{unique_id}: response compressed")
116 | 
117 |         return success, response
118 | 
119 |     def get_timeout_response(
120 |         self, unique_id, is_compressed, input_type, is_client_timeout=False
121 |     ):
122 |         if is_client_timeout:
123 |             _utils.MAIN_INDEX.update(
124 |                 {
125 |                     unique_id: {
126 |                         "-1.predicted_at": time.time(),
127 |                         "timedout_in_queue": True,
128 |                     }
129 |                 }
130 |             )
131 |             _utils.logger.warning(f"{unique_id}: client timeout")
132 | 
133 |         return self.create_response(
134 |             unique_id,
135 |             {
136 |                 "success": False,
137 |                 "reason": "timeout" if not is_client_timeout else "client_timeout",
138 |                 "unique_id": unique_id,
139 |                 "prediction": None,
140 |             },
141 |             is_compressed,
142 |             input_type,
143 |         )
144 | 
145 |     def add_to_infer_queue(
146 |         self, inputs: bytes, unique_id: str, input_type: str, is_compressed: bool
147 |     ):
148 |         try:
149 |             request_received_at = time.time()
150 |             _utils.logger.debug(f"{unique_id}: reading inputs")
151 | 
152 |             inputs = self.read_inputs(unique_id, inputs, input_type, is_compressed)
153 | 
154 |             if inputs is None:
155 |                 _utils.logger.warning(f"{unique_id}: inputs are None")
156 |                 return self.create_response(
157 |                     unique_id,
158 |                     {
159 |                         "success": False,
160 |                         "reason": f"inputs have to be {'pickle,' if self.allow_pickle else ''} msgpack or json",
161 |                         "unique_id": unique_id,
162 |                         "prediction": None,
163 |                     },
164 |                     is_compressed,
165 |                     input_type,
166 |                 )
167 | 
168 |             if not isinstance(inputs, (list, tuple)):
169 |                 _utils.logger.warning(f"{unique_id}: inputs have to be a list or tuple")
170 |                 return self.create_response(
171 |                     unique_id,
172 |                     {
173 |                         "success": False,
174 |                         "reason": "inputs have to be a list or tuple",
175 |                         "unique_id": unique_id,
176 |                         "prediction": None,
177 |                     },
178 |                     is_compressed,
179 |                     input_type,
180 |                 )
181 | 
182 |             if not inputs:
183 |                 _utils.logger.debug(f"{unique_id}: empty inputs")
184 |                 return self.create_response(
185 |                     unique_id,
186 |                     {
187 |                         "success": True,
188 |                         "reason": "empty inputs",
189 |                         "unique_id": unique_id,
190 |                         "prediction": [],
191 |                     },
192 |                     is_compressed,
193 |                     input_type,
194 |                 )
195 | 
196 |             else:
197 |                 # -1 is the predictor sequence for the rest server, basically where the request originates
198 |                 _utils.MAIN_INDEX.update(
199 |                     {
200 |                         unique_id: {
201 |                             "-1.outputs": inputs,
202 |                             "-1.received_at": request_received_at,
203 |                             "-1.predicted_in_batch_of": len(inputs),
204 |                             "-1.predicted_at": 0,
205 |                             "last_predictor_sequence": -1,
206 |                             "last_predictor_success": True,
207 |                             "timedout_in_queue": None,
208 |                         }
209 |                     }
210 |                 )
211 | 
212 |                 _utils.logger.debug(f"{unique_id}: added to request queue")
213 | 
214 |                 return True, None
215 |         except Exception as ex:
216 |             _utils.logger.exception(ex, exc_info=True)
217 |             return self.create_response(
218 |                 unique_id,
219 |                 {
220 |                     "success": False,
221 |                     "reason": str(ex),
222 |                     "unique_id": unique_id,
223 |                     "prediction": None,
224 |                 },
225 |                 is_compressed,
226 |                 input_type,
227 |             )
228 | 
229 |     def get_responses_for_unique_ids(self, unique_ids, is_compresseds, input_types):
230 |         all_current_results = _utils.MAIN_INDEX.get(
231 |             unique_ids,
232 |             select_keys=[
233 |                 f"{_utils.LAST_PREDICTOR_SEQUENCE}.outputs",
234 |                 "last_predictor_success",
235 |                 "last_predictor_sequence",
236 |                 "timedout_in_queue",
237 |             ],
238 |         )
239 | 
240 |         all_responses = {}
241 | 
242 |         updations = {}
243 |         still_processing = []
244 | 
245 |         for unique_id, is_compressed, input_type in zip(
246 |             unique_ids, is_compresseds, input_types
247 |         ):
248 |             current_results = all_current_results[unique_id]
249 | 
250 |             if current_results["timedout_in_queue"]:
251 |                 _utils.logger.warning(f"{unique_id}: timedout in queue")
252 |                 updations[unique_id] = {
253 |                     "-1.predicted_at": time.time(),
254 |                 }
255 |                 all_responses[unique_id] = self.get_timeout_response(
256 |                     unique_id, is_compressed, input_type
257 |                 )
258 |                 _utils.logger.debug(f"{unique_id}: timedout in queue response created")
259 | 
260 |             elif (
261 |                 current_results["last_predictor_success"] is True
262 |                 and current_results["last_predictor_sequence"]
263 |                 == _utils.LAST_PREDICTOR_SEQUENCE
264 |             ):
265 |                 updations[unique_id] = {
266 |                     "-1.predicted_at": time.time(),
267 |                 }
268 | 
269 |                 all_responses[unique_id] = self.create_response(
270 |                     unique_id,
271 |                     {
272 |                         "success": True,
273 |                         "unique_id": unique_id,
274 |                         "prediction": current_results[
275 |                             f"{_utils.LAST_PREDICTOR_SEQUENCE}.outputs"
276 |                         ],
277 |                         "reason": None,
278 |                     },
279 |                     is_compressed,
280 |                     input_type,
281 |                 )
282 |                 _utils.logger.debug(f"{unique_id}: response created")
283 |             elif current_results["last_predictor_success"] is False:
284 |                 _utils.logger.warning(
285 |                     f"{unique_id}: predictor failed at {current_results['last_predictor_sequence']}"
286 |                 )
287 |                 updations[unique_id] = {
288 |                     "-1.predicted_at": time.time(),
289 |                 }
290 |                 all_responses[unique_id] = self.create_response(
291 |                     unique_id,
292 |                     {
293 |                         "success": False,
294 |                         "reason": f"prediction failed predictor {current_results['last_predictor_sequence']}",
295 |                         "unique_id": unique_id,
296 |                         "prediction": None,
297 |                     },
298 |                     is_compressed,
299 |                     input_type,
300 |                 )
301 |                 _utils.logger.debug(f"{unique_id}: failed response created")
302 | 
303 |             else:
304 |                 still_processing.append(unique_id)
305 | 
306 |         if updations:
307 |             _utils.MAIN_INDEX.update(updations)
308 | 
309 |         if still_processing:
310 |             _utils.logger.debug(f"Still processing: {still_processing}")
311 | 
312 |         return all_responses
313 | 


--------------------------------------------------------------------------------
/fastdeploy/_loop.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import importlib
  4 | 
  5 | from . import _utils
  6 | 
  7 | 
  8 | def load_predictor(predictor_name):
  9 |     predictor = importlib.import_module(os.path.splitext(predictor_name)[0]).predictor
 10 |     predictor_sequence = _utils.PREDICTOR_FILE_TO_SEQUENCE[predictor_name]
 11 |     _utils.logger.debug(
 12 |         f"{predictor_name}: predictor loaded with predictor_sequence {predictor_sequence}"
 13 |     )
 14 |     return predictor, predictor_sequence
 15 | 
 16 | 
 17 | def get_example(predictor_sequence):
 18 |     if predictor_sequence == 0:
 19 |         return _utils.example
 20 | 
 21 |     while True:
 22 |         _utils.logger.debug(f"Waiting for previous predictor to finish warmup")
 23 |         try:
 24 |             example = _utils.META_INDEX.get(
 25 |                 f"{predictor_sequence - 1}", select_keys=["example_output"]
 26 |             )[f"{predictor_sequence - 1}"]["example_output"]
 27 |             if example is not None:
 28 |                 return example
 29 |         except:
 30 |             time.sleep(1)
 31 | 
 32 | 
 33 | def initialize_predictor(
 34 |     predictor,
 35 |     predictor_name,
 36 |     predictor_sequence,
 37 |     example,
 38 |     optimal_batch_size,
 39 | ):
 40 |     example_output = _utils.warmup(predictor, example)
 41 |     _utils.logger.info(f"{predictor_name}: warmup done")
 42 | 
 43 |     optimal_batch_size, time_per_example = _utils.calculate_optimum_batch_sizes(
 44 |         predictor, predictor_sequence, example, optimal_batch_size
 45 |     )
 46 | 
 47 |     return {
 48 |         "optimal_batch_size": optimal_batch_size,
 49 |         "time_per_example": time_per_example,
 50 |         "predictor_name": predictor_name,
 51 |         "predictor_sequence": predictor_sequence,
 52 |         "request_poll_time": 0.01,
 53 |         "example_output": example_output,
 54 |         "status": "running",
 55 |     }
 56 | 
 57 | 
 58 | def process_batch(predictor, input_batch, optimal_batch_size):
 59 |     last_predictor_success = False
 60 |     received_at = time.time()
 61 |     try:
 62 |         results = predictor(input_batch, batch_size=optimal_batch_size)
 63 |         last_predictor_success = True
 64 |     except Exception as ex:
 65 |         _utils.logger.exception(ex, exc_info=True)
 66 |         results = [None] * len(input_batch)
 67 | 
 68 |     predicted_at = time.time()
 69 | 
 70 |     if len(results) != len(input_batch):
 71 |         raise Exception(
 72 |             f"Predictor returned {len(results)} results for {len(input_batch)} inputs"
 73 |         )
 74 | 
 75 |     return results, last_predictor_success, received_at, predicted_at
 76 | 
 77 | 
 78 | to_process = {}
 79 | current_sum_of_to_process = 0
 80 | 
 81 | 
 82 | def fetch_batch(
 83 |     main_index,
 84 |     predictor_sequence,
 85 |     optimal_batch_size,
 86 |     max_wait_time_for_batch_collection,
 87 | ):
 88 |     global to_process
 89 |     global current_sum_of_to_process
 90 | 
 91 |     unique_id_wise_input_count = {}
 92 |     input_batch = []
 93 |     current_batch_length = 0
 94 |     batch_collection_started_at = time.time()
 95 |     last_input_received_at = time.time()
 96 | 
 97 |     while current_batch_length < optimal_batch_size:
 98 |         if current_sum_of_to_process < optimal_batch_size:
 99 |             to_process.update(
100 |                 main_index.search(
101 |                     query={
102 |                         "-1.predicted_at": 0,  # prediction not yet done
103 |                         "last_predictor_success": True,  # last predictor success
104 |                         "last_predictor_sequence": predictor_sequence
105 |                         - 1,  # last predictor sequence
106 |                         "timedout_in_queue": {"$ne": True},  # not timedout in queue
107 |                     },
108 |                     n=optimal_batch_size,
109 |                     select_keys=[f"{predictor_sequence - 1}.outputs"],
110 |                     update={
111 |                         "last_predictor_sequence": predictor_sequence,  # set last predictor sequence to current predictor sequence
112 |                         "last_predictor_success": None,  # reset last predictor success
113 |                         f"{predictor_sequence}.received_at": time.time(),  # set received at to current time
114 |                     },
115 |                 )
116 |             )
117 | 
118 |         for unique_id, data in to_process.items():
119 |             if current_batch_length > optimal_batch_size * 0.8:
120 |                 break
121 |             outputs = data[f"{predictor_sequence - 1}.outputs"]
122 |             input_count = len(outputs)
123 |             unique_id_wise_input_count[unique_id] = input_count
124 |             input_batch.extend(outputs)
125 |             current_batch_length += input_count
126 |             last_input_received_at = time.time()
127 | 
128 |         for unique_id in unique_id_wise_input_count.keys():
129 |             try:
130 |                 del to_process[unique_id]
131 |             except:
132 |                 pass
133 | 
134 |         current_sum_of_to_process = sum(
135 |             len(v[f"{predictor_sequence - 1}.outputs"]) for v in to_process.values()
136 |         )
137 | 
138 |         if current_batch_length == 0:
139 |             if time.time() - last_input_received_at > 5:
140 |                 time.sleep(0.05)
141 |             else:
142 |                 time.sleep(max_wait_time_for_batch_collection / 2)
143 |             continue
144 | 
145 |         elif (
146 |             time.time() - batch_collection_started_at
147 |             < max_wait_time_for_batch_collection
148 |             and current_batch_length / optimal_batch_size < 0.9
149 |         ):
150 |             time.sleep(max_wait_time_for_batch_collection / 2)
151 |             continue
152 | 
153 |         else:
154 |             # finished collecting batch
155 |             break
156 | 
157 |     _utils.logger.info(
158 |         f"Fetched batch {unique_id_wise_input_count} with {current_sum_of_to_process} remaining in memory, to_process: {len(to_process)}"
159 |     )
160 | 
161 |     return unique_id_wise_input_count, input_batch
162 | 
163 | 
164 | def prepare_results(
165 |     unique_id_wise_input_count,
166 |     results,
167 |     predictor_sequence,
168 |     last_predictor_success,
169 |     received_at,
170 |     predicted_at,
171 |     current_batch_length,
172 | ):
173 |     """Prepare results for updating the main index."""
174 |     unique_id_wise_results = {}
175 |     total_input_count_till_now = 0
176 | 
177 |     for unique_id, input_count in unique_id_wise_input_count.items():
178 |         unique_id_wise_results[unique_id] = {
179 |             f"{predictor_sequence}.outputs": results[
180 |                 total_input_count_till_now : total_input_count_till_now + input_count
181 |             ],
182 |             f"{predictor_sequence}.predicted_at": predicted_at,
183 |             "last_predictor_success": last_predictor_success,
184 |             f"{predictor_sequence}.received_at": received_at,
185 |             f"{predictor_sequence}.predicted_in_batch_of": current_batch_length,
186 |         }
187 |         total_input_count_till_now += input_count
188 | 
189 |     return unique_id_wise_results
190 | 
191 | 
192 | def start_loop(
193 |     predictor_name=os.getenv("PREDICTOR_NAME"),
194 |     optimal_batch_size=int(os.getenv("OPTIMAL_BATCH_SIZE")),
195 | ):
196 |     """Main loop for processing predictions."""
197 |     timeout_time = float(os.getenv("TIMEOUT", 0))
198 |     predictor, predictor_sequence = load_predictor(predictor_name)
199 |     example = get_example(predictor_sequence)
200 |     predictor_info = initialize_predictor(
201 |         predictor, predictor_name, predictor_sequence, example, optimal_batch_size
202 |     )
203 |     _utils.META_INDEX.update({f"{predictor_sequence}": predictor_info})
204 | 
205 |     optimal_batch_size = predictor_info["optimal_batch_size"]
206 |     time_per_example = predictor_info["time_per_example"]
207 |     max_wait_time_for_batch_collection = max(0.003, time_per_example * 0.51)
208 | 
209 |     _utils.logger.info(
210 |         f"""{predictor_name}
211 |     optimal_batch_size: {optimal_batch_size}
212 |     time_per_example: {time_per_example}
213 |     predictor_sequence: {predictor_sequence}
214 |     max_wait_time_for_batch_collection: {max_wait_time_for_batch_collection}
215 |     """
216 |     )
217 | 
218 |     prediction_loop_started_at = time.time()
219 | 
220 |     while True:
221 |         """
222 |         Set timedout_in_queue to True for all the predictions that have been in the queue for more than timeout_time seconds
223 |         and delete older than 30 seconds predictions that have finished prediction
224 |         """
225 | 
226 |         timedout_in_queue_unique_ids = _utils.MAIN_INDEX.search(
227 |             query={
228 |                 "-1.predicted_at": 0,
229 |                 "-1.received_at": {"$lt": time.time() - timeout_time},
230 |                 "timedout_in_queue": {"$ne": True},
231 |                 "last_predictor_sequence": {"$ne": _utils.LAST_PREDICTOR_SEQUENCE},
232 |             },
233 |             update={"timedout_in_queue": True},
234 |             select_keys=[],
235 |         )
236 | 
237 |         if timedout_in_queue_unique_ids:
238 |             _utils.logger.warning(
239 |                 f"{_utils.MAIN_INDEX.count()} in queue, set timedout_in_queue to True for {list(timedout_in_queue_unique_ids)} unique_ids"
240 |             )
241 | 
242 |         _utils.MAIN_INDEX.delete(
243 |             query={
244 |                 "$and": [
245 |                     {"-1.predicted_at": {"$gt": 0}},
246 |                     {"-1.predicted_at": {"$lt": time.time() - 40}},
247 |                 ]
248 |             },
249 |         )
250 | 
251 |         unique_id_wise_input_count, input_batch = fetch_batch(
252 |             _utils.MAIN_INDEX,
253 |             predictor_sequence,
254 |             optimal_batch_size,
255 |             max_wait_time_for_batch_collection,
256 |         )
257 | 
258 |         _utils.logger.debug(f"Processing batch {unique_id_wise_input_count}")
259 | 
260 |         process_batch_started_at = time.time()
261 |         results, last_predictor_success, received_at, predicted_at = process_batch(
262 |             predictor, input_batch, optimal_batch_size
263 |         )
264 |         process_batch_ended_at = time.time()
265 | 
266 |         unique_id_wise_results = prepare_results(
267 |             unique_id_wise_input_count,
268 |             results,
269 |             predictor_sequence,
270 |             last_predictor_success,
271 |             received_at,
272 |             predicted_at,
273 |             len(input_batch),
274 |         )
275 |         _utils.MAIN_INDEX.update(unique_id_wise_results)
276 | 
277 |         _utils.logger.debug(
278 |             f"Updated results predictor {predictor_sequence}: {list(unique_id_wise_results)}"
279 |         )
280 | 
281 |         _utils.GLOBAL_METRICS_INDEX.math(
282 |             "total_predictor_run_for_hours",
283 |             (process_batch_ended_at - process_batch_started_at) / 3600,
284 |             "+=",
285 |         )
286 | 
287 |         _utils.GLOBAL_METRICS_INDEX["total_predictor_up_for_hours"] = (
288 |             time.time() - prediction_loop_started_at
289 |         ) / 3600
290 | 
291 | 
292 | if __name__ == "__main__":
293 |     import sys
294 | 
295 |     start_loop(sys.argv[1])
296 | 


--------------------------------------------------------------------------------
/fastdeploy/_rest.py:
--------------------------------------------------------------------------------
  1 | from gevent import monkey
  2 | 
  3 | monkey.patch_all()
  4 | 
  5 | import os
  6 | import json
  7 | import time
  8 | import uuid
  9 | import pickle
 10 | import falcon
 11 | import gevent
 12 | import threading
 13 | import importlib
 14 | 
 15 | from . import _utils
 16 | from . import _infer
 17 | 
 18 | try:
 19 |     get_prometheus_metrics = importlib.import_module(
 20 |         "extra_prometheus_metrics"
 21 |     ).get_prometheus_metrics
 22 | except ImportError:
 23 |     get_prometheus_metrics = None
 24 | 
 25 | 
 26 | class AsyncResponseHandler:
 27 |     def __init__(self, check_interval=0.003):
 28 |         self.pending_requests = {}
 29 |         self.check_interval = check_interval
 30 |         self.lock = threading.Lock()
 31 |         self.infer = _infer.Infer()
 32 | 
 33 |         gevent.spawn(self._response_checker)
 34 | 
 35 |     def register_request_and_wait_for_response(
 36 |         self, unique_id, is_compressed, input_type, timeout
 37 |     ):
 38 |         event = gevent.event.Event()
 39 | 
 40 |         with self.lock:
 41 |             self.pending_requests[unique_id] = {
 42 |                 "event": event,
 43 |                 "is_compressed": is_compressed,
 44 |                 "input_type": input_type,
 45 |                 "timestamp": time.time(),
 46 |             }
 47 | 
 48 |         try:
 49 |             if event.wait(timeout=timeout):
 50 |                 with self.lock:
 51 |                     response = self.pending_requests[unique_id].get("response")
 52 |                     return response
 53 |             else:
 54 |                 return self.infer.get_timeout_response(
 55 |                     unique_id, is_compressed, input_type, is_client_timeout=True
 56 |                 )
 57 |         except Exception as e:
 58 |             _utils.logger.exception(e, exc_info=True)
 59 |             _utils.logger.error(f"Error registering request and waiting for response: {e}")
 60 |             return self.infer.get_timeout_response(
 61 |                 unique_id, is_compressed, input_type, is_client_timeout=True
 62 |             )
 63 |         finally:
 64 |             with self.lock:
 65 |                 self.pending_requests.pop(unique_id, None)
 66 | 
 67 |     def deregister_request(self, unique_id):
 68 |         with self.lock:
 69 |             self.pending_requests.pop(unique_id, None)
 70 | 
 71 |     def _response_checker(self):
 72 |         last_input_received_at = time.time()
 73 |         while True:
 74 |             try:
 75 |                 unique_ids = []
 76 |                 is_compresseds = []
 77 |                 input_types = []
 78 |                 with self.lock:
 79 |                     for uid, data in self.pending_requests.items():
 80 |                         unique_ids.append(uid)
 81 |                         is_compresseds.append(data["is_compressed"])
 82 |                         input_types.append(data["input_type"])
 83 |                         last_input_received_at = data["timestamp"]
 84 | 
 85 |                 if not unique_ids and (time.time() - last_input_received_at) > 5:
 86 |                     time.sleep(0.05)
 87 |                     continue
 88 | 
 89 |                 if unique_ids:
 90 |                     _utils.logger.debug(
 91 |                         f"Checking responses for unique_ids: {unique_ids}"
 92 |                     )
 93 |                     try:
 94 |                         responses = self.infer.get_responses_for_unique_ids(
 95 |                             unique_ids=unique_ids,
 96 |                             is_compresseds=is_compresseds,
 97 |                             input_types=input_types,
 98 |                         )
 99 | 
100 |                         for uid, response in responses.items():
101 |                             if response is not None:
102 |                                 with self.lock:
103 |                                     if uid in self.pending_requests:
104 |                                         request_data = self.pending_requests[uid]
105 |                                         request_data["response"] = response
106 |                                         request_data["event"].set()
107 | 
108 |                     except Exception as e:
109 |                         _utils.logger.exception(e, exc_info=True)
110 |                         _utils.logger.error(f"Error checking responses: {e}")
111 | 
112 |             except Exception as e:
113 |                 _utils.logger.error(f"Error in response checker loop: {e}")
114 | 
115 |             finally:
116 |                 gevent.sleep(self.check_interval)
117 | 
118 | 
119 | class Infer(object):
120 |     def __init__(self):
121 |         self._infer = _infer.Infer()
122 |         self._response_handler = AsyncResponseHandler()
123 | 
124 |     def on_post(self, req, resp):
125 |         request_received_at = time.time()
126 | 
127 |         unique_id = str(req.params.get("unique_id", uuid.uuid4()))
128 |         client_timeout = float(req.params.get("timeout", os.getenv("TIMEOUT", 480)))
129 | 
130 |         is_compressed = req.params.get("compressed", "f")[0].lower() == "t"
131 |         input_type = req.params.get("input_type", "json")
132 | 
133 |         success, failure_response = self._infer.add_to_infer_queue(
134 |             inputs=req.stream.read(),
135 |             unique_id=unique_id,
136 |             input_type=input_type,
137 |             is_compressed=is_compressed,
138 |         )
139 | 
140 |         if is_compressed:
141 |             resp.content_type = "application/octet-stream"
142 |         elif input_type == "json":
143 |             resp.content_type = "application/json"
144 |         elif input_type == "pickle":
145 |             resp.content_type = "application/pickle"
146 |         elif input_type == "msgpack":
147 |             resp.content_type = "application/msgpack"
148 | 
149 |         if success is not True:
150 |             resp.status = falcon.HTTP_400
151 |             if input_type == "json":
152 |                 resp.media = failure_response
153 |             else:
154 |                 resp.data = failure_response
155 | 
156 |         else:
157 |             (
158 |                 success,
159 |                 response,
160 |             ) = self._response_handler.register_request_and_wait_for_response(
161 |                 unique_id, is_compressed, input_type, client_timeout
162 |             )
163 |             if success:
164 |                 resp.status = falcon.HTTP_200
165 |             else:
166 |                 resp.status = falcon.HTTP_500
167 | 
168 |             if input_type == "json":
169 |                 resp.media = response
170 |             else:
171 |                 resp.data = response
172 | 
173 | 
174 | class PrometheusMetrics(object):
175 |     def on_get(self, req, resp):
176 |         _LAST_X_SECONDS = int(
177 |             req.params.get("last_x_seconds", int(os.getenv("LAST_X_SECONDS", 30)))
178 |         )
179 |         CURRENT_TIME = time.time()
180 |         LAST_X_SECONDS = time.time() - _LAST_X_SECONDS
181 | 
182 |         number_of_requests_timedout_in_last_x_seconds = _utils.MAIN_INDEX.count(
183 |             query={
184 |                 "-1.predicted_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
185 |                 "timedout_in_queue": True,
186 |             }
187 |         )
188 | 
189 |         requests_received_in_last_x_seconds = _utils.MAIN_INDEX.count(
190 |             query={"-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}}
191 |         )
192 | 
193 |         requests_processed_in_last_x_seconds = _utils.MAIN_INDEX.count(
194 |             query={"-1.predicted_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME}}
195 |         )
196 | 
197 |         requests_received_in_last_x_seconds_that_failed = _utils.MAIN_INDEX.count(
198 |             query={
199 |                 "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
200 |                 "last_predictor_success": False,
201 |             }
202 |         )
203 | 
204 |         requests_processed_in_last_x_seconds_that_failed = _utils.MAIN_INDEX.count(
205 |             query={
206 |                 "-1.predicted_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
207 |                 "last_predictor_success": False,
208 |             }
209 |         )
210 | 
211 |         requests_received_in_last_x_seconds_that_are_pending = _utils.MAIN_INDEX.count(
212 |             query={
213 |                 "-1.predicted_at": 0,
214 |                 "last_predictor_success": {"$ne": False},
215 |                 "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
216 |             }
217 |         )
218 | 
219 |         requests_received_in_last_x_seconds_that_are_successful = (
220 |             _utils.MAIN_INDEX.count(
221 |                 query={
222 |                     "-1.predicted_at": {"$ne": 0},
223 |                     "last_predictor_success": True,
224 |                     "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
225 |                     "timedout_in_queue": {"$ne": True},
226 |                 }
227 |             )
228 |         )
229 | 
230 |         requests_processed_in_last_x_seconds_that_are_successful = (
231 |             _utils.MAIN_INDEX.count(
232 |                 query={
233 |                     "-1.predicted_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
234 |                     "last_predictor_success": True,
235 |                     "timedout_in_queue": {"$ne": True},
236 |                 }
237 |             )
238 |         )
239 | 
240 |         avg_total_time_per_req_for_reqs_in_last_x_seconds = 0
241 | 
242 |         __sum_of_received_at = _utils.MAIN_INDEX.math(
243 |             "-1.received_at",
244 |             "sum",
245 |             query={
246 |                 "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
247 |                 "-1.predicted_at": {"$ne": 0},
248 |                 "timedout_in_queue": {"$ne": True},
249 |             },
250 |         )
251 | 
252 |         __sum_of_predicted_at = _utils.MAIN_INDEX.math(
253 |             "-1.predicted_at",
254 |             "sum",
255 |             query={
256 |                 "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
257 |                 "-1.predicted_at": {"$ne": 0},
258 |                 "timedout_in_queue": {"$ne": True},
259 |             },
260 |         )
261 | 
262 |         if __sum_of_received_at and __sum_of_predicted_at:
263 |             avg_total_time_per_req_for_reqs_in_last_x_seconds = (
264 |                 __sum_of_predicted_at - __sum_of_received_at
265 |             ) / requests_received_in_last_x_seconds_that_are_successful
266 | 
267 |         avg_actual_total_time_per_req_for_reqs_in_last_x_seconds = 0
268 | 
269 |         for executor_n in [0]:
270 |             _temp_sum_of_received_at = _utils.MAIN_INDEX.math(
271 |                 f"{executor_n}.received_at",
272 |                 "sum",
273 |                 query={
274 |                     "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
275 |                     "-1.predicted_at": {"$ne": 0},
276 |                     "timedout_in_queue": {"$ne": True},
277 |                 },
278 |             )
279 | 
280 |             _temp_sum_of_predicted_at = _utils.MAIN_INDEX.math(
281 |                 f"{executor_n}.predicted_at",
282 |                 "sum",
283 |                 query={
284 |                     "-1.received_at": {"$gt": LAST_X_SECONDS, "$lt": CURRENT_TIME},
285 |                     "-1.predicted_at": {"$ne": 0},
286 |                     "timedout_in_queue": {"$ne": True},
287 |                 },
288 |             )
289 | 
290 |             if _temp_sum_of_received_at and _temp_sum_of_predicted_at:
291 |                 avg_actual_total_time_per_req_for_reqs_in_last_x_seconds = (
292 |                     _temp_sum_of_predicted_at - _temp_sum_of_received_at
293 |                 ) / requests_received_in_last_x_seconds_that_are_successful
294 | 
295 |         prometheus_text = f"""
296 | # HELP requests_received_in_last_x_seconds The number of requests received in last {_LAST_X_SECONDS} seconds.
297 | # TYPE requests_received_in_last_x_seconds gauge
298 | requests_received_in_last_x_seconds {requests_received_in_last_x_seconds}
299 | 
300 | # HELP requests_processed_in_last_x_seconds The number of requests processed in last {_LAST_X_SECONDS} seconds.
301 | # TYPE requests_processed_in_last_x_seconds gauge
302 | requests_processed_in_last_x_seconds {requests_processed_in_last_x_seconds}
303 | 
304 | # HELP number_of_requests_timedout_in_last_x_seconds The number of requests timedout at predictor(s) in last {_LAST_X_SECONDS} seconds.
305 | # TYPE number_of_requests_timedout_in_last_x_seconds gauge
306 | number_of_requests_timedout_in_last_x_seconds {number_of_requests_timedout_in_last_x_seconds}
307 | 
308 | # HELP requests_received_in_last_x_seconds_that_failed The number of requests received in last {_LAST_X_SECONDS} seconds that failed.
309 | # TYPE requests_received_in_last_x_seconds_that_failed gauge
310 | requests_received_in_last_x_seconds_that_failed {requests_received_in_last_x_seconds_that_failed}
311 | 
312 | # HELP requests_processed_in_last_x_seconds_that_failed The number of requests processed in last {_LAST_X_SECONDS} seconds that failed.
313 | # TYPE requests_processed_in_last_x_seconds_that_failed gauge
314 | requests_processed_in_last_x_seconds_that_failed {requests_processed_in_last_x_seconds_that_failed}
315 | 
316 | # HELP requests_received_in_last_x_seconds_that_are_pending The number of requests received in last {_LAST_X_SECONDS} seconds that are pending.
317 | # TYPE requests_received_in_last_x_seconds_that_are_pending gauge
318 | requests_received_in_last_x_seconds_that_are_pending {requests_received_in_last_x_seconds_that_are_pending}
319 | 
320 | # HELP requests_received_in_last_x_seconds_that_are_successful The number of requests received in last {_LAST_X_SECONDS} seconds that are successful.
321 | # TYPE requests_received_in_last_x_seconds_that_are_successful gauge
322 | requests_received_in_last_x_seconds_that_are_successful {requests_received_in_last_x_seconds_that_are_successful}
323 | 
324 | # HELP requests_processed_in_last_x_seconds_that_are_successful The number of requests processed in last {_LAST_X_SECONDS} seconds that are successful.
325 | # TYPE requests_processed_in_last_x_seconds_that_are_successful gauge
326 | requests_processed_in_last_x_seconds_that_are_successful {requests_processed_in_last_x_seconds_that_are_successful}
327 | 
328 | # HELP avg_total_time_per_req_for_reqs_in_last_x_seconds The average total time per request for requests in last {_LAST_X_SECONDS} seconds.
329 | # TYPE avg_total_time_per_req_for_reqs_in_last_x_seconds gauge
330 | avg_total_time_per_req_for_reqs_in_last_x_seconds {avg_total_time_per_req_for_reqs_in_last_x_seconds}
331 | 
332 | # HELP avg_actual_total_time_per_req_for_reqs_in_last_x_seconds The average actual total time per request for requests in last {_LAST_X_SECONDS} seconds.
333 | # TYPE avg_actual_total_time_per_req_for_reqs_in_last_x_seconds gauge
334 | avg_actual_total_time_per_req_for_reqs_in_last_x_seconds {avg_actual_total_time_per_req_for_reqs_in_last_x_seconds}
335 |         """.strip()
336 | 
337 |         if get_prometheus_metrics is not None:
338 |             extra_prometheus_metrics_data = get_prometheus_metrics()
339 | 
340 |             if extra_prometheus_metrics_data:
341 |                 extra_prometheus_texts = []
342 |                 for metric_name, metric_data in extra_prometheus_metrics_data.items():
343 |                     extra_prometheus_texts.append(
344 |                         f"""
345 | # HELP {metric_name} {metric_data['help']}
346 | # TYPE {metric_name} {metric_data['type']}
347 | {metric_name} {metric_data['value']}
348 |                     """.strip()
349 |                     )
350 |                 prometheus_text += "\n\n" + "\n\n".join(extra_prometheus_texts)
351 | 
352 |         resp.status = falcon.HTTP_200
353 |         resp.content_type = "text/plain; version=0.0.4"
354 |         resp.text = prometheus_text
355 | 
356 | 
357 | class Health(object):
358 |     def on_get(self, req, resp):
359 |         fail_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y_param = req.params.get(
360 |             "fail_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y",
361 |             None,
362 |         )
363 | 
364 |         fail_if_requests_older_than_x_seconds_pending_param = req.params.get(
365 |             "fail_if_requests_older_than_x_seconds_pending", None
366 |         )
367 | 
368 |         fail_if_up_time_more_than_x_seconds_param = req.params.get(
369 |             "fail_if_up_time_more_than_x_seconds", None
370 |         )
371 | 
372 |         fail_if_requests_timedout_in_last_x_seconds_is_more_than_y_param = (
373 |             req.params.get(
374 |                 "fail_if_requests_timedout_in_last_x_seconds_is_more_than_y", None
375 |             )
376 |         )
377 | 
378 |         is_predictor_is_up_param = req.params.get("is_predictor_is_up", None)
379 | 
380 |         if fail_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y_param:
381 |             (
382 |                 x,
383 |                 y,
384 |             ) = fail_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y_param.split(
385 |                 ","
386 |             )
387 |             x, y = int(x), int(y)
388 |             if _utils.check_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y(
389 |                 x, y
390 |             ):
391 |                 resp.status = falcon.HTTP_503
392 |                 resp.media = {
393 |                     "reason": f"More than {y}% requests failed in last {x} seconds"
394 |                 }
395 |                 return
396 | 
397 |         if fail_if_requests_older_than_x_seconds_pending_param:
398 |             if _utils.check_if_requests_older_than_x_seconds_pending(
399 |                 int(fail_if_requests_older_than_x_seconds_pending_param)
400 |             ):
401 |                 resp.status = falcon.HTTP_503
402 |                 resp.media = {
403 |                     "reason": f"Requests older than {fail_if_requests_older_than_x_seconds_pending_param} seconds are pending"
404 |                 }
405 |                 return
406 | 
407 |         if fail_if_up_time_more_than_x_seconds_param:
408 |             if time.time() - Infer.started_at_time > int(
409 |                 fail_if_up_time_more_than_x_seconds_param
410 |             ):
411 |                 resp.status = falcon.HTTP_503
412 |                 resp.media = {
413 |                     "reason": f"Up time more than {fail_if_up_time_more_than_x_seconds_param} seconds"
414 |                 }
415 |                 return
416 | 
417 |         if fail_if_requests_timedout_in_last_x_seconds_is_more_than_y_param:
418 |             (
419 |                 x,
420 |                 y,
421 |             ) = fail_if_requests_timedout_in_last_x_seconds_is_more_than_y_param.split(
422 |                 ","
423 |             )
424 |             x, y = int(x), int(y)
425 |             if _utils.check_if_requests_timedout_in_last_x_seconds_is_more_than_y(x, y):
426 |                 resp.status = falcon.HTTP_503
427 |                 return
428 | 
429 |         resp.status = falcon.HTTP_200
430 |         resp.media = {"status": "ok"}
431 | 
432 | 
433 | class Meta(object):
434 |     def on_get(self, req, resp):
435 |         resp.status = falcon.HTTP_200
436 | 
437 |         if "is_pickle_allowed" in req.params:
438 |             resp.media = {
439 |                 "is_pickle_allowed": os.getenv("ALLOW_PICKLE", "true").lower() == "true"
440 |             }
441 | 
442 |         else:
443 |             try:
444 |                 json.dumps(_utils.example)
445 |                 __example = _utils.example
446 |             except:
447 |                 __example = None
448 | 
449 |             resp.media = {
450 |                 "name": _utils.recipe_name,
451 |                 "example": __example,
452 |                 "is_pickle_allowed": os.getenv("ALLOW_PICKLE", "true").lower()
453 |                 == "true",
454 |                 "timeout": int(os.getenv("TIMEOUT")),
455 |             }
456 | 
457 | 
458 | class Die(object):
459 |     def on_get(self, req, resp):
460 |         if req.params.get("die", "false").lower()[0] == "t":
461 |             resp.status = falcon.HTTP_200
462 |             resp.media = {"status": "killed"}
463 |             _utils.kill_fd(loop=True, rest=True)
464 | 
465 | 
466 | app = falcon.App(
467 |     middleware=falcon.CORSMiddleware(allow_origins="*", allow_credentials="*"),
468 | )
469 | 
470 | infer_api = Infer()
471 | prometheus_metrics = PrometheusMetrics()
472 | health_api = Health()
473 | die_api = Die()
474 | 
475 | app.add_route("/infer", infer_api)
476 | app.add_route("/sync", infer_api)
477 | app.add_route("/prometheus_metrics", prometheus_metrics)
478 | app.add_route("/health", health_api)
479 | app.add_route("/meta", Meta())
480 | app.add_route("/die", die_api)
481 | 


--------------------------------------------------------------------------------
/fastdeploy/_utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | logging.basicConfig(
  4 |     format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
  5 |     datefmt="%Y-%m-%d:%H:%M:%S",
  6 |     level=logging.INFO,
  7 | )
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | import os
 13 | import glob
 14 | import json
 15 | import time
 16 | import psutil
 17 | from datetime import datetime
 18 | from liteindex import DefinedIndex, KVIndex
 19 | 
 20 | try:
 21 |     from example import example
 22 | except:
 23 |     raise Exception("example.py not found. Please follow the instructions in README.md")
 24 | 
 25 | try:
 26 |     from example import name as recipe_name
 27 | except:
 28 |     recipe_name = os.path.basename(os.getcwd()).strip("/")
 29 | 
 30 | 
 31 | PREDICTOR_SEQUENCE_TO_FILES = {}
 32 | 
 33 | predictor_files = [
 34 |     _
 35 |     for _ in glob.glob("predictor*.py")
 36 |     if _ == "predictor.py" or _.split("predictor_")[1].split(".")[0].isdigit()
 37 | ]
 38 | 
 39 | for f in sorted(
 40 |     predictor_files,
 41 |     key=lambda x: int(
 42 |         x.split("predictor_")[1].split(".")[0] if x != "predictor.py" else 0
 43 |     ),
 44 | ):
 45 |     if f == "predictor.py":
 46 |         PREDICTOR_SEQUENCE_TO_FILES[0] = f
 47 |         break
 48 |     else:
 49 |         PREDICTOR_SEQUENCE_TO_FILES[len(PREDICTOR_SEQUENCE_TO_FILES)] = f
 50 | 
 51 | PREDICTOR_FILE_TO_SEQUENCE = {v: k for k, v in PREDICTOR_SEQUENCE_TO_FILES.items()}
 52 | 
 53 | LAST_PREDICTOR_SEQUENCE = max(PREDICTOR_SEQUENCE_TO_FILES.keys())
 54 | FIRST_PREDICTOR_SEQUENCE = min(PREDICTOR_SEQUENCE_TO_FILES.keys())
 55 | 
 56 | META_INDEX = DefinedIndex(
 57 |     "meta_index",
 58 |     schema={
 59 |         "optimal_batch_size": DefinedIndex.Type.number,
 60 |         "time_per_example": DefinedIndex.Type.number,
 61 |         "predictor_name": DefinedIndex.Type.string,
 62 |         "predictor_sequence": DefinedIndex.Type.number,
 63 |         "request_poll_time": DefinedIndex.Type.number,
 64 |         "example_output": DefinedIndex.Type.other,
 65 |         "status": DefinedIndex.Type.string,
 66 |     },
 67 |     db_path=os.path.join("fastdeploy_dbs", f"main_index.db"),
 68 | )
 69 | 
 70 | KV_STORE = KVIndex(os.path.join("fastdeploy_dbs", f"kv_store.db"))
 71 | KV_STORE.clear()
 72 | 
 73 | 
 74 | MAIN_INDEX = DefinedIndex(
 75 |     "main_index",
 76 |     schema={
 77 |         **{
 78 |             "last_predictor_sequence": DefinedIndex.Type.number,
 79 |             "last_predictor_success": DefinedIndex.Type.boolean,
 80 |             "-1.outputs": DefinedIndex.Type.other,
 81 |             "-1.predicted_at": DefinedIndex.Type.number,
 82 |             "-1.received_at": DefinedIndex.Type.number,
 83 |             "-1.predicted_in_batch_of": DefinedIndex.Type.number,
 84 |             "timedout_in_queue": DefinedIndex.Type.boolean,
 85 |         },
 86 |         **{f"{_}.outputs": "other" for _ in PREDICTOR_SEQUENCE_TO_FILES},
 87 |         **{f"{_}.predicted_at": "number" for _ in PREDICTOR_SEQUENCE_TO_FILES},
 88 |         **{f"{_}.received_at": "number" for _ in PREDICTOR_SEQUENCE_TO_FILES},
 89 |         **{f"{_}.predicted_in_batch_of": "number" for _ in PREDICTOR_SEQUENCE_TO_FILES},
 90 |     },
 91 |     db_path=os.path.join("fastdeploy_dbs", f"main_index.db"),
 92 |     auto_vacuum=False,
 93 | )
 94 | 
 95 | # for setting timedout_in_queue
 96 | # used in _loop.py start_loop to set timedout_in_queue to True for all the predictions that have been in the queue for more than timeout_time seconds
 97 | MAIN_INDEX.optimize_for_query(
 98 |     ["-1.predicted_at", "-1.received_at", "timedout_in_queue"]
 99 | )
100 | 
101 | # for getting next batch to process
102 | # used in _loop.py fetch_batch function
103 | MAIN_INDEX.optimize_for_query(
104 |     [
105 |         "-1.predicted_at",
106 |         "last_predictor_success",
107 |         "last_predictor_sequence",
108 |         "timedout_in_queue",
109 |     ]
110 | )
111 | 
112 | # in general queries
113 | MAIN_INDEX.optimize_for_query(["-1.received_at"])
114 | MAIN_INDEX.optimize_for_query(["last_predictor_success"])
115 | MAIN_INDEX.optimize_for_query(["last_predictor_sequence"])
116 | MAIN_INDEX.optimize_for_query(["timedout_in_queue"])
117 | 
118 | 
119 | GLOBAL_METRICS_INDEX = KVIndex(
120 |     os.path.join("fastdeploy_dbs", f"global_metrics_index.db")
121 | )
122 | GLOBAL_METRICS_INDEX["total_predictor_run_for_hours"] = 0
123 | GLOBAL_METRICS_INDEX["total_predictor_up_for_hours"] = 0
124 | 
125 | 
126 | def get_fd_pids():
127 |     # get pids of processes with fastdeploy and rest or loop in their full cmdline
128 |     pids = {
129 |         "rest": [],
130 |         "loop": []
131 |     }
132 | 
133 |     for proc in psutil.process_iter():
134 |         try:
135 |             full_cmdline = " ".join(proc.cmdline())
136 |             if "fastdeploy" in full_cmdline and "--rest" in full_cmdline:
137 |                 pids["rest"].append(proc.pid)
138 |             elif "fastdeploy" in full_cmdline and "--loop" in full_cmdline:
139 |                 pids["loop"].append(proc.pid)
140 |         except Exception as e:
141 |             pass
142 | 
143 |     return pids
144 | 
145 | 
146 | def kill_fd(loop=True, rest=True):
147 |     pids = get_fd_pids()
148 |     if loop and pids["loop"]:
149 |         os.system(f"kill -9 {' '.join([str(pid) for pid in pids['loop']])}")
150 |     if rest and pids["rest"]:
151 |         os.system(f"kill -9 {' '.join([str(pid) for pid in pids['rest']])}")
152 | 
153 | 
154 | def warmup(predictor, example_input, n=3):
155 |     """
156 |     Run warmup prediction on the model.
157 | 
158 |     :param n: number of warmup predictions to be run. defaults to 3
159 |     """
160 |     logger.info("Warming up .. ")
161 |     for _ in range(n - 1):
162 |         predictor(example_input)
163 | 
164 |     return predictor(example_input)
165 | 
166 | 
167 | def calculate_optimum_batch_sizes(
168 |     predictor,
169 |     predictor_sequence,
170 |     example_input,
171 |     max_batch_size,
172 |     max_batch_search_sec=10,
173 | ):
174 |     search_over_batch_sizes = (
175 |         [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
176 |         if max_batch_size == 0
177 |         else [max_batch_size]
178 |     )
179 | 
180 |     time_per_example = 0
181 |     max_batch_size = 0
182 | 
183 |     for batch_size in search_over_batch_sizes:
184 |         logger.info(f"Trying batch size: {batch_size}")
185 |         start = time.time()
186 |         predictor((example_input * batch_size)[:batch_size], batch_size=batch_size)
187 |         end = time.time()
188 | 
189 |         _time_per_example = (end - start) / batch_size
190 | 
191 |         logger.info(f"batch_size: {batch_size}, time_per_example: {_time_per_example}")
192 | 
193 |         if time_per_example == 0:
194 |             time_per_example = _time_per_example
195 |             max_batch_size = batch_size
196 |         elif _time_per_example < time_per_example:
197 |             time_per_example = _time_per_example
198 |             max_batch_size = batch_size
199 |         else:
200 |             break
201 | 
202 |     logger.info(
203 |         f"{PREDICTOR_SEQUENCE_TO_FILES[predictor_sequence]}: Optimum batch size: {max_batch_size}, time_per_example: {time_per_example}"
204 |     )
205 | 
206 |     return max_batch_size, time_per_example
207 | 
208 | 
209 | def check_if_requests_timedout_in_last_x_seconds_is_more_than_y(
210 |     last_x_seconds, max_percentage_of_timedout_requests
211 | ):
212 |     time_before_x_seconds = time.time() - last_x_seconds
213 |     requests_received_in_last_x_seconds = MAIN_INDEX.count(
214 |         query={"-1.predicted_at": {"$gte": time_before_x_seconds}}
215 |     )
216 | 
217 |     requests_timedout_in_last_x_seconds = MAIN_INDEX.count(
218 |         query={
219 |             "-1.predicted_at": {"$gte": time_before_x_seconds},
220 |             "timedout_in_queue": True,
221 |         }
222 |     )
223 | 
224 |     if requests_received_in_last_x_seconds == 0:
225 |         return False
226 | 
227 |     logger.warning(
228 |         f"Requests timedout in last {last_x_seconds} seconds: {requests_timedout_in_last_x_seconds}/{requests_received_in_last_x_seconds}"
229 |     )
230 | 
231 |     if (
232 |         requests_timedout_in_last_x_seconds / requests_received_in_last_x_seconds
233 |     ) * 100 >= max_percentage_of_timedout_requests:
234 |         return True
235 |     return False
236 | 
237 | 
238 | def check_if_percentage_of_requests_failed_in_last_x_seconds_is_more_than_y(
239 |     last_x_seconds, max_percentage_of_failed_requests
240 | ):
241 |     time_before_x_seconds = time.time() - last_x_seconds
242 |     requests_received_in_last_x_seconds = MAIN_INDEX.count(
243 |         query={"-1.predicted_at": {"$gte": time_before_x_seconds}}
244 |     )
245 | 
246 |     if requests_received_in_last_x_seconds == 0:
247 |         return False
248 | 
249 |     requests_received_in_last_x_seconds_that_failed = MAIN_INDEX.count(
250 |         query={
251 |             "-1.predicted_at": {"$gte": time_before_x_seconds},
252 |             "last_predictor_success": False,
253 |         }
254 |     )
255 | 
256 |     if (
257 |         requests_received_in_last_x_seconds_that_failed
258 |         / requests_received_in_last_x_seconds
259 |     ) * 100 >= max_percentage_of_failed_requests:
260 |         return True
261 | 
262 |     return False
263 | 
264 | 
265 | def check_if_requests_older_than_x_seconds_pending(x):
266 |     time_before_x_seconds = time.time() - x
267 | 
268 |     requests_older_than_x_seconds_pending = MAIN_INDEX.count(
269 |         query={
270 |             "-1.received_at": {"$lte": time_before_x_seconds},
271 |             "-1.predicted_at": 0,
272 |             "last_predictor_success": {"$ne": False},
273 |         }
274 |     )
275 | 
276 |     if requests_older_than_x_seconds_pending > 0:
277 |         return True
278 |     return False
279 | 


--------------------------------------------------------------------------------
/fastdeploy/monitor.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Function to check if nvidia-smi is available
  4 | check_nvidia_smi() {
  5 |     command -v nvidia-smi >/dev/null 2>&1
  6 | }
  7 | 
  8 | # Function to get GPU usage for a PID
  9 | get_gpu_usage() {
 10 |     pid=$1
 11 |     if check_nvidia_smi; then
 12 |         gpu_mem=$(nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader,nounits | grep "^$pid," | cut -d',' -f2 | tr -d ' ')
 13 |         gpu_util=$(nvidia-smi --query-compute-apps=pid,gpu_util --format=csv,noheader,nounits | grep "^$pid," | cut -d',' -f2 | tr -d ' ')
 14 |         
 15 |         gpu_mem=${gpu_mem:-0}
 16 |         gpu_util=${gpu_util:-0}
 17 |     else
 18 |         gpu_mem=0
 19 |         gpu_util=0
 20 |     fi
 21 |     
 22 |     echo "$gpu_util $gpu_mem"
 23 | }
 24 | 
 25 | # Function to get CPU and memory usage for a single PID
 26 | get_usage() {
 27 |     pid=$1
 28 |     cpu=$(ps -p $pid -o %cpu= | tr -d ' ')
 29 |     mem=$(ps -p $pid -o rss= | tr -d ' ')
 30 |     mem_mb=$(printf "%.2f" $(echo "$mem / 1024" | bc -l))
 31 |     echo "$cpu $mem_mb"
 32 | }
 33 | 
 34 | # Function to sum CPU and memory usage for multiple PIDs
 35 | sum_usage() {
 36 |     pids=$1
 37 |     cpu_sum=0
 38 |     mem_sum=0
 39 |     
 40 |     for pid in $pids; do
 41 |         read cpu mem <<< $(get_usage $pid)
 42 |         cpu_sum=$(echo "$cpu_sum + $cpu" | bc -l)
 43 |         mem_sum=$(echo "$mem_sum + $mem" | bc -l)
 44 |     done
 45 |     
 46 |     echo "$cpu_sum $mem_sum"
 47 | }
 48 | 
 49 | # Initialize arrays for storing historical data
 50 | declare -a loop_cpu_history
 51 | declare -a loop_ram_history
 52 | declare -a loop_gpu_util_history
 53 | declare -a loop_gpu_mem_history
 54 | declare -a rest_cpu_history
 55 | declare -a rest_ram_history
 56 | 
 57 | # Function to calculate statistics
 58 | calculate_stats() {
 59 |     local values=("$@")
 60 |     local count=${#values[@]}
 61 |     
 62 |     if [ $count -eq 0 ]; then
 63 |         echo '{"min": "N/A", "max": "N/A", "avg": "N/A"}'
 64 |         return
 65 |     fi
 66 |     
 67 |     local min=${values[0]}
 68 |     local max=${values[0]}
 69 |     local sum=0
 70 |     
 71 |     for value in "${values[@]}"; do
 72 |         sum=$(printf "%.2f" $(echo "$sum + $value" | bc -l))
 73 |         
 74 |         if (( $(echo "$value < $min" | bc -l) )); then
 75 |             min=$value
 76 |         fi
 77 |         
 78 |         if (( $(echo "$value > $max" | bc -l) )); then
 79 |             max=$value
 80 |         fi
 81 |     done
 82 |     
 83 |     local avg=$(printf "%.2f" $(echo "$sum / $count" | bc -l))
 84 |     
 85 |     echo "{\"min\": $min, \"max\": $max, \"avg\": $avg}"
 86 | }
 87 | 
 88 | # Function to add value to history array (maintaining last 5 values)
 89 | add_to_history() {
 90 |     local array_name=$1
 91 |     local value=$2
 92 |     
 93 |     eval "$array_name[\${#$array_name[@]}]=$value"
 94 |     
 95 |     if [ $(eval "echo \${#$array_name[@]}") -gt 5 ]; then
 96 |         eval "$array_name=(\"\${$array_name[@]:1}\")"
 97 |     fi
 98 | }
 99 | 
100 | # Function to create JSON output
101 | create_json() {
102 |     local loop_pid=$1
103 |     local rest_pids=$2
104 |     local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
105 |     local output=""
106 |     
107 |     output+="{\n"
108 |     output+="  \"timestamp\": \"$timestamp\",\n"
109 |     
110 |     # Loop process data
111 |     output+="  \"loop_process\": {\n"
112 |     if [ ! -z "$loop_pid" ]; then
113 |         read cpu mem <<< $(get_usage $loop_pid)
114 |         read gpu_util gpu_mem <<< $(get_gpu_usage $loop_pid)
115 |         
116 |         add_to_history loop_cpu_history "$cpu"
117 |         add_to_history loop_ram_history "$mem"
118 |         add_to_history loop_gpu_util_history "$gpu_util"
119 |         add_to_history loop_gpu_mem_history "$gpu_mem"
120 |         
121 |         output+="    \"pid\": $loop_pid,\n"
122 |         output+="    \"status\": \"running\",\n"
123 |         output+="    \"current\": {\n"
124 |         output+="      \"cpu\": $cpu,\n"
125 |         output+="      \"ram\": $mem,\n"
126 |         output+="      \"gpu_util\": $gpu_util,\n"
127 |         output+="      \"gpu_mem\": $gpu_mem\n"
128 |         output+="    },\n"
129 |         output+="    \"stats\": {\n"
130 |         output+="      \"cpu\": $(calculate_stats "${loop_cpu_history[@]}"),\n"
131 |         output+="      \"ram\": $(calculate_stats "${loop_ram_history[@]}"),\n"
132 |         output+="      \"gpu_util\": $(calculate_stats "${loop_gpu_util_history[@]}"),\n"
133 |         output+="      \"gpu_mem\": $(calculate_stats "${loop_gpu_mem_history[@]}")\n"
134 |         output+="    }\n"
135 |     else
136 |         output+="    \"status\": \"not_running\"\n"
137 |     fi
138 |     output+="  },\n"
139 |     
140 |     # REST processes data
141 |     output+="  \"rest_processes\": {\n"
142 |     if [ ! -z "$rest_pids" ]; then
143 |         read cpu mem <<< $(sum_usage "$rest_pids")
144 |         
145 |         add_to_history rest_cpu_history "$cpu"
146 |         add_to_history rest_ram_history "$mem"
147 |         
148 |         output+="    \"pids\": [$(echo $rest_pids | sed 's/ /, /g')],\n"
149 |         output+="    \"status\": \"running\",\n"
150 |         output+="    \"current\": {\n"
151 |         output+="      \"cpu\": $cpu,\n"
152 |         output+="      \"ram\": $mem\n"
153 |         output+="    },\n"
154 |         output+="    \"stats\": {\n"
155 |         output+="      \"cpu\": $(calculate_stats "${rest_cpu_history[@]}"),\n"
156 |         output+="      \"ram\": $(calculate_stats "${rest_ram_history[@]}")\n"
157 |         output+="    }\n"
158 |     else
159 |         output+="    \"status\": \"not_running\"\n"
160 |     fi
161 |     output+="  }\n"
162 |     output+="}"
163 | 
164 |     echo -e "$output"
165 | }
166 | 
167 | # Main monitoring function
168 | monitor() {
169 |     # Get PIDs
170 |     loop_pid=$(pgrep -f "fastdeploy.*loop")
171 |     rest_pids=$(pgrep -f "fastdeploy.*rest")
172 |     
173 |     # Create JSON and write to file
174 |     create_json "$loop_pid" "$rest_pids" > monitoring_results.json
175 | }
176 | 
177 | # Run the monitor function every 2 seconds
178 | while true; do
179 |     monitor
180 |     sleep 1
181 | done
182 | 


--------------------------------------------------------------------------------
/recipe.md:
--------------------------------------------------------------------------------
 1 | ### Serving your pipeline with fastdeploy [example](https://github.com/notAI-tech/fastDeploy/tree/master/recipes/echo)
 2 | 
 3 | - Create a recipe folder with the following structure:
 4 | ```
 5 | recipe_folder/
 6 | ├── example.py
 7 | ├── predictor.py
 8 | ├── requirements.txt (optional)
 9 | └── extras.sh (optional)
10 | ```
11 | 
12 | - `example.py`
13 | 
14 | ```python
15 | name = "your_app_or_model_name"
16 | 
17 | example = [
18 |     example_object_1,
19 |     example_object_2,
20 | ]
21 | ```
22 | 
23 | - `predictor.py`
24 | 
25 | ```python
26 | # Whatever code and imports you need to load your model and make predictions
27 | 
28 | # predictor function must be defined exactly as below
29 | # batch_size is the optimal batch size for your model
30 | # inputs length may or may not be equal to batch_size
31 | # len(outputs) == len(inputs)
32 | def predictor(inputs, batch_size=1):
33 |     return outputs
34 | ```
35 | 
36 | - `requirements.txt` (optional): all python dependencies for your pipeline
37 | 
38 | - `extras.sh` (optional): any bash commands to run before installing requirements.txt
39 | 
40 | - #### start the loop
41 | 
42 | ```bash
43 | fastdeploy --loop --recipe recipes/echo_chained
44 | ```
45 | 
46 | - #### start the server
47 | 
48 | ```bash
49 | fastdeploy --rest --recipe recipes/echo_chained
50 | ```
51 | 
52 | 
53 | ### Chained recipe [example](https://github.com/notAI-tech/fastDeploy/tree/master/recipes/echo_chained)
54 | - Chained recipe means you have multiple predictor_X.py which are chained sequentially
55 | - `predictor_1.py` will be called first, then `predictor_2.py` and so on
56 | - Each predictor_X.py must have a predictor function defined as above
57 | - Each predictor_X.py is run separately i.e: can be in different virtualenvs
58 | 
59 | - #### start all the loops
60 | 
61 | ```bash
62 | fastdeploy --loop --recipe recipes/echo_chained --config "predictor_name:predictor_1.py"
63 | 
64 | fastdeploy --loop --recipe recipes/echo_chained --config "predictor_name:predictor_2.py"
65 | ```
66 | 
67 | - #### start the server
68 | 
69 | ```bash
70 | fastdeploy --rest --recipe recipes/echo_chained
71 | ```
72 | 


--------------------------------------------------------------------------------
/recipes/.gitignore:
--------------------------------------------------------------------------------
1 | */*default*
2 | */fastdeploy_dbs
3 | 


--------------------------------------------------------------------------------
/recipes/echo/.dockerignore:
--------------------------------------------------------------------------------
1 | *.request_index
2 | *.results_index
3 | *.log_index


--------------------------------------------------------------------------------
/recipes/echo/.gitignore:
--------------------------------------------------------------------------------
1 | *.request_index
2 | *.results_index
3 | *.log_index


--------------------------------------------------------------------------------
/recipes/echo/example.py:
--------------------------------------------------------------------------------
1 | name = "echo"
2 | 
3 | example = ["Any JSON serialiazable Python object can be input"]
4 | 


--------------------------------------------------------------------------------
/recipes/echo/extra_prometheus_metrics.py:
--------------------------------------------------------------------------------
1 | def get_prometheus_metrics():
2 |     return {
3 |         "test_metric": {
4 |             "type": "counter",
5 |             "help": "This is a test metric",
6 |             "value": 1
7 |         }
8 |     }


--------------------------------------------------------------------------------
/recipes/echo/fastDeploy.auto_dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-slim
 2 | RUN python3 -m pip install --upgrade --no-cache-dir pip fastdeploy
 3 | 
 4 | ENV MAX_REQUEST_BATCH_SIZE=0 WORKERS=3 TIMEOUT=480 HOST=0.0.0.0 PORT=8080 ONLY_ASYNC=false ALLOW_PICKLE=true PREDICTOR_NAME=predictor.py OPTIMAL_BATCH_SIZE=0 KEEP_ALIVE=60 BASE=python:3.8-slim
 5 | 
 6 | ADD . /recipe
 7 | WORKDIR /recipe
 8 | 
 9 | RUN python3 -m pip install --no-cache-dir -r /recipe/requirements.txt
10 | RUN cd /recipe && python3 -c "from predictor import predictor; from example import example; predictor(example)"
11 | 
12 | ENTRYPOINT ["/bin/sh", "-c"]
13 | 
14 | CMD ["ulimit -n 1000000 && python3 -m fastdeploy --recipe /recipe --loop & python3 -m fastdeploy --recipe /recipe --rest"]
15 | 


--------------------------------------------------------------------------------
/recipes/echo/predictor.py:
--------------------------------------------------------------------------------
 1 | # Do the required imports
 2 | import os
 3 | import time
 4 | 
 5 | # Any code can be here
 6 | # Load your models, import your local scripts
 7 | # modify the code inside predictor function.
 8 | 
 9 | SLEEP_TIME = float(os.getenv("SLEEP_TIME", "0.2"))
10 | 
11 | def predictor(input_list, batch_size=1):
12 |     output_list = []
13 |     while input_list:
14 |         input_batch = input_list[:batch_size]
15 |         input_list = input_list[batch_size:]
16 |         output_list += input_batch
17 |         time.sleep(SLEEP_TIME)
18 |     
19 |     return output_list
20 | 


--------------------------------------------------------------------------------
/recipes/echo/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/notAI-tech/fastDeploy/34865d1be99cc5ab98645985c6c7dda7119df1c4/recipes/echo/requirements.txt


--------------------------------------------------------------------------------
/recipes/echo_chained/.dockerignore:
--------------------------------------------------------------------------------
1 | *.request_index
2 | *.results_index
3 | *.log_index


--------------------------------------------------------------------------------
/recipes/echo_chained/.gitignore:
--------------------------------------------------------------------------------
1 | *.request_index
2 | *.results_index
3 | *.log_index


--------------------------------------------------------------------------------
/recipes/echo_chained/example.py:
--------------------------------------------------------------------------------
1 | example = ["Any JSON serialiazable Python object can be input"]
2 | 


--------------------------------------------------------------------------------
/recipes/echo_chained/predictor_1.py:
--------------------------------------------------------------------------------
 1 | # Do the required imports
 2 | import os
 3 | import time
 4 | 
 5 | # Any code can be here
 6 | # Load your models, import your local scripts
 7 | # modify the code inside predictor function.
 8 | 
 9 | SLEEP_TIME = float(os.getenv("SLEEP_TIME", "0.1"))
10 | 
11 | def predictor(input_list, batch_size=1):
12 |     output_list = []
13 |     while input_list:
14 |         print(input_list)
15 |         input_batch = input_list[:batch_size]
16 |         input_list = input_list[batch_size:]
17 |         output_list += [(1, _) for _ in input_batch]
18 |         time.sleep(SLEEP_TIME)
19 |     
20 |     return output_list
21 | 


--------------------------------------------------------------------------------
/recipes/echo_chained/predictor_2.py:
--------------------------------------------------------------------------------
 1 | # Do the required imports
 2 | import os
 3 | import time
 4 | 
 5 | # Any code can be here
 6 | # Load your models, import your local scripts
 7 | # modify the code inside predictor function.
 8 | 
 9 | SLEEP_TIME = float(os.getenv("SLEEP_TIME", "0.2"))
10 | 
11 | def predictor(input_list, batch_size=1):
12 |     print(input_list)
13 |     output_list = []
14 |     while input_list:
15 |         input_batch = input_list[:batch_size]
16 |         input_list = input_list[batch_size:]
17 |         output_list += [(2, _) for _ in input_batch]
18 |         time.sleep(SLEEP_TIME)
19 |     
20 |     return output_list
21 | 


--------------------------------------------------------------------------------
/recipes/echo_chained/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/notAI-tech/fastDeploy/34865d1be99cc5ab98645985c6c7dda7119df1c4/recipes/echo_chained/requirements.txt


--------------------------------------------------------------------------------
/recipes/text_embeddings/example.py:
--------------------------------------------------------------------------------
 1 | # generate random sentence with words of size 1-10 characters and total 5-100 words
 2 | 
 3 | import random
 4 | import string
 5 | 
 6 | words = open("words.txt", "r").read().split()
 7 | 
 8 | def generate_random_sentence():
 9 |     # Generate random number of words between 5-100
10 |     num_words = random.randint(3, 100)
11 |     
12 |     sentence = []
13 |     for _ in range(num_words):
14 |         word = random.choice(words)
15 |         sentence.append(word)
16 |         
17 |     return ' '.join(sentence)
18 | 
19 | 
20 | def example_function():
21 |     return [generate_random_sentence() for _ in range(random.randint(1, 10))]
22 | 
23 | example = example_function()


--------------------------------------------------------------------------------
/recipes/text_embeddings/predictor.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers import SentenceTransformer
 2 | 
 3 | sentences = ['That is a happy person', 'That is a very happy person']
 4 | 
 5 | model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True, backend="onnx", model_kwargs={"file_name": "model.onnx", "provider": "CPUExecutionProvider"})
 6 | 
 7 | def predictor(input_list, batch_size=16):
 8 |     return model.encode(input_list, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False, batch_size=batch_size)
 9 | 
10 | 


--------------------------------------------------------------------------------
/recipes/text_embeddings/requirements.txt:
--------------------------------------------------------------------------------
1 | sentence-transformers[onnx]
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Note: To use the 'upload' functionality of this file, you must:
  5 | #   $ pip install twine
  6 | 
  7 | import io
  8 | import os
  9 | import sys
 10 | from shutil import rmtree
 11 | 
 12 | from setuptools import find_packages, setup, Command
 13 | 
 14 | # Package meta-data.
 15 | NAME = "fastdeploy"
 16 | DESCRIPTION = "Deploy DL/ ML inference pipelines with minimal extra code. "
 17 | URL = "https://github.com/notAI-tech/fastDeploy"
 18 | EMAIL = "praneeth@bpraneeth.com"
 19 | AUTHOR = "BEDAPUDI PRANEETH"
 20 | REQUIRES_PYTHON = ">=3.6.0"
 21 | VERSION = "3.1.1"
 22 | 
 23 | # What packages are required for this module to be executed?
 24 | REQUIRED = ["falcon", "liteindex==0.0.3.2.dev6", "zstandard", "gunicorn[gevent]", "msgpack", "psutil"]
 25 | 
 26 | # What packages are optional?
 27 | EXTRAS = {
 28 | }
 29 | 
 30 | # The rest you shouldn't have to touch too much :)
 31 | # ------------------------------------------------
 32 | # Except, perhaps the License and Trove Classifiers!
 33 | # If you do change the License, remember to change the Trove Classifier for that!
 34 | 
 35 | here = os.path.abspath(os.path.dirname(__file__))
 36 | 
 37 | # Import the README and use it as the long-description.
 38 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
 39 | try:
 40 |     with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
 41 |         long_description = "\n" + f.read()
 42 | except FileNotFoundError:
 43 |     long_description = DESCRIPTION
 44 | 
 45 | # Load the package's __version__.py module as a dictionary.
 46 | about = {}
 47 | if not VERSION:
 48 |     with open(os.path.join(here, NAME, "__version__.py")) as f:
 49 |         exec(f.read(), about)
 50 | else:
 51 |     about["__version__"] = VERSION
 52 | 
 53 | 
 54 | class UploadCommand(Command):
 55 |     """Support setup.py upload."""
 56 | 
 57 |     description = "Build and publish the package."
 58 |     user_options = []
 59 | 
 60 |     @staticmethod
 61 |     def status(s):
 62 |         """Prints things in bold."""
 63 |         print("\033[1m{0}\033[0m".format(s))
 64 | 
 65 |     def initialize_options(self):
 66 |         pass
 67 | 
 68 |     def finalize_options(self):
 69 |         pass
 70 | 
 71 |     def run(self):
 72 |         try:
 73 |             self.status("Removing previous builds…")
 74 |             rmtree(os.path.join(here, "dist"))
 75 |         except OSError:
 76 |             pass
 77 | 
 78 |         self.status("Building Source and Wheel (universal) distribution…")
 79 |         os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
 80 | 
 81 |         self.status("Uploading the package to PyPI via Twine…")
 82 |         os.system("twine upload dist/*")
 83 | 
 84 |         self.status("Pushing git tags…")
 85 |         os.system("git tag v{0}".format(about["__version__"]))
 86 |         os.system("git push --tags")
 87 | 
 88 |         sys.exit()
 89 | 
 90 | 
 91 | # Where the magic happens:
 92 | setup(
 93 |     name=NAME,
 94 |     version=about["__version__"],
 95 |     description=DESCRIPTION,
 96 |     long_description=long_description,
 97 |     long_description_content_type="text/markdown",
 98 |     author=AUTHOR,
 99 |     author_email=EMAIL,
100 |     python_requires=REQUIRES_PYTHON,
101 |     url=URL,
102 |     packages=find_packages(exclude=("tests",)),
103 |     # If your package is a single module, use this instead of 'packages':
104 |     # py_modules=['mypackage'],
105 |     entry_points={"console_scripts": ["fastdeploy=fastdeploy:main"]},
106 |     install_requires=REQUIRED,
107 |     extras_require=EXTRAS,
108 |     package_data={NAME: ["fastdeploy-ui/*", "fastdeploy-ui/build/*"]},
109 |     include_package_data=True,
110 |     license="MIT",
111 |     classifiers=[
112 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
113 |         "License :: OSI Approved :: MIT License",
114 |         "Programming Language :: Python",
115 |         "Programming Language :: Python :: 3",
116 |         "Programming Language :: Python :: 3.6",
117 |         "Programming Language :: Python :: Implementation :: CPython",
118 |     ],
119 |     # $ setup.py publish support.
120 |     cmdclass={
121 |         "upload": UploadCommand,
122 |     },
123 | )
124 | 


--------------------------------------------------------------------------------
/testing/README.md:
--------------------------------------------------------------------------------
1 | python benchmark.py --target_rps_per_connection 100 --parallel_connections 10 --duration 60  --warmup 1 --server_url http://10.18.9.60:8080 --input_file /Users/praneeth.bedapudi/RINGCENTRAL/marauders-map/ml_serving/nlu/semantic_score_serving/example.py --results_file a.json --request_timeout 0.6
2 | 


--------------------------------------------------------------------------------
/testing/benchmark.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import logging
  3 | import argparse
  4 | import json
  5 | import random
  6 | import numpy as np
  7 | from datetime import datetime
  8 | import os
  9 | import importlib.util
 10 | from tqdm import tqdm
 11 | from fdclient import FDClient
 12 | import multiprocessing as mp
 13 | from dataclasses import dataclass
 14 | from typing import List, Dict, Any
 15 | import queue
 16 | import signal
 17 | 
 18 | # Configure logging
 19 | logging.basicConfig(format='%(asctime)s - %(message)s')
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | @dataclass
 23 | class ConnectionStats:
 24 |     latencies: List[float]
 25 |     errors: List[str]
 26 |     successes: int
 27 |     failures: int
 28 |     connection_id: int
 29 | 
 30 | 
 31 | class BenchmarkProcess(mp.Process):
 32 |     def __init__(self, connection_id, server_url, target_rps, duration, 
 33 |                  input_source, request_batch_size, is_warmup, 
 34 |                  stats_queue, progress_queue, request_timeout=10):
 35 |         super().__init__()
 36 |         self.connection_id = connection_id
 37 |         self.server_url = server_url
 38 |         self.target_rps = target_rps
 39 |         self.duration = duration
 40 |         self.input_source = input_source
 41 |         self.request_batch_size = request_batch_size
 42 |         self.is_warmup = is_warmup
 43 |         self.stats_queue = stats_queue
 44 |         self.progress_queue = progress_queue
 45 |         self.request_timeout = request_timeout
 46 |         self._loaded_function = None
 47 |         
 48 |     def _load_function(self):
 49 |         """Load the Python function inside the process"""
 50 |         if self.input_source['type'] == 'function':
 51 |             path = os.path.abspath(self.input_source['path'])
 52 |             directory = os.path.dirname(path)
 53 |             filename = os.path.basename(path)
 54 |             
 55 |             original_dir = os.getcwd()
 56 |             try:
 57 |                 os.chdir(directory)
 58 |                 module_name = os.path.splitext(filename)[0]
 59 |                 spec = importlib.util.spec_from_file_location(module_name, filename)
 60 |                 module = importlib.util.module_from_spec(spec)
 61 |                 spec.loader.exec_module(module)
 62 |                 
 63 |                 if not hasattr(module, 'example_function'):
 64 |                     raise ValueError("Python file must contain example_function()")
 65 |                 
 66 |                 self._loaded_function = module.example_function
 67 |             finally:
 68 |                 os.chdir(original_dir)
 69 | 
 70 |     def generate_payload(self):
 71 |         """Generate payload based on input source type"""
 72 |         if self.input_source['type'] == 'json':
 73 |             return [self.input_source['data'][random.randint(0, len(self.input_source['data']) - 1)] 
 74 |                    for _ in range(self.request_batch_size)]
 75 |         else:  # function
 76 |             if self._loaded_function is None:
 77 |                 self._load_function()
 78 |             return self._loaded_function()[:self.request_batch_size]
 79 | 
 80 |     def run(self):
 81 |         # Handle Ctrl+C gracefully
 82 |         signal.signal(signal.SIGINT, signal.SIG_IGN)
 83 |         
 84 |         client = FDClient(server_url=self.server_url, request_timeout=self.request_timeout)
 85 |         
 86 |         if self.target_rps:
 87 |             sleep_time = 1.0 / self.target_rps
 88 |         else:
 89 |             sleep_time = 0
 90 |             
 91 |         start_time = time.time()
 92 |         stats = ConnectionStats(
 93 |             latencies=[], errors=[], successes=0, failures=0,
 94 |             connection_id=self.connection_id
 95 |         )
 96 |         requests_made = 0
 97 |         
 98 |         while time.time() - start_time < self.duration:
 99 |             request_start = time.time()
100 |             
101 |             try:
102 |                 # Generate and send request
103 |                 inps = self.generate_payload()
104 |                 request_id = f"{'warm' if self.is_warmup else 'req'}-conn{self.connection_id}-{requests_made}"
105 |                 
106 |                 results = client.infer(inps, unique_id=request_id)
107 |                 latency = (time.time() - request_start) * 1000  # Convert to ms
108 |                 
109 |                 if results['success']:
110 |                     if not self.is_warmup:
111 |                         stats.successes += 1
112 |                         stats.latencies.append(latency)
113 |                 else:
114 |                     if not self.is_warmup:
115 |                         stats.failures += 1
116 |                         stats.errors.append(results.get('reason', 'Unknown error'))
117 |                     
118 |             except Exception as e:
119 |                 if not self.is_warmup:
120 |                     stats.failures += 1
121 |                     stats.errors.append(str(e))
122 |             
123 |             requests_made += 1
124 |             
125 |             # Update progress
126 |             elapsed = time.time() - start_time
127 |             self.progress_queue.put((self.connection_id, min(elapsed, self.duration)))
128 |             
129 |             # Rate limiting
130 |             elapsed = time.time() - request_start
131 |             if sleep_time > elapsed:
132 |                 time.sleep(sleep_time - elapsed)
133 |         
134 |         # Send final stats
135 |         self.stats_queue.put((self.connection_id, stats))
136 | 
137 | class BenchmarkRunner:
138 |     def __init__(self, target_rps_per_connection, duration_seconds, server_url,
139 |                  parallel_connections=1, warmup_seconds=5, input_source=None, 
140 |                  request_batch_size=1, log_dir=None, debug=False, request_timeout=10):
141 |         self.target_rps_per_connection = target_rps_per_connection
142 |         self.parallel_connections = parallel_connections
143 |         self.duration_seconds = duration_seconds
144 |         self.warmup_seconds = warmup_seconds
145 |         self.server_url = server_url
146 |         self.input_source = input_source
147 |         self.request_batch_size = request_batch_size
148 |         self.log_dir = log_dir
149 |         self.debug = debug
150 |         self.request_timeout = request_timeout
151 |         
152 |         if self.log_dir:
153 |             os.makedirs(self.log_dir, exist_ok=True)
154 |             
155 |         # For handling Ctrl+C gracefully
156 |         self.stop_event = mp.Event()
157 |         signal.signal(signal.SIGINT, self._handle_interrupt)
158 | 
159 |     def _handle_interrupt(self, signum, frame):
160 |         print("\nStopping benchmark gracefully...")
161 |         self.stop_event.set()
162 | 
163 |     def _update_progress_bars(self, progress_queue, pbars, duration, process_count):
164 |         """Update progress bars from queue until duration is reached or stop_event is set"""
165 |         start_time = time.time()
166 |         while time.time() - start_time < duration and not self.stop_event.is_set():
167 |             try:
168 |                 conn_id, progress = progress_queue.get(timeout=0.1)
169 |                 pbars[conn_id].n = progress
170 |                 pbars[conn_id].refresh()
171 |             except queue.Empty:
172 |                 continue
173 | 
174 |     def run_benchmark(self):
175 |         """Run the benchmark with parallel processes"""
176 |         # Create queues for inter-process communication
177 |         stats_queue = mp.Queue()
178 |         progress_queue = mp.Queue()
179 |         
180 |         print("\nStarting warmup period...")
181 |         
182 |         # Create progress bars for warmup
183 |         warmup_pbars = {
184 |             i: tqdm(
185 |                 total=self.warmup_seconds,
186 |                 desc=f"Warmup Conn {i}",
187 |                 position=i,
188 |                 unit="s",
189 |                 leave=True
190 |             )
191 |             for i in range(self.parallel_connections)
192 |         }
193 |         
194 |         # Start warmup processes
195 |         warmup_processes = [
196 |             BenchmarkProcess(
197 |                 connection_id=i,
198 |                 server_url=self.server_url,
199 |                 target_rps=self.target_rps_per_connection,
200 |                 duration=self.warmup_seconds,
201 |                 input_source=self.input_source,
202 |                 request_batch_size=self.request_batch_size,
203 |                 is_warmup=True,
204 |                 stats_queue=stats_queue,
205 |                 progress_queue=progress_queue,
206 |                 request_timeout=self.request_timeout
207 |             )
208 |             for i in range(self.parallel_connections)
209 |         ]
210 |         
211 |         for p in warmup_processes:
212 |             p.start()
213 |             
214 |         # Update warmup progress bars
215 |         self._update_progress_bars(
216 |             progress_queue, warmup_pbars, 
217 |             self.warmup_seconds, self.parallel_connections
218 |         )
219 |         
220 |         # Wait for warmup processes to finish
221 |         for p in warmup_processes:
222 |             p.join()
223 |             
224 |         # Clear warmup stats queue
225 |         while not stats_queue.empty():
226 |             stats_queue.get()
227 |             
228 |         # Close warmup progress bars
229 |         for pbar in warmup_pbars.values():
230 |             pbar.close()
231 |         
232 |         if self.stop_event.is_set():
233 |             print("\nBenchmark interrupted during warmup")
234 |             return None
235 |             
236 |         print("\nStarting benchmark...")
237 |         
238 |         # Create progress bars for benchmark
239 |         benchmark_pbars = {
240 |             i: tqdm(
241 |                 total=self.duration_seconds,
242 |                 desc=f"Benchmark Conn {i}",
243 |                 position=i,
244 |                 unit="s",
245 |                 leave=True
246 |             )
247 |             for i in range(self.parallel_connections)
248 |         }
249 |         
250 |         # Start benchmark processes
251 |         benchmark_processes = [
252 |             BenchmarkProcess(
253 |                 connection_id=i,
254 |                 server_url=self.server_url,
255 |                 target_rps=self.target_rps_per_connection,
256 |                 duration=self.duration_seconds,
257 |                 input_source=self.input_source,
258 |                 request_batch_size=self.request_batch_size,
259 |                 is_warmup=False,
260 |                 stats_queue=stats_queue,
261 |                 progress_queue=progress_queue,
262 |                 request_timeout=self.request_timeout
263 |             )
264 |             for i in range(self.parallel_connections)
265 |         ]
266 |         
267 |         for p in benchmark_processes:
268 |             p.start()
269 |             
270 |         # Update benchmark progress bars
271 |         self._update_progress_bars(
272 |             progress_queue, benchmark_pbars, 
273 |             self.duration_seconds, self.parallel_connections
274 |         )
275 |         
276 |         # Collect results
277 |         connection_stats = {}
278 |         for _ in range(self.parallel_connections):
279 |             conn_id, stats = stats_queue.get()
280 |             connection_stats[conn_id] = stats
281 |             
282 |         # Wait for all processes to finish
283 |         for p in benchmark_processes:
284 |             p.join()
285 |             
286 |         # Close benchmark progress bars
287 |         for pbar in benchmark_pbars.values():
288 |             pbar.close()
289 |             
290 |         # Move cursor to bottom of progress bars
291 |         print("\n" * (self.parallel_connections))
292 |         
293 |         if self.stop_event.is_set():
294 |             print("\nBenchmark interrupted")
295 |             return None
296 |             
297 |         # Aggregate results
298 |         all_latencies = []
299 |         total_successes = 0
300 |         total_failures = 0
301 |         all_errors = []
302 |         
303 |         for stats in connection_stats.values():
304 |             all_latencies.extend(stats.latencies)
305 |             total_successes += stats.successes
306 |             total_failures += stats.failures
307 |             all_errors.extend(stats.errors)
308 |         
309 |         if all_latencies:
310 |             total_time = self.duration_seconds
311 |             p50 = np.percentile(all_latencies, 50)
312 |             p90 = np.percentile(all_latencies, 90)
313 |             p95 = np.percentile(all_latencies, 95)
314 |             p99 = np.percentile(all_latencies, 99)
315 |             avg_latency = np.mean(all_latencies)
316 |             std_latency = np.std(all_latencies)
317 |             total_requests = total_successes + total_failures
318 |             actual_rps = total_requests / total_time
319 |             
320 |             results = {
321 |                 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
322 |                 'total_requests': total_requests,
323 |                 'successes': total_successes,
324 |                 'failures': total_failures,
325 |                 'success_rate': (total_successes/total_requests)*100 if total_requests > 0 else 0,
326 |                 'average_latency_ms': float(avg_latency),
327 |                 'std_latency_ms': float(std_latency),
328 |                 'p50_latency_ms': float(p50),
329 |                 'p90_latency_ms': float(p90),
330 |                 'p95_latency_ms': float(p95),
331 |                 'p99_latency_ms': float(p99),
332 |                 'min_latency_ms': float(min(all_latencies)),
333 |                 'max_latency_ms': float(max(all_latencies)),
334 |                 'actual_rps': float(actual_rps),
335 |                 'target_rps_per_connection': self.target_rps_per_connection,
336 |                 'parallel_connections': self.parallel_connections,
337 |                 'total_target_rps': (self.target_rps_per_connection or 0) * self.parallel_connections,
338 |                 'duration_seconds': self.duration_seconds,
339 |                 'warmup_seconds': self.warmup_seconds,
340 |                 'request_batch_size': self.request_batch_size,
341 |                 'errors': all_errors[:10] if all_errors else [],  # First 10 errors
342 |                 'error_count': len(all_errors),
343 |                 # Per-connection stats
344 |                 'connection_stats': {
345 |                     conn_id: {
346 |                         'requests': stats.successes + stats.failures,
347 |                         'successes': stats.successes,
348 |                         'failures': stats.failures,
349 |                         'success_rate': (stats.successes/(stats.successes + stats.failures))*100 if (stats.successes + stats.failures) > 0 else 0,
350 |                         'average_latency_ms': float(np.mean(stats.latencies)) if stats.latencies else 0,
351 |                         'actual_rps': (stats.successes + stats.failures) / total_time
352 |                     }
353 |                     for conn_id, stats in connection_stats.items()
354 |                 }
355 |             }
356 |             return results
357 |         return None
358 | 
359 | def format_duration(ms):
360 |     """Format milliseconds into a readable duration."""
361 |     if ms < 1:
362 |         return f"{ms*1000:.2f}μs"
363 |     elif ms < 1000:
364 |         return f"{ms:.2f}ms"
365 |     else:
366 |         return f"{ms/1000:.2f}s"
367 | 
368 | def print_results(results):
369 |     """Print formatted benchmark results."""
370 |     if not results:
371 |         return
372 |     
373 |     print("\n" + "="*80)
374 |     print("BENCHMARK RESULTS")
375 |     print("="*80)
376 |     
377 |     # Overall Statistics
378 |     print("\n📊 OVERALL STATISTICS")
379 |     print("-"*40)
380 |     print(f"Total Requests:     {results['total_requests']:,}")
381 |     print(f"Successful:         {results['successes']:,}")
382 |     print(f"Failed:            {results['failures']:,}")
383 |     print(f"Success Rate:       {results['success_rate']:.2f}%")
384 |     
385 |     # Throughput
386 |     print("\n🚀 THROUGHPUT")
387 |     print("-"*40)
388 |     print(f"Actual RPS:         {results['actual_rps']:.2f}")
389 |     print(f"Target RPS:         {results['total_target_rps'] or 'unlimited'}")
390 |     print(f"Connections:        {results['parallel_connections']}")
391 |     print(f"Duration:           {results['duration_seconds']}s (+ {results['warmup_seconds']}s warmup)")
392 |     print(f"Batch Size:         {results['request_batch_size']}")
393 |     
394 |     # Latency Statistics
395 |     print("\n⚡ LATENCY STATISTICS")
396 |     print("-"*40)
397 |     print(f"Average:           {format_duration(results['average_latency_ms'])}")
398 |     print(f"Std Dev:           {format_duration(results['std_latency_ms'])}")
399 |     print(f"Min:               {format_duration(results['min_latency_ms'])}")
400 |     print(f"Max:               {format_duration(results['max_latency_ms'])}")
401 |     print(f"P50:               {format_duration(results['p50_latency_ms'])}")
402 |     print(f"P90:               {format_duration(results['p90_latency_ms'])}")
403 |     print(f"P95:               {format_duration(results['p95_latency_ms'])}")
404 |     print(f"P99:               {format_duration(results['p99_latency_ms'])}")
405 | 
406 |     # Per-Connection Statistics
407 |     print("\n🔌 PER-CONNECTION STATISTICS")
408 |     print("-"*40)
409 |     for conn_id, stats in results['connection_stats'].items():
410 |         print(f"\nConnection {conn_id}:")
411 |         print(f"  Requests:        {stats['requests']:,}")
412 |         print(f"  Success Rate:    {stats['success_rate']:.2f}%")
413 |         print(f"  Actual RPS:      {stats['actual_rps']:.2f}")
414 |         print(f"  Avg Latency:     {format_duration(stats['average_latency_ms'])}")
415 | 
416 |     # Error Summary
417 |     if results['errors']:
418 |         print("\n❌ ERROR SUMMARY")
419 |         print("-"*40)
420 |         print(f"Total Errors: {results['error_count']}")
421 |         print("\nFirst 10 Errors:")
422 |         for i, error in enumerate(results['errors'], 1):
423 |             print(f"{i}. {error}")
424 | 
425 |     print("\n" + "="*80)
426 | 
427 | 
428 | def main():
429 |     parser = argparse.ArgumentParser(description='API Benchmark Tool')
430 |     parser.add_argument('--server_url', type=str, required=True, help='Server URL')
431 |     parser.add_argument('--target_rps_per_connection', type=int, default=None, 
432 |                        help='Target requests per second per connection')
433 |     parser.add_argument('--parallel_connections', type=int, default=1,
434 |                        help='Number of parallel connections')
435 |     parser.add_argument('--duration', type=int, default=60, help='Test duration in seconds')
436 |     parser.add_argument('--warmup', type=int, default=5, help='Warmup period in seconds')
437 |     parser.add_argument('--debug', action='store_true', help='Enable debug logging')
438 |     parser.add_argument('--input_file', type=str, required=True, help='Input .json or .py file path')
439 |     parser.add_argument('--request_batch_size', type=int, default=1, help='Request batch size')
440 |     parser.add_argument('--log_dir', type=str, default=None, help='Directory to log request inputs and outputs')
441 |     parser.add_argument('--results_file', type=str, default='benchmark_results.json', 
442 |                        help='File to write benchmark results')
443 |     parser.add_argument('--request_timeout', type=float, default=10, help='Request timeout in seconds')
444 |     args = parser.parse_args()
445 | 
446 |     if args.debug:
447 |         logger.setLevel(logging.DEBUG)
448 | 
449 |     # Load input source
450 |     input_source = None
451 |     if args.input_file.endswith('.json'):
452 |         try:
453 |             with open(args.input_file, 'r') as f:
454 |                 input_data = json.load(f)
455 |             input_source = {'type': 'json', 'data': input_data}
456 |         except Exception as e:
457 |             logger.error(f"Failed to load JSON input file: {e}")
458 |             return
459 |     elif args.input_file.endswith('.py'):
460 |         input_source = {'type': 'function', 'path': args.input_file}
461 |     else:
462 |         logger.error("Input file must be either .json or .py")
463 |         return
464 | 
465 |     # Initialize and run benchmark
466 |     runner = BenchmarkRunner(
467 |         target_rps_per_connection=args.target_rps_per_connection,
468 |         parallel_connections=args.parallel_connections,
469 |         duration_seconds=args.duration,
470 |         server_url=args.server_url,
471 |         warmup_seconds=args.warmup,
472 |         input_source=input_source,
473 |         request_batch_size=args.request_batch_size,
474 |         log_dir=args.log_dir,
475 |         debug=args.debug,
476 |         request_timeout=args.request_timeout
477 |     )
478 |     
479 |     total_target_rps = (args.target_rps_per_connection or 'unlimited') 
480 |     if args.target_rps_per_connection:
481 |         total_target_rps = args.target_rps_per_connection * args.parallel_connections
482 |     
483 |     print(f"\n{'='*80}")
484 |     print("BENCHMARK CONFIGURATION")
485 |     print(f"{'='*80}")
486 |     print(f"Server URL:          {args.server_url}")
487 |     print(f"Parallel connections: {args.parallel_connections}")
488 |     print(f"Target RPS/conn:     {args.target_rps_per_connection or 'unlimited'}")
489 |     print(f"Total target RPS:    {total_target_rps}")
490 |     print(f"Duration:            {args.duration}s (+ {args.warmup}s warmup)")
491 |     print(f"Request batch size:  {args.request_batch_size}")
492 |     print(f"Input source:        {args.input_file}")
493 |     print(f"Log directory:       {args.log_dir or 'disabled'}")
494 |     print(f"Debug mode:          {'enabled' if args.debug else 'disabled'}")
495 |     print(f"Request timeout:     {args.request_timeout}s")
496 |     print(f"{'='*80}\n")
497 |     
498 |     try:
499 |         results = runner.run_benchmark()
500 |         
501 |         if results:
502 |             # Write results to file
503 |             with open(args.results_file, 'w') as f:
504 |                 json.dump(results, f, indent=2)
505 |             print(f"\nDetailed results saved to: {args.results_file}")
506 |             
507 |             # Print formatted results
508 |             print_results(results)
509 |         else:
510 |             print("\nNo results generated. Benchmark may have been interrupted.")
511 |     except KeyboardInterrupt:
512 |         print("\nBenchmark interrupted by user.")
513 |     except Exception as e:
514 |         logger.error(f"Benchmark failed: {e}")
515 |         if args.debug:
516 |             raise
517 | 
518 | if __name__ == "__main__":
519 |     main()


--------------------------------------------------------------------------------