├── .gitignore
├── LICENSE
├── README.md
├── clearml_serving
    ├── __init__.py
    ├── __main__.py
    ├── engines
    │   ├── __init__.py
    │   └── triton
    │   │   ├── Dockerfile
    │   │   ├── Dockerfile.tr2207
    │   │   ├── __init__.py
    │   │   ├── entrypoint.sh
    │   │   ├── requirements.txt
    │   │   └── triton_helper.py
    ├── preprocess
    │   └── preprocess_template.py
    ├── serving
    │   ├── Dockerfile
    │   ├── __init__.py
    │   ├── endpoints.py
    │   ├── entrypoint.sh
    │   ├── init.py
    │   ├── main.py
    │   ├── model_request_processor.py
    │   ├── preprocess_service.py
    │   ├── requirements.txt
    │   ├── utils.py
    │   └── uvicorn_mp_entrypoint.py
    ├── statistics
    │   ├── Dockerfile
    │   ├── __init__.py
    │   ├── entrypoint.sh
    │   ├── main.py
    │   ├── metrics.py
    │   └── requirements.txt
    └── version.py
├── docker
    ├── datasource.yml
    ├── docker-compose-triton-gpu.yml
    ├── docker-compose-triton.yml
    ├── docker-compose.yml
    ├── example.env
    └── prometheus.yml
├── docs
    ├── design_diagram.png
    ├── grafana_screenshot.png
    └── webapp_screenshots.gif
├── examples
    ├── custom
    │   ├── preprocess.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   └── train_model.py
    ├── ensemble
    │   ├── preprocess.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   └── train_ensemble.py
    ├── huggingface
    │   ├── docker-compose-override.yml
    │   ├── example_payload.json
    │   ├── preprocess.py
    │   ├── readme.md
    │   └── requirements.txt
    ├── keras
    │   ├── preprocess.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   └── train_keras_mnist.py
    ├── lightgbm
    │   ├── preprocess.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   └── train_model.py
    ├── pipeline
    │   ├── async_preprocess.py
    │   ├── preprocess.py
    │   └── readme.md
    ├── preprocess_template
    │   └── preprocess_template.py
    ├── pytorch
    │   ├── 5.jpg
    │   ├── preprocess.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   └── train_pytorch_mnist.py
    ├── sklearn
    │   ├── preprocess.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   └── train_model.py
    └── xgboost
    │   ├── preprocess.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   └── train_model.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # python build
 2 | dist/
 3 | build/
 4 | *.egg-info/
 5 | .tmp/
 6 | 
 7 | 
 8 | # Compiled Python bytecode
 9 | *.py[cod]
10 | 
11 | # Log files
12 | *.log
13 | 
14 | # JetBrains IDE
15 | .idea/
16 | .vscode/
17 | 
18 | tests/huggingface
19 | 
20 | # Generated by MacOS
21 | .DS_Store
22 | 
23 | # Generated by Windows
24 | Thumbs.db
25 | 
26 | # Virtual environment
27 | .venv
28 | 
29 | # Applications
30 | *.app
31 | *.exe
32 | *.war
33 | *.pkl
34 | *.pt
35 | *.pb
36 | data/
37 | runs/
38 | variables/
39 | 
40 | # Large media files
41 | *.mp4
42 | *.tiff
43 | *.avi
44 | *.flv
45 | *.mov
46 | *.wmv
47 | 
48 | # models
49 | *.pbtxt
50 | *.h5
51 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2025 ClearML Inc
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <div align="center">
  3 | 
  4 | <a href="https://app.clear.ml"><img src="https://github.com/clearml/clearml/blob/master/docs/clearml-logo.svg?raw=true" width="250px"></a>
  5 | 
  6 | **ClearML Serving - Model deployment made easy**
  7 | 
  8 | ## **`clearml-serving v1.3.1` </br> :sparkles: Model Serving (ML/DL) Made Easy :tada:** <br> :fire: NEW version 1.3 :rocket: 20% faster ! 
  9 | 
 10 | 
 11 | [![GitHub license](https://img.shields.io/github/license/clearml/clearml-serving.svg)](https://img.shields.io/github/license/clearml/clearml-serving.svg)
 12 | [![PyPI pyversions](https://img.shields.io/pypi/pyversions/clearml-serving.svg)](https://img.shields.io/pypi/pyversions/clearml-serving.svg)
 13 | [![PyPI version shields.io](https://img.shields.io/pypi/v/clearml-serving.svg)](https://img.shields.io/pypi/v/clearml-serving.svg)
 14 | [![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/clearml)](https://artifacthub.io/packages/helm/clearml/clearml-serving)
 15 | [![Slack Channel](https://img.shields.io/badge/slack-%23clearml--community-blueviolet?logo=slack)](https://joinslack.clear.ml)
 16 | 
 17 | `🌟 ClearML is open-source - Leave a star to support the project! 🌟`
 18 | 
 19 | </div>
 20 | 
 21 | 
 22 | **`clearml-serving`** is a command line utility for model deployment and orchestration.  
 23 | It enables model deployment including serving and preprocessing code to a Kubernetes cluster or custom container based solution.
 24 | 
 25 | ### :fire: NEW :confetti_ball: Take it for a spin with a simple `docker-compose` [command](#nail_care-initial-setup) :magic_wand: :sparkles: 
 26 | 
 27 | 
 28 | <a><img src="https://github.com/clearml/clearml-serving/blob/main/docs/design_diagram.png?raw=true" width="100%"></a>
 29 | 
 30 | Features:
 31 | * Easy to deploy & configure
 32 |   * Support Machine Learning Models (Scikit Learn, XGBoost, LightGBM)
 33 |   * Support Deep Learning Models (Tensorflow, PyTorch, ONNX)
 34 |   * Customizable RestAPI for serving (i.e. allow per model pre/post-processing for easy integration)
 35 | * Flexible  
 36 |   * On-line model deployment 
 37 |   * On-line endpoint model/version deployment (i.e. no need to take the service down)
 38 |   * Per model standalone preprocessing and postprocessing python code 
 39 | * Scalable
 40 |   * Multi model per container
 41 |   * Multi models per serving service
 42 |   * Multi-service support (fully seperated multiple serving service running independently)
 43 |   * Multi cluster support
 44 |   * Out-of-the-box node auto-scaling based on load/usage
 45 | * Efficient
 46 |   * Multi-container resource utilization
 47 |   * Support for CPU & GPU nodes
 48 |   * Auto-batching for DL models
 49 | * Automatic deployment
 50 |   * Automatic model upgrades w/ canary support 
 51 |   * Programmable API for model deployment
 52 | * Canary A/B deployment
 53 |   * Online Canary updates
 54 | * Model Monitoring
 55 |   * Usage Metric reporting
 56 |   * Metric Dashboard
 57 |   * Model performance metric
 58 |   * Model performance Dashboard
 59 | 
 60 | ## ClearML Serving Design 
 61 | 
 62 | ### ClearML Serving Design Principles 
 63 | 
 64 | **Modular** , **Scalable** , **Flexible** , **Customizable** , **Open Source**
 65 | 
 66 | ## Installation
 67 | 
 68 | ### Prerequisites
 69 | 
 70 | * ClearML-Server : Model repository, Service Health, Control plane
 71 | * Kubernetes / Single-instance Machine : Deploying containers 
 72 | * CLI : Configuration & model deployment interface
 73 | 
 74 | ### :nail_care: Initial Setup
 75 | 
 76 | 1. Setup your [**ClearML Server**](https://github.com/clearml/clearml-server) or use the [Free tier Hosting](https://app.clear.ml)
 77 | 2. Setup local access (if you haven't already), see instructions [here](https://clear.ml/docs/latest/docs/getting_started/ds/ds_first_steps#install-clearml)
 78 | 3. Install clearml-serving CLI: 
 79 | ```bash
 80 | pip3 install clearml-serving
 81 | ```
 82 | 4. Create the Serving Service Controller
 83 |   - `clearml-serving create --name "serving example"`
 84 |   - The new serving service UID should be printed `New Serving Service created: id=aa11bb22aa11bb22`
 85 | 5. Write down the Serving Service UID
 86 | 6. Clone clearml-serving repository
 87 | ```bash
 88 | git clone https://github.com/clearml/clearml-serving.git
 89 | ```
 90 | 7. Edit the environment variables file (`docker/example.env`) with your clearml-server credentials and Serving Service UID. For example, you should have something like
 91 | ```bash
 92 | cat docker/example.env
 93 | ```
 94 | ```bash
 95 |   CLEARML_WEB_HOST="https://app.clear.ml"
 96 |   CLEARML_API_HOST="https://api.clear.ml"
 97 |   CLEARML_FILES_HOST="https://files.clear.ml"
 98 |   CLEARML_API_ACCESS_KEY="<access_key_here>"
 99 |   CLEARML_API_SECRET_KEY="<secret_key_here>"
100 |   CLEARML_SERVING_TASK_ID="<serving_service_id_here>"
101 | ```
102 | 8. Spin the clearml-serving containers with docker-compose (or if running on Kubernetes use the helm chart)
103 | ```bash
104 | cd docker && docker-compose --env-file example.env -f docker-compose.yml up 
105 | ```
106 | If you need Triton support (keras/pytorch/onnx etc.), use the triton docker-compose file
107 | ```bash
108 | cd docker && docker-compose --env-file example.env -f docker-compose-triton.yml up 
109 | ```
110 | :muscle: If running on a GPU instance w/ Triton support (keras/pytorch/onnx etc.), use the triton gpu docker-compose file
111 | ```bash
112 | cd docker && docker-compose --env-file example.env -f docker-compose-triton-gpu.yml up 
113 | ```
114 | 
115 | > **Notice**: Any model that registers with "Triton" engine, will run the pre/post processing code on the Inference service container, and the model inference itself will be executed on the Triton Engine container.
116 | 
117 | 
118 | ### :ocean: Optional: advanced setup - S3/GS/Azure access
119 | 
120 | To add access credentials and allow the inference containers to download models from your S3/GS/Azure object-storage,
121 | add the respective environment variables to your env files (`example.env`)
122 | See further details on configuring the storage access [here](https://clear.ml/docs/latest/docs/integrations/storage#configuring-storage)
123 | 
124 | ```bash
125 | AWS_ACCESS_KEY_ID
126 | AWS_SECRET_ACCESS_KEY
127 | AWS_DEFAULT_REGION
128 | 
129 | GOOGLE_APPLICATION_CREDENTIALS
130 | 
131 | AZURE_STORAGE_ACCOUNT
132 | AZURE_STORAGE_KEY
133 | ```
134 | 
135 | ### :information_desk_person: Concepts
136 | 
137 | **CLI** - Secure configuration interface for on-line model upgrade/deployment on running Serving Services
138 | 
139 | **Serving Service Task** - Control plane object storing configuration on all the endpoints. Support multiple separated instance, deployed on multiple clusters.
140 | 
141 | **Inference Services** - Inference containers, performing model serving pre/post processing. Also support CPU model inferencing.
142 | 
143 | **Serving Engine Services** - Inference engine containers (e.g. Nvidia Triton, TorchServe etc.) used by the Inference Services for heavier model inference.
144 | 
145 | **Statistics Service** - Single instance per Serving Service  collecting and broadcasting model serving & performance statistics
146 | 
147 | **Time-series DB** - Statistics collection service used by the Statistics Service, e.g. Prometheus
148 | 
149 | **Dashboards** - Customizable dashboard-ing solution on top of the collected statistics, e.g. Grafana
150 | 
151 | ### :point_right: Toy model (scikit learn) deployment example 
152 | 
153 | 1. Train toy scikit-learn model
154 |   - create new python virtual environment
155 |   - `pip3 install -r examples/sklearn/requirements.txt`
156 |   - `python3 examples/sklearn/train_model.py`
157 |   - Model was automatically registered and uploaded into the model repository. For Manual model registration see [here](#turtle-registering--deploying-new-models-manually) 
158 | 2. Register the new Model on the Serving Service
159 |   - `clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model" --project "serving examples"`
160 |   - **Notice** the preprocessing python code is packaged and uploaded to the "Serving Service", to be used by any inference container, and downloaded in realtime when updated
161 | 3. Spin the Inference Container
162 |   - Customize container [Dockerfile](clearml_serving/serving/Dockerfile) if needed
163 |   - Build container `docker build --tag clearml-serving-inference:latest -f clearml_serving/serving/Dockerfile .`
164 |   - Spin the inference container: `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> -e CLEARML_SERVING_POLL_FREQ=5 clearml-serving-inference:latest` 
165 | 4. Test new model inference endpoint
166 |   - `curl -X POST "http://127.0.0.1:8080/serve/test_model_sklearn" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2}'`
167 |   
168 | **Notice**, now that we have an inference container running, we can add new model inference endpoints directly with the CLI. The inference container will automatically sync once every 5 minutes.
169 | 
170 | **Notice** On the first few requests the inference container needs to download the model file and preprocessing python code, this means the request might take a little longer, once everything is cached, it will return almost immediately.
171 | 
172 | **Notes:**
173 | > Review the model repository in the ClearML web UI, under the "serving examples" Project on your ClearML account/server ([free hosted](https://app.clear.ml) or [self-deployed](https://github.com/clearml/clearml-server)).
174 | 
175 | > Inference services status, console outputs and machine metrics are available in the ClearML UI in the Serving Service project (default: "DevOps" project)
176 | 
177 | > To learn more on training models and the ClearML model repository, see the [ClearML documentation](https://clear.ml/docs)
178 | 
179 | ### :turtle: Registering & Deploying new models manually 
180 | 
181 | Uploading an existing model file into the model repository can be done via the `clearml` RestAPI, the python interface, or with the `clearml-serving` CLI. 
182 | 
183 | > To learn more on training models and the ClearML model repository, see the [ClearML documentation](https://clear.ml/docs)
184 | 
185 | - local model file on our laptop: 'examples/sklearn/sklearn-model.pkl'
186 | - Upload the model file to the `clearml-server` file storage and register it
187 | `clearml-serving --id <service_id> model upload --name "manual sklearn model" --project "serving examples" --framework "scikit-learn" --path examples/sklearn/sklearn-model.pkl`
188 | - We now have a new Model in the "serving examples" project, by the name of "manual sklearn model". The CLI output prints the UID of the newly created model, we will use it to register a new endpoint 
189 | - In the `clearml` web UI we can see the new model listed under the `Models` tab in the associated project. we can also download the model file itself directly from the web UI 
190 | - Register a new endpoint with the new model
191 | `clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn" --preprocess "examples/sklearn/preprocess.py" --model-id <newly_created_model_id_here>`
192 | 
193 | **Notice** we can also provide a differnt storage destination for the model, such as S3/GS/Azure, by passing
194 | `--destination="s3://bucket/folder"`, `gs://bucket/folder`, `azure://bucket/folder`. Yhere is no need to provide a unique path tp the destination argument, the location of the model will be a unique path based on the serving service ID and the model name
195 | 
196 | 
197 | ### :rabbit: Automatic model deployment
198 | 
199 | The clearml Serving Service support automatic model deployment and upgrades, directly connected with the model repository and API. When the model auto-deploy is configured, a new model versions will be automatically deployed when you "publish" or "tag" a new model in the `clearml` model repository. This automation interface allows for simpler CI/CD model deployment process, as a single API automatically deploy (or remove) a model from the Serving Service.
200 | 
201 | #### :bulb: Automatic model deployment example
202 | 
203 | 1. Configure the model auto-update on the Serving Service
204 | - `clearml-serving --id <service_id> model auto-update --engine sklearn --endpoint "test_model_sklearn_auto" --preprocess "preprocess.py" --name "train sklearn model" --project "serving examples" --max-versions 2`
205 | 2. Deploy the Inference container (if not already deployed)
206 | 3. Publish a new model the model repository
207 | - Go to the "serving examples" project in the ClearML web UI, click on the Models Tab, search for "train sklearn model" right click and select "Publish"
208 | - Use the RestAPI [details](https://clear.ml/docs/latest/docs/references/api/models#post-modelspublish_many)
209 | - Use Python interface: 
210 | ```python
211 | from clearml import Model
212 | Model(model_id="unique_model_id_here").publish()
213 | ```
214 | 4. The new model is available on a new endpoint version (1), test with: 
215 | `curl -X POST "http://127.0.0.1:8080/serve/test_model_sklearn_auto/1" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2}'`
216 | 
217 | ### :bird: Canary endpoint setup
218 | 
219 | Canary endpoint deployment add a new endpoint where the actual request is sent to a preconfigured set of endpoints with pre-provided distribution. For example, let's create a new endpoint "test_model_sklearn_canary", we can provide a list of endpoints and probabilities (weights).
220 | 
221 | ```bash
222 | clearml-serving --id <service_id> model canary --endpoint "test_model_sklearn_canary" --weights 0.1 0.9 --input-endpoints test_model_sklearn/2 test_model_sklearn/1
223 | ```
224 | This means that any request coming to `/test_model_sklearn_canary/` will be routed with probability of 90% to
225 | `/test_model_sklearn/1/` and with probability of 10% to `/test_model_sklearn/2/`. 
226 | 
227 | **Note:**
228 | > As with any other Serving Service configuration, we can configure the Canary endpoint while the Inference containers are already running and deployed, they will get updated in their next update cycle (default: once every 5 minutes)
229 | 
230 | We can also prepare a "fixed" canary endpoint, always splitting the load between the last two deployed models:
231 | ```bash
232 | clearml-serving --id <service_id> model canary --endpoint "test_model_sklearn_canary" --weights 0.1 0.9 --input-endpoints-prefix test_model_sklearn/
233 | ```
234 | 
235 | This means that is we have two model inference endpoints: `/test_model_sklearn/1/` and `/test_model_sklearn/2/`. The 10% probability (weight 0.1) will match the last (order by version number) endpoint, i.e. `/test_model_sklearn/2/` and the 90% will match `/test_model_sklearn/2/`.
236 | When we add a new model endpoint version, e.g. `/test_model_sklearn/3/`, the canary distribution will automatically match the 90% probability to `/test_model_sklearn/2/` and the 10% to the new endpoint `/test_model_sklearn/3/`.  
237 | 
238 | Example:
239 | 1. Add two endpoints:
240 |   - `clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model" --version 1 --project "serving examples"`
241 |   -  `clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model" --version 2 --project "serving examples"`
242 | 2. Add Canary endpoint:
243 |   - `clearml-serving --id <service_id> model canary --endpoint "test_model_sklearn_canary" --weights 0.1 0.9 --input-endpoints test_model_sklearn/2 test_model_sklearn/1`
244 | 3. Test Canary endpoint:
245 |   - `curl -X POST "http://127.0.0.1:8080/serve/test_model" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2}'` 
246 | 
247 | 
248 | ### :bar_chart: Model monitoring and performance metrics :bell:
249 | 
250 | ![Grafana Screenshot](docs/grafana_screenshot.png)
251 | 
252 | ClearML serving instances send serving statistics (count/latency) automatically to Prometheus and Grafana can be used 
253 | to visualize and create live dashboards. 
254 | 
255 | The default docker-compose installation is preconfigured with Prometheus and Grafana, do notice that by default data/ate of both containers is *not* persistent. To add persistence we do recommend adding a volume mount.
256 | 
257 | You can also add many custom metrics on the input/predictions of your models.
258 | Once a model endpoint is registered, adding custom metric can be done using the CLI.
259 | For example, assume we have our mock scikit-learn model deployed on endpoint `test_model_sklearn`, 
260 | we can log the requests inputs and outputs (see examples/sklearn/preprocess.py example):
261 | ```bash
262 | clearml-serving --id <serving_service_id_here> metrics add --endpoint test_model_sklearn --variable-scalar
263 | x0=0,0.1,0.5,1,10 x1=0,0.1,0.5,1,10 y=0,0.1,0.5,0.75,1
264 | ```
265 | 
266 | This will create a distribution histogram (buckets specified via a list of less-equal values after `=` sign),
267 | that we will be able to visualize on Grafana.
268 | Notice we can also log time-series values with `--variable-value x2` or discrete results (e.g. classifications strings) with `--variable-enum animal=cat,dog,sheep`.
269 | Additional custom variables can be in the preprocess and postprocess with a call to `collect_custom_statistics_fn({'new_var': 1.337})` see clearml_serving/preprocess/preprocess_template.py
270 | 
271 | With the new metrics logged we can create a visualization dashboard over the latency of the calls, and the output distribution. 
272 | 
273 | Grafana model performance example:
274 | 
275 | - browse to http://localhost:3000
276 | - login with: admin/admin
277 | - create a new dashboard
278 | - select Prometheus as data source
279 | - Add a query: `100 * increase(test_model_sklearn:_latency_bucket[1m]) / increase(test_model_sklearn:_latency_sum[1m])`
280 | - Change type to heatmap, and select on the right hand-side under "Data Format" select "Time series buckets"
281 | - You now have the latency distribution, over time.
282 | - Repeat the same process for x0, the query would be `100 * increase(test_model_sklearn:x0_bucket[1m]) / increase(test_model_sklearn:x0_sum[1m])`
283 | 
284 | > **Notice**: If not specified all serving requests will be logged, to change the default configure "CLEARML_DEFAULT_METRIC_LOG_FREQ", for example CLEARML_DEFAULT_METRIC_LOG_FREQ=0.2 means only 20% of all requests will be logged. You can also specify per endpoint log frequency with the `clearml-serving` CLI. Check the CLI documentation with `clearml-serving metrics --help`
285 | 
286 | ### :fire: Model Serving Examples
287 | 
288 | - Scikit-Learn [example](examples/sklearn/readme.md) - random data 
289 | - Scikit-Learn Model Ensemble [example](examples/ensemble/readme.md) - random data 
290 | - XGBoost [example](examples/xgboost/readme.md) - iris dataset
291 | - LightGBM [example](examples/lightgbm/readme.md) - iris dataset
292 | - PyTorch [example](examples/pytorch/readme.md) - mnist dataset
293 | - TensorFlow/Keras [example](examples/keras/readme.md) - mnist dataset
294 | - Multi-Model Pipeline [example](examples/pipeline/readme.md) - multiple models
295 | - Multi-Model ASync Pipeline [example](examples/pipeline/async_preprocess.py) - multiple models
296 | - Custom Model [example](examples/custom/readme.md) - custom data
297 | 
298 | ### :pray: Status
299 | 
300 |   - [x] FastAPI integration for inference service
301 |   - [x] multi-process Gunicorn for inference service
302 |   - [x] Dynamic preprocess python code loading (no need for container/process restart)
303 |   - [x] Model files download/caching (http/s3/gs/azure)
304 |   - [x] Scikit-learn. XGBoost, LightGBM integration
305 |   - [x] Custom inference, including dynamic code loading
306 |   - [x] Manual model upload/registration to model repository (http/s3/gs/azure)
307 |   - [x] Canary load balancing
308 |   - [x] Auto model endpoint deployment based on model repository state
309 |   - [x] Machine/Node health metrics
310 |   - [x] Dynamic online configuration
311 |   - [x] CLI configuration tool
312 |   - [x] Nvidia Triton integration
313 |   - [x] GZip request compression
314 |   - [x] TorchServe engine integration
315 |   - [x] Prebuilt Docker containers (dockerhub)
316 |   - [x] Docker-compose deployment (CPU/GPU)
317 |   - [x] Scikit-Learn example
318 |   - [x] XGBoost example
319 |   - [x] LightGBM example
320 |   - [x] PyTorch example
321 |   - [x] TensorFlow/Keras example
322 |   - [x] Model ensemble example
323 |   - [x] Model pipeline example
324 |   - [x] Statistics Service
325 |   - [x] Kafka install instructions
326 |   - [x] Prometheus install instructions
327 |   - [x] Grafana install instructions
328 |   - [x] Kubernetes Helm Chart
329 |   - [ ] Intel optimized container (python, numpy, daal, scikit-learn)
330 | 
331 | ## Contributing
332 | 
333 | **PRs are always welcomed** :heart: See more details in the ClearML [Guidelines for Contributing](https://github.com/clearml/clearml/blob/master/docs/contributing.md).
334 | 
335 | 
336 | 


--------------------------------------------------------------------------------
/clearml_serving/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clearml/clearml-serving/1def0a6901617767687f2b747aaffdc060f96046/clearml_serving/__init__.py


--------------------------------------------------------------------------------
/clearml_serving/engines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clearml/clearml-serving/1def0a6901617767687f2b747aaffdc060f96046/clearml_serving/engines/__init__.py


--------------------------------------------------------------------------------
/clearml_serving/engines/triton/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | FROM nvcr.io/nvidia/tritonserver:22.08-py3
 3 | 
 4 | 
 5 | ENV LC_ALL=C.UTF-8
 6 | 
 7 | # install base package
 8 | RUN python3 -m pip install --no-cache-dir -U pip clearml-serving
 9 | 
10 | # get latest execution code from the git repository
11 | # RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git
12 | COPY clearml_serving /root/clearml/clearml_serving
13 | 
14 | RUN python3 -m pip install --no-cache-dir -r /root/clearml/clearml_serving/engines/triton/requirements.txt
15 | 
16 | # default serving port
17 | EXPOSE 8001
18 | 
19 | # environement variable to load Task from CLEARML_SERVING_TASK_ID, CLEARML_SERVING_PORT
20 | 
21 | WORKDIR /root/clearml/
22 | ENTRYPOINT ["clearml_serving/engines/triton/entrypoint.sh"]
23 | 


--------------------------------------------------------------------------------
/clearml_serving/engines/triton/Dockerfile.tr2207:
--------------------------------------------------------------------------------
 1 | 
 2 | FROM nvcr.io/nvidia/tritonserver:22.07-py3
 3 | 
 4 | 
 5 | ENV LC_ALL=C.UTF-8
 6 | 
 7 | # install base package
 8 | RUN pip3 install --no-cache-dir -U pip
 9 | RUN pip3 install --no-cache-dir clearml-serving
10 | 
11 | # get latest execution code from the git repository
12 | # RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git
13 | COPY clearml_serving /root/clearml/clearml_serving
14 | 
15 | RUN pip3 install --no-cache-dir  -r /root/clearml/clearml_serving/engines/triton/requirements.txt
16 | 
17 | # default serving port
18 | EXPOSE 8001
19 | 
20 | # environement variable to load Task from CLEARML_SERVING_TASK_ID, CLEARML_SERVING_PORT
21 | 
22 | WORKDIR /root/clearml/
23 | ENTRYPOINT ["clearml_serving/engines/triton/entrypoint.sh"]
24 | 


--------------------------------------------------------------------------------
/clearml_serving/engines/triton/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clearml/clearml-serving/1def0a6901617767687f2b747aaffdc060f96046/clearml_serving/engines/triton/__init__.py


--------------------------------------------------------------------------------
/clearml_serving/engines/triton/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # print configuration
 4 | echo CLEARML_SERVING_TASK_ID="$CLEARML_SERVING_TASK_ID"
 5 | echo CLEARML_TRITON_POLL_FREQ="$CLEARML_TRITON_POLL_FREQ"
 6 | echo CLEARML_TRITON_METRIC_FREQ="$CLEARML_TRITON_METRIC_FREQ"
 7 | echo CLEARML_TRITON_HELPER_ARGS="$CLEARML_TRITON_HELPER_ARGS"
 8 | echo CLEARML_EXTRA_PYTHON_PACKAGES="$CLEARML_EXTRA_PYTHON_PACKAGES"
 9 | 
10 | # we should also have clearml-server configurations
11 | 
12 | if [ ! -z "$CLEARML_EXTRA_PYTHON_PACKAGES" ]
13 | then
14 |       python3 -m pip install $CLEARML_EXTRA_PYTHON_PACKAGES
15 | fi
16 | 
17 | # start service
18 | PYTHONPATH=$(pwd) python3 clearml_serving/engines/triton/triton_helper.py $CLEARML_TRITON_HELPER_ARGS $@
19 | 


--------------------------------------------------------------------------------
/clearml_serving/engines/triton/requirements.txt:
--------------------------------------------------------------------------------
1 | clearml >= 1.3.1
2 | clearml-serving
3 | tritonclient[grpc]>=2.32,<2.33
4 | starlette
5 | grpcio
6 | Pillow>=10.0.1
7 | pathlib2
8 | 


--------------------------------------------------------------------------------
/clearml_serving/engines/triton/triton_helper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import shutil
  4 | import subprocess
  5 | from argparse import ArgumentParser
  6 | from time import time
  7 | from typing import Optional
  8 | 
  9 | import numpy as np
 10 | from clearml import Task, Logger, InputModel
 11 | from clearml.backend_api.utils import get_http_session_with_retry
 12 | from clearml.utilities.pyhocon import ConfigFactory, ConfigTree, HOCONConverter
 13 | from pathlib import Path
 14 | 
 15 | from clearml_serving.serving.endpoints import ModelEndpoint
 16 | from clearml_serving.serving.model_request_processor import ModelRequestProcessor
 17 | 
 18 | 
 19 | class TritonHelper(object):
 20 |     _metric_line_parsing = r"(\w+){(gpu_uuid=\"[\w\W]*\",)?model=\"(\w+)\",\s*version=\"(\d+)\"}\s*([0-9.]*)"
 21 |     _default_metrics_port = 8002
 22 | 
 23 |     def __init__(
 24 |             self,
 25 |             args,  # Any
 26 |             task,  # type: Task
 27 |             serving_id,  # type: str
 28 |             metric_host=None,  # type: Optional[str]
 29 |             metric_port=None,  # type: int
 30 |     ):
 31 |         # type: (...) -> None
 32 |         self._http_session = get_http_session_with_retry()
 33 |         self.args = dict(**args.__dict__) if args else {}
 34 |         self.task = task
 35 |         self._serving_service_task_id = serving_id
 36 |         self._serving_service_task = None  # type: Optional[ModelRequestProcessor]
 37 |         self._current_endpoints = {}
 38 |         self.metric_host = metric_host or '0.0.0.0'
 39 |         self.metric_port = metric_port or self._default_metrics_port
 40 |         self._parse_metric = re.compile(self._metric_line_parsing)
 41 |         self._timestamp = time()
 42 |         self._last_update_step = None
 43 |         print('String Triton Helper service\n{}\n'.format(self.args))
 44 | 
 45 |     def report_metrics(self, remote_logger):
 46 |         # type: (Optional[Logger]) -> bool
 47 |         # iterations are seconds from start
 48 |         iteration = int(time() - self._timestamp)
 49 | 
 50 |         report_msg = "reporting metrics: relative time {} sec".format(iteration)
 51 |         self.task.get_logger().report_text(report_msg)
 52 |         if remote_logger:
 53 |             remote_logger.report_text(report_msg, print_console=False)
 54 | 
 55 |         # noinspection PyBroadException
 56 |         try:
 57 |             # this is inside the container
 58 |             request = self._http_session.get('http://{}:{}/metrics'.format(self.metric_host, self.metric_port))  # noqa
 59 |             if not request.ok:
 60 |                 return False
 61 |             content = request.content.decode().split('\n')
 62 |         except Exception:
 63 |             return False
 64 | 
 65 |         for line in content:
 66 |             line = line.strip()
 67 |             if not line or line.startswith('#'):
 68 |                 continue
 69 |             # noinspection PyBroadException
 70 |             try:
 71 |                 metric, gpu_uuid, variant, version, value = self._parse_metric.match(line).groups()
 72 |                 value = float(value)
 73 |             except Exception:
 74 |                 continue
 75 |             self.task.get_logger().report_scalar(
 76 |                 title=metric,
 77 |                 series='{}.v{}'.format(variant, version),
 78 |                 iteration=iteration,
 79 |                 value=value
 80 |             )
 81 |             # on the remote logger we add our own Task ID (unique ID),
 82 |             # to support multiple servers reporting to the same service controller
 83 |             if remote_logger:
 84 |                 remote_logger.report_scalar(
 85 |                     title=metric,
 86 |                     series='{}.v{}.{}'.format(variant, version, self.task.id),
 87 |                     iteration=iteration,
 88 |                     value=value
 89 |                 )
 90 | 
 91 |     def model_service_update_step(self, model_repository_folder=None, verbose=True):
 92 |         # type: (Optional[str], bool) -> bool
 93 | 
 94 |         if not self._serving_service_task:
 95 |             return False
 96 | 
 97 |         active_endpoints = self._serving_service_task.get_synced_endpoints()
 98 | 
 99 |         self._last_update_step = time()
100 | 
101 |         # nothing to do
102 |         if self._current_endpoints == active_endpoints:
103 |             return False
104 | 
105 |         if not model_repository_folder:
106 |             model_repository_folder = '/models/'
107 | 
108 |         if verbose:
109 |             print('Updating local model folder: {}'.format(model_repository_folder))
110 | 
111 |         for url, endpoint in active_endpoints.items():
112 | 
113 |             # Triton model folder structure reference:
114 |             # https://github.com/triton-inference-server/server/blob/r22.07/docs/model_repository.md#model-repository
115 | 
116 |             # skip if there is no change
117 |             if url in self._current_endpoints and self._current_endpoints.get(url) == endpoint:
118 |                 continue
119 | 
120 |             # skip if this is not a triton engine endpoint:
121 |             if endpoint.engine_type != "triton":
122 |                 continue
123 | 
124 |             url = url.replace("/", "_")
125 | 
126 |             folder = Path(model_repository_folder) / url
127 |             folder.mkdir(parents=True, exist_ok=True)
128 | 
129 |             config_pbtxt = folder / 'config.pbtxt'
130 |             # download model versions
131 |             version = 1
132 |             model_id = endpoint.model_id
133 | 
134 |             model_folder = folder / str(version)
135 | 
136 |             model_folder.mkdir(parents=True, exist_ok=True)
137 |             model = None
138 |             # noinspection PyBroadException
139 |             try:
140 |                 model = InputModel(model_id)
141 |                 local_path = model.get_local_copy()
142 |             except Exception:
143 |                 local_path = None
144 | 
145 |             if not local_path or not model:
146 |                 print("Error retrieving model ID {} []".format(model_id, model.url if model else ''))
147 |                 continue
148 | 
149 |             local_path = Path(local_path)
150 | 
151 |             # prepare config.pbtxt
152 |             self.create_config_pbtxt(
153 |                 endpoint, target_pbtxt_file=config_pbtxt.as_posix(), platform=model.framework
154 |             )
155 | 
156 |             if verbose:
157 |                 print('Update model v{} in {}'.format(version, model_folder))
158 | 
159 |             framework = str(model.framework).lower()
160 | 
161 |             # if this is a folder copy every and delete the temp folder
162 |             if local_path.is_dir() and model and ("tensorflow" in framework or "keras" in framework):
163 |                 # we assume we have a `tensorflow.savedmodel` folder
164 |                 model_folder /= 'model.savedmodel'
165 |                 self._extract_folder(local_path, model_folder, verbose, remove_existing=True)
166 |             elif "torch" in framework and local_path.is_file():
167 |                 # single file should be moved
168 |                 self._extract_single_file(local_path, model_folder / "model.pt", verbose)
169 |             elif "onnx" in framework and local_path.is_dir():
170 |                 # just unzip both model.bin & model.xml into the model folder
171 |                 self._extract_folder(local_path, model_folder, verbose)
172 |             elif ("tensorflow" in framework or "keras" in framework) and local_path.is_file():
173 |                 # just rename the single file to "model.graphdef"
174 |                 self._extract_single_file(local_path, model_folder / "model.graphdef", verbose)
175 |             elif "tensorrt" in framework and local_path.is_file():
176 |                 # just rename the single file to "model.plan"
177 |                 self._extract_single_file(local_path, model_folder / "model.plan", verbose)
178 |             elif local_path.is_file():
179 |                 # generic model will be stored as 'model.bin'
180 |                 self._extract_single_file(local_path, model_folder / "model.bin", verbose)
181 |             elif local_path.is_dir():
182 |                 # generic model will be stored into the model folder
183 |                 self._extract_folder(local_path, model_folder, verbose)
184 |             else:
185 |                 print("Model type could not be inferred skipping", model.id, model.framework, model.name)
186 |                 continue
187 | 
188 |         # todo: trigger triton model reloading (instead of relaying on current poll mechanism)
189 |         # based on the model endpoint changes
190 | 
191 |         # update current state
192 |         self._current_endpoints = active_endpoints
193 | 
194 |         return True
195 | 
196 |     @staticmethod
197 |     def _extract_single_file(local_path, target_path, verbose):
198 |         old_file = None
199 |         if target_path.exists():
200 |             old_file = target_path.parent / '.old.{}'.format(target_path.name)
201 |             target_path.replace(old_file)
202 |         if verbose:
203 |             print('copy model into {}'.format(target_path))
204 |         shutil.move(local_path.as_posix(), target_path.as_posix())
205 |         if old_file:
206 |             old_file.unlink()
207 | 
208 |     @staticmethod
209 |     def _extract_folder(local_path, model_folder, verbose, remove_existing=False):
210 |         model_folder.mkdir(parents=True, exist_ok=True)
211 |         # rename to old
212 |         old_folder = None
213 |         if remove_existing and model_folder.exists():
214 |             old_folder = model_folder.parent / '.old.{}'.format(model_folder.name)
215 |             model_folder.replace(old_folder)
216 |         if verbose:
217 |             print('copy model into {}'.format(model_folder))
218 |         shutil.copytree(
219 |             local_path.as_posix(), model_folder.as_posix(), symlinks=False, dirs_exist_ok=True
220 |         )
221 |         if old_folder:
222 |             shutil.rmtree(path=old_folder.as_posix())
223 |         # delete temp folder
224 |         shutil.rmtree(local_path.as_posix())
225 | 
226 |     def maintenance_daemon(
227 |             self,
228 |             local_model_repo='/models',  # type: str
229 |             update_frequency_sec=60.0,  # type: float
230 |             metric_frequency_sec=60.0  # type: float
231 |     ):
232 |         # type: (...) -> None
233 | 
234 |         Path(local_model_repo).mkdir(parents=True, exist_ok=True)
235 | 
236 |         self._serving_service_task = ModelRequestProcessor(task_id=self._serving_service_task_id)
237 |         self.model_service_update_step(model_repository_folder=local_model_repo, verbose=True)
238 | 
239 |         # noinspection PyProtectedMember
240 |         remote_logger = self._serving_service_task._task.get_logger()
241 | 
242 |         # todo: log triton server outputs when running locally
243 | 
244 |         # we assume we can run the triton server
245 |         cmd = [
246 |             'tritonserver',
247 |             '--model-control-mode=poll',
248 |             '--model-repository={}'.format(local_model_repo),
249 |             '--repository-poll-secs={}'.format(update_frequency_sec),
250 |             '--metrics-port={}'.format(self._default_metrics_port),
251 |             '--allow-metrics=true',
252 |             '--allow-gpu-metrics=true',
253 |         ]
254 |         for k, v in self.args.items():
255 |             if not v or not str(k).startswith('t_'):
256 |                 continue
257 |             cmd.append('--{}={}'.format(k, v))
258 | 
259 |         print('Starting server: {}'.format(cmd))
260 |         try:
261 |             proc = subprocess.Popen(cmd)
262 |         except FileNotFoundError:
263 |             raise ValueError(
264 |                 "Triton Server Engine (tritonserver) could not be found!\n"
265 |                 "Verify you running inside the `nvcr.io/nvidia/tritonserver` docker container")
266 |         base_freq = min(update_frequency_sec, metric_frequency_sec)
267 |         metric_tic = update_tic = time()
268 |         while True:
269 |             try:
270 |                 error_code = proc.wait(timeout=base_freq)
271 |                 if error_code == 0:
272 |                     print("triton-server process ended with error code {}".format(error_code))
273 |                     return
274 |                 raise ValueError("triton-server process ended with error code {}".format(error_code))
275 |             except subprocess.TimeoutExpired:
276 |                 pass
277 |             pass
278 | 
279 |             # update models
280 |             if time() - update_tic > update_frequency_sec:
281 |                 print("Info: syncing models from main serving service")
282 |                 if self.model_service_update_step(model_repository_folder=local_model_repo, verbose=True):
283 |                     print("Info: Models updated from main serving service")
284 |                 update_tic = time()
285 | 
286 |             # update stats
287 |             if time() - metric_tic > metric_frequency_sec:
288 |                 metric_tic = time()
289 |                 self.report_metrics(remote_logger)
290 | 
291 |     @classmethod
292 |     def create_config_pbtxt(cls, endpoint, target_pbtxt_file, platform=None):
293 |         # type: (ModelEndpoint, str, Optional[str]) -> bool
294 |         """
295 |         Full spec available here:
296 |         https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md
297 |         """
298 | 
299 |         def _convert_lists(config):
300 |             if isinstance(config, list):
301 |                 return [_convert_lists(i) for i in config]
302 | 
303 |             if not isinstance(config, ConfigTree):
304 |                 return config
305 | 
306 |             for k in list(config.keys()):
307 |                 v = config[k]
308 |                 # try to convert to list
309 |                 if isinstance(v, (ConfigTree, list)):
310 |                     # noinspection PyBroadException
311 |                     try:
312 |                         a_list = config.get_list(k, [])
313 |                         if a_list:
314 |                             config[k] = _convert_lists(a_list)
315 |                             continue
316 |                     except Exception:
317 |                         pass
318 | 
319 |                 config[k] = _convert_lists(v)
320 | 
321 |             return config
322 | 
323 |         final_config_pbtxt = ""
324 |         config_dict = dict()
325 | 
326 |         if endpoint.auxiliary_cfg and isinstance(endpoint.auxiliary_cfg, str):
327 |             final_config_pbtxt = endpoint.auxiliary_cfg + "\n"
328 |         elif endpoint.auxiliary_cfg and isinstance(endpoint.auxiliary_cfg, dict):
329 |             config_dict = dict(**endpoint.auxiliary_cfg)
330 | 
331 |         config_dict = ConfigFactory.from_dict(config_dict)
332 | 
333 |         # The framework for the model. Possible values are:
334 |         #   "tensorrt_plan", "tensorflow_graphdef",
335 |         #   "tensorflow_savedmodel", "onnxruntime_onnx",
336 |         #   "pytorch_libtorch".
337 |         # Default for TF: "tensorflow_savedmodel"
338 | 
339 |         # replace ": [{" with ": [{" (currently not needed)
340 |         # pattern = re.compile(r"(?P<key>\w+)(?P<space>\s+)(?P<bracket>(\[)|({))")
341 | 
342 |         for i, s in enumerate(endpoint.input_size or []):
343 |             config_dict.put("input.{}.dims".format(i), s)
344 | 
345 |         for i, s in enumerate(endpoint.output_size or []):
346 |             config_dict.put("output.{}.dims".format(i), s)
347 | 
348 |         for i, s in enumerate(endpoint.input_type or []):
349 |             input_type = "TYPE_" + cls.np_to_triton_dtype(np.dtype(s))
350 |             config_dict.put("input.{}.data_type".format(i), input_type)
351 | 
352 |         for i, s in enumerate(endpoint.output_type or []):
353 |             output_type = "TYPE_" + cls.np_to_triton_dtype(np.dtype(s))
354 |             config_dict.put("output.{}.data_type".format(i), output_type)
355 | 
356 |         for i, s in enumerate(endpoint.input_name or []):
357 |             config_dict.put("input.{}.name".format(i), "\"{}\"".format(s))
358 | 
359 |         for i, s in enumerate(endpoint.output_name or []):
360 |             config_dict.put("output.{}.name".format(i), "\"{}\"".format(s))
361 | 
362 |         # check if we have platform in the auxiliary config pbtxt
363 |         if platform and final_config_pbtxt:
364 |             # noinspection PyBroadException
365 |             try:
366 |                 final_config_pbtxt_dict = ConfigFactory.parse_string(final_config_pbtxt)
367 |                 # if we found it, null the requested platform and use the auxiliary config pbtxt platform `value`
368 |                 if final_config_pbtxt_dict.get("platform", None):
369 |                     print(
370 |                         "WARNING: ignoring auto-detecetd `platform={}` "
371 |                         "and using auxiliary pbtxt `platform={}`".format(
372 |                             str(platform).lower(), final_config_pbtxt_dict.get("platform")))
373 |                     platform = None
374 |             except Exception:
375 |                 # we failed parsing the auxiliary pbtxt
376 |                 pass
377 | 
378 |         if platform and not config_dict.get("platform", None) and not config_dict.get("backend", None):
379 |             platform = str(platform).lower()
380 |             if platform.startswith("tensorflow") or platform.startswith("keras"):
381 |                 config_dict["platform"] = "\"tensorflow_savedmodel\""
382 |             elif platform.startswith("pytorch") or platform.startswith("caffe"):
383 |                 config_dict["backend"] = "\"pytorch\""
384 |             elif platform.startswith("onnx"):
385 |                 config_dict["platform"] = "\"onnxruntime_onnx\""
386 | 
387 |         # convert to lists anything that we can:
388 |         if config_dict:
389 |             config_dict = _convert_lists(config_dict)
390 |             # Convert HOCON standard to predefined message format
391 |             config_pbtxt = "\n" + HOCONConverter.to_hocon(config_dict). \
392 |                 replace("=", ":").replace(" : ", ": ")
393 | 
394 |             # conform types (remove string quotes)
395 |             config_pbtxt = config_pbtxt.replace("\\\"", "<DQUOTE>").\
396 |                 replace("\\\'", "<QUOTE>").replace("\"", "").replace("\'", "").\
397 |                 replace("<DQUOTE>", "\"").replace("<QUOTE>", "\'")
398 |         else:
399 |             config_pbtxt = ""
400 | 
401 |         # merge the two
402 |         final_config_pbtxt += config_pbtxt
403 |         print("INFO: target config.pbtxt file for endpoint '{}':\n{}\n".format(
404 |             endpoint.serving_url, final_config_pbtxt))
405 | 
406 |         with open(target_pbtxt_file, "w") as config_file:
407 |             config_file.write(final_config_pbtxt)
408 | 
409 |         return True
410 | 
411 |     @staticmethod
412 |     def np_to_triton_dtype(np_dtype):
413 |         # type (np.dtype) -> str
414 |         """
415 |         copied from tritonclientutils import np_to_triton_dtype
416 |         """
417 |         if np_dtype == bool:
418 |             return "BOOL"
419 |         elif np_dtype == np.int8:
420 |             return "INT8"
421 |         elif np_dtype == np.int16:
422 |             return "INT16"
423 |         elif np_dtype == np.int32:
424 |             return "INT32"
425 |         elif np_dtype == np.int64:
426 |             return "INT64"
427 |         elif np_dtype == np.uint8:
428 |             return "UINT8"
429 |         elif np_dtype == np.uint16:
430 |             return "UINT16"
431 |         elif np_dtype == np.uint32:
432 |             return "UINT32"
433 |         elif np_dtype == np.uint64:
434 |             return "UINT64"
435 |         elif np_dtype == np.float16:
436 |             return "FP16"
437 |         elif np_dtype == np.float32:
438 |             return "FP32"
439 |         elif np_dtype == np.float64:
440 |             return "FP64"
441 |         elif np_dtype == str:
442 |             return "STRING"
443 |         elif np_dtype == np.object_ or np_dtype.type == np.bytes_:
444 |             return "BYTES"
445 |         return None
446 | 
447 |     @staticmethod
448 |     def triton_to_np_dtype(dtype):
449 |         if dtype == "BOOL":
450 |             return bool
451 |         elif dtype == "INT8":
452 |             return np.int8
453 |         elif dtype == "INT16":
454 |             return np.int16
455 |         elif dtype == "INT32":
456 |             return np.int32
457 |         elif dtype == "INT64":
458 |             return np.int64
459 |         elif dtype == "UINT8":
460 |             return np.uint8
461 |         elif dtype == "UINT16":
462 |             return np.uint16
463 |         elif dtype == "UINT32":
464 |             return np.uint32
465 |         elif dtype == "UINT64":
466 |             return np.uint64
467 |         elif dtype == "FP16":
468 |             return np.float16
469 |         elif dtype == "FP32":
470 |             return np.float32
471 |         elif dtype == "FP64":
472 |             return np.float64
473 |         elif dtype == "BYTES":
474 |             return np.object_
475 |         return None
476 | 
477 | 
478 | def main():
479 |     title = 'clearml-serving - Nvidia Triton Engine Controller'
480 |     print(title)
481 |     parser = ArgumentParser(prog='clearml-serving', description=title)
482 |     parser.add_argument(
483 |         '--serving-id', default=os.environ.get('CLEARML_SERVING_TASK_ID'), type=str,
484 |         help='Specify main serving service Task ID')
485 |     parser.add_argument(
486 |         '--project', default=None, type=str,
487 |         help='Optional specify project for the serving engine Task')
488 |     parser.add_argument(
489 |         '--name', default='triton engine', type=str,
490 |         help='Optional specify task name for the serving engine Task')
491 |     parser.add_argument(
492 |         '--update-frequency', default=os.environ.get('CLEARML_TRITON_POLL_FREQ') or 10., type=float,
493 |         help='Model update frequency in minutes')
494 |     parser.add_argument(
495 |         '--metric-frequency', default=os.environ.get('CLEARML_TRITON_METRIC_FREQ') or 1., type=float,
496 |         help='Metric reporting update frequency in minutes')
497 |     parser.add_argument(
498 |         '--inference-task-id', default=None, type=str,
499 |         help='Optional: Specify the inference Task ID to report to. default: create a new one')
500 |     parser.add_argument(
501 |         '--t-http-port', type=str, help='<integer> The port for the server to listen on for HTTP requests')
502 |     parser.add_argument(
503 |         '--t-http-thread-count', type=str, help='<integer> Number of threads handling HTTP requests')
504 |     parser.add_argument(
505 |         '--t-allow-grpc', type=str, help='<integer> Allow the server to listen for GRPC requests')
506 |     parser.add_argument(
507 |         '--t-grpc-port', type=str, help='<integer> The port for the server to listen on for GRPC requests')
508 |     parser.add_argument(
509 |         '--t-grpc-infer-allocation-pool-size', type=str,
510 |         help='<integer> The maximum number of inference request/response objects that remain '
511 |              'allocated for reuse. As long as the number of in-flight requests doesn\'t exceed '
512 |              'this value there will be no allocation/deallocation of request/response objects')
513 |     parser.add_argument(
514 |         '--t-pinned-memory-pool-byte-size', type=str,
515 |         help='<integer> The total byte size that can be allocated as pinned system '
516 |              'memory. If GPU support is enabled, the server will allocate pinned '
517 |              'system memory to accelerate data transfer between host and devices '
518 |              'until it exceeds the specified byte size. This option will not affect '
519 |              'the allocation conducted by the backend frameworks. Default is 256 MB')
520 |     parser.add_argument(
521 |         '--t-cuda-memory-pool-byte-size', type=str,
522 |         help='<<integer>:<integer>> The total byte size that can be allocated as CUDA memory for '
523 |              'the GPU device. If GPU support is enabled, the server will allocate '
524 |              'CUDA memory to minimize data transfer between host and devices '
525 |              'until it exceeds the specified byte size. This option will not affect '
526 |              'the allocation conducted by the backend frameworks. The argument '
527 |              'should be 2 integers separated by colons in the format <GPU device'
528 |              'ID>:<pool byte size>. This option can be used multiple times, but only '
529 |              'once per GPU device. Subsequent uses will overwrite previous uses for '
530 |              'the same GPU device. Default is 64 MB')
531 |     parser.add_argument(
532 |         '--t-min-supported-compute-capability', type=str,
533 |         help='<float> The minimum supported CUDA compute capability. GPUs that '
534 |              'don\'t support this compute capability will not be used by the server')
535 |     parser.add_argument(
536 |         '--t-buffer-manager-thread-count', type=str,
537 |         help='<integer> The number of threads used to accelerate copies and other'
538 |              'operations required to manage input and output tensor contents.'
539 |              'Default is 0')
540 |     parser.add_argument(
541 |         '--t-log-verbose', type=str,
542 |         help='<integer> Triton server logging verbosity (default disabled)')
543 |     parser.add_argument(
544 |         '--t-exit-on-error', type=bool, default=True,
545 |         help='Exits the inference server if any error occurs during initialization.'
546 |              'Recommended to set to True to catch any unanticipated errors.'
547 |              'False prevents single models breaking the whole tritonserver.'
548 |     )
549 | 
550 |     args = parser.parse_args()
551 | 
552 |     # check Args OS overrides
553 |     prefix = "CLEARML_TRITON_"
554 |     for k, v in os.environ.items():
555 |         if not k.startswith(prefix):
556 |             continue
557 |         args_var = k.replace(prefix, "", 1).replace("-", "_").lower()
558 |         if args_var in args.__dict__:
559 |             # casting
560 |             t = type(getattr(args, args_var, None))
561 |             setattr(args, args_var, type(t)(v) if t is not None else v)
562 | 
563 |     # noinspection PyProtectedMember
564 |     serving_task = ModelRequestProcessor._get_control_plane_task(task_id=args.inference_task_id)
565 | 
566 |     task = Task.init(
567 |         project_name=args.project or serving_task.get_project_name() or "serving",
568 |         task_name="{} - {}".format(serving_task.name, args.name),
569 |         task_type=Task.TaskTypes.inference,
570 |         continue_last_task=args.inference_task_id or None
571 |     )
572 |     print("configuration args: {}".format(args))
573 |     helper = TritonHelper(args, task, serving_id=args.serving_id)
574 | 
575 |     # safe casting
576 |     try:
577 |         update_frequency_sec = float(args.update_frequency) * 60.0
578 |     except (ValueError, TypeError):
579 |         update_frequency_sec = 600
580 |     try:
581 |         metric_frequency_sec = float(args.metric_frequency) * 60.0
582 |     except (ValueError, TypeError):
583 |         metric_frequency_sec = 60
584 | 
585 |     # this function will never return
586 |     helper.maintenance_daemon(
587 |         local_model_repo='/models',
588 |         update_frequency_sec=update_frequency_sec,
589 |         metric_frequency_sec=metric_frequency_sec,
590 |     )
591 | 
592 | 
593 | if __name__ == '__main__':
594 |     main()
595 | 


--------------------------------------------------------------------------------
/clearml_serving/preprocess/preprocess_template.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Optional, Callable, Union
  2 | 
  3 | 
  4 | # Preprocess class Must be named "Preprocess"
  5 | # No need to inherit or to implement all methods
  6 | class Preprocess(object):
  7 |     """
  8 |     Preprocess class Must be named "Preprocess"
  9 |     Otherwise there are No limitations, No need to inherit or to implement all methods
 10 |     Notice! This is not thread safe! the same instance may be accessed from multiple threads simultaneously
 11 |     to store date in a safe way push it into the `state` dict argument of preprocessing/postprocessing functions
 12 | 
 13 |     Notice the execution flows is synchronous as follows:
 14 | 
 15 |     1. RestAPI(...) -> body: Union[bytes, dict]
 16 |     2. preprocess(body: Union[bytes, dict], ...) -> data: Any
 17 |     3. process(data: Any, ...) -> data: Any
 18 |     4. postprocess(data: Any, ...) -> result: dict
 19 |     5. RestAPI(result: dict) -> returned request
 20 |     """
 21 | 
 22 |     def __init__(self):
 23 |         # set internal state, this will be called only once. (i.e. not per request)
 24 |         # it will also set the internal model_endpoint to reference the specific model endpoint object being served
 25 |         self.model_endpoint = None  # type: clearml_serving.serving.endpoints.ModelEndpoint
 26 | 
 27 |     def load(self, local_file_name: str) -> Any:  # noqa
 28 |         """
 29 |         OPTIONAL: provide loading method for the model
 30 |         useful if we need to load a model in a specific way for the prediction engine to work
 31 | 
 32 |         REMOVE FUNCTION IF NOT USED
 33 | 
 34 |         Notice! When used with specific engines (i.e. not Custom)
 35 |         The returned object will be passed as is to the inference engine,
 36 |         this means it must not be None, otherwise the endpoint will be ignored!
 37 | 
 38 |         :param local_file_name: file name / path to read load the model from
 39 | 
 40 |         :return: Object that will be called with .predict() method for inference.
 41 |         """
 42 |         pass
 43 | 
 44 |     def preprocess(
 45 |             self,
 46 |             body: Union[bytes, dict],
 47 |             state: dict,
 48 |             collect_custom_statistics_fn: Optional[Callable[[dict], None]],
 49 |     ) -> Any:  # noqa
 50 |         """
 51 |         Optional: do something with the request data, return any type of object.
 52 |         The returned object will be passed as is to the inference engine
 53 | 
 54 |         :param body: dictionary or bytes as recieved from the RestAPI
 55 |         :param state: Use state dict to store data passed to the post-processing function call.
 56 |             This is a per-request state dict (meaning a new empty dict will be passed per request)
 57 |             Usage example:
 58 |             >>> def preprocess(..., state):
 59 |                     state['preprocess_aux_data'] = [1,2,3]
 60 |             >>> def postprocess(..., state):
 61 |                     print(state['preprocess_aux_data'])
 62 |         :param collect_custom_statistics_fn: Optional, if provided allows to send a custom set of key/values
 63 |             to the statictics collector servicd.
 64 |             None is passed if statiscs collector is not configured, or if the current request should not be collected
 65 | 
 66 |             Usage example:
 67 |             >>> print(body)
 68 |             {"x0": 1, "x1": 2}
 69 |             >>> if collect_custom_statistics_fn:
 70 |             >>>   collect_custom_statistics_fn({"x0": 1, "x1": 2})
 71 | 
 72 |         :return: Object to be passed directly to the model inference
 73 |         """
 74 |         return body
 75 | 
 76 |     def postprocess(
 77 |             self,
 78 |             data: Any,
 79 |             state: dict,
 80 |             collect_custom_statistics_fn: Optional[Callable[[dict], None]],
 81 |     ) -> dict:  # noqa
 82 |         """
 83 |         Optional: post process the data returned from the model inference engine
 84 |         returned dict will be passed back as the request result as is.
 85 | 
 86 |         :param data: object as recieved from the inference model function
 87 |         :param state: Use state dict to store data passed to the post-processing function call.
 88 |             This is a per-request state dict (meaning a dict instance per request)
 89 |             Usage example:
 90 |             >>> def preprocess(..., state):
 91 |                     state['preprocess_aux_data'] = [1,2,3]
 92 |             >>> def postprocess(..., state):
 93 |                     print(state['preprocess_aux_data'])
 94 |         :param collect_custom_statistics_fn: Optional, if provided allows to send a custom set of key/values
 95 |             to the statictics collector servicd.
 96 |             None is passed if statiscs collector is not configured, or if the current request should not be collected
 97 | 
 98 |             Usage example:
 99 |             >>> if collect_custom_statistics_fn:
100 |             >>>   collect_custom_statistics_fn({"y": 1})
101 | 
102 |         :return: Dictionary passed directly as the returned result of the RestAPI
103 |         """
104 |         return data
105 | 
106 |     def process(
107 |             self,
108 |             data: Any,
109 |             state: dict,
110 |             collect_custom_statistics_fn: Optional[Callable[[dict], None]],
111 |     ) -> Any:  # noqa
112 |         """
113 |         OPTIONAL: do something with the actual data, return any type of object.
114 |         The returned object will be passed as is to the postprocess function engine
115 | 
116 |         REMOVE FUNCTION IF NOT USED
117 | 
118 |         :param data: object as recieved from the preprocessing function
119 |         :param state: Use state dict to store data passed to the post-processing function call.
120 |             This is a per-request state dict (meaning a dict instance per request)
121 |             Usage example:
122 |             >>> def preprocess(..., state):
123 |                     state['preprocess_aux_data'] = [1,2,3]
124 |             >>> def postprocess(..., state):
125 |                     print(state['preprocess_aux_data'])
126 |         :param collect_custom_statistics_fn: Optional, if provided allows to send a custom set of key/values
127 |             to the statictics collector servicd.
128 |             None is passed if statiscs collector is not configured, or if the current request should not be collected
129 | 
130 |             Usage example:
131 |             >>> if collect_custom_statistics_fn:
132 |             >>>   collect_custom_statistics_fn({"type": "classification"})
133 | 
134 |         :return: Object to be passed tp the post-processing function
135 |         """
136 |         return data
137 | 
138 |     def send_request(  # noqa
139 |             self,
140 |             endpoint: str,
141 |             version: Optional[str] = None,
142 |             data: Optional[dict] = None
143 |     ) -> Optional[dict]:
144 |         """
145 |         NOTICE! This method will be replaced in runtime, by the inference service
146 | 
147 |         Helper method to send model inference requests to the inference service itself.
148 |         This is designed to help with model ensemble, model pipelines, etc.
149 |         On request error return None, otherwise the request result data dictionary
150 | 
151 |         Usage example:
152 | 
153 |         >>> x0, x1 = 1, 2
154 |         >>> result = self.send_request(endpoint="test_model_sklearn", version="1", data={"x0": x0, "x1": x1})
155 |         >>> y = result["y"]
156 |         """
157 |         pass
158 | 


--------------------------------------------------------------------------------
/clearml_serving/serving/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-bullseye
 2 | 
 3 | 
 4 | ENV LC_ALL=C.UTF-8
 5 | 
 6 | # install base package
 7 | RUN pip3 install --no-cache-dir clearml-serving
 8 | 
 9 | # get latest execution code from the git repository
10 | # RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git
11 | COPY clearml_serving /root/clearml/clearml_serving
12 | 
13 | RUN pip3 install --no-cache-dir -r /root/clearml/clearml_serving/serving/requirements.txt
14 | 
15 | # default serving port
16 | EXPOSE 8080
17 | 
18 | # environement variable to load Task from CLEARML_SERVING_TASK_ID, CLEARML_SERVING_PORT
19 | 
20 | WORKDIR /root/clearml/
21 | ENTRYPOINT ["clearml_serving/serving/entrypoint.sh"]
22 | 


--------------------------------------------------------------------------------
/clearml_serving/serving/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clearml/clearml-serving/1def0a6901617767687f2b747aaffdc060f96046/clearml_serving/serving/__init__.py


--------------------------------------------------------------------------------
/clearml_serving/serving/endpoints.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from attr import attrib, attrs, asdict, validators
  3 | 
  4 | 
  5 | def _engine_validator(inst, attr, value):  # noqa
  6 |     from .preprocess_service import BasePreprocessRequest
  7 |     if not BasePreprocessRequest.validate_engine_type(value):
  8 |         raise TypeError("{} not supported engine type".format(value))
  9 | 
 10 | 
 11 | def _matrix_type_validator(inst, attr, value):  # noqa
 12 |     if isinstance(value, (tuple, list)):
 13 |         for v in value:
 14 |             if v and not np.dtype(v):
 15 |                 raise TypeError("{} not supported matrix type".format(v))
 16 | 
 17 |     elif value and not np.dtype(value):
 18 |         raise TypeError("{} not supported matrix type".format(value))
 19 | 
 20 | 
 21 | def _list_type_convertor(inst):  # noqa
 22 |     if inst is None:
 23 |         return None
 24 |     return inst if isinstance(inst, (tuple, list)) else [inst]
 25 | 
 26 | 
 27 | def _nested_list_type_convertor(inst):  # noqa
 28 |     if inst is None:
 29 |         return None
 30 |     if isinstance(inst, (tuple, list)) and all(not isinstance(i, (tuple, list)) for i in inst):
 31 |         return [inst]
 32 |     inst = inst if isinstance(inst, (tuple, list)) else [inst]
 33 |     return inst
 34 | 
 35 | 
 36 | @attrs
 37 | class BaseStruct(object):
 38 |     def as_dict(self, remove_null_entries=False):
 39 |         if not remove_null_entries:
 40 |             return asdict(self)
 41 |         return {k: v for k, v in asdict(self).items() if v is not None}
 42 | 
 43 | 
 44 | @attrs
 45 | class ModelMonitoring(BaseStruct):
 46 |     base_serving_url = attrib(type=str)  # serving point url prefix (example: "detect_cat")
 47 |     engine_type = attrib(type=str, validator=_engine_validator)  # engine type
 48 |     monitor_project = attrib(type=str, default=None)  # monitor model project (for model auto update)
 49 |     monitor_name = attrib(type=str, default=None)  # monitor model name (for model auto update, regexp selection)
 50 |     monitor_tags = attrib(type=list, default=[])  # monitor model tag (for model auto update)
 51 |     only_published = attrib(type=bool, default=False)  # only select published models
 52 |     max_versions = attrib(type=int, default=None)  # Maximum number of models to keep serving (latest X models)
 53 |     input_size = attrib(type=list, default=None, converter=_nested_list_type_convertor)  # optional,  model matrix size
 54 |     input_type = attrib(type=list, default=None, validator=_matrix_type_validator, converter=_list_type_convertor)
 55 |     input_name = attrib(type=list, default=None, converter=_list_type_convertor)  # optional, input layer names
 56 |     output_size = attrib(type=list, default=None, converter=_nested_list_type_convertor)  # optional, model matrix size
 57 |     output_type = attrib(type=list, default=None, validator=_matrix_type_validator, converter=_list_type_convertor)
 58 |     output_name = attrib(type=list, default=None, converter=_list_type_convertor)  # optional, output layer names
 59 |     preprocess_artifact = attrib(
 60 |         type=str, default=None)  # optional artifact name storing the model preprocessing code
 61 |     auxiliary_cfg = attrib(type=dict, default=None)  # Auxiliary configuration (e.g. triton conf), Union[str, dict]
 62 | 
 63 | 
 64 | @attrs
 65 | class ModelEndpoint(BaseStruct):
 66 |     engine_type = attrib(type=str, validator=_engine_validator)  # engine type
 67 |     serving_url = attrib(type=str)  # full serving point url (including version) example: "detect_cat/v1"
 68 |     model_id = attrib(type=str, default=None)  # model ID to serve (and download)
 69 |     version = attrib(type=str, default="")  # key (version string), default no version
 70 |     preprocess_artifact = attrib(
 71 |         type=str, default=None)  # optional artifact name storing the model preprocessing code
 72 |     input_size = attrib(type=list, default=None, converter=_nested_list_type_convertor)  # optional,  model matrix size
 73 |     input_type = attrib(type=list, default=None, validator=_matrix_type_validator, converter=_list_type_convertor)
 74 |     input_name = attrib(type=list, default=None, converter=_list_type_convertor)  # optional, input layer names
 75 |     output_size = attrib(type=list, default=None, converter=_nested_list_type_convertor)  # optional, model matrix size
 76 |     output_type = attrib(type=list, default=None, validator=_matrix_type_validator, converter=_list_type_convertor)
 77 |     output_name = attrib(type=list, default=None, converter=_list_type_convertor)  # optional, output layer names
 78 |     auxiliary_cfg = attrib(type=dict, default=None)  # Optional: Auxiliary configuration (e.g. triton conf), [str, dict]
 79 | 
 80 | 
 81 | @attrs
 82 | class CanaryEP(BaseStruct):
 83 |     endpoint = attrib(type=str)  # load balancer endpoint
 84 |     weights = attrib(type=list)  # list of weights (order should be matching fixed_endpoints or prefix)
 85 |     load_endpoints = attrib(type=list, default=[])  # list of endpoints to balance and route
 86 |     load_endpoint_prefix = attrib(
 87 |         type=str, default=None)  # endpoint prefix to list
 88 |     # (any endpoint starting with this prefix will be listed, sorted lexicographically, or broken into /<int>)
 89 | 
 90 | 
 91 | @attrs
 92 | class EndpointMetricLogging(BaseStruct):
 93 |     @attrs
 94 |     class MetricType(BaseStruct):
 95 |         type = attrib(type=str, validator=validators.in_(("scalar", "enum", "value", "counter")))
 96 |         buckets = attrib(type=list, default=None)
 97 | 
 98 |     endpoint = attrib(type=str)  # Specific endpoint to log metrics w/ version (example: "model/1")
 99 |     # If endpoint name ends with a "*" any endpoint with a matching prefix will be selected
100 | 
101 |     log_frequency = attrib(type=float, default=None)  # Specific endpoint to log frequency
102 |     # (0.0 to 1.0, where 1.0 is 100% of all requests are logged)
103 | 
104 |     metrics = attrib(
105 |         type=dict, default={},
106 |         converter=lambda x: {
107 |             k: v if isinstance(v, EndpointMetricLogging.MetricType)
108 |             else EndpointMetricLogging.MetricType(**v) for k, v in x.items()
109 |         }
110 |     )  # key=variable, value=MetricType
111 | 
112 |     # example:
113 |     # {"x1": dict(type="scalar", buckets=[0,1,2,3]),
114 |     #  "y": dict(type="enum", buckets=["cat", "dog"]).
115 |     #  "latency": dict(type="value", buckets=[]).
116 |     #  }
117 | 
118 |     def as_dict(self, remove_null_entries=False):
119 |         if not remove_null_entries:
120 |             return {k: v.as_dict(remove_null_entries) if isinstance(v, BaseStruct) else v
121 |                     for k, v in asdict(self).items()}
122 | 
123 |         return {k: v.as_dict(remove_null_entries) if isinstance(v, BaseStruct) else v
124 |                 for k, v in asdict(self).items() if v is not None}
125 | 


--------------------------------------------------------------------------------
/clearml_serving/serving/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # print configuration
 4 | echo CLEARML_SERVING_TASK_ID="$CLEARML_SERVING_TASK_ID"
 5 | echo CLEARML_INFERENCE_TASK_ID="$CLEARML_INFERENCE_TASK_ID"
 6 | echo CLEARML_SERVING_PORT="$CLEARML_SERVING_PORT"
 7 | echo CLEARML_USE_GUNICORN="$CLEARML_USE_GUNICORN"
 8 | echo CLEARML_EXTRA_PYTHON_PACKAGES="$CLEARML_EXTRA_PYTHON_PACKAGES"
 9 | echo CLEARML_SERVING_NUM_PROCESS="$CLEARML_SERVING_NUM_PROCESS"
10 | echo CLEARML_SERVING_POLL_FREQ="$CLEARML_SERVING_POLL_FREQ"
11 | echo CLEARML_DEFAULT_KAFKA_SERVE_URL="$CLEARML_DEFAULT_KAFKA_SERVE_URL"
12 | 
13 | SERVING_PORT="${CLEARML_SERVING_PORT:-8080}"
14 | GUNICORN_NUM_PROCESS="${CLEARML_SERVING_NUM_PROCESS:-4}"
15 | GUNICORN_SERVING_TIMEOUT="${GUNICORN_SERVING_TIMEOUT:-600}"
16 | GUNICORN_MAX_REQUESTS="${GUNICORN_MAX_REQUESTS:-0}"
17 | UVICORN_SERVE_LOOP="${UVICORN_SERVE_LOOP:-uvloop}"
18 | UVICORN_LOG_LEVEL="${UVICORN_LOG_LEVEL:-warning}"
19 | 
20 | # set default internal serve endpoint (for request pipelining)
21 | CLEARML_DEFAULT_BASE_SERVE_URL="${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:$SERVING_PORT/serve}"
22 | CLEARML_DEFAULT_TRITON_GRPC_ADDR="${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-127.0.0.1:8001}"
23 | 
24 | # print configuration
25 | echo WEB_CONCURRENCY="$WEB_CONCURRENCY"
26 | echo SERVING_PORT="$SERVING_PORT"
27 | echo GUNICORN_NUM_PROCESS="$GUNICORN_NUM_PROCESS"
28 | echo GUNICORN_SERVING_TIMEOUT="$GUNICORN_SERVING_PORT"
29 | echo GUNICORN_MAX_REQUESTS="$GUNICORN_MAX_REQUESTS"
30 | echo GUNICORN_EXTRA_ARGS="$GUNICORN_EXTRA_ARGS"
31 | echo UVICORN_SERVE_LOOP="$UVICORN_SERVE_LOOP"
32 | echo UVICORN_EXTRA_ARGS="$UVICORN_EXTRA_ARGS"
33 | echo UVICORN_LOG_LEVEL="$UVICORN_LOG_LEVEL"
34 | echo CLEARML_DEFAULT_BASE_SERVE_URL="$CLEARML_DEFAULT_BASE_SERVE_URL"
35 | echo CLEARML_DEFAULT_TRITON_GRPC_ADDR="$CLEARML_DEFAULT_TRITON_GRPC_ADDR"
36 | 
37 | # runtime add extra python packages
38 | if [ ! -z "$CLEARML_EXTRA_PYTHON_PACKAGES" ]
39 | then
40 |       python3 -m pip install $CLEARML_EXTRA_PYTHON_PACKAGES
41 | fi
42 | 
43 | if [ -z "$CLEARML_USE_GUNICORN" ]
44 | then
45 |   if [ -z "$CLEARML_SERVING_NUM_PROCESS" ]
46 |   then
47 |     echo "Starting Uvicorn server - single worker"
48 |     PYTHONPATH=$(pwd) python3 -m uvicorn \
49 |         clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \
50 |         $UVICORN_EXTRA_ARGS
51 |   else
52 |     echo "Starting Uvicorn server - multi worker"
53 |     PYTHONPATH=$(pwd) python3 clearml_serving/serving/uvicorn_mp_entrypoint.py \
54 |         clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \
55 |         --workers $CLEARML_SERVING_NUM_PROCESS $UVICORN_EXTRA_ARGS
56 |   fi
57 | else
58 |   echo "Starting Gunicorn server"
59 |   # start service
60 |   PYTHONPATH=$(pwd) python3 -m gunicorn \
61 |       --preload clearml_serving.serving.main:app \
62 |       --workers $GUNICORN_NUM_PROCESS \
63 |       --worker-class uvicorn.workers.UvicornWorker \
64 |       --max-requests $GUNICORN_MAX_REQUESTS \
65 |       --timeout $GUNICORN_SERVING_TIMEOUT \
66 |       --bind 0.0.0.0:$SERVING_PORT \
67 |       $GUNICORN_EXTRA_ARGS
68 | fi
69 | 


--------------------------------------------------------------------------------
/clearml_serving/serving/init.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from clearml import Task
 3 | from clearml_serving.serving.model_request_processor import ModelRequestProcessor
 4 | from clearml_serving.serving.preprocess_service import BasePreprocessRequest
 5 | 
 6 | 
 7 | def setup_task(force_threaded_logging=None):
 8 |     serving_service_task_id = os.environ.get("CLEARML_SERVING_TASK_ID", None)
 9 |     inference_service_task_id = os.environ.get("CLEARML_INFERENCE_TASK_ID", False) # according Task.init() docs
10 | 
11 |     # always use background thread, it requires less memory
12 |     if force_threaded_logging or os.environ.get("CLEARML_BKG_THREAD_REPORT") in ("1", "Y", "y", "true"):
13 |         os.environ["CLEARML_BKG_THREAD_REPORT"] = "1"
14 |         Task._report_subprocess_enabled = False
15 | 
16 |     # get the serving controller task
17 |     # noinspection PyProtectedMember
18 |     serving_task = ModelRequestProcessor._get_control_plane_task(task_id=serving_service_task_id)
19 |     # set to running (because we are here)
20 |     if serving_task.status != "in_progress":
21 |         serving_task.started(force=True)
22 | 
23 |     # create a new serving instance (for visibility and monitoring)
24 |     instance_task = Task.init(
25 |         project_name=serving_task.get_project_name(),
26 |         task_name="{} - serve instance".format(serving_task.name),
27 |         task_type="inference",  # noqa
28 |         continue_last_task=inference_service_task_id,
29 |     )
30 |     instance_task.set_system_tags(["service"])
31 |     # make sure we start logging thread/process
32 |     instance_logger = instance_task.get_logger()  # noqa
33 |     # this will use the main thread/process
34 |     session_logger = serving_task.get_logger()
35 | 
36 |     # preload modules into memory before forking
37 |     BasePreprocessRequest.load_modules()
38 | 
39 |     return serving_service_task_id, session_logger, instance_task.id
40 | 


--------------------------------------------------------------------------------
/clearml_serving/serving/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shlex
  3 | import traceback
  4 | import gzip
  5 | import asyncio
  6 | 
  7 | from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
  8 | from fastapi.routing import APIRoute
  9 | from fastapi.responses import PlainTextResponse
 10 | from grpc.aio import AioRpcError
 11 | 
 12 | from starlette.background import BackgroundTask
 13 | 
 14 | from typing import Optional, Dict, Any, Callable, Union
 15 | 
 16 | from clearml_serving.version import __version__
 17 | from clearml_serving.serving.init import setup_task
 18 | from clearml_serving.serving.model_request_processor import (
 19 |     ModelRequestProcessor,
 20 |     EndpointNotFoundException,
 21 |     EndpointBackendEngineException,
 22 |     EndpointModelLoadException,
 23 |     ServingInitializationException,
 24 | )
 25 | from clearml_serving.serving.utils import parse_grpc_errors
 26 | 
 27 | 
 28 | class GzipRequest(Request):
 29 |     async def body(self) -> bytes:
 30 |         if not hasattr(self, "_body"):
 31 |             body = await super().body()
 32 |             if "gzip" in self.headers.getlist("Content-Encoding"):
 33 |                 body = gzip.decompress(body)
 34 |             self._body = body  # noqa
 35 |         return self._body
 36 | 
 37 | 
 38 | class GzipRoute(APIRoute):
 39 |     def get_route_handler(self) -> Callable:
 40 |         original_route_handler = super().get_route_handler()
 41 | 
 42 |         async def custom_route_handler(request: Request) -> Response:
 43 |             request = GzipRequest(request.scope, request.receive)
 44 |             return await original_route_handler(request)
 45 | 
 46 |         return custom_route_handler
 47 | 
 48 | 
 49 | # process Lock, so that we can have only a single process doing the model reloading at a time
 50 | singleton_sync_lock = None  # Lock()
 51 | # shared Model processor object
 52 | processor = None  # type: Optional[ModelRequestProcessor]
 53 | 
 54 | # create clearml Task and load models
 55 | serving_service_task_id, session_logger, instance_id = setup_task()
 56 | # polling frequency
 57 | model_sync_frequency_secs = 5
 58 | try:
 59 |     model_sync_frequency_secs = float(os.environ.get("CLEARML_SERVING_POLL_FREQ", model_sync_frequency_secs))
 60 | except (ValueError, TypeError):
 61 |     pass
 62 | 
 63 | 
 64 | grpc_aio_ignore_errors = parse_grpc_errors(shlex.split(os.environ.get("CLEARML_SERVING_AIO_RPC_IGNORE_ERRORS", "")))
 65 | grpc_aio_verbose_errors = parse_grpc_errors(shlex.split(os.environ.get("CLEARML_SERVING_AIO_RPC_VERBOSE_ERRORS", "")))
 66 | 
 67 | 
 68 | class CUDAException(Exception):
 69 |     def __init__(self, exception: str):
 70 |         self.exception = exception
 71 | 
 72 | 
 73 | # start FastAPI app
 74 | app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router")
 75 | 
 76 | 
 77 | @app.on_event("startup")
 78 | async def startup_event():
 79 |     global processor
 80 | 
 81 |     if processor:
 82 |         print(
 83 |             "ModelRequestProcessor already initialized [pid={}] [service_id={}]".format(
 84 |                 os.getpid(), serving_service_task_id
 85 |             )
 86 |         )
 87 |     else:
 88 |         print("Starting up ModelRequestProcessor [pid={}] [service_id={}]".format(os.getpid(), serving_service_task_id))
 89 |         processor = ModelRequestProcessor(
 90 |             task_id=serving_service_task_id,
 91 |             update_lock_guard=singleton_sync_lock,
 92 |         )
 93 |         print("ModelRequestProcessor [id={}] loaded".format(processor.get_id()))
 94 |         processor.launch(poll_frequency_sec=model_sync_frequency_secs * 60)
 95 | 
 96 | 
 97 | @app.on_event("shutdown")
 98 | def shutdown_event():
 99 |     print("RESTARTING INFERENCE SERVICE!")
100 | 
101 | 
102 | async def exit_app():
103 |     loop = asyncio.get_running_loop()
104 |     loop.stop()
105 | 
106 | 
107 | @app.exception_handler(CUDAException)
108 | async def cuda_exception_handler(request, exc):
109 |     task = BackgroundTask(exit_app)
110 |     return PlainTextResponse("CUDA out of memory. Restarting service", status_code=500, background=task)
111 | 
112 | 
113 | router = APIRouter(
114 |     prefix="/serve",
115 |     tags=["models"],
116 |     responses={404: {"description": "Model Serving Endpoint Not found"}},
117 |     route_class=GzipRoute,  # mark-out to remove support for GZip content encoding
118 | )
119 | 
120 | 
121 | # cover all routing options for model version `/{model_id}`, `/{model_id}/123`, `/{model_id}?version=123`
122 | @router.post("/{model_id}/{version}")
123 | @router.post("/{model_id}/")
124 | @router.post("/{model_id}")
125 | async def serve_model(model_id: str, version: Optional[str] = None, request: Union[bytes, Dict[Any, Any]] = None):
126 |     try:
127 |         return_value = await processor.process_request(base_url=model_id, version=version, request_body=request)
128 |     except EndpointNotFoundException as ex:
129 |         raise HTTPException(status_code=404, detail="Error processing request, endpoint was not found: {}".format(ex))
130 |     except (EndpointModelLoadException, EndpointBackendEngineException) as ex:
131 |         session_logger.report_text(
132 |             "[{}] Exception [{}] {} while processing request: {}\n{}".format(
133 |                 instance_id, type(ex), ex, request, "".join(traceback.format_exc())
134 |             )
135 |         )
136 |         raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
137 |     except ServingInitializationException as ex:
138 |         session_logger.report_text(
139 |             "[{}] Exception [{}] {} while loading serving inference: {}\n{}".format(
140 |                 instance_id, type(ex), ex, request, "".join(traceback.format_exc())
141 |             )
142 |         )
143 |         raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex))
144 |     except ValueError as ex:
145 |         session_logger.report_text(
146 |             "[{}] Exception [{}] {} while processing request: {}\n{}".format(
147 |                 instance_id, type(ex), ex, request, "".join(traceback.format_exc())
148 |             )
149 |         )
150 |         if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
151 |             raise CUDAException(exception=ex)
152 |         else:
153 |             raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
154 |     except AioRpcError as ex:
155 |         if grpc_aio_verbose_errors and ex.code() in grpc_aio_verbose_errors:
156 |             session_logger.report_text(
157 |                 "[{}] Exception [AioRpcError] {} while processing request: {}".format(instance_id, ex, request)
158 |             )
159 |         elif not grpc_aio_ignore_errors or ex.code() not in grpc_aio_ignore_errors:
160 |             session_logger.report_text("[{}] Exception [AioRpcError] status={} ".format(instance_id, ex.code()))
161 |         raise HTTPException(
162 |             status_code=500, detail="Error [AioRpcError] processing request: status={}".format(ex.code())
163 |         )
164 |     except Exception as ex:
165 |         session_logger.report_text(
166 |             "[{}] Exception [{}] {} while processing request: {}\n{}".format(
167 |                 instance_id, type(ex), ex, request, "".join(traceback.format_exc())
168 |             )
169 |         )
170 |         raise HTTPException(status_code=500, detail="Error  [{}] processing request: {}".format(type(ex), ex))
171 |     return return_value
172 | 
173 | 
174 | app.include_router(router)
175 | 


--------------------------------------------------------------------------------
/clearml_serving/serving/requirements.txt:
--------------------------------------------------------------------------------
 1 | clearml>=1.10.1,<2
 2 | attrs>=20.3.0,<24
 3 | fastapi[all]>=0.109.1,<0.111
 4 | uvicorn[standard]
 5 | gunicorn>=20.1.0,<20.2
 6 | asyncio>=3.4.3,<3.5 ; python_version < '3.10'
 7 | aiocache>=0.12,<0.13
 8 | tritonclient[grpc]>=2.32,<2.33
 9 | starlette
10 | numpy>=1.24,<1.27
11 | scikit-learn>=1.2.2,<1.3
12 | pandas>=1.5.3,<1.6
13 | grpcio
14 | referencing>=0.31.0
15 | Pillow>=10.0.1
16 | xgboost>=1.7.5,<1.8
17 | lightgbm>=3.3.2,<3.4
18 | requests>=2.31.0
19 | kafka-python>=2.0.2,<2.1
20 | lz4>=4.0.0,<5
21 | 


--------------------------------------------------------------------------------
/clearml_serving/serving/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Set
 2 | 
 3 | import grpc
 4 | 
 5 | 
 6 | def parse_grpc_errors(errors: List[str]) -> Set[grpc.StatusCode]:
 7 |     try:
 8 |         typed_errors = {
 9 |             int(e) if e.isdigit() else e.lower().replace("-", " ").replace("_", " ")
10 |             for e in errors
11 |         }
12 |         if len(typed_errors) == 1 and next(iter(typed_errors)) in ("true", "false"):
13 |             return set(grpc.StatusCode if next(iter(typed_errors)) == "true" else [])
14 |         return {e for e in grpc.StatusCode if typed_errors.intersection(e.value)}
15 |     except (ValueError, TypeError):
16 |         pass
17 |     return set()
18 | 


--------------------------------------------------------------------------------
/clearml_serving/serving/uvicorn_mp_entrypoint.py:
--------------------------------------------------------------------------------
1 | import uvicorn
2 | from clearml_serving.serving.init import setup_task
3 | 
4 | if __name__ == "__main__":
5 |     setup_task(force_threaded_logging=True)
6 |     uvicorn.main()
7 | 


--------------------------------------------------------------------------------
/clearml_serving/statistics/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-bullseye
 2 | 
 3 | 
 4 | ENV LC_ALL=C.UTF-8
 5 | 
 6 | # install base package
 7 | RUN pip3 install --no-cache-dir clearml-serving
 8 | 
 9 | # get latest execution code from the git repository
10 | # RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git
11 | COPY clearml_serving /root/clearml/clearml_serving
12 | 
13 | RUN pip3 install --no-cache-dir -r /root/clearml/clearml_serving/statistics/requirements.txt
14 | 
15 | # default serving port
16 | EXPOSE 9999
17 | 
18 | # environement variable to load Task from CLEARML_SERVING_TASK_ID, CLEARML_SERVING_PORT
19 | 
20 | WORKDIR /root/clearml/
21 | ENTRYPOINT ["clearml_serving/statistics/entrypoint.sh"]
22 | 


--------------------------------------------------------------------------------
/clearml_serving/statistics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clearml/clearml-serving/1def0a6901617767687f2b747aaffdc060f96046/clearml_serving/statistics/__init__.py


--------------------------------------------------------------------------------
/clearml_serving/statistics/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # print configuration
 4 | echo CLEARML_SERVING_TASK_ID="$CLEARML_SERVING_TASK_ID"
 5 | echo CLEARML_SERVING_PORT="$CLEARML_SERVING_PORT"
 6 | echo CLEARML_EXTRA_PYTHON_PACKAGES="$CLEARML_EXTRA_PYTHON_PACKAGES"
 7 | echo CLEARML_SERVING_POLL_FREQ="$CLEARML_SERVING_POLL_FREQ"
 8 | echo CLEARML_DEFAULT_KAFKA_SERVE_URL="$CLEARML_DEFAULT_KAFKA_SERVE_URL"
 9 | 
10 | SERVING_PORT="${CLEARML_SERVING_PORT:-9999}"
11 | 
12 | # set default internal serve endpoint (for request pipelining)
13 | CLEARML_DEFAULT_BASE_SERVE_URL="${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:$SERVING_PORT/serve}"
14 | CLEARML_DEFAULT_TRITON_GRPC_ADDR="${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-127.0.0.1:8001}"
15 | 
16 | # print configuration
17 | echo SERVING_PORT="$SERVING_PORT"
18 | 
19 | # runtime add extra python packages
20 | if [ ! -z "$CLEARML_EXTRA_PYTHON_PACKAGES" ]
21 | then
22 |       python3 -m pip install $CLEARML_EXTRA_PYTHON_PACKAGES
23 | fi
24 | 
25 | echo "Starting Statistics Controller server"
26 | PYTHONPATH=$(pwd) python3 clearml_serving/statistics/main.py
27 | 


--------------------------------------------------------------------------------
/clearml_serving/statistics/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import prometheus_client
 4 | from clearml import Task
 5 | 
 6 | from clearml_serving.serving.model_request_processor import ModelRequestProcessor
 7 | from clearml_serving.statistics.metrics import StatisticsController
 8 | 
 9 | 
10 | def main():
11 |     serving_service_task_id = os.environ.get("CLEARML_SERVING_TASK_ID", None)
12 |     model_sync_frequency_secs = 5
13 |     try:
14 |         model_sync_frequency_secs = float(os.environ.get("CLEARML_SERVING_POLL_FREQ", model_sync_frequency_secs))
15 |     except (ValueError, TypeError):
16 |         pass
17 | 
18 |     # noinspection PyProtectedMember
19 |     serving_task = ModelRequestProcessor._get_control_plane_task(task_id=serving_service_task_id)
20 |     # create a new serving instance (for visibility and monitoring)
21 |     instance_task = Task.init(
22 |         project_name=serving_task.get_project_name(),
23 |         task_name="{} - statistics controller".format(serving_task.name),
24 |         task_type="monitor",
25 |     )
26 |     instance_task.set_system_tags(["service"])
27 |     # noinspection PyProtectedMember
28 |     kafka_server_url = os.environ.get("CLEARML_DEFAULT_KAFKA_SERVE_URL", "localhost:9092")
29 |     stats_controller = StatisticsController(
30 |         task=instance_task,
31 |         kafka_server_url=kafka_server_url,
32 |         serving_id=serving_service_task_id,
33 |         poll_frequency_min=model_sync_frequency_secs
34 |     )
35 |     prometheus_client.start_http_server(int(os.environ.get("CLEARML_SERVING_PORT", 9999)))
36 |     # we will never leave here
37 |     stats_controller.start()
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     main()
42 | 


--------------------------------------------------------------------------------
/clearml_serving/statistics/metrics.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | from copy import deepcopy
  5 | from functools import partial
  6 | from threading import Event, Thread
  7 | from time import time, sleep
  8 | 
  9 | from clearml import Task
 10 | from typing import Optional, Dict, Any, Iterable, Set
 11 | 
 12 | from prometheus_client import Histogram, Enum, Gauge, Counter, values
 13 | from kafka import KafkaConsumer
 14 | from prometheus_client.metrics import MetricWrapperBase, _validate_exemplar
 15 | from prometheus_client.registry import REGISTRY
 16 | from prometheus_client.samples import Exemplar, Sample
 17 | from prometheus_client.context_managers import Timer
 18 | from prometheus_client.utils import floatToGoString
 19 | 
 20 | from ..serving.endpoints import EndpointMetricLogging
 21 | from ..serving.model_request_processor import ModelRequestProcessor
 22 | 
 23 | 
 24 | class ScalarHistogram(Histogram):
 25 | 
 26 |     def __init__(self, *args, **kwargs):
 27 |         super().__init__(*args, **kwargs)
 28 | 
 29 |     def observe(self, amount, exemplar=None):
 30 |         """Observe the given amount.
 31 | 
 32 |         The amount is usually positive or zero. Negative values are
 33 |         accepted but prevent current versions of Prometheus from
 34 |         properly detecting counter resets in the sum of
 35 |         observations. See
 36 |         https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations
 37 |         for details.
 38 |         """
 39 |         self._raise_if_not_observable()
 40 |         if not isinstance(amount, (list, tuple)):
 41 |             amount = [amount]
 42 |         self._sum.inc(len(amount))
 43 |         for v in amount:
 44 |             for i, bound in enumerate(self._upper_bounds):
 45 |                 if v <= bound:
 46 |                     self._buckets[i].inc(1)
 47 |                     if exemplar:
 48 |                         _validate_exemplar(exemplar)
 49 |                         self._buckets[i].set_exemplar(Exemplar(exemplar, v, time()))
 50 |                     break
 51 | 
 52 |     def _child_samples(self) -> Iterable[Sample]:
 53 |         samples = []
 54 |         for i, bound in enumerate(self._upper_bounds):
 55 |             acc = self._buckets[i].get()
 56 |             samples.append(
 57 |                 Sample('_bucket', {'le': floatToGoString(bound)}, acc, None, self._buckets[i].get_exemplar())
 58 |             )
 59 |             samples.append(Sample('_sum', {'le': floatToGoString(bound)}, self._sum.get(), None, None))
 60 | 
 61 |         return tuple(samples)
 62 | 
 63 | 
 64 | class EnumHistogram(MetricWrapperBase):
 65 |     """A Histogram tracks the size and number of events in buckets.
 66 | 
 67 |     You can use Histograms for aggregatable calculation of quantiles.
 68 | 
 69 |     Example use cases:
 70 |     - Response latency
 71 |     - Request size
 72 | 
 73 |     Example for a Histogram:
 74 | 
 75 |         from prometheus_client import Histogram
 76 | 
 77 |         h = Histogram('request_size_bytes', 'Request size (bytes)')
 78 |         h.observe(512)  # Observe 512 (bytes)
 79 | 
 80 |     Example for a Histogram using time:
 81 | 
 82 |         from prometheus_client import Histogram
 83 | 
 84 |         REQUEST_TIME = Histogram('response_latency_seconds', 'Response latency (seconds)')
 85 | 
 86 |         @REQUEST_TIME.time()
 87 |         def create_response(request):
 88 |           '''A dummy function'''
 89 |           time.sleep(1)
 90 | 
 91 |     Example of using the same Histogram object as a context manager:
 92 | 
 93 |         with REQUEST_TIME.time():
 94 |             pass  # Logic to be timed
 95 | 
 96 |     The default buckets are intended to cover a typical web/rpc request from milliseconds to seconds.
 97 |     They can be overridden by passing `buckets` keyword argument to `Histogram`.
 98 |     """
 99 |     _type = 'histogram'
100 | 
101 |     def __init__(self,
102 |                  name,
103 |                  documentation,
104 |                  buckets,
105 |                  labelnames=(),
106 |                  namespace='',
107 |                  subsystem='',
108 |                  unit='',
109 |                  registry=REGISTRY,
110 |                  _labelvalues=None,
111 |                  ):
112 |         self._prepare_buckets(buckets)
113 |         super().__init__(
114 |             name=name,
115 |             documentation=documentation,
116 |             labelnames=labelnames,
117 |             namespace=namespace,
118 |             subsystem=subsystem,
119 |             unit=unit,
120 |             registry=registry,
121 |             _labelvalues=_labelvalues,
122 |         )
123 |         self._kwargs['buckets'] = buckets
124 | 
125 |     def _prepare_buckets(self, buckets):
126 |         buckets = [str(b) for b in buckets]
127 |         if buckets != sorted(buckets):
128 |             # This is probably an error on the part of the user,
129 |             # so raise rather than sorting for them.
130 |             raise ValueError('Buckets not in sorted order')
131 | 
132 |         if len(buckets) < 2:
133 |             raise ValueError('Must have at least two buckets')
134 |         self._upper_bounds = buckets
135 | 
136 |     def _metric_init(self):
137 |         self._buckets = {}
138 |         self._created = time()
139 |         bucket_labelnames = self._upper_bounds
140 |         self._sum = values.ValueClass(
141 |             self._type, self._name, self._name + '_sum', self._labelnames, self._labelvalues)
142 |         for b in self._upper_bounds:
143 |             self._buckets[b] = values.ValueClass(
144 |                 self._type,
145 |                 self._name,
146 |                 self._name + '_bucket',
147 |                 bucket_labelnames,
148 |                 self._labelvalues + (b,))
149 | 
150 |     def observe(self, amount, exemplar=None):
151 |         """Observe the given amount.
152 | 
153 |         The amount is usually positive or zero. Negative values are
154 |         accepted but prevent current versions of Prometheus from
155 |         properly detecting counter resets in the sum of
156 |         observations. See
157 |         https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations
158 |         for details.
159 |         """
160 |         self._raise_if_not_observable()
161 |         if not isinstance(amount, (list, tuple)):
162 |             amount = [amount]
163 |         self._sum.inc(len(amount))
164 |         for v in amount:
165 |             self._buckets[v].inc(1)
166 |             if exemplar:
167 |                 _validate_exemplar(exemplar)
168 |                 self._buckets[v].set_exemplar(Exemplar(exemplar, 1, time()))
169 | 
170 |     def time(self):
171 |         """Time a block of code or function, and observe the duration in seconds.
172 | 
173 |         Can be used as a function decorator or context manager.
174 |         """
175 |         return Timer(self, 'observe')
176 | 
177 |     def _child_samples(self) -> Iterable[Sample]:
178 |         samples = []
179 |         for i in self._buckets:
180 |             acc = self._buckets[i].get()
181 |             samples.append(Sample(
182 |                 '_bucket', {'enum': i}, acc, None, self._buckets[i].get_exemplar()))
183 |             samples.append(Sample('_sum', {'enum': i}, self._sum.get(), None, None))
184 | 
185 |         return tuple(samples)
186 | 
187 | 
188 | class StatisticsController(object):
189 |     _reserved = {
190 |         '_latency': partial(ScalarHistogram, buckets=(.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0)),
191 |         '_count': Counter
192 |     }
193 |     _metric_type_class = {"scalar": ScalarHistogram, "enum": EnumHistogram, "value": Gauge, "counter": Counter}
194 | 
195 |     def __init__(
196 |             self,
197 |             task: Task,
198 |             kafka_server_url: str,
199 |             serving_id: Optional[str],
200 |             poll_frequency_min: float = 5
201 |     ):
202 |         self.task = task
203 |         self._serving_service_task_id = serving_id
204 |         self._poll_frequency_min = float(poll_frequency_min)
205 |         self._serving_service = None  # type: Optional[ModelRequestProcessor]
206 |         self._current_endpoints = {}  # type: Optional[Dict[str, EndpointMetricLogging]]
207 |         self._auto_added_endpoints = set()  # type: Set[str]
208 |         self._prometheus_metrics = {}  # type: Optional[Dict[str, Dict[str, MetricWrapperBase]]]
209 |         self._timestamp = time()
210 |         self._sync_thread = None
211 |         self._last_sync_time = time()
212 |         self._dirty = False
213 |         self._sync_event = Event()
214 |         self._sync_threshold_sec = 30
215 |         self._kafka_server = kafka_server_url
216 |         # noinspection PyProtectedMember
217 |         self._kafka_topic = ModelRequestProcessor._kafka_topic
218 | 
219 |     def start(self):
220 |         self._serving_service = ModelRequestProcessor(task_id=self._serving_service_task_id)
221 | 
222 |         if not self._sync_thread:
223 |             self._sync_thread = Thread(target=self._sync_daemon, daemon=True)
224 |             self._sync_thread.start()
225 | 
226 |         # noinspection PyProtectedMember
227 |         kafka_server = \
228 |             self._serving_service.get_configuration().get(ModelRequestProcessor._config_key_kafka_stats) or \
229 |             self._kafka_server
230 | 
231 |         print("Starting Kafka Statistics processing: {}".format(kafka_server))
232 | 
233 |         while True:
234 |             try:
235 |                 consumer = KafkaConsumer(self._kafka_topic, bootstrap_servers=kafka_server)
236 |                 break
237 |             except Exception as ex:
238 |                 print("Error: failed opening Kafka consumer [{}]: {}".format(kafka_server, ex))
239 |                 print("Retrying in 30 seconds")
240 |                 sleep(30)
241 | 
242 |         # we will never leave this loop
243 |         while True:
244 |             # noinspection PyBroadException
245 |             try:
246 |                 message = next(consumer)
247 |             except Exception:
248 |                 print("Warning: failed to pull kafka consumer pipe")
249 |                 sleep(5)
250 |                 continue
251 | 
252 |             # noinspection PyBroadException
253 |             try:
254 |                 list_data = json.loads(message.value.decode("utf-8"))
255 |             except Exception:
256 |                 print("Warning: failed to decode kafka stats message")
257 |                 continue
258 | 
259 |             for data in list_data:
260 |                 try:
261 |                     url = data.pop("_url", None)
262 |                     if not url:
263 |                         # should not happen
264 |                         continue
265 |                     endpoint_metric = self._current_endpoints.get(url)
266 |                     if not endpoint_metric:
267 |                         # add default one, we will just log the reserved valued:
268 |                         endpoint_metric = dict()
269 |                         self._current_endpoints[url] = EndpointMetricLogging(endpoint=url)
270 |                         self._auto_added_endpoints.add(url)
271 |                         # we should sync,
272 |                         if time()-self._last_sync_time > self._sync_threshold_sec:
273 |                             self._last_sync_time = time()
274 |                             self._sync_event.set()
275 | 
276 |                     metric_url_log = self._prometheus_metrics.get(url)
277 |                     if not metric_url_log:
278 |                         # create a new one
279 |                         metric_url_log = dict()
280 |                         self._prometheus_metrics[url] = metric_url_log
281 | 
282 |                     # check if we have the prometheus_logger
283 |                     for k, v in data.items():
284 |                         prometheus_logger = metric_url_log.get(k)
285 |                         if not prometheus_logger:
286 |                             prometheus_logger = self._create_prometheus_logger_class(url, k, endpoint_metric)
287 |                             if not prometheus_logger:
288 |                                 continue
289 |                             metric_url_log[k] = prometheus_logger
290 | 
291 |                         self._report_value(prometheus_logger, v)
292 | 
293 |                 except Exception as ex:
294 |                     print("Warning: failed to report stat to Prometheus: {}".format(ex))
295 |                     continue
296 | 
297 |     @staticmethod
298 |     def _report_value(prometheus_logger: Optional[MetricWrapperBase], v: Any) -> bool:
299 |         if not prometheus_logger:
300 |             # this means no one configured the variable to log
301 |             return False
302 |         elif isinstance(prometheus_logger, (Histogram, EnumHistogram)):
303 |             prometheus_logger.observe(amount=v)
304 |         elif isinstance(prometheus_logger, Gauge):
305 |             prometheus_logger.set(value=v)
306 |         elif isinstance(prometheus_logger, Counter):
307 |             prometheus_logger.inc(amount=v)
308 |         elif isinstance(prometheus_logger, Enum):
309 |             prometheus_logger.state(state=v)
310 |         else:
311 |             # we should not get here
312 |             return False
313 | 
314 |         return True
315 | 
316 |     def _create_prometheus_logger_class(
317 |             self,
318 |             url: str,
319 |             variable_name: str,
320 |             endpoint_config: EndpointMetricLogging
321 |     ) -> Optional[MetricWrapperBase]:
322 |         reserved_cls = self._reserved.get(variable_name)
323 |         name = "{}:{}".format(url, variable_name)
324 |         name = re.sub(r"[^(a-zA-Z0-9_:)]", "_", name)
325 |         if reserved_cls:
326 |             return reserved_cls(name=name, documentation="Built in {}".format(variable_name))
327 | 
328 |         if not endpoint_config:
329 |             # we should not end up here
330 |             return None
331 | 
332 |         metric_ = endpoint_config.metrics.get(variable_name)
333 |         if not metric_:
334 |             return None
335 |         metric_cls = self._metric_type_class.get(metric_.type)
336 |         if not metric_cls:
337 |             return None
338 |         if metric_cls in (ScalarHistogram, EnumHistogram):
339 |             return metric_cls(
340 |                 name=name,
341 |                 documentation="User defined metric {}".format(metric_.type),
342 |                 buckets=metric_.buckets
343 |             )
344 |         return metric_cls(name=name, documentation="User defined metric {}".format(metric_.type))
345 | 
346 |     def _sync_daemon(self):
347 |         self._last_sync_time = time()
348 |         poll_freq_sec = self._poll_frequency_min*60
349 |         print("Instance [{}, pid={}]: Launching - configuration sync every {} sec".format(
350 |             self.task.id, os.getpid(), poll_freq_sec))
351 |         while True:
352 |             try:
353 |                 self._serving_service.reload()
354 |                 endpoint_metrics = self._serving_service.list_endpoint_logging()
355 |                 self._last_sync_time = time()
356 |                 # we might have added new urls (auto metric logging), we need to compare only configured keys
357 |                 current_endpoints = {
358 |                     k: v for k, v in self._current_endpoints.items()
359 |                     if k not in self._auto_added_endpoints}
360 |                 if current_endpoints == endpoint_metrics:
361 |                     self._sync_event.wait(timeout=poll_freq_sec)
362 |                     self._sync_event.clear()
363 |                     continue
364 | 
365 |                 # update metrics:
366 |                 self._dirty = True
367 |                 self._auto_added_endpoints -= set(endpoint_metrics.keys())
368 |                 # merge top level configuration (we might have auto logged url endpoints)
369 |                 self._current_endpoints.update(deepcopy(endpoint_metrics))
370 |                 print("New configuration synced")
371 |             except Exception as ex:
372 |                 print("Warning: failed to sync state from serving service Task: {}".format(ex))
373 |                 continue
374 | 


--------------------------------------------------------------------------------
/clearml_serving/statistics/requirements.txt:
--------------------------------------------------------------------------------
1 | clearml>=1.3.1
2 | numpy>=1.20,<1.24
3 | requests>=2.31.0
4 | kafka-python>=2.0.2,<2.1
5 | prometheus_client>=0.13.1,<0.14
6 | lz4>=4.0.0,<5
7 | pillow>=10.0.1 # not directly required, pinned by Snyk to avoid a vulnerability
8 | 


--------------------------------------------------------------------------------
/clearml_serving/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.3.2'
2 | 


--------------------------------------------------------------------------------
/docker/datasource.yml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 | 
3 | datasources:
4 |   - name: Prometheus
5 |     type: prometheus
6 |     # Access mode - proxy (server in the UI) or direct (browser in the UI).
7 |     access: proxy
8 |     url: http://clearml-serving-prometheus:9090
9 | 


--------------------------------------------------------------------------------
/docker/docker-compose-triton-gpu.yml:
--------------------------------------------------------------------------------
  1 | version: "3"
  2 | 
  3 | services:
  4 |   zookeeper:
  5 |     image: bitnami/zookeeper:3.7.0
  6 |     container_name: clearml-serving-zookeeper
  7 |     # ports:
  8 |       # - "2181:2181"
  9 |     environment:
 10 |       - ALLOW_ANONYMOUS_LOGIN=yes
 11 |     networks:
 12 |       - clearml-serving-backend
 13 | 
 14 |   kafka:
 15 |     image: bitnami/kafka:3.1.1
 16 |     container_name: clearml-serving-kafka
 17 |     # ports:
 18 |       # - "9092:9092"
 19 |     environment:
 20 |       - KAFKA_BROKER_ID=1
 21 |       - KAFKA_CFG_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
 22 |       - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
 23 |       - KAFKA_CFG_ZOOKEEPER_CONNECT=clearml-serving-zookeeper:2181
 24 |       - ALLOW_PLAINTEXT_LISTENER=yes
 25 |       - KAFKA_CREATE_TOPICS="topic_test:1:1"
 26 |     depends_on:
 27 |       - zookeeper
 28 |     networks:
 29 |       - clearml-serving-backend
 30 | 
 31 |   prometheus:
 32 |     image: prom/prometheus:v2.34.0
 33 |     container_name: clearml-serving-prometheus
 34 |     volumes:
 35 |       - ./prometheus.yml:/prometheus.yml
 36 |     command:
 37 |       - '--config.file=/prometheus.yml'
 38 |       - '--storage.tsdb.path=/prometheus'
 39 |       - '--web.console.libraries=/etc/prometheus/console_libraries'
 40 |       - '--web.console.templates=/etc/prometheus/consoles'
 41 |       - '--storage.tsdb.retention.time=200h'
 42 |       - '--web.enable-lifecycle'
 43 |     restart: unless-stopped
 44 |     # ports:
 45 |       # - "9090:9090"
 46 |     depends_on:
 47 |       - clearml-serving-statistics
 48 |     networks:
 49 |       - clearml-serving-backend
 50 | 
 51 |   alertmanager:
 52 |     image: prom/alertmanager:v0.23.0
 53 |     container_name: clearml-serving-alertmanager
 54 |     restart: unless-stopped
 55 |     # ports:
 56 |       # - "9093:9093"
 57 |     depends_on:
 58 |       - prometheus
 59 |       - grafana
 60 |     networks:
 61 |       - clearml-serving-backend
 62 | 
 63 |   grafana:
 64 |     image: grafana/grafana:8.4.4-ubuntu
 65 |     container_name: clearml-serving-grafana
 66 |     volumes:
 67 |       - './datasource.yml:/etc/grafana/provisioning/datasources/datasource.yaml'
 68 |     restart: unless-stopped
 69 |     ports:
 70 |       - "3000:3000"
 71 |     depends_on:
 72 |       - prometheus
 73 |     networks:
 74 |       - clearml-serving-backend
 75 | 
 76 | 
 77 |   clearml-serving-inference:
 78 |     image: allegroai/clearml-serving-inference:latest
 79 |     container_name: clearml-serving-inference
 80 |     restart: unless-stopped
 81 |     # optimize perforamnce
 82 |     security_opt:
 83 |       - seccomp:unconfined
 84 |     ports:
 85 |       - "8080:8080"
 86 |     environment:
 87 |       CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
 88 |       CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
 89 |       CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
 90 |       CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
 91 |       CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
 92 |       CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
 93 |       CLEARML_SERVING_PORT: ${CLEARML_SERVING_PORT:-8080}
 94 |       CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
 95 |       CLEARML_DEFAULT_BASE_SERVE_URL: ${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:8080/serve}
 96 |       CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
 97 |       CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001}
 98 |       CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
 99 |       CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
100 |       CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
101 |       AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
102 |       AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
103 |       AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION:-}
104 |       GOOGLE_APPLICATION_CREDENTIALS: ${GOOGLE_APPLICATION_CREDENTIALS:-}
105 |       AZURE_STORAGE_ACCOUNT: ${AZURE_STORAGE_ACCOUNT:-}
106 |       AZURE_STORAGE_KEY: ${AZURE_STORAGE_KEY:-}
107 |     depends_on:
108 |       - kafka
109 |       - clearml-serving-triton
110 |     networks:
111 |       - clearml-serving-backend
112 | 
113 |   clearml-serving-triton:
114 |     image: allegroai/clearml-serving-triton:latest
115 |     container_name: clearml-serving-triton
116 |     restart: unless-stopped
117 |     # optimize perforamnce
118 |     security_opt:
119 |       - seccomp:unconfined
120 |     # ports:
121 |       # - "8001:8001"
122 |     environment:
123 |       CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
124 |       CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
125 |       CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
126 |       CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
127 |       CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
128 |       CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
129 |       CLEARML_TRITON_POLL_FREQ: ${CLEARML_TRITON_POLL_FREQ:-1.0}
130 |       CLEARML_TRITON_METRIC_FREQ: ${CLEARML_TRITON_METRIC_FREQ:-1.0}
131 |       CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}      
132 |       AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
133 |       AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
134 |       AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION:-}
135 |       GOOGLE_APPLICATION_CREDENTIALS: ${GOOGLE_APPLICATION_CREDENTIALS:-}
136 |       AZURE_STORAGE_ACCOUNT: ${AZURE_STORAGE_ACCOUNT:-}
137 |       AZURE_STORAGE_KEY: ${AZURE_STORAGE_KEY:-}
138 |     depends_on:
139 |       - kafka
140 |     networks:
141 |       - clearml-serving-backend
142 |     deploy:
143 |       resources:
144 |         reservations:
145 |           devices:
146 |             - capabilities: [gpu]
147 | 
148 |   clearml-serving-statistics:
149 |     image: allegroai/clearml-serving-statistics:latest
150 |     container_name: clearml-serving-statistics
151 |     restart: unless-stopped
152 |     # optimize perforamnce
153 |     security_opt:
154 |       - seccomp:unconfined
155 |     # ports:
156 |       # - "9999:9999"
157 |     environment:
158 |       CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
159 |       CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
160 |       CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
161 |       CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
162 |       CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
163 |       CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
164 |       CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
165 |       CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
166 |     depends_on:
167 |       - kafka
168 |     networks:
169 |       - clearml-serving-backend
170 | 
171 | 
172 | networks:
173 |   clearml-serving-backend:
174 |     driver: bridge
175 | 


--------------------------------------------------------------------------------
/docker/docker-compose-triton.yml:
--------------------------------------------------------------------------------
  1 | version: "3"
  2 | 
  3 | services:
  4 |   zookeeper:
  5 |     image: bitnami/zookeeper:3.7.0
  6 |     container_name: clearml-serving-zookeeper
  7 |     # ports:
  8 |       # - "2181:2181"
  9 |     environment:
 10 |       - ALLOW_ANONYMOUS_LOGIN=yes
 11 |     networks:
 12 |       - clearml-serving-backend
 13 | 
 14 |   kafka:
 15 |     image: bitnami/kafka:3.1.1
 16 |     container_name: clearml-serving-kafka
 17 |     # ports:
 18 |       # - "9092:9092"
 19 |     environment:
 20 |       - KAFKA_BROKER_ID=1
 21 |       - KAFKA_CFG_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
 22 |       - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
 23 |       - KAFKA_CFG_ZOOKEEPER_CONNECT=clearml-serving-zookeeper:2181
 24 |       - ALLOW_PLAINTEXT_LISTENER=yes
 25 |       - KAFKA_CREATE_TOPICS="topic_test:1:1"
 26 |     depends_on:
 27 |       - zookeeper
 28 |     networks:
 29 |       - clearml-serving-backend
 30 | 
 31 |   prometheus:
 32 |     image: prom/prometheus:v2.34.0
 33 |     container_name: clearml-serving-prometheus
 34 |     volumes:
 35 |       - ./prometheus.yml:/prometheus.yml
 36 |     command:
 37 |       - '--config.file=/prometheus.yml'
 38 |       - '--storage.tsdb.path=/prometheus'
 39 |       - '--web.console.libraries=/etc/prometheus/console_libraries'
 40 |       - '--web.console.templates=/etc/prometheus/consoles'
 41 |       - '--storage.tsdb.retention.time=200h'
 42 |       - '--web.enable-lifecycle'
 43 |     restart: unless-stopped
 44 |     # ports:
 45 |       # - "9090:9090"
 46 |     depends_on:
 47 |       - clearml-serving-statistics
 48 |     networks:
 49 |       - clearml-serving-backend
 50 | 
 51 |   alertmanager:
 52 |     image: prom/alertmanager:v0.23.0
 53 |     container_name: clearml-serving-alertmanager
 54 |     restart: unless-stopped
 55 |     # ports:
 56 |       # - "9093:9093"
 57 |     depends_on:
 58 |       - prometheus
 59 |       - grafana
 60 |     networks:
 61 |       - clearml-serving-backend
 62 | 
 63 |   grafana:
 64 |     image: grafana/grafana:8.4.4-ubuntu
 65 |     container_name: clearml-serving-grafana
 66 |     volumes:
 67 |       - './datasource.yml:/etc/grafana/provisioning/datasources/datasource.yaml'
 68 |     restart: unless-stopped
 69 |     ports:
 70 |       - "3000:3000"
 71 |     depends_on:
 72 |       - prometheus
 73 |     networks:
 74 |       - clearml-serving-backend
 75 | 
 76 | 
 77 |   clearml-serving-inference:
 78 |     image: allegroai/clearml-serving-inference:latest
 79 |     container_name: clearml-serving-inference
 80 |     restart: unless-stopped
 81 |     # optimize perforamnce
 82 |     security_opt:
 83 |       - seccomp:unconfined
 84 |     ports:
 85 |       - "8080:8080"
 86 |     environment:
 87 |       CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
 88 |       CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
 89 |       CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
 90 |       CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
 91 |       CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
 92 |       CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
 93 |       CLEARML_SERVING_PORT: ${CLEARML_SERVING_PORT:-8080}
 94 |       CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
 95 |       CLEARML_DEFAULT_BASE_SERVE_URL: ${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:8080/serve}
 96 |       CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
 97 |       CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001}
 98 |       CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
 99 |       CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
100 |       CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
101 |       AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
102 |       AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
103 |       AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION:-}
104 |       GOOGLE_APPLICATION_CREDENTIALS: ${GOOGLE_APPLICATION_CREDENTIALS:-}
105 |       AZURE_STORAGE_ACCOUNT: ${AZURE_STORAGE_ACCOUNT:-}
106 |       AZURE_STORAGE_KEY: ${AZURE_STORAGE_KEY:-}
107 |     depends_on:
108 |       - kafka
109 |       - clearml-serving-triton
110 |     networks:
111 |       - clearml-serving-backend
112 | 
113 |   clearml-serving-triton:
114 |     image: allegroai/clearml-serving-triton:latest
115 |     container_name: clearml-serving-triton
116 |     restart: unless-stopped
117 |     # optimize perforamnce
118 |     security_opt:
119 |       - seccomp:unconfined
120 |     # ports:
121 |       # - "8001:8001"
122 |     environment:
123 |       CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
124 |       CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
125 |       CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
126 |       CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
127 |       CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
128 |       CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
129 |       CLEARML_TRITON_POLL_FREQ: ${CLEARML_TRITON_POLL_FREQ:-1.0}
130 |       CLEARML_TRITON_METRIC_FREQ: ${CLEARML_TRITON_METRIC_FREQ:-1.0}
131 |       CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
132 |       AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
133 |       AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
134 |       AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION:-}
135 |       GOOGLE_APPLICATION_CREDENTIALS: ${GOOGLE_APPLICATION_CREDENTIALS:-}
136 |       AZURE_STORAGE_ACCOUNT: ${AZURE_STORAGE_ACCOUNT:-}
137 |       AZURE_STORAGE_KEY: ${AZURE_STORAGE_KEY:-}
138 |     depends_on:
139 |       - kafka
140 |     networks:
141 |       - clearml-serving-backend
142 | 
143 |   clearml-serving-statistics:
144 |     image: allegroai/clearml-serving-statistics:latest
145 |     container_name: clearml-serving-statistics
146 |     restart: unless-stopped
147 |     # optimize perforamnce
148 |     security_opt:
149 |       - seccomp:unconfined
150 |     # ports:
151 |       # - "9999:9999"
152 |     environment:
153 |       CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
154 |       CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
155 |       CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
156 |       CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
157 |       CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
158 |       CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
159 |       CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
160 |       CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
161 |     depends_on:
162 |       - kafka
163 |     networks:
164 |       - clearml-serving-backend
165 | 
166 | 
167 | networks:
168 |   clearml-serving-backend:
169 |     driver: bridge
170 | 


--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: "3"
  2 | 
  3 | services:
  4 |   zookeeper:
  5 |     image: bitnami/zookeeper:3.7.0
  6 |     container_name: clearml-serving-zookeeper
  7 |     # ports:
  8 |       # - "2181:2181"
  9 |     environment:
 10 |       - ALLOW_ANONYMOUS_LOGIN=yes
 11 |     networks:
 12 |       - clearml-serving-backend
 13 | 
 14 |   kafka:
 15 |     image: bitnami/kafka:3.1.1
 16 |     container_name: clearml-serving-kafka
 17 |     # ports:
 18 |       # - "9092:9092"
 19 |     environment:
 20 |       - KAFKA_BROKER_ID=1
 21 |       - KAFKA_CFG_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
 22 |       - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
 23 |       - KAFKA_CFG_ZOOKEEPER_CONNECT=clearml-serving-zookeeper:2181
 24 |       - ALLOW_PLAINTEXT_LISTENER=yes
 25 |       - KAFKA_CREATE_TOPICS="topic_test:1:1"
 26 |     depends_on:
 27 |       - zookeeper
 28 |     networks:
 29 |       - clearml-serving-backend
 30 | 
 31 |   prometheus:
 32 |     image: prom/prometheus:v2.34.0
 33 |     container_name: clearml-serving-prometheus
 34 |     volumes:
 35 |       - ./prometheus.yml:/prometheus.yml
 36 |     command:
 37 |       - '--config.file=/prometheus.yml'
 38 |       - '--storage.tsdb.path=/prometheus'
 39 |       - '--web.console.libraries=/etc/prometheus/console_libraries'
 40 |       - '--web.console.templates=/etc/prometheus/consoles'
 41 |       - '--storage.tsdb.retention.time=200h'
 42 |       - '--web.enable-lifecycle'
 43 |     restart: unless-stopped
 44 |     # ports:
 45 |       # - "9090:9090"
 46 |     depends_on:
 47 |       - clearml-serving-statistics
 48 |     networks:
 49 |       - clearml-serving-backend
 50 | 
 51 |   alertmanager:
 52 |     image: prom/alertmanager:v0.23.0
 53 |     container_name: clearml-serving-alertmanager
 54 |     restart: unless-stopped
 55 |     # ports:
 56 |       # - "9093:9093"
 57 |     depends_on:
 58 |       - prometheus
 59 |       - grafana
 60 |     networks:
 61 |       - clearml-serving-backend
 62 | 
 63 |   grafana:
 64 |     image: grafana/grafana:8.4.4-ubuntu
 65 |     container_name: clearml-serving-grafana
 66 |     volumes:
 67 |       - './datasource.yml:/etc/grafana/provisioning/datasources/datasource.yaml'
 68 |     restart: unless-stopped
 69 |     ports:
 70 |       - "3000:3000"
 71 |     depends_on:
 72 |       - prometheus
 73 |     networks:
 74 |       - clearml-serving-backend
 75 | 
 76 | 
 77 |   clearml-serving-inference:
 78 |     image: allegroai/clearml-serving-inference:latest
 79 |     container_name: clearml-serving-inference
 80 |     restart: unless-stopped
 81 |     # optimize perforamnce
 82 |     security_opt:
 83 |       - seccomp:unconfined
 84 |     ports:
 85 |       - "8080:8080"
 86 |     environment:
 87 |       CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
 88 |       CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
 89 |       CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
 90 |       CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
 91 |       CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
 92 |       CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
 93 |       CLEARML_SERVING_PORT: ${CLEARML_SERVING_PORT:-8080}
 94 |       CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
 95 |       CLEARML_DEFAULT_BASE_SERVE_URL: ${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:8080/serve}
 96 |       CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
 97 |       CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-}
 98 |       CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
 99 |       CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
100 |       CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
101 |       AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
102 |       AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
103 |       AWS_DEFAULT_REGION: ${AWS_DEFAULT_REGION:-}
104 |       GOOGLE_APPLICATION_CREDENTIALS: ${GOOGLE_APPLICATION_CREDENTIALS:-}
105 |       AZURE_STORAGE_ACCOUNT: ${AZURE_STORAGE_ACCOUNT:-}
106 |       AZURE_STORAGE_KEY: ${AZURE_STORAGE_KEY:-}
107 |     depends_on:
108 |       - kafka
109 |     networks:
110 |       - clearml-serving-backend
111 | 
112 |   clearml-serving-statistics:
113 |     image: allegroai/clearml-serving-statistics:latest
114 |     container_name: clearml-serving-statistics
115 |     restart: unless-stopped
116 |     # optimize perforamnce
117 |     security_opt:
118 |       - seccomp:unconfined
119 |     # ports:
120 |       # - "9999:9999"
121 |     environment:
122 |       CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
123 |       CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
124 |       CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
125 |       CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
126 |       CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
127 |       CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
128 |       CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
129 |       CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
130 |     depends_on:
131 |       - kafka
132 |     networks:
133 |       - clearml-serving-backend
134 | 
135 | 
136 | networks:
137 |   clearml-serving-backend:
138 |     driver: bridge
139 | 


--------------------------------------------------------------------------------
/docker/example.env:
--------------------------------------------------------------------------------
1 | CLEARML_WEB_HOST="https://app.clear.ml"
2 | CLEARML_API_HOST="https://api.clear.ml"
3 | CLEARML_FILES_HOST="https://files.clear.ml"
4 | CLEARML_API_ACCESS_KEY="<access_key_here>"
5 | CLEARML_API_SECRET_KEY="<secret_key_here>"
6 | CLEARML_SERVING_TASK_ID="<serving_service_id_here>"
7 | 


--------------------------------------------------------------------------------
/docker/prometheus.yml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   scrape_interval:     15s # By default, scrape targets every 15 seconds.
 3 |   evaluation_interval:     15s # By default, scrape targets every 15 seconds.
 4 |   external_labels:
 5 |     monitor: 'clearml-serving'
 6 | 
 7 | scrape_configs:
 8 |   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
 9 |   - job_name: 'prometheus'
10 | 
11 |     scrape_interval: 5s
12 | 
13 |     static_configs:
14 |       - targets: ['localhost:9090']
15 | 
16 |   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
17 |   - job_name: 'clearml-inference-stats'
18 | 
19 |     scrape_interval: 5s
20 | 
21 |     static_configs:
22 |       - targets: ['clearml-serving-statistics:9999']
23 | 


--------------------------------------------------------------------------------
/docs/design_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clearml/clearml-serving/1def0a6901617767687f2b747aaffdc060f96046/docs/design_diagram.png


--------------------------------------------------------------------------------
/docs/grafana_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clearml/clearml-serving/1def0a6901617767687f2b747aaffdc060f96046/docs/grafana_screenshot.png


--------------------------------------------------------------------------------
/docs/webapp_screenshots.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clearml/clearml-serving/1def0a6901617767687f2b747aaffdc060f96046/docs/webapp_screenshots.gif


--------------------------------------------------------------------------------
/examples/custom/preprocess.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Callable, Optional
  2 | 
  3 | import joblib
  4 | import numpy as np
  5 | 
  6 | 
  7 | # Notice Preprocess class Must be named "Preprocess"
  8 | class Preprocess(object):
  9 |     """
 10 |     Notice the execution flows is synchronous as follows:
 11 | 
 12 |     1. RestAPI(...) -> body: dict
 13 |     2. preprocess(body: dict, ...) -> data: Any
 14 |     3. process(data: Any, ...) -> data: Any
 15 |     4. postprocess(data: Any, ...) -> result: dict
 16 |     5. RestAPI(result: dict) -> returned request
 17 |     """
 18 |     def __init__(self):
 19 |         """
 20 |         Set any initial property on the Task (usually model object)
 21 |         Notice these properties will be accessed from multiple threads.
 22 |         If you need a stateful (per request) data, use the `state` dict argument passed to pre/post/process functions
 23 |         """
 24 |         # set internal state, this will be called only once. (i.e. not per request)
 25 |         self._model = None
 26 | 
 27 |     def load(self, local_file_name: str) -> Optional[Any]:  # noqa
 28 |         """
 29 |         Optional: provide loading method for the model
 30 |         useful if we need to load a model in a specific way for the prediction engine to work
 31 |         :param local_file_name: file name / path to read load the model from
 32 |         :return: Object that will be called with .predict() method for inference
 33 |         """
 34 | 
 35 |         # Example now lets load the actual model
 36 | 
 37 |         self._model = joblib.load(local_file_name)
 38 | 
 39 |     def preprocess(self, body: dict, state: dict, collect_custom_statistics_fn=None) -> Any:
 40 |         """
 41 |         Optional: do something with the request data, return any type of object.
 42 |         The returned object will be passed as is to the inference engine
 43 | 
 44 |         :param body: dictionary as recieved from the RestAPI
 45 |         :param state: Use state dict to store data passed to the post-processing function call.
 46 |             This is a per-request state dict (meaning a new empty dict will be passed per request)
 47 |             Usage example:
 48 |             >>> def preprocess(..., state):
 49 |                     state['preprocess_aux_data'] = [1,2,3]
 50 |             >>> def postprocess(..., state):
 51 |                     print(state['preprocess_aux_data'])
 52 |         :param collect_custom_statistics_fn: Optional, if provided allows to send a custom set of key/values
 53 |             to the statictics collector servicd.
 54 |             None is passed if statiscs collector is not configured, or if the current request should not be
 55 |             collected
 56 | 
 57 |             Usage example:
 58 |             >>> print(body)
 59 |             {"x0": 1, "x1": 2}
 60 |             >>> if collect_custom_statistics_fn:
 61 |             >>>   collect_custom_statistics_fn({"x0": 1, "x1": 2})
 62 | 
 63 |         :return: Object to be passed directly to the model inference
 64 |         """
 65 | 
 66 |         # we expect to get a feature vector on the `feature` entry if the dict
 67 |         return np.array(body.get("features", []), dtype=np.float)
 68 | 
 69 |     def process(
 70 |             self,
 71 |             data: Any,
 72 |             state: dict,
 73 |             collect_custom_statistics_fn: Optional[Callable[[dict], None]],
 74 |     ) -> Any:  # noqa
 75 |         """
 76 |         Optional: do something with the actual data, return any type of object.
 77 |         The returned object will be passed as is to the postprocess function engine
 78 | 
 79 |         :param data: object as recieved from the preprocessing function
 80 |         :param state: Use state dict to store data passed to the post-processing function call.
 81 |             This is a per-request state dict (meaning a dict instance per request)
 82 |             Usage example:
 83 |             >>> def preprocess(..., state):
 84 |                     state['preprocess_aux_data'] = [1,2,3]
 85 |             >>> def postprocess(..., state):
 86 |                     print(state['preprocess_aux_data'])
 87 |         :param collect_custom_statistics_fn: Optional, if provided allows to send a custom set of key/values
 88 |             to the statictics collector servicd.
 89 |             None is passed if statiscs collector is not configured, or if the current request should not be collected
 90 | 
 91 |             Usage example:
 92 |             >>> if collect_custom_statistics_fn:
 93 |             >>>   collect_custom_statistics_fn({"type": "classification"})
 94 | 
 95 |         :return: Object to be passed tp the post-processing function
 96 |         """
 97 | 
 98 |         # this is where we do the heavy lifting, i.e. run our model.
 99 |         # notice we know data is a numpy array of type float, because this is what we prepared in preprocessing function
100 |         data = self._model.predict(np.atleast_2d(data))
101 |         # data is also a numpy array, as returned from our fit function
102 |         return data
103 | 
104 |     def postprocess(self, data: Any, state: dict, collect_custom_statistics_fn=None) -> dict:
105 |         """
106 |         Optional: post process the data returned from the model inference engine
107 |         returned dict will be passed back as the request result as is.
108 | 
109 |         :param data: object as recieved from the inference model function
110 |         :param state: Use state dict to store data passed to the post-processing function call.
111 |             This is a per-request state dict (meaning a dict instance per request)
112 |             Usage example:
113 |             >>> def preprocess(..., state):
114 |                     state['preprocess_aux_data'] = [1,2,3]
115 |             >>> def postprocess(..., state):
116 |                     print(state['preprocess_aux_data'])
117 |         :param collect_custom_statistics_fn: Optional, if provided allows to send a custom set of key/values
118 |             to the statictics collector servicd.
119 |             None is passed if statiscs collector is not configured, or if the current request should not be
120 |             collected
121 | 
122 |             Usage example:
123 |             >>> if collect_custom_statistics_fn:
124 |             >>>   collect_custom_statistics_fn({"y": 1})
125 | 
126 |         :return: Dictionary passed directly as the returned result of the RestAPI
127 |         """
128 | 
129 |         # Now we take the result numpy (predicted) and create a list of values to
130 |         # send back as the restapi return value
131 |         # data is the return value from model.predict we will put is inside a return value as Y
132 |         return dict(predict=data.tolist())
133 | 


--------------------------------------------------------------------------------
/examples/custom/readme.md:
--------------------------------------------------------------------------------
 1 | # Train and Deploy custom model
 2 | 
 3 | ## training mock custom model
 4 | 
 5 | Run the mock python training code
 6 | ```bash
 7 | pip install -r examples/custom/requirements.txt 
 8 | python examples/custom/train_model.py
 9 | ```
10 | 
11 | The output will be a model created on the project "serving examples", by the name "custom train model"
12 | 
13 | ## setting up the serving service
14 | 
15 | 1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
16 | 2. Make sure to add any required additional packages (for your custom model) to the [docker-compose.yml](https://github.com/allegroai/clearml-serving/blob/826f503cf4a9b069b89eb053696d218d1ce26f47/docker/docker-compose.yml#L97) (or as environment variable to the `clearml-serving-inference` container), by defining for example: `CLEARML_EXTRA_PYTHON_PACKAGES="scikit-learn numpy"`
17 | 3. Create model endpoint: 
18 | `clearml-serving --id <service_id> model add --engine custom --endpoint "test_model_custom" --preprocess "examples/custom/preprocess.py" --name "custom train model" --project "serving examples"`
19 | 
20 | Or auto update 
21 | 
22 | `clearml-serving --id <service_id> model auto-update --engine custom --endpoint "test_model_custom_auto" --preprocess "examples/custom/preprocess.py" --name "custom train model" --project "serving examples" --max-versions 2`
23 | 
24 | Or add Canary endpoint
25 | 
26 | `clearml-serving --id <service_id> model canary --endpoint "test_model_custom_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_custom_auto`
27 | 
28 | 4. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint.
29 | 
30 | Or you can run the clearml-serving container independently `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
31 | 
32 | 5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): `curl -X POST "http://127.0.0.1:8080/serve/test_model_custom" -H "accept: application/json" -H "Content-Type: application/json" -d '{"features": [1, 2, 3]}'`
33 | 
34 | 
35 | > **_Notice:_**  You can also change the serving service while it is already running!
36 | This includes adding/removing endpoints, adding canary model routing etc.
37 | by default new endpoints/models will be automatically updated after 1 minute
38 | 


--------------------------------------------------------------------------------
/examples/custom/requirements.txt:
--------------------------------------------------------------------------------
1 | clearml >= 1.1.6
2 | scikit-learn
3 | pillow>=10.0.1 # not directly required, pinned by Snyk to avoid a vulnerability
4 | 


--------------------------------------------------------------------------------
/examples/custom/train_model.py:
--------------------------------------------------------------------------------
 1 | from sklearn.linear_model import LogisticRegression
 2 | from sklearn.datasets import make_blobs
 3 | from joblib import dump
 4 | from clearml import Task
 5 | 
 6 | task = Task.init(project_name="serving examples", task_name="custom train model", output_uri=True)
 7 | 
 8 | # generate 2d classification dataset
 9 | X, y = make_blobs(n_samples=100, centers=2, n_features=3, random_state=1)
10 | # fit final model
11 | model = LogisticRegression()
12 | model.fit(X, y)
13 | 
14 | dump(model, filename="custom-model.pkl", compress=9)
15 | 
16 | 


--------------------------------------------------------------------------------
/examples/ensemble/preprocess.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | # Notice Preprocess class Must be named "Preprocess"
 7 | class Preprocess(object):
 8 |     def __init__(self):
 9 |         # set internal state, this will be called only once. (i.e. not per request)
10 |         pass
11 | 
12 |     def preprocess(self, body: dict, state: dict, collect_custom_statistics_fn=None) -> Any:
13 |         # we expect to get two valid on the dict x0, and x1
14 |         return [[body.get("x0", None), body.get("x1", None)], ]
15 | 
16 |     def postprocess(self, data: Any, state: dict, collect_custom_statistics_fn=None) -> dict:
17 |         # post process the data returned from the model inference engine
18 |         # data is the return value from model.predict we will put is inside a return value as Y
19 |         return dict(y=data.tolist() if isinstance(data, np.ndarray) else data)
20 | 


--------------------------------------------------------------------------------
/examples/ensemble/readme.md:
--------------------------------------------------------------------------------
 1 | # Train and Deploy Scikit-Learn model ensemble
 2 | 
 3 | ## training mock voting regression model
 4 | 
 5 | Run the mock python training code
 6 | ```bash
 7 | pip install -r examples/ensemble/requirements.txt 
 8 | python examples/ensemble/train_ensemble.py
 9 | ```
10 | 
11 | The output will be a model created on the project "serving examples", by the name "train model ensemble"
12 | 
13 | ## setting up the serving service
14 | 
15 | 1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
16 | 2. Create model endpoint: 
17 | `clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_ensemble" --preprocess "examples/ensemble/preprocess.py" --name "train model ensemble" --project "serving examples"`
18 | 
19 | Or auto update 
20 | 
21 | `clearml-serving --id <service_id> model auto-update --engine sklearn --endpoint "test_model_ensemble_auto" --preprocess "examples/ensemble/preprocess.py" --name "train model ensemble" --project "serving examples" --max-versions 2`
22 | 
23 | Or add Canary endpoint
24 | 
25 | `clearml-serving --id <service_id> model canary --endpoint "test_model_ensemble_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_ensemble_auto`
26 | 
27 | 3. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint.
28 | 
29 | Or you can run the clearml-serving container independently `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
30 | 
31 | 4. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): `curl -X POST "http://127.0.0.1:8080/serve/test_model_ensemble" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2}'`
32 | 
33 | > **_Notice:_**  You can also change the serving service while it is already running!
34 | This includes adding/removing endpoints, adding canary model routing etc.
35 | by default new endpoints/models will be automatically updated after 1 minute
36 | 


--------------------------------------------------------------------------------
/examples/ensemble/requirements.txt:
--------------------------------------------------------------------------------
1 | clearml >= 1.1.6
2 | scikit-learn >= 1.0.2
3 | numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability
4 | pillow>=10.0.1 # not directly required, pinned by Snyk to avoid a vulnerability
5 | 


--------------------------------------------------------------------------------
/examples/ensemble/train_ensemble.py:
--------------------------------------------------------------------------------
 1 | from sklearn.neighbors import KNeighborsRegressor
 2 | from sklearn.ensemble import RandomForestRegressor
 3 | from sklearn.ensemble import VotingRegressor
 4 | from sklearn.datasets import make_blobs
 5 | from joblib import dump
 6 | from clearml import Task
 7 | 
 8 | task = Task.init(project_name="serving examples", task_name="train model ensemble", output_uri=True)
 9 | 
10 | # generate 2d classification dataset
11 | X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1)
12 | 
13 | knn = KNeighborsRegressor(n_neighbors=5)
14 | knn.fit(X, y)
15 | 
16 | rf = RandomForestRegressor(n_estimators=50)
17 | rf.fit(X, y)
18 | 
19 | estimators = [("knn", knn), ("rf", rf), ]
20 | ensemble = VotingRegressor(estimators)
21 | ensemble.fit(X, y)
22 | 
23 | dump(ensemble, filename="ensemble-vr.pkl", compress=9)
24 | 


--------------------------------------------------------------------------------
/examples/huggingface/docker-compose-override.yml:
--------------------------------------------------------------------------------
1 | services:
2 |   clearml-serving-triton:
3 |     image: allegroai/clearml-serving-triton:1.2.0-22.07


--------------------------------------------------------------------------------
/examples/huggingface/example_payload.json:
--------------------------------------------------------------------------------
1 | {"text": "This is a ClearML example to show how Triton binaries are deployed."}


--------------------------------------------------------------------------------
/examples/huggingface/preprocess.py:
--------------------------------------------------------------------------------
 1 | """Hugginface preprocessing module for ClearML Serving."""
 2 | from typing import Any
 3 | from transformers import AutoTokenizer, PreTrainedTokenizer, TensorType
 4 | 
 5 | 
 6 | # Notice Preprocess class Must be named "Preprocess"
 7 | class Preprocess:
 8 |     """Processing class will be run by the ClearML inference services before and after each request."""
 9 | 
10 |     def __init__(self):
11 |         """Set internal state, this will be called only once. (i.e. not per request)."""
12 |         self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained("philschmid/MiniLM-L6-H384-uncased-sst2")
13 | 
14 |     def preprocess(self, body: dict, state: dict, collect_custom_statistics_fn=None) -> Any:
15 |         """Will be run when a request comes into the ClearML inference service."""
16 |         tokens = self.tokenizer(
17 |             text=body['text'],
18 |             max_length=16,
19 |             truncation=True,
20 |             return_tensors=TensorType.NUMPY,
21 |         )
22 | 
23 |         return [tokens["input_ids"].tolist(), tokens["token_type_ids"].tolist(), tokens["attention_mask"].tolist()]
24 | 
25 |     def postprocess(self, data: Any, state: dict, collect_custom_statistics_fn=None) -> dict:
26 |         """Will be run whan a request comes back from the Triton Engine."""
27 |         # post process the data returned from the model inference engine
28 |         # data is the return value from model.predict we will put is inside a return value as Y
29 |         return {'data': data.tolist()}
30 | 


--------------------------------------------------------------------------------
/examples/huggingface/readme.md:
--------------------------------------------------------------------------------
  1 | # Example Huggingface on ClearML Serving
  2 | 
  3 | Technically, the underlying NVIDIA Triton inference engine can handle almost any type of model, including Pytorch models which is how many Huggingface models are shipped out of the box.
  4 | 
  5 | But in order to get better serving speeds, check out this [repository](https://github.com/ELS-RD/transformer-deploy), their [docs](https://els-rd.github.io/transformer-deploy/) and the excellent accompanying [blogpost](https://medium.com/towards-data-science/hugging-face-transformer-inference-under-1-millisecond-latency-e1be0057a51c) to convert huggingface models first into ONNX and then into TensorRT optimized binaries.
  6 | 
  7 | ## Model vs Tokenizer
  8 | 
  9 | Most Huggingface NLP models ship with a tokenizer as well. We don’t want to leave it to the end user to embed their own inputs. The blogpost above uses an ensemble endpoint in Triton that first runs some python code that contains the tokenizer and then sends the result to a second endpoint which contains the actual model.
 10 | 
 11 | This is a good approach, but the tokenizer is CPU based and not independently scalable from the GPU based transformer model. With ClearML serving, we can move the tokenization step to the preprocessing script that we provide to the ClearML serving inference container, which will make this step completely autoscalable.
 12 | 
 13 | ## Getting the right TensorRT <> Triton versions
 14 | 
 15 | Chances are very high that the transformer-deploy image has a different triton version than what ClearML serving uses, which will give issues later on. Triton is very harsh on its version requirements. Please check the triton version we are using in `clearml_serving/engines/triton/Dockerfile` and compare it to the main Dockerfile from the `transformers-deploy` repo. Check [this](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html) page for more information about which TensorRT version is shipped in which Triton container.
 16 | 
 17 | If they don't match up, either rebuild the ClearML triton image locally with the right triton version and make sure it is picked up by compose, or build the `transformers-deploy` image locally with the correct version and use it to run the model conversion. Your model has to be optimized using the exact same TensorRT version or it will not serve!
 18 | 
 19 | ## Setting up for the example
 20 | 
 21 | At the time of this writing, compiling a huggingface model from the `transformers-deploy` main branch means it is compiled using tensorRT version 8.4.1, which corresponds to Triton version 22.07.
 22 | 
 23 | To get ClearML running on 22.07, all we need to do is change the base image name in the `docker-compose-triton-gpu.yml` file, the the correct version.
 24 | 
 25 | ```diff
 26 | ...
 27 | clearml-serving-triton:
 28 | -   image: allegroai/clearml-serving-triton:latest
 29 | +   image: allegroai/clearml-serving-triton:1.2.0-22.07
 30 |     container_name: clearml-serving-triton
 31 |     restart: unless-stopped
 32 |     # optimize perforamnce
 33 |     security_opt:
 34 |       - seccomp:unconfined
 35 | ...
 36 | ```
 37 | Or you can build your own correct version by adapting the dockerfile in `clearml_serving/engines/triton/Dockerfile`, building it and making sure the triton compose yaml uses it instead.
 38 | 
 39 | 
 40 | ## Setting up the serving service
 41 | 
 42 | ### Get the repository (with the example)
 43 | 
 44 | Clone the serving repository if you haven’t already.
 45 | 
 46 | ```bash
 47 | git clone https://github.com/allegroai/clearml-serving.git
 48 | cd clearml-serving
 49 | ```
 50 | 
 51 | ### Launch the serving task to clearml
 52 | 
 53 | Install `clearml-serving` either via pip or from the repository. Create serving Service:
 54 | 
 55 | ```bash
 56 | clearml-serving create --name "huggingface serving example"
 57 | ```
 58 | 
 59 | (write down the service ID, this is the service ID that is in your env file as well)
 60 | 
 61 | ### Setting up the docker-compose serving stack
 62 | Setup the `docker/example.env` file with your ClearML credentials, then add an extra line to install 3rd party packages. In this case, we want to also install the `transformers` package because we’re going to run the tokenizer in the inference container
 63 | 
 64 | ```bash
 65 | CLEARML_WEB_HOST="https://app.clear.ml"
 66 | CLEARML_API_HOST="https://api.clear.ml"
 67 | CLEARML_FILES_HOST="https://files.clear.ml"
 68 | CLEARML_API_ACCESS_KEY="<>"
 69 | CLEARML_API_SECRET_KEY="<>"
 70 | CLEARML_SERVING_TASK_ID="<>"
 71 | # Add this to install necessary packages
 72 | CLEARML_EXTRA_PYTHON_PACKAGES=transformers
 73 | # Change this depending on your machine and performance needs
 74 | CLEARML_USE_GUNICORN=1
 75 | CLEARML_SERVING_NUM_PROCESS=8
 76 | ```
 77 | 
 78 | Huggingface models require Triton engine support, please use `docker-compose-triton.yml` / `docker-compose-triton-gpu.yml` or if running on Kubernetes, the matching helm chart to set things up. Check the repository main readme documentation if you need help.
 79 | 
 80 | To run with the correct version of Triton for this example, do:
 81 | ```bash
 82 | docker compose --env-file docker/example.env -f docker/docker-compose-triton-gpu.yml -f examples/huggingface/docker-compose-override.yml  up --force-recreate
 83 | ```
 84 | This should get you a running ClearML stack with Triton which is reporting to a ClearML task in a project called `DevOps`.
 85 | 
 86 | ### Getting the sample model
 87 | 
 88 | If you didn’t use the transformers-deploy repository on your own model, you can run this single command to get a tensorRT binary of an example classification model. 
 89 | 
 90 | Please make sure you have properly installed docker and nvidia-container-toolkit, so it can be run on GPU. The command will download a `model.bin` file to the local directory for you to serve.
 91 | 
 92 | ```bash
 93 | curl https://clearml-public.s3.amazonaws.com/models/model_onnx.bin -o model.bin
 94 | ```
 95 | 
 96 | ### Setup
 97 | 
 98 | 1. Upload the TensorRT model (write down the model ID)
 99 | 
100 | ```bash
101 | clearml-serving --id <your_service_ID> model upload --name "Transformer ONNX" --project "Hugginface Serving" --path model.bin
102 | ```
103 | 
104 | 2. Create a model endpoint:
105 | 
106 | ```bash
107 | # Without dynamic batching
108 | clearml-serving --id <your_service_ID> model add --engine triton --endpoint "transformer_model" --model-id <your_model_ID> --preprocess examples/huggingface/preprocess.py --input-size "[-1, -1]" "[-1, -1]" "[-1, -1]" --input-type int32 int32 int32 --input-name "input_ids" "token_type_ids" "attention_mask" --output-size "[-1, 2]" --output-type float32 --output-name "output" --aux-config platform=\"tensorrt_plan\" default_model_filename=\"model.bin\"
109 | 
110 | # With dynamic batching
111 | clearml-serving --id <your_service_ID> model add --engine triton --endpoint "transformer_model" --model-id <your_model_ID> --preprocess examples/huggingface/preprocess.py --input-size "[-1]" "[-1]" "[-1]" --input-type int32 int32 int32 --input-name "input_ids" "token_type_ids" "attention_mask" --output-size "[2]" --output-type float32 --output-name "output" --aux-config platform=\"onnxruntime_onnx\" default_model_filename=\"model.bin\" dynamic_batching.preferred_batch_size="[1,2,4,8,16,32,64]" dynamic_batching.max_queue_delay_microseconds=5000000 max_batch_size=64
112 | ```
113 | 
114 | > Note the backslashes for string values! `platform=\"tensorrt_plan\" default_model_filename=\"model.bin\"`
115 | 
116 | > **INFO**: the model input and output parameters are usually in a `config.pbtxt` file next to the model itself. 
117 | 
118 | 1. Make sure you have the `clearml-serving` `docker-compose-triton.yml` (or `docker-compose-triton-gpu.yml`) running, it might take it a minute or two to sync with the new endpoint.
119 | 2. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory):
120 | 
121 | > ***Notice:***
122 |  You can also change the serving service while it is already running! This includes adding/removing endpoints, adding canary model routing etc. by default new endpoints/models will be automatically updated after 1 minute
123 | > 
124 | 
125 | ## Running Inference
126 | 
127 | After waiting a little bit for the stack to detect your new endpoint and load it, you can use curl to send a request:
128 | 
129 | ```bash
130 | curl -X POST "http://127.0.0.1:8080/serve/transformer_model" -H "accept: application/json" -H "Content-Type: application/json" -d '{"text": "This is a ClearML example to show how Triton binaries are deployed."}'
131 | ```
132 | 
133 | Or use the notebook in this example folder to run it using python `requests`
134 | 
135 | The inference request will be sent to the ClearML inference service first, which will run the raw request through the `preprocessing.py` file, which takes out the `text` value, runs it through the tokenizer and then sends the result to Triton, which runs the model and sends the output back to the same `preprocessing.py` file but in the postprocessing function this time, whose result is returned to the user.
136 | 
137 | ## Benchmarking
138 | 
139 | To run a load test on your endpoint to check its performance, use the following commands:
140 | ```bash
141 | ab -l -n 8000 -c 128  -H "accept: application/json" -H "Content-Type: application/json" -T "application/json" -p examples/huggingface/example_payload.json  "http://127.0.0.1:8080/serve/transformer_model"
142 | ```


--------------------------------------------------------------------------------
/examples/huggingface/requirements.txt:
--------------------------------------------------------------------------------
1 | clearml-serving
2 | pillow>=10.2.0 # not directly required, pinned by Snyk to avoid a vulnerability


--------------------------------------------------------------------------------
/examples/keras/preprocess.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from typing import Any, Union
 3 | 
 4 | import numpy as np
 5 | from PIL import Image, ImageOps
 6 | 
 7 | 
 8 | from clearml import StorageManager
 9 | 
10 | 
11 | # Notice Preprocess class Must be named "Preprocess"
12 | class Preprocess(object):
13 |     def __init__(self):
14 |         # set internal state, this will be called only once. (i.e. not per request)
15 |         pass
16 | 
17 |     def preprocess(self, body: Union[bytes, dict], state: dict, collect_custom_statistics_fn=None) -> Any:
18 |         # we expect to get two valid on the dict x0, and x1
19 |         if isinstance(body, bytes):
20 |             # we expect to get a stream of encoded image bytes
21 |             try:
22 |                 image = Image.open(io.BytesIO(body)).convert("RGB")
23 |             except Exception:
24 |                 # value error would return 404, we want to return 500 so any other exception
25 |                 raise RuntimeError("Image could not be decoded")
26 | 
27 |         if isinstance(body, dict) and "url" in body.keys():
28 |             # image is given as url, and is fetched
29 |             url = body.get("url")
30 |             local_file = StorageManager.get_local_copy(remote_url=url)
31 |             image = Image.open(local_file)
32 |     
33 |         image = ImageOps.grayscale(image).resize((28, 28))
34 |         return np.array([np.array(image)])
35 | 
36 |     def postprocess(self, data: Any, state: dict, collect_custom_statistics_fn=None) -> dict:
37 |         # post process the data returned from the model inference engine
38 |         # data is the return value from model.predict we will put is inside a return value as Y
39 |         if not isinstance(data, np.ndarray):
40 |             # this should not happen
41 |             return dict(digit=-1)
42 | 
43 |         # data is returned as probability per class (10 class/digits)
44 |         return dict(digit=int(data.flatten().argmax()))
45 | 


--------------------------------------------------------------------------------
/examples/keras/readme.md:
--------------------------------------------------------------------------------
 1 | # Train and Deploy Keras model with Nvidia Triton Engine
 2 | 
 3 | ## training mnist digit classifier model
 4 | 
 5 | Run the mock python training code
 6 | ```bash
 7 | pip install -r examples/keras/requirements.txt 
 8 | python examples/keras/train_keras_mnist.py
 9 | ```
10 | 
11 | The output will be a model created on the project "serving examples", by the name "train keras model"
12 | 
13 | ## setting up the serving service
14 | 
15 | Prerequisites, Keras/Tensorflow models require Triton engine support, please use `docker-compose-triton.yml` / `docker-compose-triton-gpu.yml` or if running on Kubernetes, the matching helm chart.
16 | 
17 | 1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
18 | 2. Create model endpoint: 
19 | 
20 |  `clearml-serving --id <service_id> model add --engine triton --endpoint "test_model_keras" --preprocess "examples/keras/preprocess.py" --name "train keras model - serving_model" --project "serving examples" --input-size 1 784 --input-name "dense_input" --input-type float32 --output-size -1 10 --output-name "activation_2" --output-type float32   
21 | `
22 | 
23 | Or auto update
24 | 
25 | `clearml-serving --id <service_id> model auto-update --engine triton --endpoint "test_model_auto" --preprocess "examples/keras/preprocess.py" --name "train keras model - serving_model" --project "serving examples" --max-versions 2
26 |   --input-size 1 784 --input-name "dense_input" --input-type float32   
27 |   --output-size -1 10 --output-name "activation_2" --output-type float32`
28 | 
29 | Or add Canary endpoint
30 | 
31 | `clearml-serving --id <service_id> model canary --endpoint "test_model_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_auto`
32 |    
33 | 3. Make sure you have the `clearml-serving` `docker-compose-triton.yml` (or `docker-compose-triton-gpu.yml`) running, it might take it a minute or two to sync with the new endpoint.
34 | 
35 | 4. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): \
36 |   `curl -X POST "http://127.0.0.1:8080/serve/test_model_keras" -H "accept: application/json" -H "Content-Type: application/json" -d '{"url": "https://raw.githubusercontent.com/allegroai/clearml-serving/main/examples/pytorch/5.jpg"}'`
37 |  \
38 |   or send a local file to be classified with \
39 |   `curl -X POST "http://127.0.0.1:8080/serve/test_model_keras" -H "Content-Type: image/jpeg" --data-binary "@5.jpg"`
40 | 
41 | > **_Notice:_**  You can also change the serving service while it is already running!
42 | This includes adding/removing endpoints, adding canary model routing etc.
43 | by default new endpoints/models will be automatically updated after 1 minute


--------------------------------------------------------------------------------
/examples/keras/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow>=2.0
2 | clearml
3 | Pillow


--------------------------------------------------------------------------------
/examples/keras/train_keras_mnist.py:
--------------------------------------------------------------------------------
  1 | # ClearML - Keras with Tensorboard example code, automatic logging model and Tensorboard outputs
  2 | #
  3 | # Train a simple deep NN on the MNIST dataset.
  4 | # Then store a model to be served by clearml-serving
  5 | import argparse
  6 | import os
  7 | import tempfile
  8 | 
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | from pathlib import Path
 12 | from tensorflow.keras import utils as np_utils
 13 | from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
 14 | from tensorflow.keras.datasets import mnist
 15 | from tensorflow.keras.layers import Activation, Dense
 16 | from tensorflow.keras.models import Sequential
 17 | from tensorflow.keras.optimizers import RMSprop
 18 | 
 19 | from clearml import Task
 20 | 
 21 | 
 22 | class TensorBoardImage(TensorBoard):
 23 |     @staticmethod
 24 |     def make_image(tensor):
 25 |         from PIL import Image
 26 |         import io
 27 |         tensor = np.stack((tensor, tensor, tensor), axis=2)
 28 |         height, width, channels = tensor.shape
 29 |         image = Image.fromarray(tensor)
 30 |         output = io.BytesIO()
 31 |         image.save(output, format='PNG')
 32 |         image_string = output.getvalue()
 33 |         output.close()
 34 |         return tf.Summary.Image(height=height,
 35 |                                 width=width,
 36 |                                 colorspace=channels,
 37 |                                 encoded_image_string=image_string)
 38 | 
 39 |     def on_epoch_end(self, epoch, logs=None):
 40 |         if logs is None:
 41 |             logs = {}
 42 |         super(TensorBoardImage, self).on_epoch_end(epoch, logs)
 43 |         images = self.validation_data[0]  # 0 - data; 1 - labels
 44 |         img = (255 * images[0].reshape(28, 28)).astype('uint8')
 45 | 
 46 |         image = self.make_image(img)
 47 |         summary = tf.Summary(value=[tf.Summary.Value(tag='image', image=image)])
 48 |         self.writer.add_summary(summary, epoch)
 49 | 
 50 | 
 51 | def main():
 52 |     parser = argparse.ArgumentParser(description='Keras MNIST Example - training CNN classification model')
 53 |     parser.add_argument('--batch-size', type=int, default=128, help='input batch size for training (default: 128)')
 54 |     parser.add_argument('--epochs', type=int, default=1, help='number of epochs to train (default: 6)')
 55 |     args = parser.parse_args()
 56 | 
 57 |     # the data, shuffled and split between train and test sets
 58 |     nb_classes = 10
 59 |     (X_train, y_train), (X_test, y_test) = mnist.load_data()
 60 | 
 61 |     X_train = X_train.reshape(60000, 784).astype('float32') / 255.
 62 |     X_test = X_test.reshape(10000, 784).astype('float32') / 255.
 63 |     print(X_train.shape[0], 'train samples')
 64 |     print(X_test.shape[0], 'test samples')
 65 | 
 66 |     # convert class vectors to binary class matrices
 67 |     Y_train = np_utils.to_categorical(y_train, nb_classes)
 68 |     Y_test = np_utils.to_categorical(y_test, nb_classes)
 69 | 
 70 |     model = Sequential()
 71 |     model.add(Dense(512, input_shape=(784,)))
 72 |     model.add(Activation('relu'))
 73 |     # model.add(Dropout(0.2))
 74 |     model.add(Dense(512))
 75 |     model.add(Activation('relu'))
 76 |     # model.add(Dropout(0.2))
 77 |     model.add(Dense(10))
 78 |     model.add(Activation('softmax'))
 79 | 
 80 |     model2 = Sequential()
 81 |     model2.add(Dense(512, input_shape=(784,)))
 82 |     model2.add(Activation('relu'))
 83 | 
 84 |     model.summary()
 85 | 
 86 |     model.compile(
 87 |         loss='categorical_crossentropy',
 88 |         optimizer=RMSprop(),
 89 |         metrics=['accuracy']
 90 |     )
 91 | 
 92 |     # Connecting ClearML with the current process,
 93 |     # from here on everything is logged automatically
 94 |     task = Task.init(project_name='serving examples', task_name='train keras model', output_uri=True)
 95 | 
 96 |     # Advanced: setting model class enumeration
 97 |     labels = dict(('digit_%d' % i, i) for i in range(10))
 98 |     task.set_model_label_enumeration(labels)
 99 | 
100 |     output_folder = os.path.join(tempfile.gettempdir(), 'keras_example_new_temp_now')
101 | 
102 |     board = TensorBoard(histogram_freq=1, log_dir=output_folder, write_images=False)
103 |     model_store = ModelCheckpoint(filepath=os.path.join(output_folder, 'weight.{epoch}.hdf5'))
104 | 
105 |     # load previous model, if it is there
106 |     # noinspection PyBroadException
107 |     try:
108 |         model.load_weights(os.path.join(output_folder, 'weight.1.hdf5'))
109 |     except Exception:
110 |         pass
111 | 
112 |     model.fit(
113 |         X_train, Y_train,
114 |         batch_size=args.batch_size, epochs=args.epochs,
115 |         callbacks=[board, model_store],
116 |         verbose=1, validation_data=(X_test, Y_test)
117 |     )
118 |     score = model.evaluate(X_test, Y_test, verbose=0)
119 | 
120 |     # store the model in a format that can be served
121 |     model.save('serving_model', include_optimizer=False)
122 | 
123 |     print('Test score: {}'.format(score[0]))
124 |     print('Test accuracy: {}'.format(score[1]))
125 | 
126 | 
127 | if __name__ == '__main__':
128 |     main()
129 | 


--------------------------------------------------------------------------------
/examples/lightgbm/preprocess.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | # Notice Preprocess class Must be named "Preprocess"
 7 | class Preprocess(object):
 8 |     def __init__(self):
 9 |         # set internal state, this will be called only once. (i.e. not per request)
10 |         pass
11 | 
12 |     def preprocess(self, body: dict, state: dict, collect_custom_statistics_fn=None) -> Any:
13 |         # we expect to get four valid numbers on the dict: x0, x1, x2, x3
14 |         return np.array(
15 |             [[body.get("x0", None), body.get("x1", None), body.get("x2", None), body.get("x3", None)], ],
16 |             dtype=np.float32
17 |         )
18 | 
19 |     def postprocess(self, data: Any, state: dict, collect_custom_statistics_fn=None) -> dict:
20 |         # post process the data returned from the model inference engine
21 |         # data is the return value from model.predict we will put is inside a return value as Y
22 |         # we pick the most probably class and return the class index (argmax)
23 |         return dict(y=int(np.argmax(data)) if isinstance(data, np.ndarray) else data)
24 | 


--------------------------------------------------------------------------------
/examples/lightgbm/readme.md:
--------------------------------------------------------------------------------
 1 | # Train and Deploy LightGBM model
 2 | 
 3 | ## training iris classifier model
 4 | 
 5 | Run the mock python training code
 6 | ```bash
 7 | pip install -r examples/lightgbm/requirements.txt 
 8 | python examples/lightgbm/train_model.py
 9 | ```
10 | 
11 | The output will be a model created on the project "serving examples", by the name "train lightgbm model"
12 | 
13 | ## setting up the serving service
14 | 
15 | 1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
16 | 
17 | 2. Create model endpoint: 
18 | 
19 | `clearml-serving --id <service_id> model add --engine lightgbm --endpoint "test_model_lgbm" --preprocess "examples/lightgbm/preprocess.py" --name "train lightgbm model - lgbm_model" --project "serving examples"`
20 | 
21 | Or auto-update 
22 | 
23 | `clearml-serving --id <service_id> model auto-update --engine lightgbm --endpoint "test_model_auto" --preprocess "examples/lightgbm/preprocess.py" --name "train lightgbm model - lgbm_model" --project "serving examples" --max-versions 2`
24 | 
25 | Or add Canary endpoint
26 | 
27 | `clearml-serving --id <service_id> model canary --endpoint "test_model_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_auto`
28 | 
29 | 3. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint.
30 | 
31 | Or you can run the clearml-serving container independently `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
32 | 
33 | 4. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): `curl -X POST "http://127.0.0.1:8080/serve/test_model_lgbm" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2, "x2": 3, "x3": 4}'`
34 | 
35 | > **_Notice:_**  You can also change the serving service while it is already running!
36 | This includes adding/removing endpoints, adding canary model routing etc.
37 | 


--------------------------------------------------------------------------------
/examples/lightgbm/requirements.txt:
--------------------------------------------------------------------------------
1 | clearml >= 1.1.6
2 | lightgbm
3 | 
4 | pillow>=10.0.1 # not directly required, pinned by Snyk to avoid a vulnerability
5 | 


--------------------------------------------------------------------------------
/examples/lightgbm/train_model.py:
--------------------------------------------------------------------------------
 1 | import lightgbm as lgb
 2 | from sklearn.datasets import load_iris
 3 | from sklearn.model_selection import train_test_split
 4 | 
 5 | from clearml import Task
 6 | 
 7 | task = Task.init(project_name="serving examples", task_name="train lightgbm model", output_uri=True)
 8 | 
 9 | iris = load_iris()
10 | y = iris['target']
11 | X = iris['data']
12 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
13 | dtrain = lgb.Dataset(X_train, label=y_train)
14 | 
15 | params = {
16 |     'objective': 'multiclass',
17 |     'metric': 'softmax',
18 |     'num_class': 3
19 | }
20 | lgb_model = lgb.train(params=params, train_set=dtrain)
21 | 
22 | lgb_model.save_model("lgbm_model")
23 | 


--------------------------------------------------------------------------------
/examples/pipeline/async_preprocess.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, List
 2 | 
 3 | 
 4 | # register with --engine custom_async
 5 | # Notice Preprocess class Must be named "Preprocess"
 6 | class Preprocess(object):
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     async def preprocess(self, body: dict, state: dict, collect_custom_statistics_fn=None) -> Any:
11 |         # we expect to get two valid on the dict x0, and x1
12 |         return body
13 | 
14 |     async def postprocess(self, data: List[dict], state: dict, collect_custom_statistics_fn=None) -> dict:
15 |         # we will here average the results and return the new value
16 |         # assume data is a list of dicts greater than 1
17 | 
18 |         # average result
19 |         return dict(y=0.5 * data[0]['y'][0] + 0.5 * data[1]['y'][0])
20 | 
21 |     async def process(self, data: Any, state: dict, collect_custom_statistics_fn=None) -> Any:
22 |         """
23 |         do something with the actual data, return any type of object.
24 |         The returned object will be passed as is to the postprocess function engine
25 |         """
26 |         predict_a = self.send_request(endpoint="/test_model_sklearn_a/", version=None, data=data)
27 |         predict_b = self.send_request(endpoint="/test_model_sklearn_b/", version=None, data=data)
28 | 
29 |         predict_a = await predict_a
30 |         predict_b = await predict_b
31 | 
32 |         if not predict_b or not predict_a:
33 |             raise ValueError("Error requesting inference endpoint test_model_sklearn a/b")
34 | 
35 |         return [predict_a, predict_b]
36 | 
37 |     async def send_request(self, endpoint, version, data) -> List[dict]:
38 |         # Mock Function!
39 |         # replaced by real send request function when constructed by the inference service
40 |         pass
41 | 


--------------------------------------------------------------------------------
/examples/pipeline/preprocess.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ThreadPoolExecutor
 2 | from typing import Any, List
 3 | 
 4 | 
 5 | # Notice Preprocess class Must be named "Preprocess"
 6 | class Preprocess(object):
 7 |     def __init__(self):
 8 |         # set internal state, this will be called only once. (i.e. not per request)
 9 |         self.executor = ThreadPoolExecutor(max_workers=32)
10 | 
11 |     def postprocess(self, data: List[dict], state: dict, collect_custom_statistics_fn=None) -> dict:
12 |         # we will here average the results and return the new value
13 |         # assume data is a list of dicts greater than 1
14 | 
15 |         # average result
16 |         return dict(y=0.5 * data[0]['y'][0] + 0.5 * data[1]['y'][0])
17 | 
18 |     def process(self, data: Any, state: dict, collect_custom_statistics_fn=None) -> Any:
19 |         """
20 |         do something with the actual data, return any type of object.
21 |         The returned object will be passed as is to the postprocess function engine
22 |         """
23 |         predict_a = self.executor.submit(self.send_request, endpoint="/test_model_sklearn_a/", version=None, data=data)
24 |         predict_b = self.executor.submit(self.send_request, endpoint="/test_model_sklearn_b/", version=None, data=data)
25 | 
26 |         predict_a = predict_a.result()
27 |         predict_b = predict_b.result()
28 | 
29 |         if not predict_b or not predict_a:
30 |             raise ValueError("Error requesting inference endpoint test_model_sklearn a/b")
31 | 
32 |         return [predict_a, predict_b]
33 | 
34 |     def send_request(self, endpoint, version, data) -> List[dict]:
35 |         # Mock Function!
36 |         # replaced by real send request function when constructed by the inference service
37 |         pass
38 | 


--------------------------------------------------------------------------------
/examples/pipeline/readme.md:
--------------------------------------------------------------------------------
 1 | # Deploy a model inference pipeline 
 2 | 
 3 | ## prerequisites 
 4 | 
 5 | Training a scikit-learn model (see example/sklearn) 
 6 | 
 7 | ## setting up the serving service
 8 | 
 9 | 1. Create serving Service (if not already running): 
10 | `clearml-serving create --name "serving example"` (write down the service ID)
11 | 
12 | 2. Create model base two endpoints: 
13 | `clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn_a" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model" --project "serving examples"`
14 | 
15 | `clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn_b" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model" --project "serving examples"`
16 | 
17 | 3. Create pipeline model endpoint: 
18 | 
19 | Threaded version
20 | 
21 | `clearml-serving --id <service_id> model add --engine custom --endpoint "test_model_pipeline" --preprocess "examples/pipeline/preprocess.py"`
22 | 
23 | AsyncIO version
24 | 
25 | `clearml-serving --id <service_id> model add --engine custom_async --endpoint "test_model_pipeline" --preprocess "examples/pipeline/async_preprocess.py"`
26 | 
27 | 4. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint.
28 | 
29 | Or you can run the clearml-serving container independently `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
30 | 
31 | 5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): `curl -X POST "http://127.0.0.1:8080/serve/test_model_pipeline" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2}'`
32 | 
33 | 
34 | > **_Notice:_**  You can also change the serving service while it is already running!
35 | This includes adding/removing endpoints, adding canary model routing etc.
36 | by default new endpoints/models will be automatically updated after 1 minute
37 | 


--------------------------------------------------------------------------------
/examples/preprocess_template/preprocess_template.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Optional, Callable, Union
  2 | 
  3 | 
  4 | # Preprocess class Must be named "Preprocess"
  5 | # No need to inherit or to implement all methods
  6 | class Preprocess(object):
  7 |     """
  8 |     Preprocess class Must be named "Preprocess"
  9 |     Otherwise there are No limitations, No need to inherit or to implement all methods
 10 |     Notice! This is not thread safe! the same instance may be accessed from multiple threads simultaneously
 11 |     to store date in a safe way push it into the `state` dict argument of preprocessing/postprocessing functions
 12 | 
 13 |     Notice the execution flows is synchronous as follows:
 14 | 
 15 |     1. RestAPI(...) -> body: Union[bytes, dict]
 16 |     2. preprocess(body: Union[bytes, dict], ...) -> data: Any
 17 |     3. process(data: Any, ...) -> data: Any
 18 |     4. postprocess(data: Any, ...) -> result: dict
 19 |     5. RestAPI(result: dict) -> returned request
 20 |     """
 21 | 
 22 |     def __init__(self):
 23 |         # set internal state, this will be called only once. (i.e. not per request)
 24 |         # it will also set the internal model_endpoint to reference the specific model endpoint object being served
 25 |         self.model_endpoint = None  # type: clearml_serving.serving.endpoints.ModelEndpoint
 26 | 
 27 |     def load(self, local_file_name: str) -> Any:  # noqa
 28 |         """
 29 |         Optional: provide loading method for the model
 30 |         useful if we need to load a model in a specific way for the prediction engine to work
 31 | 
 32 |         Notice! When used with specific engines (i.e. not Custom)
 33 |         The returned object will be passed as is to the inference engine,
 34 |         this means it must not be None, otherwise the endpoint will be ignored!
 35 | 
 36 |         :param local_file_name: file name / path to read load the model from
 37 | 
 38 |         :return: Object that will be called with .predict() method for inference.
 39 |         """
 40 |         pass
 41 | 
 42 |     def preprocess(
 43 |             self,
 44 |             body: Union[bytes, dict],
 45 |             state: dict,
 46 |             collect_custom_statistics_fn: Optional[Callable[[dict], None]],
 47 |     ) -> Any:  # noqa
 48 |         """
 49 |         Optional: do something with the request data, return any type of object.
 50 |         The returned object will be passed as is to the inference engine
 51 | 
 52 |         :param body: dictionary or bytes as recieved from the RestAPI
 53 |         :param state: Use state dict to store data passed to the post-processing function call.
 54 |             This is a per-request state dict (meaning a new empty dict will be passed per request)
 55 |             Usage example:
 56 |             >>> def preprocess(..., state):
 57 |                     state['preprocess_aux_data'] = [1,2,3]
 58 |             >>> def postprocess(..., state):
 59 |                     print(state['preprocess_aux_data'])
 60 |         :param collect_custom_statistics_fn: Optional, if provided allows to send a custom set of key/values
 61 |             to the statictics collector servicd.
 62 |             None is passed if statiscs collector is not configured, or if the current request should not be collected
 63 | 
 64 |             Usage example:
 65 |             >>> print(body)
 66 |             {"x0": 1, "x1": 2}
 67 |             >>> if collect_custom_statistics_fn:
 68 |             >>>   collect_custom_statistics_fn({"x0": 1, "x1": 2})
 69 | 
 70 |         :return: Object to be passed directly to the model inference
 71 |         """
 72 |         return body
 73 | 
 74 |     def postprocess(
 75 |             self,
 76 |             data: Any,
 77 |             state: dict,
 78 |             collect_custom_statistics_fn: Optional[Callable[[dict], None]],
 79 |     ) -> dict:  # noqa
 80 |         """
 81 |         Optional: post process the data returned from the model inference engine
 82 |         returned dict will be passed back as the request result as is.
 83 | 
 84 |         :param data: object as recieved from the inference model function
 85 |         :param state: Use state dict to store data passed to the post-processing function call.
 86 |             This is a per-request state dict (meaning a dict instance per request)
 87 |             Usage example:
 88 |             >>> def preprocess(..., state):
 89 |                     state['preprocess_aux_data'] = [1,2,3]
 90 |             >>> def postprocess(..., state):
 91 |                     print(state['preprocess_aux_data'])
 92 |         :param collect_custom_statistics_fn: Optional, if provided allows to send a custom set of key/values
 93 |             to the statictics collector servicd.
 94 |             None is passed if statiscs collector is not configured, or if the current request should not be collected
 95 | 
 96 |             Usage example:
 97 |             >>> if collect_custom_statistics_fn:
 98 |             >>>   collect_custom_statistics_fn({"y": 1})
 99 | 
100 |         :return: Dictionary passed directly as the returned result of the RestAPI
101 |         """
102 |         return data
103 | 
104 |     def process(
105 |             self,
106 |             data: Any,
107 |             state: dict,
108 |             collect_custom_statistics_fn: Optional[Callable[[dict], None]],
109 |     ) -> Any:  # noqa
110 |         """
111 |         Optional: do something with the actual data, return any type of object.
112 |         The returned object will be passed as is to the postprocess function engine
113 | 
114 |         :param data: object as recieved from the preprocessing function
115 |         :param state: Use state dict to store data passed to the post-processing function call.
116 |             This is a per-request state dict (meaning a dict instance per request)
117 |             Usage example:
118 |             >>> def preprocess(..., state):
119 |                     state['preprocess_aux_data'] = [1,2,3]
120 |             >>> def postprocess(..., state):
121 |                     print(state['preprocess_aux_data'])
122 |         :param collect_custom_statistics_fn: Optional, if provided allows to send a custom set of key/values
123 |             to the statictics collector servicd.
124 |             None is passed if statiscs collector is not configured, or if the current request should not be collected
125 | 
126 |             Usage example:
127 |             >>> if collect_custom_statistics_fn:
128 |             >>>   collect_custom_statistics_fn({"type": "classification"})
129 | 
130 |         :return: Object to be passed tp the post-processing function
131 |         """
132 |         return data
133 | 
134 |     def send_request(  # noqa
135 |             self,
136 |             endpoint: str,
137 |             version: Optional[str] = None,
138 |             data: Optional[dict] = None
139 |     ) -> Optional[dict]:
140 |         """
141 |         NOTICE: This method will be replaced in runtime, by the inference service
142 | 
143 |         Helper method to send model inference requests to the inference service itself.
144 |         This is designed to help with model ensemble, model pipelines, etc.
145 |         On request error return None, otherwise the request result data dictionary
146 | 
147 |         Usage example:
148 | 
149 |         >>> x0, x1 = 1, 2
150 |         >>> result = self.send_request(endpoint="test_model_sklearn", version="1", data={"x0": x0, "x1": x1})
151 |         >>> y = result["y"]
152 |         """
153 |         pass
154 | 


--------------------------------------------------------------------------------
/examples/pytorch/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clearml/clearml-serving/1def0a6901617767687f2b747aaffdc060f96046/examples/pytorch/5.jpg


--------------------------------------------------------------------------------
/examples/pytorch/preprocess.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from typing import Any, Union
 3 | 
 4 | import numpy as np
 5 | from PIL import Image, ImageOps
 6 | 
 7 | 
 8 | from clearml import StorageManager
 9 | 
10 | 
11 | # Notice Preprocess class Must be named "Preprocess"
12 | class Preprocess(object):
13 |     def __init__(self):
14 |         # set internal state, this will be called only once. (i.e. not per request)
15 |         pass
16 | 
17 |     def preprocess(self, body: Union[bytes, dict], state: dict, collect_custom_statistics_fn=None) -> Any:
18 |         # we expect to get two valid on the dict x0, and x1
19 |         if isinstance(body, bytes):
20 |             # we expect to get a stream of encoded image bytes
21 |             try:
22 |                 image = Image.open(io.BytesIO(body)).convert("RGB")
23 |             except Exception:
24 |                 # value error would return 404, we want to return 500 so any other exception
25 |                 raise RuntimeError("Image could not be decoded")
26 | 
27 |         if isinstance(body, dict) and "url" in body.keys():
28 |             # image is given as url, and is fetched
29 |             url = body.get("url")
30 |             local_file = StorageManager.get_local_copy(remote_url=url)
31 |             image = Image.open(local_file)
32 |     
33 |         image = ImageOps.grayscale(image).resize((28, 28))
34 |         return np.array([np.array(image)])
35 | 
36 |     def postprocess(self, data: Any, state: dict, collect_custom_statistics_fn=None) -> dict:
37 |         # post process the data returned from the model inference engine
38 |         # data is the return value from model.predict we will put is inside a return value as Y
39 |         if not isinstance(data, np.ndarray):
40 |             # this should not happen
41 |             return dict(digit=-1)
42 | 
43 |         # data is returned as probability per class (10 class/digits)
44 |         return dict(digit=int(data.flatten().argmax()))
45 | 


--------------------------------------------------------------------------------
/examples/pytorch/readme.md:
--------------------------------------------------------------------------------
 1 | # Train and Deploy Pytorch model with Nvidia Triton Engine
 2 | 
 3 | ## training mnist digit classifier model
 4 | 
 5 | Run the mock python training code
 6 | ```bash
 7 | pip install -r examples/pytorch/requirements.txt 
 8 | python examples/pytorch/train_pytorch_mnist.py
 9 | ```
10 | 
11 | The output will be a model created on the project "serving examples", by the name "train pytorch model"
12 | *Notice* Only TorchScript models are supported by Triton server
13 | 
14 | ## setting up the serving service
15 | 
16 | 
17 | Prerequisites, PyTorch models require Triton engine support, please use `docker-compose-triton.yml` / `docker-compose-triton-gpu.yml` or if running on Kubernetes, the matching helm chart.
18 | 
19 | 1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
20 | 2. Create model endpoint:
21 | 
22 | `clearml-serving --id <service_id> model add --engine triton --endpoint "test_model_pytorch" --preprocess "examples/pytorch/preprocess.py" --name "train pytorch model" --project "serving examples"
23 |   --input-size 1 28 28 --input-name "INPUT__0" --input-type float32   
24 |   --output-size -1 10 --output-name "OUTPUT__0" --output-type float32   
25 | `
26 | 
27 | Or auto update 
28 | 
29 | `clearml-serving --id <service_id> model auto-update --engine triton --endpoint "test_model_pytorch_auto" --preprocess "examples/pytorch/preprocess.py" --name "train pytorch model" --project "serving examples" --max-versions 2
30 |   --input-size 1 28 28 --input-name "INPUT__0" --input-type float32   
31 |   --output-size -1 10 --output-name "OUTPUT__0" --output-type float32`
32 |   
33 | Or add Canary endpoint
34 | 
35 | `clearml-serving --id <service_id> model canary --endpoint "test_model_pytorch_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_pytorch_auto`
36 |    
37 | 3. Make sure you have the `clearml-serving` `docker-compose-triton.yml` (or `docker-compose-triton-gpu.yml`) running, it might take it a minute or two to sync with the new endpoint.
38 | 
39 | 4. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): \
40 |   `curl -X POST "http://127.0.0.1:8080/serve/test_model_pytorch" -H "accept: application/json" -H "Content-Type: application/json" -d '{"url": "https://raw.githubusercontent.com/allegroai/clearml-serving/main/examples/pytorch/5.jpg"}'` \
41 |   or send a local file to be classified with \
42 |   `curl -X POST "http://127.0.0.1:8080/serve/test_model_pytorch" -H "Content-Type: image/jpeg" --data-binary "@5.jpg"`
43 | 
44 | > **_Notice:_**  You can also change the serving service while it is already running!
45 | This includes adding/removing endpoints, adding canary model routing etc.
46 | by default new endpoints/models will be automatically updated after 1 minute
47 | 


--------------------------------------------------------------------------------
/examples/pytorch/requirements.txt:
--------------------------------------------------------------------------------
1 | torchvision
2 | torch
3 | clearml
4 | Pillow
5 | tensorboard
6 | 


--------------------------------------------------------------------------------
/examples/pytorch/train_pytorch_mnist.py:
--------------------------------------------------------------------------------
  1 | # ClearML - Example of pytorch with tensorboard>=v1.14
  2 | #
  3 | from __future__ import print_function
  4 | 
  5 | import argparse
  6 | import os
  7 | from tempfile import gettempdir
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import torch.optim as optim
 13 | from torchvision import datasets, transforms
 14 | from torch.autograd import Variable
 15 | from torch.utils.tensorboard import SummaryWriter
 16 | 
 17 | from clearml import Task, OutputModel
 18 | 
 19 | 
 20 | class Net(nn.Module):
 21 | 
 22 |     def __init__(self):
 23 |         super(Net, self).__init__()
 24 |         self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
 25 |         self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
 26 |         self.conv2_drop = nn.Dropout2d()
 27 |         self.fc1 = nn.Linear(320, 50)
 28 |         self.fc2 = nn.Linear(50, 10)
 29 | 
 30 |     def forward(self, x):
 31 |         x = F.relu(F.max_pool2d(self.conv1(x), 2))
 32 |         x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
 33 |         x = x.view(-1, 320)
 34 |         x = F.relu(self.fc1(x))
 35 |         x = F.dropout(x, training=self.training)
 36 |         x = self.fc2(x)
 37 |         return F.log_softmax(x, dim=1)
 38 | 
 39 | 
 40 | def train(model, epoch, train_loader, args, optimizer, writer):
 41 |     model.train()
 42 |     for batch_idx, (data, target) in enumerate(train_loader):
 43 |         if args.cuda:
 44 |             data, target = data.cuda(), target.cuda()
 45 |         data, target = Variable(data), Variable(target)
 46 |         optimizer.zero_grad()
 47 |         output = model(data)
 48 |         loss = F.nll_loss(output, target)
 49 |         loss.backward()
 50 |         optimizer.step()
 51 |         if batch_idx % args.log_interval == 0:
 52 |             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
 53 |                 epoch, batch_idx * len(data), len(train_loader.dataset),
 54 |                 100. * batch_idx / len(train_loader), loss.data.item()))
 55 |             niter = epoch*len(train_loader)+batch_idx
 56 |             writer.add_scalar('Train/Loss', loss.data.item(), niter)
 57 | 
 58 | 
 59 | def test(model, test_loader, args, optimizer, writer):
 60 |     model.eval()
 61 |     test_loss = 0
 62 |     correct = 0
 63 |     for niter, (data, target) in enumerate(test_loader):
 64 |         if args.cuda:
 65 |             data, target = data.cuda(), target.cuda()
 66 |         data, target = Variable(data), Variable(target)
 67 |         output = model(data)
 68 |         test_loss += F.nll_loss(output, target, reduction='sum').data.item()  # sum up batch loss
 69 |         pred = output.data.max(1)[1]  # get the index of the max log-probability
 70 |         pred = pred.eq(target.data).cpu().sum()
 71 |         writer.add_scalar('Test/Loss', pred, niter)
 72 |         correct += pred
 73 |         if niter % 100 == 0:
 74 |             writer.add_image('test', data[0, :, :, :], niter)
 75 | 
 76 |     test_loss /= len(test_loader.dataset)
 77 |     print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
 78 |         test_loss, correct, len(test_loader.dataset),
 79 |         100. * correct / len(test_loader.dataset)))
 80 | 
 81 | 
 82 | def main():
 83 |     # Training settings
 84 |     parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 85 |     parser.add_argument('--batch-size', type=int, default=64, metavar='N',
 86 |                         help='input batch size for training (default: 64)')
 87 |     parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
 88 |                         help='input batch size for testing (default: 1000)')
 89 |     parser.add_argument('--epochs', type=int, default=10, metavar='N',
 90 |                         help='number of epochs to train (default: 10)')
 91 |     parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
 92 |                         help='learning rate (default: 0.01)')
 93 |     parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
 94 |                         help='SGD momentum (default: 0.5)')
 95 |     parser.add_argument('--no-cuda', action='store_true', default=False,
 96 |                         help='disables CUDA training')
 97 |     parser.add_argument('--seed', type=int, default=1, metavar='S',
 98 |                         help='random seed (default: 1)')
 99 |     parser.add_argument('--log-interval', type=int, default=10, metavar='N',
100 |                         help='how many batches to wait before logging training status')
101 |     args = parser.parse_args()
102 | 
103 |     # Connecting ClearML with the current process,
104 |     # from here on everything is logged automatically
105 |     task = Task.init(project_name='serving examples', task_name='train pytorch model', output_uri=True)  # noqa: F841
106 |     writer = SummaryWriter('runs')
107 |     writer.add_text('TEXT', 'This is some text', 0)
108 |     args.cuda = not args.no_cuda and torch.cuda.is_available()
109 | 
110 |     torch.manual_seed(args.seed)
111 |     if args.cuda:
112 |         torch.cuda.manual_seed(args.seed)
113 | 
114 |     kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
115 |     train_loader = torch.utils.data.DataLoader(datasets.MNIST('./data', train=True, download=True,
116 |                                                               transform=transforms.Compose([
117 |                                                                   transforms.ToTensor(),
118 |                                                                   transforms.Normalize((0.1307,), (0.3081,))])),
119 |                                                batch_size=args.batch_size, shuffle=True, **kwargs)
120 |     test_loader = torch.utils.data.DataLoader(datasets.MNIST('./data', train=False,
121 |                                                              transform=transforms.Compose([
122 |                                                                  transforms.ToTensor(),
123 |                                                                  transforms.Normalize((0.1307,), (0.3081,))])),
124 |                                               batch_size=args.test_batch_size, shuffle=True, **kwargs)
125 | 
126 |     model = Net()
127 |     if args.cuda:
128 |         model.cuda()
129 | 
130 |     optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
131 | 
132 |     for epoch in range(1, args.epochs + 1):
133 |         train(model, epoch, train_loader, args, optimizer, writer)
134 | 
135 |     # store in a way we can easily load into triton without having to have the model class
136 |     torch.jit.script(model).save('serving_model.pt')
137 |     OutputModel().update_weights('serving_model.pt')
138 |     test(model, test_loader, args, optimizer, writer)
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     main()
143 | 


--------------------------------------------------------------------------------
/examples/sklearn/preprocess.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | # Notice Preprocess class Must be named "Preprocess"
 7 | class Preprocess(object):
 8 |     def __init__(self):
 9 |         # set internal state, this will be called only once. (i.e. not per request)
10 |         pass
11 | 
12 |     def preprocess(self, body: dict, state: dict, collect_custom_statistics_fn=None) -> Any:
13 |         # we expect to get two valid on the dict x0, and x1
14 |         return [[body.get("x0", None), body.get("x1", None)], ]
15 | 
16 |     def postprocess(self, data: Any, state: dict, collect_custom_statistics_fn=None) -> dict:
17 |         # post process the data returned from the model inference engine
18 |         # data is the return value from model.predict we will put is inside a return value as Y
19 |         return dict(y=data.tolist() if isinstance(data, np.ndarray) else data)
20 | 


--------------------------------------------------------------------------------
/examples/sklearn/readme.md:
--------------------------------------------------------------------------------
 1 | # Train and Deploy Scikit-Learn model
 2 | 
 3 | ## training mock logistic regression model
 4 | 
 5 | Run the mock python training code
 6 | ```bash
 7 | pip install -r examples/sklearn/requirements.txt 
 8 | python examples/sklearn/train_model.py
 9 | ```
10 | 
11 | The output will be a model created on the project "serving examples", by the name "train sklearn model"
12 | 
13 | ## setting up the serving service
14 | 
15 | 1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
16 | 2. Create model endpoint: 
17 | `clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model - sklearn-model" --project "serving examples"`
18 | 
19 | Or auto update 
20 | 
21 | `clearml-serving --id <service_id> model auto-update --engine sklearn --endpoint "test_model_sklearn_auto" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model - sklearn-model" --project "serving examples" --max-versions 2`
22 | 
23 | Or add Canary endpoint
24 | 
25 | `clearml-serving --id <service_id> model canary --endpoint "test_model_sklearn_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_sklearn_auto`
26 | 
27 | 3. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint.
28 | 
29 | Or you can run the clearml-serving container independently `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
30 | 
31 | 4. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): `curl -X POST "http://127.0.0.1:8080/serve/test_model_sklearn" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2}'`
32 | 
33 | 
34 | > **_Notice:_**  You can also change the serving service while it is already running!
35 | This includes adding/removing endpoints, adding canary model routing etc.
36 | by default new endpoints/models will be automatically updated after 1 minute
37 | 


--------------------------------------------------------------------------------
/examples/sklearn/requirements.txt:
--------------------------------------------------------------------------------
1 | clearml >= 1.1.6
2 | scikit-learn
3 | pillow>=10.0.1 # not directly required, pinned by Snyk to avoid a vulnerability
4 | 


--------------------------------------------------------------------------------
/examples/sklearn/train_model.py:
--------------------------------------------------------------------------------
 1 | from sklearn.linear_model import LogisticRegression
 2 | from sklearn.datasets import make_blobs
 3 | from joblib import dump
 4 | from clearml import Task
 5 | 
 6 | task = Task.init(project_name="serving examples", task_name="train sklearn model", output_uri=True)
 7 | 
 8 | # generate 2d classification dataset
 9 | X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1)
10 | # fit final model
11 | model = LogisticRegression()
12 | model.fit(X, y)
13 | 
14 | dump(model, filename="sklearn-model.pkl", compress=9)
15 | 
16 | 


--------------------------------------------------------------------------------
/examples/xgboost/preprocess.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import numpy as np
 4 | import xgboost as xgb
 5 | 
 6 | 
 7 | # Notice Preprocess class Must be named "Preprocess"
 8 | class Preprocess(object):
 9 |     def __init__(self):
10 |         # set internal state, this will be called only once. (i.e. not per request)
11 |         pass
12 | 
13 |     def preprocess(self, body: dict, state: dict, collect_custom_statistics_fn=None) -> Any:
14 |         # we expect to get four valid numbers on the dict: x0, x1, x2, x3
15 |         return xgb.DMatrix(
16 |             [[body.get("x0", None), body.get("x1", None), body.get("x2", None), body.get("x3", None)]])
17 | 
18 |     def postprocess(self, data: Any, state: dict, collect_custom_statistics_fn=None) -> dict:
19 |         # post process the data returned from the model inference engine
20 |         # data is the return value from model.predict we will put is inside a return value as Y
21 |         return dict(y=data.tolist() if isinstance(data, np.ndarray) else data)
22 | 


--------------------------------------------------------------------------------
/examples/xgboost/readme.md:
--------------------------------------------------------------------------------
 1 | # Train and Deploy XGBoost model
 2 | 
 3 | ## training iris classifier model
 4 | 
 5 | Run the mock python training code
 6 | ```bash
 7 | pip install -r examples/xgboost/requirements.txt 
 8 | python examples/xgboost/train_model.py
 9 | ```
10 | 
11 | The output will be a model created on the project "serving examples", by the name "train xgboost model"
12 | 
13 | ## setting up the serving service
14 | 
15 | 1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
16 | 2. Create model endpoint: 
17 | 
18 | 3. `clearml-serving --id <service_id> model add --engine xgboost --endpoint "test_model_xgb" --preprocess "examples/xgboost/preprocess.py" --name "train xgboost model - xgb_model" --project "serving examples"`
19 | 
20 | Or auto update 
21 | 
22 | `clearml-serving --id <service_id> model auto-update --engine xgboost --endpoint "test_model_xgb_auto" --preprocess "examples/xgboost/preprocess.py" --name "train xgboost model - xgb_model" --project "serving examples" --max-versions 2`
23 | 
24 | Or add Canary endpoint
25 | 
26 | `clearml-serving --id <service_id> model canary --endpoint "test_model_xgb_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_xgb_auto`
27 | 
28 | 3. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint.
29 | 
30 | Or you can run the clearml-serving container independently `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
31 | 
32 | 4. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): `curl -X POST "http://127.0.0.1:8080/serve/test_model_xgb" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2, "x2": 3, "x3": 4}'`
33 | 
34 | > **_Notice:_**  You can also change the serving service while it is already running!
35 | This includes adding/removing endpoints, adding canary model routing etc.
36 | 


--------------------------------------------------------------------------------
/examples/xgboost/requirements.txt:
--------------------------------------------------------------------------------
1 | clearml >= 1.1.6
2 | xgboost
3 | 
4 | pillow>=10.0.1 # not directly required, pinned by Snyk to avoid a vulnerability
5 | 


--------------------------------------------------------------------------------
/examples/xgboost/train_model.py:
--------------------------------------------------------------------------------
 1 | import xgboost as xgb
 2 | from sklearn.datasets import load_iris
 3 | from sklearn.model_selection import train_test_split
 4 | 
 5 | from clearml import Task
 6 | 
 7 | task = Task.init(project_name="serving examples", task_name="train xgboost model", output_uri=True)
 8 | 
 9 | X, y = load_iris(return_X_y=True)
10 | X_train, X_test, y_train, y_test = train_test_split(
11 |     X, y, test_size=0.2, random_state=100
12 | )
13 | 
14 | dtrain = xgb.DMatrix(X_train, label=y_train)
15 | dtest = xgb.DMatrix(X_test, label=y_test)
16 | 
17 | params = {"objective": "reg:squarederror", "eval_metric": "rmse"}
18 | 
19 | 
20 | bst = xgb.train(
21 |     params,
22 |     dtrain,
23 |     num_boost_round=100,
24 |     evals=[(dtrain, "train"), (dtest, "test")],
25 |     verbose_eval=0,
26 | )
27 | 
28 | bst.save_model("xgb_model")
29 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | clearml >= 1.3.1
2 | pillow>=10.0.1 # not directly required, pinned by Snyk to avoid a vulnerability
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | `clearml-serving` - Model-Serving Orchestration and Repository Solution
 3 | https://github.com/clearml/clearml-serving
 4 | """
 5 | 
 6 | import os.path
 7 | # Always prefer setuptools over distutils
 8 | from setuptools import setup, find_packages
 9 | 
10 | 
11 | def read_text(filepath):
12 |     with open(filepath, "r", encoding="utf-8") as f:
13 |         return f.read()
14 | 
15 | 
16 | here = os.path.dirname(__file__)
17 | # Get the long description from the README file
18 | long_description = read_text(os.path.join(here, 'README.md'))
19 | 
20 | 
21 | def read_version_string(version_file):
22 |     for line in read_text(version_file).splitlines():
23 |         if line.startswith('__version__'):
24 |             delim = '"' if '"' in line else "'"
25 |             return line.split(delim)[1]
26 |     else:
27 |         raise RuntimeError("Unable to find version string.")
28 | 
29 | 
30 | version = read_version_string("clearml_serving/version.py")
31 | 
32 | requirements = read_text(os.path.join(here, 'requirements.txt')).splitlines()
33 | 
34 | setup(
35 |     name='clearml-serving',
36 |     version=version,
37 |     description='clearml-serving - Model-Serving Orchestration and Repository Solution',
38 |     long_description=long_description,
39 |     long_description_content_type='text/markdown',
40 |     # The project's main homepage.
41 |     url='https://github.com/clearml/clearml-serving.git',
42 |     author='ClearML',
43 |     author_email='support@clear.ml',
44 |     license='Apache License 2.0',
45 |     classifiers=[
46 |         'Development Status :: 5 - Production/Stable',
47 |         'Intended Audience :: Developers',
48 |         'Intended Audience :: Science/Research',
49 |         'Operating System :: POSIX :: Linux',
50 |         'Operating System :: MacOS :: MacOS X',
51 |         'Operating System :: Microsoft',
52 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
53 |         'Topic :: Software Development',
54 |         'Topic :: Software Development :: Version Control',
55 |         'Topic :: System :: Logging',
56 |         'Topic :: System :: Monitoring',
57 |         'Programming Language :: Python :: 3.6',
58 |         'Programming Language :: Python :: 3.7',
59 |         'Programming Language :: Python :: 3.8',
60 |         'Programming Language :: Python :: 3.9',
61 |         'Programming Language :: Python :: 3.10',
62 |         'Programming Language :: Python :: 3.11',
63 |         'License :: OSI Approved :: Apache Software License',
64 |     ],
65 |     keywords='clearml mlops devops trains development machine deep learning version control machine-learning '
66 |              'machinelearning deeplearning deep-learning model-serving',
67 |     packages=find_packages(exclude=['contrib', 'docs', 'data', 'examples', 'tests']),
68 |     install_requires=requirements,
69 |     # To provide executable scripts, use entry points in preference to the
70 |     # "scripts" keyword. Entry points provide cross-platform support and allow
71 |     # pip to create the appropriate form of executable for the target platform.
72 |     entry_points={
73 |         'console_scripts': [
74 |           'clearml-serving = clearml_serving.__main__:main',
75 |         ],
76 |     },
77 | )
78 | 


--------------------------------------------------------------------------------