├── VERSION ├── tests ├── __init__.py ├── pipelines │ ├── operator_with_model.yaml │ ├── operator_with_services.yaml │ └── nullpipeline.yaml ├── test_main.py ├── test_topology_sort.py ├── test_container.py ├── test_clarac_utils.py ├── test_cli.py ├── test_utils.py ├── test_triton_utils.py └── test_pipeline_utils.py ├── .gitignore ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg ├── src ├── __init__.py ├── constants.py ├── main.py ├── topology_sort.py ├── cli.py ├── clarac_utils.py ├── container.py ├── utils.py ├── triton_utils.py └── pipeline_utils.py ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md └── workflows │ ├── build.yml │ ├── codeql-analysis.yml │ ├── release-staging.yml │ └── release.yml ├── ngc └── overview.md ├── setup.py ├── CONTRIBUTING.md ├── LICENSE └── README.md /VERSION: -------------------------------------------------------------------------------- 1 | 0.8.1 -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | .vscode/ 3 | *.csv 4 | demo/ 5 | .coverage 6 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | autopep8==1.5.7 2 | coverage==5.5 3 | flake8==3.9.2 4 | isort==5.9.2 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | psutil==5.8.0 2 | pytest==6.2.4 3 | PyYAML==5.4.1 4 | requests==2.25.1 5 | tabulate==0.8.9 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [coverage:run] 2 | source = . 3 | omit = venv/* 4 | 5 | [coverage:report] 6 | show_missing = True 7 | include = 8 | src/* 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: 'Build' 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 3.8 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: 3.8 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip wheel 21 | pip install -r requirements.txt 22 | - name: Build 23 | run: | 24 | python3 setup.py sdist bdist_wheel 25 | - name: Install 26 | run: | 27 | pip install ./dist/*.whl 28 | - name: Test 29 | run: python3 -m pytest tests 30 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: 'CodeQL' 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | schedule: 9 | - cron: '37 23 * * 0' 10 | 11 | jobs: 12 | analyze: 13 | name: Analyze 14 | runs-on: ubuntu-latest 15 | permissions: 16 | actions: read 17 | contents: read 18 | security-events: write 19 | 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | language: [ 'python' ] 24 | 25 | steps: 26 | - name: Checkout repository 27 | uses: actions/checkout@v2 28 | 29 | # Initializes the CodeQL tools for scanning. 30 | - name: Initialize CodeQL 31 | uses: github/codeql-action/init@v1 32 | with: 33 | languages: ${{ matrix.language }} 34 | 35 | - name: Autobuild 36 | uses: github/codeql-action/autobuild@v1 37 | 38 | - name: Perform CodeQL Analysis 39 | uses: github/codeql-action/analyze@v1 40 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Description 11 | A clear and concise description of what the bug is. 12 | 13 | ### Steps to reproduce 14 | Please share a clear and concise description of the problem. 15 | 16 | ... 17 | 18 | ### Expected behavior 19 | A clear and concise description of what you expected to happen. 20 | 21 | ### Actual behavior 22 | A clear and concise description of what actually happened. 23 | 24 | ### Configuration 25 | 26 | * CPOST version: (PIP package version or branch name if building from source) 27 | * Python version: 28 | * OS and version (distro if applicable): 29 | 30 | ### Regression? 31 | Did this work in the previous build or release of the playbook? If you can try a previous release or build to find out, that can help us narrow down the problem. If you don't know, that's OK. 32 | 33 | ### Other information 34 | (Please attach any relevant stdout / logs if available and remember to anonymize any PHI before sharing). 35 | 36 | -------------------------------------------------------------------------------- /ngc/overview.md: -------------------------------------------------------------------------------- 1 | # CPOST (Clara Pipeline Operator Sizing Tool) 2 | ## Tool to measure resource usage of Clara Platform pipeline operators 3 | 4 | CPOST is a tool that will help you run your pipeline locally and provides you with the CPU and memory usage of each operators ran for the given input payload. Opeartors are ran one at a time and CPU and memory usage are sampled. The CPU and memory usage metrics are provided in a .csv format which allows further data analytics as needed. 5 | 6 | ## System Requirements 7 | * Clara Compiler (downloadable from [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:clara:clara_cli)) 8 | * Docker 20.10 or higher due to cgroup v2 constraints 9 | * System must be using cgroup v2 (See [Docker Control Groups](https://docs.docker.com/config/containers/runmetrics/#control-groups) for more information) 10 | * Python 3.8.0 or higher 11 | * Do not have a Triton instance running on the same machine that CPOST is running on. CPOST will provision it's own Triton instance and the two instances could conflict and cause failures. 12 | 13 | CPOST is available on [GitHub](https://github.com/NVIDIA/clara-pipeline-operator-sizing-tool) 14 | -------------------------------------------------------------------------------- /tests/pipelines/operator_with_model.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | api-version: 0.4.0 15 | name: null-pipeline 16 | orchestrator: Clara 17 | pull-secrets: 18 | - ngc-clara 19 | operators: 20 | - name: null-reader 21 | variables: 22 | CLARA_TRACE: 2 23 | container: 24 | image: null-pipeline/operator-py 25 | tag: 0.8.1 26 | command: ["python", "register.py", "--agent", "renderserver"] 27 | input: 28 | - path: /input 29 | output: 30 | - path: /output 31 | models: 32 | # change the following line to match the name created for the model 33 | - name: segmentation_ct_spleen_v1 34 | - name: segmentation_ct_liver_v1 35 | -------------------------------------------------------------------------------- /tests/pipelines/operator_with_services.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | api-version: 0.4.0 15 | name: null-pipeline 16 | orchestrator: Clara 17 | pull-secrets: 18 | - ngc-clara 19 | operators: 20 | - name: null-reader 21 | variables: 22 | CLARA_TRACE: 2 23 | container: 24 | image: null-pipeline/operator-py 25 | tag: 0.8.1 26 | input: 27 | - path: /input 28 | output: 29 | - path: /output 30 | services: 31 | - name: trtis 32 | # Triton Inference Server, required by this AI application. 33 | container: 34 | image: nvcr.io/nvidia/tritonserver 35 | tag: latest 36 | command: ["some", "command"] 37 | connections: 38 | http: 39 | - name: NVIDIA_CLARA_TRTISURI 40 | port: 8000 41 | -------------------------------------------------------------------------------- /src/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | import sys 18 | from pathlib import Path 19 | 20 | B_MB_FACTOR = 1e6 21 | 22 | SYSFS_PATH = Path("/sys/fs/cgroup") 23 | 24 | ON_POSIX = 'posix' in sys.builtin_module_names 25 | 26 | NS_PER_S = 1e9 27 | CLOCK_TICKS_PER_S = os.sysconf(os.sysconf_names['SC_CLK_TCK']) 28 | ONLINE_CPUS = os.sysconf(os.sysconf_names['SC_NPROCESSORS_ONLN']) 29 | 30 | ID_WAITING_TIME_SECONDS = 15 31 | METRIC_SAMPLING_PERIOD_SECONDS = 0.2 # i.e 200ms 32 | 33 | 34 | TRITON_IMAGE_TAG = "nvcr.io/nvidia/tritonserver:20.07-v1-py3" 35 | TRITON_READY_TIMEOUT_SECONDS = 30 36 | TRITON_WAIT_TIME_SECONDS = 15 37 | TRITON_WAIT_SLEEP_TIME_SECONDS = 1 38 | TRITON_HTTP_ENV_VAR = "NVIDIA_TRITON_HTTPURI" 39 | TRITON_HTTP_PORT = 8000 40 | TRITON_GRPC_ENV_VAR = "NVIDIA_TRITON_GRPCURI" 41 | TRITON_GRPC_PORT = 8001 42 | LEGACY_TRTIS_HTTP_ENV_VAR = "NVIDIA_CLARA_TRTISURI" 43 | LEGACY_TRITON_HTTP_ENV_VAR = "CLARA_TRITON_URI" 44 | -------------------------------------------------------------------------------- /.github/workflows/release-staging.yml: -------------------------------------------------------------------------------- 1 | name: 'Staging' 2 | # generating and testing package artifacts from the main branch 3 | 4 | on: 5 | workflow_dispatch: 6 | 7 | jobs: 8 | packaging: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | python-version: [3.8, 3.9] 13 | steps: 14 | - uses: actions/checkout@v2 15 | with: 16 | fetch-depth: 0 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip wheel 24 | pip install -r requirements.txt 25 | - name: Build 26 | run: | 27 | python3 setup.py sdist bdist_wheel 28 | - name: Install 29 | run: | 30 | pip install ./dist/*.whl 31 | - name: Test 32 | run: python3 -m pytest tests 33 | 34 | - if: matrix.python-version == '3.8' 35 | name: Upload artifacts 36 | uses: actions/upload-artifact@v2 37 | with: 38 | path: ./dist/nvidia_clara_cpost-*.whl 39 | 40 | - if: matrix.python-version == '3.8' 41 | name: Check artifacts 42 | run: | 43 | ls -al dist/ 44 | rm dist/nvidia-clara-cpost-*.tar.gz 45 | ls -al dist/ 46 | 47 | - if: matrix.python-version == '3.8' 48 | name: Publish to Test PyPI 49 | uses: pypa/gh-action-pypi-publish@master 50 | with: 51 | password: ${{ secrets.TEST_PYPI_TOKEN }} 52 | repository_url: https://test.pypi.org/legacy/ -------------------------------------------------------------------------------- /tests/pipelines/nullpipeline.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | api-version: 0.4.0 15 | name: null-pipeline 16 | orchestrator: Clara 17 | pull-secrets: 18 | - ngc-clara 19 | # This pipeline is intended to emulate the traditional 3-stage pipeline used by our 20 | # reference pipelines: reader -> inference -> writer. 21 | operators: 22 | - name: null-reader 23 | variables: 24 | CLARA_TRACE: 2 25 | container: 26 | image: null-pipeline/operator-py 27 | tag: 0.8.1 28 | input: 29 | - path: /input 30 | output: 31 | - path: /output 32 | - name: null-inference 33 | variables: 34 | CLARA_TRACE: 2 35 | container: 36 | image: null-pipeline/operator-py 37 | tag: 0.8.1 38 | input: 39 | - from: null-reader 40 | path: /input 41 | output: 42 | - path: /output 43 | requests: 44 | gpu: 1 # Request a GPU to better emulate GPU enabled inference workloads. 45 | - name: null-writer 46 | variables: 47 | CLARA_TRACE: 2 48 | container: 49 | image: null-pipeline/operator-py 50 | tag: 0.8.1 51 | input: 52 | - from: null-inference 53 | path: /input 54 | output: 55 | - path: /output 56 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | import sys 18 | from unittest.mock import MagicMock, patch 19 | 20 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src")) 21 | from main import main # nopep8 # noqa: E402 22 | 23 | 24 | @patch("main.run_pipeline") 25 | @patch("main.topo_sort_pipeline") 26 | @patch("main.check_images_and_tags") 27 | @patch("main.run_clarac") 28 | @patch("main.assert_installed") 29 | @patch("main.set_up_logging") 30 | @patch("main.parse_args") 31 | def test_main(mock_parse, mock_set_logging, mock_assert_install, mock_run_clarac, mock_check, mock_sort, mock_run): 32 | 33 | mock_parse.return_value = MagicMock(**{"verbose": 2, "pipeline_path": "some_path"}) 34 | mock_run_clarac.return_value = MagicMock(**{"operators": "operators"}) 35 | main() 36 | mock_set_logging.assert_called_with(2) 37 | assert mock_assert_install.call_count == 2 38 | mock_run_clarac.assert_called_with("some_path") 39 | mock_check.assert_called_with("operators") 40 | mock_sort.assert_called_with("operators") 41 | mock_run.assert_called_once() 42 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: 'Release' 2 | # generating and testing package artifacts from the main branch 3 | 4 | on: 5 | release: 6 | types: [edited, published] 7 | tags-ignore: 8 | - data 9 | 10 | jobs: 11 | packaging: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: [3.8, 3.9] 16 | steps: 17 | - uses: actions/checkout@v2 18 | with: 19 | fetch-depth: 0 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip wheel 27 | pip install -r requirements.txt 28 | - name: Build 29 | run: | 30 | python3 setup.py sdist bdist_wheel 31 | - name: Install 32 | run: | 33 | pip install ./dist/*.whl 34 | - name: Test 35 | run: python3 -m pytest tests 36 | 37 | - if: matrix.python-version == '3.8' && startsWith(github.ref, 'refs/tags/') 38 | name: Upload artifacts 39 | uses: actions/upload-artifact@v2 40 | with: 41 | path: ./dist/nvidia_clara_cpost-*.whl 42 | 43 | - if: matrix.python-version == '3.8' && startsWith(github.ref, 'refs/tags/') 44 | name: Check artifacts 45 | run: | 46 | ls -al dist/ 47 | rm ./dist/nvidia-clara-cpost-*.tar.gz 48 | ls -al dist/ 49 | 50 | - if: matrix.python-version == '3.8' && startsWith(github.ref, 'refs/tags/') 51 | name: Publish to Production PyPI 52 | uses: pypa/gh-action-pypi-publish@master 53 | with: 54 | user: __token__ 55 | password: ${{ secrets.PYPI_TOKEN }} 56 | 57 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | import os 18 | import sys 19 | 20 | sys.path.append('{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), '../src')) 21 | from clarac_utils import run_clarac # nopep8 # noqa: E402 22 | from pipeline_utils import run_pipeline # nopep8 # noqa: E402 23 | from topology_sort import topo_sort_pipeline # nopep8 # noqa: E402 24 | from utils import assert_installed, check_images_and_tags, set_up_logging # nopep8 # noqa: E402 25 | 26 | from cli import parse_args # nopep8 # noqa: E402 27 | 28 | 29 | def main(): 30 | parsed_args = parse_args(sys.argv[1:]) 31 | 32 | set_up_logging(parsed_args.verbose) 33 | 34 | assert_installed("clarac") 35 | assert_installed("docker") 36 | logging.info("All software dependencies are fullfilled.") 37 | 38 | pipeline_config = run_clarac(parsed_args.pipeline_path) 39 | 40 | check_images_and_tags(pipeline_config.operators) 41 | 42 | execution_order = topo_sort_pipeline(pipeline_config.operators) 43 | 44 | run_pipeline(execution_order, parsed_args.input_dir, parsed_args.metrics_dir, 45 | parsed_args.models_dir, parsed_args.force) 46 | 47 | 48 | if __name__ == "__main__": # pragma: no cover 49 | main() 50 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | 18 | import setuptools 19 | 20 | with open("README.md", "r") as fh: 21 | long_description = fh.read() 22 | 23 | # Install required packages from requirements.txt file 24 | requirements_relative_path = "/requirements.txt" 25 | package_folder = os.path.dirname(os.path.realpath(__file__)) 26 | requirements_path = package_folder + requirements_relative_path 27 | if os.path.isfile(requirements_path): 28 | with open(requirements_path) as f: 29 | install_requires = f.read().splitlines() 30 | 31 | # Extract version number from VERSION file 32 | release_version = "0.0.0" 33 | if os.path.exists('VERSION'): 34 | with open('VERSION') as version_file: 35 | release_version = version_file.read().strip() 36 | 37 | setuptools.setup( 38 | name="nvidia-clara-cpost", 39 | author="NVIDIA Clara Deploy", 40 | version=release_version, 41 | description="Python package to run Clara Pipeline Operator Sizing Tool (cpost)", 42 | long_description=long_description, 43 | long_description_content_type="text/markdown", 44 | url="https://gitlab-master.nvidia.com/Clara/sdk/-/tree/main/Tools/cpost", 45 | install_requires=install_requires, 46 | packages=setuptools.find_packages('.'), 47 | entry_points={ 48 | 'console_scripts': [ 49 | 'cpost = src.main:main' 50 | ] 51 | }, 52 | classifiers=[ 53 | "Programming Language :: Python :: 3", 54 | "Operating System :: OS Independent", 55 | ], 56 | python_requires='>=3.8', 57 | ) 58 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to NVIDIA CPOST (Clara Pipeline Operator Sizing Tool) 2 | 3 | All PRs submitted to the CPOST project must be signed with real name and email address. Please see below for more information. 4 | 5 | ## Sign your work 6 | 7 | We enforce the [Developer Certificate of Origin](http://developercertificate.org/) on all pull 8 | requests. The sign-off is a simple line at the end of the explanation for the patch. Your 9 | signature certifies that you wrote the patch or otherwise have the right to pass 10 | it on as an open-source patch. The rules are pretty simple: if you can certify 11 | the below (from [developercertificate.org](http://developercertificate.org/)): 12 | 13 | ``` 14 | Developer Certificate of Origin 15 | Version 1.1 16 | 17 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 18 | 19 | Everyone is permitted to copy and distribute verbatim copies of this 20 | license document, but changing it is not allowed. 21 | 22 | 23 | Developer's Certificate of Origin 1.1 24 | 25 | By making a contribution to this project, I certify that: 26 | 27 | (a) The contribution was created in whole or in part by me and I 28 | have the right to submit it under the open source license 29 | indicated in the file; or 30 | 31 | (b) The contribution is based upon previous work that, to the best 32 | of my knowledge, is covered under an appropriate open source 33 | license and I have the right under that license to submit that 34 | work with modifications, whether created in whole or in part 35 | by me, under the same open source license (unless I am 36 | permitted to submit under a different license), as indicated 37 | in the file; or 38 | 39 | (c) The contribution was provided directly to me by some other 40 | person who certified (a), (b) or (c) and I have not modified 41 | it. 42 | 43 | (d) I understand and agree that this project and the contribution 44 | are public and that a record of the contribution (including all 45 | personal information I submit with it, including my sign-off) is 46 | maintained indefinitely and may be redistributed consistent with 47 | this project or the open source license(s) involved. 48 | ``` 49 | 50 | Then you just add a line to every git commit message: 51 | 52 | Signed-off-by: Joe Smith 53 | 54 | Please use your real name (sorry, no pseudonyms or anonymous contributions.) 55 | 56 | If you set your `user.name` and `user.email` git configs, you can sign your 57 | commit automatically with `git commit -s`. -------------------------------------------------------------------------------- /tests/test_topology_sort.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | import sys 18 | 19 | import pytest 20 | 21 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src")) 22 | from clarac_utils import OperatorConfig # nopep8 # noqa: E402 23 | from topology_sort import PipelineDAG, topo_sort_pipeline # nopep8 # noqa: E402 24 | 25 | 26 | def test_topo_sort(): 27 | g = PipelineDAG() 28 | g.add_input_edge(2, 5) 29 | g.add_input_edge(0, 5) 30 | g.add_input_edge(0, 4) 31 | g.add_input_edge(1, 4) 32 | g.add_input_edge(3, 2) 33 | g.add_input_edge(1, 3) 34 | assert g.topological_sort() == [5, 4, 2, 0, 3, 1] 35 | 36 | 37 | def test_topo_sort_2(): 38 | g = PipelineDAG() 39 | g.add_input_edge(2, 1) 40 | g.add_input_edge(3, 2) 41 | g.add_input_edge(4, 3) 42 | assert g.topological_sort() == [1, 2, 3, 4] 43 | 44 | 45 | def test_topo_sort_error(): 46 | g = PipelineDAG() 47 | g.add_input_edge(2, 1) 48 | g.add_input_edge(3, 2) 49 | g.add_input_edge(1, 3) 50 | with pytest.raises(RuntimeError): 51 | g.topological_sort() 52 | 53 | 54 | def test_a_pipeline(): 55 | op1 = OperatorConfig("Input1", "tag", None, None, [{"path": "/input"}], None) 56 | op2 = OperatorConfig("Input2", "tag", None, None, [{"from": "Input1", "path": "/input"}], None) 57 | op3 = OperatorConfig("Input3", "tag", None, None, [{"from": "Input2", "path": "/input"}], None) 58 | 59 | sequence = topo_sort_pipeline([op2, op3, op1]) 60 | assert sequence == [op1, op2, op3] 61 | 62 | 63 | def test_a_single_operator_pipeline(): 64 | op1 = OperatorConfig("Input1", "tag", None, None, [{"path": "/input"}], None) 65 | 66 | sequence = topo_sort_pipeline([op1]) 67 | assert sequence == [op1] 68 | 69 | 70 | def test_twp_operator_pipeline(): 71 | op1 = OperatorConfig("Input1", "tag", None, None, [{"path": "/input"}], None) 72 | op2 = OperatorConfig("Input2", "tag", None, None, [{"from": "Input1", "path": "/input"}], None) 73 | 74 | sequence = topo_sort_pipeline([op2, op1]) 75 | assert sequence == [op1, op2] 76 | 77 | 78 | def test_complex_pipeline(): 79 | op1 = OperatorConfig("Input1", "tag", None, None, [{"path": "/input"}], None) 80 | op2 = OperatorConfig("Input2", "tag", None, None, [{"path": "/input"}], None) 81 | op3 = OperatorConfig("Input3", "tag", None, None, 82 | [{"from": "Input1", "path": "/input"}, 83 | {"from": "Input2", "path": "/input"}], 84 | None) 85 | op4 = OperatorConfig("Input4", "tag", None, None, [{"from": "Input2", "path": "/input"}], None) 86 | op5 = OperatorConfig("Input5", "tag", None, None, 87 | [{"from": "Input3", "path": "/input"}, 88 | {"from": "Input4", "path": "/input"}], 89 | None) 90 | 91 | sequence = topo_sort_pipeline([op3, op4, op1, op2, op5]) 92 | assert sequence == [op1, op2, op3, op4, op5] 93 | -------------------------------------------------------------------------------- /src/topology_sort.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | from collections import defaultdict 18 | 19 | 20 | class PipelineDAG: 21 | """Class for the Pipeline DAG used for sorting.""" 22 | 23 | def __init__(self): 24 | self.input_deg_graph = defaultdict(lambda: 0) 25 | self.output_graph = defaultdict(list) # dictionary containing adjacency List 26 | 27 | def add_input_edge(self, node: str, input_node: str): 28 | """Add the node by giving its input node. 29 | 30 | Args: 31 | node: Node to be added 32 | input_node: One of its dependency nodes 33 | 34 | Returns: 35 | None 36 | """ 37 | self.output_graph[input_node].append(node) 38 | # Update the input_degree_graph as we are adding each node 39 | self.input_deg_graph[input_node] += 0 40 | self.input_deg_graph[node] += 1 41 | 42 | def topological_sort(self): 43 | """Topologically sort the given graph based on Kahn's algorithm. 44 | 45 | Args: 46 | None 47 | 48 | Returns: 49 | A list that is the topological order of the current graph 50 | 51 | Raises: 52 | Runtime Error if the graph contains cycles 53 | """ 54 | visited_count = 0 55 | topo_order = [] 56 | # Create a list for all node with in-degree 0 57 | zero_indegree = [node for node, length in self.input_deg_graph.items() if length == 0] 58 | 59 | # Pick zero-in-degree node one by one and check if any new zero-in-degree node shows up 60 | while zero_indegree: 61 | # Get the first zero in-degree node and add it to topo_order 62 | cur_node = zero_indegree.pop(0) 63 | topo_order.append(cur_node) 64 | 65 | # Iterate through output nodes of cur_node and decrease their in-degree by 1 66 | for i in self.output_graph[cur_node]: 67 | self.input_deg_graph[i] -= 1 68 | # If in-degree becomes zero, add it to zero_indegree 69 | if self.input_deg_graph[i] == 0: 70 | zero_indegree.append(i) 71 | 72 | visited_count += 1 73 | 74 | # Check for a cycle in the graph 75 | if visited_count != len(self.output_graph.keys()): 76 | raise RuntimeError("There exists a cycle in the given graph") 77 | 78 | return topo_order 79 | 80 | 81 | def topo_sort_pipeline(operators): 82 | """Topologically sort the given operators. 83 | 84 | Args: 85 | operators: List of OperatorConfig objects 86 | 87 | Returns: 88 | A topologically ordered OperatorConfig objects 89 | """ 90 | logging.debug(f"Topolocally order the given input: {operators}") 91 | if len(operators) == 1: 92 | result = operators.copy() 93 | else: 94 | # Construct a dictionary from operators so that we can convert names back to OperatorConfigs later 95 | op_dict = {op.name: op for op in operators} 96 | dag = PipelineDAG() 97 | for op in operators: 98 | for input_path in op.inputs: 99 | if input_path.get("from"): 100 | dag.add_input_edge(op.name, input_path.get("from")) 101 | sequence = dag.topological_sort() 102 | result = [op_dict[op_name] for op_name in sequence] 103 | logging.debug(f"Topologically order result is: {result}") 104 | return result 105 | -------------------------------------------------------------------------------- /src/cli.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import argparse 17 | import sys 18 | from enum import IntEnum 19 | from pathlib import Path 20 | 21 | 22 | class ContinueOptions(IntEnum): 23 | """Enum to organize options to prompt user, continue execution, or stop execution when operator fails.""" 24 | NONE = 0 # prompt user y/n 25 | CONT = 1 # continue execution 26 | STOP = 2 # stop execution 27 | 28 | # methods for compatible with argparse and error message 29 | 30 | def __str__(self): 31 | return self.name.lower() 32 | 33 | def __repr__(self): 34 | return str(self) 35 | 36 | @staticmethod 37 | def argparse(s): 38 | try: 39 | return ContinueOptions[s.upper()] 40 | except KeyError: # To be used with `choices` in add_argument() 41 | return s 42 | 43 | 44 | class MyParser(argparse.ArgumentParser): 45 | """Custom parser class to override the error method.""" 46 | 47 | def error(self, message): 48 | """Overriding the default error method to print help message before exiting.""" 49 | sys.stderr.write('error: %s\n' % message) 50 | self.print_help(sys.stderr) 51 | self.exit(2) 52 | 53 | 54 | def valid_file(path): 55 | """Helper method for parse_args to convert to Path and verify if the file path exists. 56 | 57 | Args: 58 | path: path to file from parse_args() 59 | 60 | Returns: 61 | The absolute path of the given file path if it exists 62 | 63 | Raises: 64 | argparse.ArgumentTypeError if the file given does not exist 65 | """ 66 | path = Path(path) 67 | if path.exists() and path.is_file(): 68 | return path.absolute() 69 | raise argparse.ArgumentTypeError(f"No such file or the given path is not a file: '{path}'") 70 | 71 | 72 | def valid_dir(path): 73 | """Helper method for parse_args to convert to Path and verify if the directory exists. 74 | 75 | Args: 76 | path: path to directory from parse_args() 77 | 78 | Returns: 79 | The absolute path of the given directory if it exists 80 | 81 | Raises: 82 | argparse.ArgumentTypeError if the directory given does not exist or if not a directory 83 | """ 84 | path = Path(path) 85 | if path.exists() and path.is_dir(): 86 | return path.absolute() 87 | raise argparse.ArgumentTypeError(f"No such directory or the given path is not a directory: '{path}'") 88 | 89 | 90 | def parse_args(args): 91 | """Create an argument parser and parse the command-line arguments. 92 | 93 | Args: 94 | args: A list of arguments to parse 95 | 96 | Returns: 97 | A parser object containing parsed arguments 98 | """ 99 | 100 | parser = MyParser(prog="cpost", description="Clara Pipeline Sizing Tool CLI") 101 | 102 | parser.add_argument("pipeline_path", metavar="", 103 | type=valid_file, help="pipeline definition file path") 104 | 105 | parser.add_argument("input_dir", metavar="", type=valid_dir, help="input payload directory") 106 | 107 | parser.add_argument("--metrics_dir", type=valid_dir, 108 | help="metrics output directory, if not specified, write to stdout") 109 | 110 | parser.add_argument("--models_dir", type=valid_dir, 111 | help="directory for Triton models, required if pipeline uses Triton") 112 | 113 | parser.add_argument( 114 | "-v", "--verbose", action='store_true', 115 | help="verbose output (DEBUG level). If not specified, default output is INFO level.") 116 | 117 | parser.add_argument( 118 | "--force", default=ContinueOptions.NONE, const=ContinueOptions.NONE, nargs='?', type=ContinueOptions.argparse, 119 | choices=list(ContinueOptions), 120 | help='force continue or stop when operator failure occurs. \ 121 | (default: %(default)s, which will prompt the user for each failure).') 122 | 123 | return parser.parse_args(args) 124 | -------------------------------------------------------------------------------- /src/clarac_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | import sys 17 | from dataclasses import dataclass 18 | from subprocess import run as subproc_run 19 | from tempfile import NamedTemporaryFile 20 | from typing import Dict, List 21 | 22 | import yaml 23 | 24 | 25 | @dataclass 26 | class ServiceConfig: 27 | name: str 28 | image_n_tag: str 29 | command: List[str] 30 | http_connections: Dict 31 | 32 | 33 | @dataclass 34 | class OperatorConfig: 35 | name: str 36 | image_n_tag: str 37 | command: List[str] 38 | variables: Dict 39 | inputs: List 40 | outputs: List 41 | models: List[str] = None 42 | services: List[ServiceConfig] = None 43 | 44 | def update_variables(self, var_dict): 45 | """Update the variable attribute with the given dictionary.""" 46 | if self.variables: 47 | self.variables = {**var_dict, **self.variables} 48 | else: 49 | self.variables = {**var_dict} 50 | 51 | 52 | @dataclass 53 | class PipelineConfig: 54 | name: str 55 | operators: List[OperatorConfig] 56 | 57 | 58 | def run_clarac(source_file: str) -> PipelineConfig: 59 | """Run Clara Complier in a subprocess using the given pipeline definition and parse the results. 60 | 61 | Args: 62 | source_file: path to the pipeline definition file 63 | 64 | Returns: 65 | A PipelineConfig object 66 | """ 67 | def _extract_services(services): 68 | """Extract services section in pipeline definition into list of ServiceConfig.""" 69 | result = [] 70 | for service in services: 71 | service_image_n_tag = service["container"]["image"] + ":" + service["container"]["tag"] 72 | command = service["container"].get("command") 73 | if command: 74 | command = [c.replace("$(NVIDIA_CLARA_SERVICE_DATA_PATH)", "") for c in command] 75 | op_service = ServiceConfig( 76 | name=service["name"], 77 | image_n_tag=service_image_n_tag, 78 | command=command, 79 | http_connections={con["name"]: con["port"] for con in service["connections"].get("http")}) 80 | result.append(op_service) 81 | return result 82 | 83 | logging.debug("Running Clara Complier to validate the pipeline definition ...") 84 | with NamedTemporaryFile() as result_file: 85 | cmd = ["clarac", "-p", source_file, "-o", result_file.name, "--resolve-imports"] 86 | proc = subproc_run(cmd) 87 | if proc.returncode != 0: 88 | logging.error(proc.stderr) 89 | sys.exit(proc.returncode) 90 | else: 91 | logging.debug(f"stdout from Clara Complier: {proc.stdout}") 92 | logging.debug(f"Clara Complier returned with error code {proc.returncode}, loading result as python object") 93 | 94 | try: 95 | config = yaml.load(result_file, yaml.FullLoader) 96 | except yaml.YAMLError as exc: 97 | logging.error(f"Error in configuration file from Clara Complier: {exc}") 98 | sys.exit(2) 99 | logging.debug(f"The content loaded from Clara Complier is: {config}") 100 | 101 | operators = [] 102 | # Get the objects of interest, construct a list, and return it 103 | for op in config["operators"]: 104 | # Get services and names of triton models used by this operator 105 | op_models = [model_dict["name"] for model_dict in op.get("models")] if op.get("models") else None 106 | op_services = _extract_services(op.get("services")) if op.get("services") else None 107 | 108 | image_n_tag = op["container"]["image"] + ":" + op["container"]["tag"] 109 | cmd = op["container"].get("command") 110 | operator = OperatorConfig(name=op["name"], image_n_tag=image_n_tag, command=cmd, variables=op.get( 111 | "variables"), inputs=op["input"], outputs=op.get("output"), models=op_models, services=op_services) 112 | operators.append(operator) 113 | 114 | return PipelineConfig(name=config["name"], operators=operators) 115 | -------------------------------------------------------------------------------- /tests/test_container.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | import sys 18 | import tempfile 19 | from pathlib import Path 20 | from unittest.mock import patch 21 | 22 | import pytest 23 | 24 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src")) 25 | from container import Container, Metrics, RawMetrics # nopep8 # noqa: E402 26 | 27 | 28 | def is_empty(any_structure): 29 | """Helper method to check if structure is empty.""" 30 | if any_structure: 31 | return False 32 | else: 33 | return True 34 | 35 | 36 | TEMP_DIR = Path(tempfile.gettempdir()) 37 | TEST_SYS_FS = TEMP_DIR / "test_sys_fs" 38 | 39 | 40 | @patch("container.SYSFS_PATH", TEST_SYS_FS) 41 | class TestContainer: 42 | 43 | def test_init_container(self): 44 | container = Container() 45 | assert isinstance(container, Container) 46 | assert is_empty(container.id) 47 | assert is_empty(container.raw_metrics) 48 | assert is_empty(container.metric_paths) 49 | 50 | def test_create_metrics_path_no_id(self): 51 | container = Container() 52 | with pytest.raises(RuntimeError): 53 | container.construct_metrics_path() 54 | 55 | def test_create_metrics_path_with_id(self): 56 | container = Container() 57 | container.id = "testID1" 58 | container.construct_metrics_path() 59 | 60 | cpu_path = TEST_SYS_FS / "cpuacct" / "docker" / container.id / "cpuacct.usage" 61 | per_cpu_path = TEST_SYS_FS / "cpuacct" / "docker" / container.id / "cpuacct.usage_percpu" 62 | mem_path = TEST_SYS_FS / "memory" / "docker" / container.id / "memory.usage_in_bytes" 63 | 64 | assert container.metric_paths == (cpu_path, per_cpu_path, mem_path) 65 | 66 | def test_metrics_path_exists(self, tmp_path): 67 | container = Container() 68 | p1, p2, p3 = tmp_path / "p1", tmp_path / "p2", tmp_path / "p3" 69 | container.metric_paths = (p1, p2, p3) 70 | 71 | assert not container.metrics_path_exists() 72 | p1.touch() 73 | assert not container.metrics_path_exists() 74 | p2.touch() 75 | assert not container.metrics_path_exists() 76 | p3.touch() 77 | assert container.metrics_path_exists() 78 | 79 | @patch("container.psutil.cpu_times") 80 | def test_read_raw_metrics(self, mock_cpu, tmp_path): 81 | mock_cpu_data = [10, 20, 10, 20, 10, 20, 10, 20] 82 | mock_cpu.return_value = mock_cpu_data 83 | container = Container() 84 | p1, p2, p3 = tmp_path / "p1", tmp_path / "p2", tmp_path / "p3" 85 | content1, content2, content3 = b'123', b'456', b'789' 86 | p1.write_bytes(content1) 87 | p2.write_bytes(content2) 88 | p3.write_bytes(content3) 89 | container.metric_paths = (p1, p2, p3) 90 | 91 | raw_metrics = container._read_raw_metrics() 92 | assert isinstance(raw_metrics, RawMetrics) 93 | assert isinstance(raw_metrics.timestamp, float) 94 | assert raw_metrics.cpu == float(content1) 95 | assert raw_metrics.per_cpu == content2 96 | assert raw_metrics.sys_cpu == sum(mock_cpu_data[:7]) 97 | assert raw_metrics.memory == float(content3) 98 | 99 | def test_sample_metrics_no_path(self): 100 | container = Container() 101 | with pytest.raises(RuntimeError): 102 | container.sample_metrics() 103 | 104 | @patch("container.Container._read_raw_metrics") 105 | @patch("container.Container._process_raw_data") 106 | def test_sample_metrics(self, mock_process_data, mock_read_metrics): 107 | container = Container() 108 | container.metric_paths = (1, 2, 3) 109 | mock_read_metrics.side_effect = [1, 2, 3] 110 | 111 | def sum_two(prev, cur): 112 | return (prev + cur) 113 | mock_process_data.side_effect = sum_two 114 | 115 | container.sample_metrics() 116 | assert container.raw_metrics == [1] 117 | assert container.metrics == [] 118 | container.sample_metrics() 119 | assert container.raw_metrics == [1, 2] 120 | assert container.metrics == [3] 121 | container.sample_metrics() 122 | assert container.raw_metrics == [1, 2, 3] 123 | assert container.metrics == [3, 5] 124 | 125 | @patch("container.ONLINE_CPUS", 4) 126 | def test_process_raw_data(self): 127 | container = Container() 128 | raw_data = [ 129 | RawMetrics( 130 | timestamp=2.0, cpu=800000.0, per_cpu=b'300000 0 0 500000 \n', sys_cpu=14000000.00, 131 | memory=6500000), 132 | RawMetrics( 133 | timestamp=3.0, cpu=1000000.0, per_cpu=b'500000 0 0 500000 \n', sys_cpu=14000000.60, 134 | memory=8500000)] 135 | post_data = container._process_raw_data(raw_data[0], raw_data[1]) 136 | cpu_delta = (raw_data[1].cpu - raw_data[0].cpu) / 1e9 137 | sys_delta = raw_data[1].sys_cpu - raw_data[0].sys_cpu 138 | 139 | assert post_data == Metrics( 140 | timestamp=2.5, 141 | cpu_percent=(cpu_delta/sys_delta)*4*100, 142 | memory=7.50, 143 | ) 144 | -------------------------------------------------------------------------------- /tests/test_clarac_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | import sys 18 | from pathlib import Path 19 | from unittest.mock import MagicMock, patch 20 | 21 | import pytest 22 | 23 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src")) 24 | from clarac_utils import OperatorConfig, PipelineConfig, ServiceConfig, run_clarac # nopep8 # noqa: E402 25 | 26 | 27 | @pytest.mark.parametrize("og_variables, exp_variables", 28 | [(None, {"a": 1, "b": 2}), 29 | ({"c": 3}, 30 | {"c": 3, "a": 1, "b": 2})]) 31 | def test_op_config_update_variables(og_variables, exp_variables): 32 | new_variables = {"a": 1, "b": 2} 33 | op = OperatorConfig("op1", "image_tag", None, og_variables, None, None) 34 | op.update_variables(new_variables) 35 | assert op.variables == exp_variables 36 | 37 | 38 | @patch("clarac_utils.subproc_run") 39 | def test_run_clarac_subproc_error(mock_subproc_run, tmp_path): 40 | mock_subproc_run.return_value = MagicMock(**{"returncode": 1, "stderr": "some error"}) 41 | with pytest.raises(SystemExit): 42 | run_clarac(tmp_path) 43 | 44 | 45 | @patch("clarac_utils.NamedTemporaryFile") 46 | @patch("clarac_utils.subproc_run") 47 | def test_run_clarac_yaml_error(mock_subproc_run, mock_temp_file, tmp_path): 48 | mock_subproc_run.return_value = MagicMock(**{"returncode": 0, "stdout": "some output"}) 49 | mock_file = tmp_path / "bad.yaml" 50 | mock_file.touch() 51 | mock_file.write_text("api-version: '0.4.0'\n name: null-pipeline") 52 | with open(mock_file) as mock_file_obj: 53 | mock_temp_file.return_value.__enter__.return_value = mock_file_obj 54 | 55 | with pytest.raises(SystemExit): 56 | run_clarac(tmp_path) 57 | 58 | 59 | @pytest.mark.skip("Skipping due to pipeline setup for clarac is incomplete") 60 | def test_run_clarac(): 61 | pipeline_file = Path(__file__).parent / "pipelines" / ("nullpipeline.yaml") 62 | 63 | config = run_clarac(pipeline_file) 64 | assert isinstance(config, PipelineConfig) 65 | assert config.name == "null-pipeline" 66 | assert len(config.operators) == 3 67 | op = config.operators[0] 68 | assert op.name == "null-reader" 69 | assert op.image_n_tag == "null-pipeline/operator-py:0.8.1" 70 | assert op.command is None 71 | assert op.variables == {"CLARA_TRACE": 2} 72 | assert op.inputs == [{"name": None, "path": "/input"}] 73 | assert op.outputs == [{"name": None, "path": "/output"}] 74 | assert op.models is None 75 | assert op.services is None 76 | 77 | op = config.operators[1] 78 | assert op.name == "null-inference" 79 | assert op.image_n_tag == "null-pipeline/operator-py:0.8.1" 80 | assert op.command is None 81 | assert op.variables == {"CLARA_TRACE": 2} 82 | assert op.inputs == [{"from": "null-reader", "name": None, "path": "/input"}] 83 | assert op.outputs == [{"name": None, "path": "/output"}] 84 | assert op.models is None 85 | assert op.services is None 86 | 87 | op = config.operators[2] 88 | assert op.name == "null-writer" 89 | assert op.image_n_tag == "null-pipeline/operator-py:0.8.1" 90 | assert op.command is None 91 | assert op.variables == {"CLARA_TRACE": 2} 92 | assert op.inputs == [{"from": "null-inference", "name": None, "path": "/input"}] 93 | assert op.outputs == [{"name": None, "path": "/output"}] 94 | assert op.models is None 95 | assert op.services is None 96 | 97 | 98 | @pytest.mark.skip("Skipping due to pipeline setup for clarac is incomplete") 99 | def test_run_clarac_with_triton_models(): 100 | pipeline_file = Path(__file__).parent / "pipelines" / ("operator_with_model.yaml") 101 | 102 | config = run_clarac(pipeline_file) 103 | assert isinstance(config, PipelineConfig) 104 | assert config.name == "null-pipeline" 105 | assert len(config.operators) == 1 106 | op = config.operators[0] 107 | assert op.name == "null-reader" 108 | assert op.image_n_tag == "null-pipeline/operator-py:0.8.1" 109 | assert op.inputs == [{"name": None, "path": "/input"}] 110 | assert op.outputs == [{"name": None, "path": "/output"}] 111 | assert op.command == ["python", "register.py", "--agent", "renderserver"] 112 | assert op.models == ["segmentation_ct_spleen_v1", "segmentation_ct_liver_v1"] 113 | assert op.services is None 114 | 115 | 116 | @pytest.mark.skip("Skipping due to pipeline setup for clarac is incomplete") 117 | def test_run_clarac_with_pipeline_services(): 118 | pipeline_file = Path(__file__).parent / "pipelines" / ("operator_with_services.yaml") 119 | 120 | config = run_clarac(pipeline_file) 121 | assert isinstance(config, PipelineConfig) 122 | assert config.name == "null-pipeline" 123 | assert len(config.operators) == 1 124 | op = config.operators[0] 125 | assert op.name == "null-reader" 126 | assert op.image_n_tag == "null-pipeline/operator-py:0.8.1" 127 | assert op.inputs == [{"name": None, "path": "/input"}] 128 | assert op.outputs == [{"name": None, "path": "/output"}] 129 | assert op.command is None 130 | assert op.models is None 131 | assert len(op.services) == 1 132 | op_service = op.services[0] 133 | assert isinstance(op_service, ServiceConfig) 134 | assert op_service.name == "trtis" 135 | assert op_service.image_n_tag == "nvcr.io/nvidia/tritonserver:latest" 136 | assert op_service.command == ["some", "command"] 137 | assert op_service.http_connections == {"NVIDIA_CLARA_TRTISURI": 8000} 138 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | import re 18 | import sys 19 | from argparse import ArgumentTypeError 20 | 21 | import pytest 22 | 23 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src")) 24 | from cli import ContinueOptions, parse_args # nopep8 # noqa: E402 25 | 26 | 27 | @pytest.fixture(scope="function") 28 | def file_maker(tmp_path): 29 | """A function scoped pytest fixture to return the path of a temporary file.""" 30 | file_path = tmp_path / "pipeline_defn" 31 | file_path.touch() 32 | return str(file_path) 33 | 34 | 35 | def swap_pattern(pattern, substitute, args): 36 | """Helper method to substitute a pattern in args for cleaner tests.""" 37 | return [re.sub(pattern, substitute, i) for i in args] 38 | 39 | 40 | def test_swap_pattern(): 41 | args = ["%tmp_file%", "some_input_dir", "%tmp%", "hello", "%tmp%"] 42 | result = swap_pattern("%tmp%", "abc", args) 43 | assert result == ["%tmp_file%", "some_input_dir", "abc", "hello", "abc"] 44 | 45 | 46 | @pytest.mark.parametrize("input_args", [["%tmp_file%"], [], ["-x"], ["-v"]]) 47 | def test_missing_required_args(input_args, file_maker, capsys): 48 | input_args = swap_pattern(r'%tmp_file%', file_maker, input_args) 49 | 50 | with pytest.raises(SystemExit) as pytest_wrapped_e: 51 | parse_args(input_args) 52 | out, err = capsys.readouterr() 53 | 54 | assert "" == out 55 | assert "error: the following arguments are required" in err 56 | assert "usage: cpost" in err 57 | assert pytest_wrapped_e.value.code == 2 58 | 59 | 60 | @pytest.mark.parametrize("input_args, error", 61 | [ 62 | (["some_pipeline_path", "some_input_dir"], ArgumentTypeError), 63 | (["/tmp", "/tmp"], ArgumentTypeError), 64 | (["%tmp_file%", "some_input_dir"], ArgumentTypeError), 65 | (["%tmp_file%", "%tmp_file%"], ArgumentTypeError), 66 | (["%tmp_file%", "/tmp", "--metrics_dir", "some_dir"], ArgumentTypeError), 67 | (["%tmp_file%", "/tmp", "--metrics_dir", "%tmp_file%"], ArgumentTypeError), 68 | (["%tmp_file%", "/tmp", "--models_dir", "some_dir"], ArgumentTypeError), 69 | (["%tmp_file%", "/tmp", "--models_dir", "%tmp_file%"], ArgumentTypeError) 70 | ]) 71 | def test_invalid_path(input_args, error, file_maker): 72 | input_args = swap_pattern(r'%tmp_file%', file_maker, input_args) 73 | with pytest.raises(SystemExit) as pytest_wrapped_e: 74 | with pytest.raises(error) as excinfo: 75 | parse_args(input_args) 76 | assert "No such" in str(excinfo.value) 77 | assert pytest_wrapped_e.value.code == 2 78 | 79 | 80 | @pytest.mark.parametrize("optional_dir_specified", [True, False]) 81 | def test_valid_path(optional_dir_specified, tmp_path, file_maker): 82 | input_dir = tmp_path / test_valid_path.__name__ 83 | input_dir.mkdir() 84 | pipeline = file_maker 85 | 86 | if not optional_dir_specified: 87 | input_args = [pipeline, str(input_dir)] 88 | parsed = parse_args(input_args) 89 | assert parsed.input_dir == input_dir 90 | assert str(parsed.pipeline_path) == pipeline 91 | assert parsed.metrics_dir is None 92 | assert parsed.models_dir is None 93 | assert parsed.force == ContinueOptions.NONE 94 | else: 95 | metrics_dir = tmp_path / "test_output_metrics" 96 | metrics_dir.mkdir() 97 | models_dir = tmp_path / "model_repo" 98 | models_dir.mkdir() 99 | input_args = [pipeline, str(input_dir), "--metrics_dir", str(metrics_dir), "--models_dir", str(models_dir)] 100 | parsed = parse_args(input_args) 101 | assert parsed.input_dir == input_dir 102 | assert str(parsed.pipeline_path) == pipeline 103 | assert parsed.metrics_dir == metrics_dir 104 | assert parsed.models_dir == models_dir 105 | assert parsed.force == ContinueOptions.NONE 106 | 107 | 108 | @pytest.mark.parametrize("force_args, exp_option", 109 | [(["--force", "cont"], ContinueOptions.CONT), 110 | (["--force=cont"], ContinueOptions.CONT), 111 | ([], ContinueOptions.NONE), 112 | (["--force", "none"], ContinueOptions.NONE), 113 | (["--force", "stop"], ContinueOptions.STOP)]) 114 | def test_parse_force_options(force_args, exp_option, tmp_path, file_maker): 115 | input_dir = tmp_path / test_parse_force_options.__name__ 116 | input_dir.mkdir() 117 | pipeline = file_maker 118 | input_args = force_args + [pipeline, str(input_dir)] 119 | parsed = parse_args(input_args) 120 | assert parsed.input_dir == input_dir 121 | assert str(parsed.pipeline_path) == pipeline 122 | assert parsed.metrics_dir is None 123 | assert parsed.models_dir is None 124 | assert parsed.force == exp_option 125 | 126 | 127 | @pytest.mark.parametrize("force_args, err_msg", 128 | [(["--force", "continue"], "argument --force: invalid choice: 'continue'"), 129 | (["--force"], "argument --force: invalid choice:"), 130 | (["--force", "aaaa"], "argument --force: invalid choice: 'aaaa'")]) 131 | def test_parse_force_options_error(force_args, err_msg, tmp_path, capsys, file_maker): 132 | input_dir = tmp_path / test_parse_force_options_error.__name__ 133 | input_dir.mkdir() 134 | pipeline = file_maker 135 | input_args = force_args + [pipeline, str(input_dir)] 136 | with pytest.raises(SystemExit): 137 | parse_args(input_args) 138 | out, err = capsys.readouterr() 139 | assert err_msg in err 140 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import io 17 | import os 18 | import sys 19 | from dataclasses import dataclass 20 | from unittest.mock import MagicMock, patch 21 | 22 | import pytest 23 | 24 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src")) 25 | from clarac_utils import OperatorConfig # nopep8 # noqa: E402 26 | from utils import (assert_installed, check_images_and_tags, convert_percent_to_cores, # nopep8 # noqa: E402 27 | prompt_yes_or_no, round_up_to_multiple, subproc_run_wrapper, write_to_csv) 28 | 29 | 30 | @pytest.mark.parametrize("data_in, base, data_out", 31 | [(3, 2, 4), 32 | (4, 2, 4), 33 | (15.5, 5, 20), 34 | (148.05, 256, 256), 35 | (256.05, 256, 512)]) 36 | def test_round_up_to_multiple(data_in, base, data_out): 37 | assert round_up_to_multiple(data_in, base) == data_out 38 | 39 | 40 | @pytest.mark.parametrize("data_in, data_out", 41 | [(100.05, 2), 42 | (1343.5, 14), 43 | (50.55, 1)]) 44 | def test_convert_percent_to_cores(data_in, data_out): 45 | assert convert_percent_to_cores(data_in) == data_out 46 | 47 | 48 | @pytest.mark.parametrize("program, exist", [("echo", True), ("clara", True), ("claraabc", False)]) 49 | def test_assert_installed(program, exist): 50 | if program == "clara": 51 | pytest.skip() 52 | if exist: 53 | assert assert_installed(program) is None 54 | else: 55 | with pytest.raises(SystemExit) as exc: 56 | assert_installed(program) 57 | assert exc.value.code == 1 58 | 59 | 60 | @pytest.mark.parametrize("mocked_return, run_called_count", [ 61 | pytest.param(MagicMock(**{"stdout": b'tag1\n'}), 2, id="exists_locally"), 62 | pytest.param(MagicMock(**{"stdout": b'', "returncode": 0}), 4, id="can_be_pulled"), 63 | pytest.param(MagicMock(**{"stdout": b'', "returncode": 1, "stderr": b'error message'}), 2, id="pull_failed"), 64 | ]) 65 | @patch("utils.subproc_run") 66 | def test_check_images_and_tags(mock_subproc_run, mocked_return, run_called_count): 67 | mock_subproc_run.return_value = mocked_return 68 | mock_service = [MagicMock(**{"image_n_tag": "tag1"})] 69 | op1 = OperatorConfig("Input1", "tag1", None, None, [{"path": "/input"}], None, None, mock_service) 70 | if mocked_return.returncode == 1: 71 | with pytest.raises(SystemExit): 72 | check_images_and_tags([op1]) 73 | else: 74 | check_images_and_tags([op1]) 75 | assert mock_subproc_run.call_count == run_called_count 76 | 77 | 78 | @patch("utils.TRITON_IMAGE_TAG", "triton-tag") 79 | @pytest.mark.parametrize("mocked_return, expect_exit, run_called_count", 80 | [ 81 | pytest.param([MagicMock(**{"stdout": b'triton-tag\n'})], 82 | False, 2, id="exists_locally"), 83 | pytest.param( 84 | [MagicMock(**{"stdout": b''}), 85 | MagicMock(**{"stdout": b'', "returncode": 0})], 86 | False, 3, id="can_be_pulled"), 87 | pytest.param( 88 | [MagicMock(**{"stdout": b''}), 89 | MagicMock(**{"stdout": b'', "returncode": 1, "stderr": b'error message'})], 90 | True, 3, id="pull_failed"), 91 | 92 | ]) 93 | @patch("utils.subproc_run") 94 | def test_check_images_and_tags_with_triton(mock_subproc_run, mocked_return, expect_exit, run_called_count): 95 | mock_subproc_run.side_effect = [MagicMock(**{"stdout": b'tag1\n'})] + mocked_return 96 | op1 = OperatorConfig("Input1", "tag1", None, None, [{"path": "/input"}], None, ["model1"]) 97 | if expect_exit: 98 | with pytest.raises(SystemExit): 99 | check_images_and_tags([op1]) 100 | else: 101 | check_images_and_tags([op1]) 102 | assert mock_subproc_run.call_count == run_called_count 103 | 104 | 105 | @pytest.mark.parametrize("mocked_return", [ 106 | pytest.param(MagicMock(**{"stdout": b'container_id\n', "returncode": 0}), id="all_good"), 107 | pytest.param(MagicMock(**{"stderr": b'error message', "returncode": 1}), id="error") 108 | ]) 109 | @patch("utils.subproc_run") 110 | def test_subproc_run_wrapper(mock_subproc_run, mocked_return): 111 | mock_subproc_run.return_value = mocked_return 112 | if mocked_return.returncode == 1: 113 | with pytest.raises(SystemExit): 114 | subproc_run_wrapper(["some", "cmd"]) 115 | else: 116 | result = subproc_run_wrapper(["some", "cmd"]) 117 | assert result == "container_id" 118 | 119 | 120 | @pytest.mark.parametrize("choice, expected_result", [ 121 | ("y", True), 122 | ("Y", True), 123 | ("yes", True), 124 | ("YES", True), 125 | ("yup", True), 126 | ("n", False), 127 | ("N", False), 128 | ("no", False), 129 | ("NO", False), 130 | ("nope", False), 131 | ("j\nx\nyeeee", True), 132 | ("exxxy\nadsfa\nnaaah", False), 133 | ("\nx\ny", True) 134 | ]) 135 | def test_prompt_yes_or_no(choice, expected_result): 136 | sys.stdin = io.StringIO(choice) 137 | assert prompt_yes_or_no("Please give your response") == expected_result 138 | 139 | 140 | def test_write_to_csv(tmp_path): 141 | 142 | @ dataclass 143 | class MockMetric: 144 | field1: str 145 | field2: int 146 | 147 | mock_q = MagicMock() 148 | mock_q.get.side_effect = [None, MockMetric("abc", 12), MockMetric("fdvc", 15), 0] 149 | output_dir = tmp_path / "sub_dir" / "test_write_to_csv" 150 | field_names = ["field1", "field2"] 151 | write_to_csv(mock_q, field_names, output_dir) 152 | 153 | assert output_dir.read_text() == "field1,field2\nabc,12\nfdvc,15\n" 154 | -------------------------------------------------------------------------------- /src/container.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from dataclasses import dataclass 17 | from dataclasses import fields as data_fields 18 | from datetime import datetime 19 | 20 | import psutil 21 | from constants import B_MB_FACTOR, NS_PER_S, ONLINE_CPUS, SYSFS_PATH 22 | 23 | 24 | @dataclass 25 | class Metrics: 26 | timestamp: float 27 | cpu_percent: float 28 | memory: float # in MB 29 | 30 | 31 | METRICS_HEADER = [obj.name for obj in data_fields(Metrics)] 32 | 33 | 34 | @dataclass 35 | class RawMetrics: 36 | timestamp: float 37 | cpu: float 38 | per_cpu: bytes 39 | sys_cpu: tuple 40 | memory: float # in bytes 41 | 42 | 43 | class Container: 44 | 45 | def __init__(self) -> None: 46 | """Initializes the Container object with id, metrics_path, raw_metrics, and metrics. 47 | 48 | Args: 49 | None 50 | 51 | Returns: 52 | None 53 | """ 54 | self.id = "" 55 | self.metric_paths = () # Tuple[Path, Path, Path] 56 | self.raw_metrics = [] # List[RawMetrics] 57 | self.metrics = [] 58 | 59 | def construct_metrics_path(self): 60 | """Constructs metrics reading paths in a tuple based on self.id attribute. 61 | 62 | Args: 63 | None 64 | 65 | Returns: 66 | None 67 | 68 | Raises: 69 | RuntimeError if id is not set when this is called 70 | """ 71 | if self.id: 72 | _cpu_path = SYSFS_PATH / "cpuacct" / "docker" / self.id / "cpuacct.usage" 73 | _per_cpu_path = SYSFS_PATH / "cpuacct" / "docker" / self.id / "cpuacct.usage_percpu" 74 | _mem_path = SYSFS_PATH / "memory" / "docker" / self.id / "memory.usage_in_bytes" 75 | self.metric_paths = (_cpu_path, _per_cpu_path, _mem_path) 76 | else: 77 | raise RuntimeError("Container ID is not set when creating paths") 78 | 79 | def metrics_path_exists(self) -> bool: 80 | """Checks if all the paths in the container.metrics_path attribute exist. 81 | 82 | Args: 83 | None 84 | 85 | Returns: 86 | A boolean value for whether all metrics_paths exist on the system. 87 | """ 88 | return self.metric_paths[0].exists() and self.metric_paths[1].exists() and self.metric_paths[2].exists() 89 | 90 | def _read_raw_metrics(self) -> RawMetrics: 91 | """Reads raw metrics data based on the self.metric_path and timestamp it. 92 | 93 | Args: 94 | None 95 | 96 | Returns: 97 | A RawMetrics object 98 | """ 99 | timestamp = datetime.utcnow().timestamp() 100 | # Rationale for raw_sys_cpu arithmetic: getSystemCPUUsage() in docker/daemon/stats_collector_unix.go 101 | # in https://github.com/rancher/docker 102 | raw_sys_cpu = sum(psutil.cpu_times()[:7]) # in seconds 103 | # Note: Converting to float takes an extra 1000ns 104 | raw_cpu = float(self.metric_paths[0].read_bytes()) 105 | # If we know this len is the same as the system cpu num, then we don't need per_cpu anymore 106 | raw_per_cpu = self.metric_paths[1].read_bytes() 107 | raw_mem = float(self.metric_paths[2].read_bytes()) 108 | return RawMetrics(timestamp, raw_cpu, raw_per_cpu, raw_sys_cpu, raw_mem) 109 | 110 | def sample_metrics(self) -> None: 111 | """Samples raw metrics data and append to self.raw_metrics list. 112 | 113 | FileNotFoundError and OSError errno 19 implies that the file no longer 114 | exist and thus these are bypassed. 115 | 116 | Args: 117 | None 118 | 119 | Returns: 120 | None or metric, which is a Metrics object 121 | 122 | Raises: 123 | RuntimeError if self.metric_paths is not set when this is called 124 | """ 125 | if self.metric_paths: 126 | try: 127 | raw_metrics = self._read_raw_metrics() 128 | self.raw_metrics.append(raw_metrics) 129 | # process metrics starting at second item 130 | if len(self.raw_metrics) >= 2: 131 | metric = self._process_raw_data(self.raw_metrics[-2], self.raw_metrics[-1]) 132 | self.metrics.append(metric) 133 | return metric 134 | else: 135 | return 136 | except FileNotFoundError: 137 | return 138 | except OSError as err: 139 | if err.errno == 19: # no such device error 140 | return 141 | else: 142 | raise(err) 143 | else: 144 | raise RuntimeError("Metrics paths must constructed before sampling.") 145 | 146 | @staticmethod 147 | def _process_raw_data(prev, cur): 148 | """Process the given data and convert units. 149 | Computation according to https://docs.docker.com/engine/api/v1.41/#operation/ContainerStats 150 | 151 | Args: 152 | prev: the prior RawMetrics object 153 | cur: the current RawMetrics object 154 | 155 | Returns: 156 | result: A list of MetricsData object 157 | """ 158 | ts_avg = (prev.timestamp + cur.timestamp) / 2.0 159 | cpu_percent = 0.0 160 | # Convert from nanoseconds to seconds 161 | cpu_delta = (cur.cpu - prev.cpu) / NS_PER_S 162 | # Below does not need div by CLOCK_TICKS_PER_S because it has been done in psutils 163 | sys_cpu_delta = cur.sys_cpu - prev.sys_cpu 164 | 165 | if cpu_delta > 0.0 and sys_cpu_delta > 0.0: 166 | cpu_percent = (cpu_delta / sys_cpu_delta) * ONLINE_CPUS * 100.0 167 | 168 | # Since we're averaging the cpu, we also need to average the memory to match the averaged timestamp 169 | memory_avg = (prev.memory + cur.memory) / 2.0 / B_MB_FACTOR 170 | 171 | return Metrics(ts_avg, cpu_percent=cpu_percent, memory=memory_avg) 172 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import csv 17 | import dataclasses 18 | import logging 19 | import math 20 | import shutil 21 | import sys 22 | from pathlib import Path 23 | from subprocess import PIPE, Popen 24 | from subprocess import run as subproc_run 25 | from typing import List 26 | 27 | from clarac_utils import OperatorConfig 28 | from constants import ON_POSIX, TRITON_IMAGE_TAG 29 | 30 | 31 | def round_up_to_multiple(x, base): 32 | """Round up the given number to the nearest multiple of the given base number.""" 33 | return math.ceil(float(x) / float(base)) * base 34 | 35 | 36 | def convert_percent_to_cores(x): 37 | "Convert the given percentage to CPU cores." 38 | return int(math.ceil(x / 100.0)) 39 | 40 | 41 | def assert_installed(prog: str): 42 | """Check if the given program is installed, terminate if not. 43 | 44 | Args: 45 | prog: Name of the commandline program 46 | 47 | Returns: 48 | None. If program is not installed, sys.exit(1) 49 | """ 50 | logging.debug(f"Checking for dependency {prog} ...") 51 | if not shutil.which(prog): 52 | sys.stderr.write(f"error: {prog} not installed, please install {prog}\n") 53 | sys.exit(1) 54 | logging.debug(f"Dependency {prog} fulfilled") 55 | 56 | 57 | def set_up_logging(verbose): 58 | """Setup logging for cpost to standard out. 59 | 60 | Args: 61 | verbose: Boolean value indicating whether log level will be debug or not 62 | 63 | Returns: 64 | None. 65 | """ 66 | if verbose: # pragma: no cover 67 | level = logging.DEBUG 68 | else: # pragma: no cover 69 | level = logging.INFO 70 | # logging config are default to StreamHandlers 71 | logging.basicConfig(format='%(message)s', level=level) # pragma: no cover 72 | 73 | 74 | def check_images_and_tags(operators: List[OperatorConfig]): 75 | """For the image and tag of each operator, examine local images and pull if not found locally. 76 | 77 | Args: 78 | operators: List of OperatorConfig objects 79 | 80 | Returns: 81 | None 82 | 83 | Raises: 84 | sys.exit if the docker pull command errorred out 85 | """ 86 | uses_triton_model_repo = False 87 | logging.info("Checking for container images and tags needed for the pipeline...") 88 | 89 | def _check_image_exists_locally(image_and_tag): 90 | logging.debug(f"Checking if `{image_and_tag}` are in local images...") 91 | local_check_proc = subproc_run( 92 | ["docker", "images", image_and_tag, "--format", "{{.Repository}}:{{.Tag}}"], 93 | capture_output=True) 94 | result = local_check_proc.stdout.decode('UTF-8') 95 | if image_and_tag in result: 96 | logging.debug(f"`{image_and_tag}` found.") 97 | return True 98 | else: 99 | return False 100 | 101 | def _pull_image(image_and_tag): 102 | logging.debug(f"`{image_and_tag}` not found, try pulling from registry ...") 103 | pull_proc = subproc_run(["docker", "pull", image_and_tag], capture_output=True) 104 | if pull_proc.returncode == 0: 105 | logging.debug(f"Docker pull command for `{image_and_tag}` returned with code {pull_proc.returncode}") 106 | logging.debug(f"stdout is: \n{pull_proc.stdout.decode('UTF-8').strip()}") 107 | else: 108 | logging.error(f"Docker pull command for `{image_and_tag}` returned with code {pull_proc.returncode}") 109 | logging.error(f"stdout is: {pull_proc.stdout.decode('UTF-8')}") 110 | logging.error(f"stderr is: {pull_proc.stderr.decode('UTF-8')}") 111 | sys.exit("Please verify docker access and the pipeline definition") 112 | 113 | for operator in operators: 114 | if not _check_image_exists_locally(operator.image_n_tag): 115 | _pull_image(operator.image_n_tag) 116 | if operator.models: 117 | uses_triton_model_repo = True 118 | if operator.services: 119 | for op_service in operator.services: 120 | if not _check_image_exists_locally(op_service.image_n_tag): 121 | _pull_image(op_service.image_n_tag) 122 | if uses_triton_model_repo: 123 | if not _check_image_exists_locally(TRITON_IMAGE_TAG): 124 | _pull_image(TRITON_IMAGE_TAG) 125 | 126 | logging.info("All container images are ready to be used.") 127 | 128 | 129 | def subproc_run_wrapper(cmd, **kwargs): 130 | sub_proc = subproc_run(cmd, capture_output=True, **kwargs) 131 | if sub_proc.returncode == 0: 132 | std_out = sub_proc.stdout.decode('UTF-8').strip() 133 | logging.debug(f"Subprocess returned with stdout {std_out}") 134 | return std_out 135 | else: 136 | logging.error( 137 | f"Running {cmd} returned with {sub_proc.returncode} with error {sub_proc.stderr}") 138 | return sys.exit(f"Failed to run subprocess with command {cmd}") 139 | 140 | 141 | def prompt_yes_or_no(condition: str): 142 | """Prompt the user with a question and waits for the y/n input. 143 | 144 | Args: 145 | condition: Condition that needs user's input 146 | 147 | Returns: 148 | Boolean value corresponding to yes or no 149 | """ 150 | while "the answer is invalid": 151 | reply = input(condition + ' (y/n): ').lower().strip() 152 | if reply: 153 | if reply[0] == 'y': 154 | return True 155 | if reply[0] == 'n': 156 | return False 157 | 158 | 159 | def write_to_csv(que, field_names, output_file): 160 | """Write data in que to the output file in csv format. 161 | 162 | Args: 163 | que: a multiprocess.Queue contains the data to be written 164 | field_names: Header for the csv file 165 | output_file: String or Path of the output file location 166 | 167 | Returns: 168 | None 169 | """ 170 | output_file = Path(output_file) 171 | if not output_file.parent.exists(): 172 | output_file.parent.mkdir(parents=True) 173 | 174 | with open(output_file, "w") as f: 175 | csv_writer = csv.DictWriter(f, fieldnames=field_names) 176 | csv_writer.writeheader() 177 | while True: 178 | item = que.get() 179 | if item is None: 180 | continue 181 | if item == 0: 182 | que.close() 183 | break 184 | csv_writer.writerow(dataclasses.asdict(item)) 185 | f.flush() 186 | logging.info(f"Results are stored in {output_file}") 187 | -------------------------------------------------------------------------------- /src/triton_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | import sys 18 | import time 19 | from contextlib import contextmanager 20 | from enum import Enum, auto 21 | from typing import List 22 | 23 | import requests 24 | from clarac_utils import OperatorConfig 25 | from constants import (TRITON_HTTP_PORT, TRITON_IMAGE_TAG, TRITON_READY_TIMEOUT_SECONDS, 26 | TRITON_WAIT_SLEEP_TIME_SECONDS, TRITON_WAIT_TIME_SECONDS) 27 | from utils import subproc_run_wrapper 28 | 29 | 30 | class RUN_MODE(Enum): 31 | NO_INFERENCE_SERVER = auto() 32 | MODEL_REPO = auto() 33 | PIPELINE_SERVICES = auto() 34 | 35 | 36 | def _extract_models_from_configs(op_configs: List[OperatorConfig]): 37 | """Helper method to obtain models from list of OperatorConfig. 38 | 39 | Args: 40 | op_configs: List of OperatorConfigs to extract information from 41 | 42 | Returns: 43 | List of string which represents the names of each model with no repeating models 44 | """ 45 | logging.debug("Abstracting model form pipeline definition") 46 | result = list(set([model for op in op_configs if op.models for model in op.models])) 47 | logging.debug(f"The models present are `{result}`") 48 | return result 49 | 50 | 51 | def check_models_directory(op_configs, models_dir) -> List[str]: 52 | """Checks if the model directory contains the models needed in the pipeline. 53 | 54 | Args: 55 | op_configs: List of OperatorConfigs to extract information from 56 | models_dir: A directory that contains Triton models 57 | 58 | Returns: 59 | model_names: List of model names used by this pipeline 60 | """ 61 | logging.info("Checking model directory for dependent models ...") 62 | required_models = _extract_models_from_configs(op_configs) 63 | if required_models == []: 64 | logging.debug("Pipeline did not specify any Triton models, skipping check for models_dir") 65 | return [] 66 | else: 67 | logging.debug("Examining model directory ...") 68 | if models_dir is None: 69 | sys.exit(f"Model directory must be provided since your pipeline uses: {required_models}") 70 | 71 | # The directory can contain more models than what's needed 72 | model_names = [] 73 | for model_name in required_models: 74 | logging.debug(f"Checking for model `{model_name}` ...") 75 | matching_config = list(models_dir.glob(f"{model_name}/config.pbtxt")) 76 | if len(matching_config) == 0: 77 | sys.exit(f"Model `{model_name}` is missing in the models directory") 78 | elif len(matching_config) > 1: 79 | logging.warning( 80 | f"Found more than one matching config file for model `{model_name}`. Using the first occurrence.") 81 | model_path = matching_config[0] 82 | with open(model_path) as f: 83 | name_in_file = f.readline().split(":")[1].strip()[1:-1] 84 | if name_in_file != model_path.parent.name: 85 | sys.exit( 86 | f"Expected name in config {name_in_file} to be equal to directory name {model_path.parent.name}") 87 | model_names.append(model_path.parent.name) 88 | 89 | logging.info("All model directory checks are complete!") 90 | return model_names 91 | 92 | 93 | def decide_method_to_run_triton(op_configs) -> RUN_MODE: 94 | """Decide how to run triton based on the given op_configs. 95 | 96 | Args: 97 | op_configs: List of OperatorConfig objects 98 | 99 | Return: 100 | RUN_MODE.MODEL_REPO, RUN_MODE.PIPELINE_SERVICES or RUN_MODE.NO_INFERENCE_SERVER 101 | 102 | Raises: 103 | SystemExit if both models and services are present in the op_config 104 | """ 105 | model_repo = False 106 | services = False 107 | for op in op_configs: 108 | if op.models: 109 | model_repo = True 110 | if op.services: 111 | services = True 112 | if model_repo and services: 113 | sys.exit("CPOST does not support model_repository and pipeline services at the same time") 114 | if model_repo: 115 | return RUN_MODE.MODEL_REPO 116 | elif services: 117 | return RUN_MODE.PIPELINE_SERVICES 118 | return RUN_MODE.NO_INFERENCE_SERVER 119 | 120 | 121 | def check_triton_status(triton_models_names=[], host="localhost", port=TRITON_HTTP_PORT): 122 | """Check status of Triton server via http. 123 | 124 | Kwargs: 125 | triton_models_names: list of triton model names to verify, default: [] 126 | host: ip address of triton, default: localhost 127 | port: the port to query http status, default: "8000" 128 | 129 | Returns: 130 | None 131 | 132 | Raises: 133 | SystemExit if requests.get returned with a non-200 status 134 | """ 135 | logging.debug("Waiting and checking Triton status ...") 136 | time.sleep(TRITON_WAIT_TIME_SECONDS) 137 | start_time = time.perf_counter() 138 | while time.perf_counter() - start_time < TRITON_READY_TIMEOUT_SECONDS: 139 | time.sleep(TRITON_WAIT_SLEEP_TIME_SECONDS) 140 | try: 141 | ready = requests.get(f"http://{host}:{port}/api/status") 142 | if ready.status_code != 200: 143 | sys.exit(f"Triton is not working, status code = {ready.status_code} with message {ready.text}") 144 | break 145 | except requests.ConnectionError: 146 | continue 147 | else: 148 | raise TimeoutError("Timeout when waiting for triton to be ready.") 149 | 150 | # Verify that each model is ready 151 | for model_name in triton_models_names: 152 | ready = requests.get( 153 | f"http://{host}:{port}/api/status/{model_name}", timeout=TRITON_READY_TIMEOUT_SECONDS) 154 | if ready.status_code != 200: 155 | sys.exit(f"Error: {ready.status_code} {ready.reason}, {ready.headers}") 156 | logging.debug("Triton is ready to be used") 157 | 158 | 159 | def inspect_ip_address(container_name): 160 | """Inspect and obtain the IP address for the given container. 161 | 162 | Args: 163 | container_name: docker name or docker container ID 164 | 165 | Returns: 166 | network_ip: the IP address of the container 167 | """ 168 | cmd = ["docker", "inspect", "--format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}'", container_name] 169 | output = subproc_run_wrapper(cmd) 170 | network_ip = output[1:-1] # Strip away the quotes around the returned IP address 171 | logging.debug(f"{container_name} can be communicated on address {network_ip}") 172 | return network_ip 173 | 174 | 175 | def start_triton(models_dir, command, image_tag=TRITON_IMAGE_TAG, triton_models_names=[]): 176 | """Starts triton container and wait for it to be ready. 177 | 178 | Args: 179 | models_dir: Absolute path of models_directory 180 | command: list of commands to run for the container 181 | 182 | Kwargs: 183 | image_tag: The image and tag for the container, e.g. image:tag, default to TRITON_IMAGE_TAG 184 | triton_models_names: List of triton model names to load, default = [] 185 | 186 | Returns: 187 | triton_container_id, ip_address: Tuple of string 188 | """ 189 | # build triton command 190 | loading_models = [f"--load-model={name}" for name in triton_models_names] 191 | cmd = ["docker", "run", "--gpus=1", "--rm", "-d", "-p8000:8000", "-p8001:8001", "-p8002:8002", 192 | "-v", f"{models_dir}:/models", image_tag] + command + loading_models 193 | logging.debug(f"Spinning up Triton with {cmd}") 194 | triton_container_id = subproc_run_wrapper(cmd) 195 | ip_address = inspect_ip_address(triton_container_id) 196 | check_triton_status(triton_models_names=triton_models_names, host=ip_address) 197 | return triton_container_id, ip_address 198 | 199 | 200 | @contextmanager 201 | def run_triton_model_repo(execution_order, models_dir): 202 | """Run Triton in a context manager if pipeline requires Triton. 203 | 204 | Args: 205 | execution_order: List of OperatorConfigs to extract information from 206 | models_dir: Absolute path of models_directory 207 | 208 | Yields: 209 | ip_address 210 | """ 211 | try: 212 | triton_models_names = check_models_directory(execution_order, models_dir) 213 | command = ["tritonserver", "--model-repository=/models", "--model-control-mode=explicit"] 214 | triton_container_id, ip_address = start_triton(models_dir, command, triton_models_names=triton_models_names) 215 | yield ip_address 216 | finally: 217 | logging.debug("Stopping Triton ...") 218 | subproc_run_wrapper(["docker", "kill", triton_container_id]) 219 | logging.debug("Finished cleaning up Triton") 220 | -------------------------------------------------------------------------------- /tests/test_triton_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | import os 18 | import re 19 | import sys 20 | from dataclasses import dataclass 21 | from typing import List 22 | from unittest.mock import MagicMock, patch 23 | 24 | import pytest 25 | 26 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src")) 27 | from triton_utils import (RUN_MODE, _extract_models_from_configs, check_models_directory, # nopep8 # noqa: E402 28 | check_triton_status, decide_method_to_run_triton, inspect_ip_address, run_triton_model_repo, 29 | start_triton) 30 | 31 | 32 | @pytest.fixture(scope="function") 33 | def create_triton_models_dir(tmp_path): 34 | """Custom Pytest fixture to mock triton models directory. 35 | 36 | Args: 37 | model_names: List of str representing triton model names. 38 | 39 | Returns: 40 | None 41 | """ 42 | def _func(model_names): 43 | # Create the folders needed and some extra models in that directory 44 | for dir_name in model_names: 45 | config_file = tmp_path / "models" / dir_name / "config.pbtxt" 46 | config_file.parent.mkdir(parents=True, exist_ok=True) 47 | file_content = f'name: "{dir_name}"\n' 48 | config_file.write_text(file_content) 49 | yield _func 50 | 51 | 52 | def test_fixture_create_models_dir(tmp_path, create_triton_models_dir): 53 | names = ["liver", "heart"] 54 | create_triton_models_dir(names) 55 | assert sorted(os.listdir(str(tmp_path / "models"))) == sorted(names) 56 | 57 | 58 | @dataclass 59 | class MockConfig: 60 | models: List[str] = None 61 | 62 | 63 | @pytest.mark.parametrize("configs, expected", [ 64 | ([MockConfig(), MockConfig(["m1", "m2"]), MockConfig(["m3"]), MockConfig(), MockConfig(["m4", "m5", "m6"])], 65 | ["m1", "m2", "m3", "m4", "m5", "m6"]), 66 | ([MockConfig(), MockConfig()], []), 67 | ([MockConfig(["m1", "m2"]), MockConfig(["m1"])], ["m1", "m2"]) 68 | ]) 69 | def test_extract_models_from_configs(configs, expected): 70 | result = _extract_models_from_configs(configs) 71 | assert sorted(result) == expected 72 | 73 | 74 | @patch("triton_utils._extract_models_from_configs") 75 | def test_check_model_repository_no_models_needed(mock_models, tmp_path): 76 | mock_models.return_value = [] 77 | mock_configs = MagicMock() 78 | result = check_models_directory(mock_configs, tmp_path) 79 | assert result == [] 80 | 81 | 82 | @patch("triton_utils._extract_models_from_configs") 83 | def test_check_model_repository_no_model_dir(mock_models): 84 | mock_models.return_value = ["liver", "spleen", "heart"] 85 | mock_configs = MagicMock() 86 | with pytest.raises(SystemExit): 87 | check_models_directory(mock_configs, None) 88 | 89 | 90 | @pytest.mark.parametrize("mock_models, dir_name, file_content", [ 91 | pytest.param(["liver"], "liver", 'name: "segmentation_liver_v1"\n', id="content_not_match"), 92 | pytest.param(["liver"], "liver_seg", 'name: "liver"\n', id="dir_name_not_match"), 93 | pytest.param(["liver", "heart"], "liver", 'name: "liver"\n', id="missing_model") 94 | ]) 95 | @patch("triton_utils._extract_models_from_configs") 96 | def test_check_model_repository_bad_input(mock_func, mock_models, dir_name, file_content, tmp_path): 97 | mock_func.return_value = mock_models 98 | mock_configs = MagicMock() 99 | config_file = tmp_path / "models" / dir_name / "config.pbtxt" 100 | config_file.parent.mkdir(parents=True) 101 | config_file.write_text(file_content) 102 | with pytest.raises(SystemExit): 103 | check_models_directory(mock_configs, config_file.parents[1]) 104 | 105 | 106 | @pytest.mark.parametrize("mock_models", [ 107 | pytest.param(["liver"], id="one_model"), 108 | pytest.param(["liver", "spleen", "heart"], id="three_models"), 109 | ]) 110 | @patch("triton_utils._extract_models_from_configs") 111 | def test_check_model_repository_good_input(mock_func, mock_models, tmp_path, create_triton_models_dir): 112 | mock_func.return_value = mock_models 113 | mock_configs = MagicMock() 114 | 115 | create_triton_models_dir(mock_models + ["eyes", "lung"]) 116 | 117 | result = check_models_directory(mock_configs, tmp_path / "models") 118 | assert sorted(result) == sorted(mock_models) 119 | 120 | 121 | @pytest.mark.parametrize("mock_configs, exp_mode", [ 122 | pytest.param([MagicMock(**{"models": True, "services": None})], RUN_MODE.MODEL_REPO, id="model_repo"), 123 | pytest.param([MagicMock(**{"models": None, "services": True})], RUN_MODE.PIPELINE_SERVICES, id="services"), 124 | pytest.param([MagicMock(**{"models": None, "services": None})], RUN_MODE.NO_INFERENCE_SERVER, id="neither"), 125 | ]) 126 | def test_decide_method_to_run_triton(mock_configs, exp_mode): 127 | assert decide_method_to_run_triton(mock_configs) == exp_mode 128 | 129 | 130 | def test_decide_method_to_run_triton_error(): 131 | mock_configs = [MagicMock(**{"models": True, "services": True})] 132 | with pytest.raises(SystemExit): 133 | decide_method_to_run_triton(mock_configs) 134 | 135 | 136 | @pytest.mark.parametrize( 137 | "model_names, mock_reponses", 138 | [ 139 | pytest.param( 140 | [], 141 | [MagicMock(**{"status_code": 200})], 142 | id="no_model_names"), 143 | pytest.param( 144 | ["model1"], 145 | [MagicMock(**{"status_code": 200, "text": None}), MagicMock(**{"status_code": 200, "text": None})], 146 | id="1_model_name"), 147 | ] 148 | ) 149 | @patch("triton_utils.TRITON_WAIT_SLEEP_TIME_SECONDS", 0) 150 | @patch("triton_utils.TRITON_WAIT_TIME_SECONDS", 0) 151 | @patch("triton_utils.requests") 152 | def test_check_triton_status_200(mock_requests, model_names, mock_reponses): 153 | mock_requests.configure_mock(**{"ConnectionError": ValueError}) 154 | mock_requests.get.side_effect = mock_reponses 155 | check_triton_status(triton_models_names=model_names, host="some_host", port="1234") 156 | assert f"http://some_host:1234" in mock_requests.get.call_args.args[0] 157 | 158 | 159 | @pytest.mark.parametrize( 160 | "model_names, mock_reponses, exp_msg", 161 | [ 162 | pytest.param( 163 | [], 164 | [MagicMock(**{"status_code": 400, "text": "some msg"})], 165 | "Triton is not working", id="no_model_names"), 166 | pytest.param( 167 | ["model1"], 168 | [MagicMock(**{"status_code": 200, "text": None}), MagicMock(**{"status_code": 400, "text": "some msg"})], 169 | "Error:", id="1_model_name"), 170 | ] 171 | ) 172 | @patch("triton_utils.TRITON_WAIT_SLEEP_TIME_SECONDS", 0) 173 | @patch("triton_utils.TRITON_WAIT_TIME_SECONDS", 0) 174 | @patch("triton_utils.requests") 175 | def test_check_triton_status_error(mock_requests, model_names, mock_reponses, exp_msg): 176 | 177 | mock_requests.configure_mock(**{"ConnectionError": ValueError}) 178 | mock_requests.get.side_effect = mock_reponses 179 | with pytest.raises(SystemExit) as exc: 180 | check_triton_status(triton_models_names=model_names) 181 | assert exp_msg in str(exc.value) 182 | 183 | 184 | @patch("triton_utils.subproc_run_wrapper") 185 | def test_inspect_ip_address(mock_subproc_run_wrapper): 186 | mock_subproc_run_wrapper.return_value = "'125.12.199.0'" 187 | result = inspect_ip_address("container_name") 188 | assert result == "125.12.199.0" 189 | 190 | 191 | @pytest.mark.parametrize("model_names", [["spleen", "arm", "legs"], []]) 192 | @patch("triton_utils.check_triton_status") 193 | @patch("triton_utils.inspect_ip_address") 194 | @patch("triton_utils.subproc_run_wrapper") 195 | def test_start_triton(mock_subproc_run_wrapper, mock_inspect, mock_check_triton_status, model_names): 196 | mock_subproc_run_wrapper.return_value = "container_id" 197 | mock_inspect.return_value = "ip_address" 198 | result = start_triton("models", ["some", "command"], triton_models_names=model_names) 199 | assert result == ("container_id", "ip_address") 200 | 201 | # Check that all the models used are listed in the call_args for Popen 202 | if model_names != []: 203 | for name in model_names: 204 | assert f"--load-model={name}" in mock_subproc_run_wrapper.call_args_list[0].args[0] 205 | 206 | 207 | @patch("triton_utils.subproc_run_wrapper") 208 | @patch("triton_utils.check_models_directory") 209 | @patch("triton_utils.start_triton") 210 | def test_run_triton_model_repo(mock_start_triton, mock_check_dir, mock_subproc_run_wrapper): 211 | triton_models_names = ["spleen", "arm", "legs"] 212 | mock_check_dir.return_value = triton_models_names 213 | 214 | process_mock = MagicMock() 215 | process_mock.configure_mock(**{"returncode": None, "terminate.return_value": None}) 216 | mock_start_triton.return_value = ("container_id", "ip_address") 217 | 218 | with run_triton_model_repo([], "some_dir"): 219 | pass 220 | 221 | mock_subproc_run_wrapper.assert_called_once() 222 | assert "container_id" in mock_subproc_run_wrapper.call_args_list[0].args[0] 223 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![License](https://img.shields.io/badge/License-Apache_2.0-lightgrey.svg)](https://opensource.org/licenses/Apache-2.0) 2 | 3 | [![NVIDIA](https://github.com/NVIDIA/clara-platform-python-client/blob/main/ext/NVIDIA_horo_white.png?raw=true)](https://docs.nvidia.com/clara/deploy/index.html) 4 | 5 | # CPOST (Clara Pipeline Operator Sizing Tool) 6 | ## Tool to measure resource usage of Clara Platform pipeline operators 7 | 8 | Cpost is a tool that will help you run your pipeline locally and provides you with the CPU and memory usage of each operators ran for the given input payload. Opeartors are ran one at a time and CPU and memory usage are sampled. The CPU and memory usage metrics are provided in a .csv format which allows further data analytics as needed. 9 | 10 | ## System Requirements 11 | * Clara Compiler (downloadable from [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:clara:clara_cli)) 12 | * Docker 20.10 or higher due to cgroup v2 constraints 13 | * System must be using cgroup v2 (See [Docker Control Groups](https://docs.docker.com/config/containers/runmetrics/#control-groups) for more information) 14 | * Python 3.8.0 or higher 15 | *Do not have a Triton instance running on the same machine that CPOST is running on. CPOST will provision it's own Triton instance and the two instances could conflict and cause failures. 16 | 17 | ## Usage 18 | The following is the help message of cpost: 19 | ``` 20 | usage: cpost [-h] [--metrics_dir METRICS_DIR] [--models_dir MODELS_DIR] [-v] [--force [{none,cont,stop}]] 21 | 22 | Clara Pipeline Sizing Tool CLI 23 | 24 | positional arguments: 25 | pipeline definition file path 26 | input payload directory 27 | 28 | optional arguments: 29 | -h, --help show this help message and exit 30 | --metrics_dir METRICS_DIR 31 | metrics output directory, if not specified, write to stdout 32 | --models_dir MODELS_DIR 33 | directory for Triton models, required if pipeline uses Triton 34 | -v, --verbose verbose output (DEBUG level). If not specified, default output is INFO level. 35 | --force [{none,cont,stop}] 36 | force continue or stop when operator failure occurs. (default: none, which will prompt the user for each failure). 37 | ``` 38 | 39 | ## Quick Start Guide 40 | 41 | ### Download CPOST 42 | #### Method 1: Install from Pypi as a PIP package 43 | Run `pip install nvidia-clara-cpost` 44 | 45 | #### Method 2: Build from Source Repository 46 | 1. Clone this repository. 47 | 2. In the source folder, run `python3 setup.py sdist bdist_wheel` and you should see a wheel file in `./dist`. Use this file to `pip install` in your desired virtual environment. For example: 48 | ``` 49 | $ ls 50 | CONTRIBUTING.md demo dist LICENSE README.md requirements-dev.txt requirements.txt setup.cfg setup.py src tests 51 | $ ls dist 52 | nvidia_clara_cpost-0.0.0-py3-none-any.whl nvidia-clara-cpost-0.0.0.tar.gz 53 | ``` 54 | 55 | ### Run CPOST in a virtual environment (recommended) 56 | After you have downloaded the wheel from [Download CPOST](#download-cpost), create a virtual environment to work with. 57 | ``` 58 | $ mkdir ./demo 59 | $ cd demo 60 | $ python3.8 -m venv venv 61 | $ source venv/bin/activate 62 | $ pip install -U pip 63 | $ pip install ../dist/nvidia_clara_cpost-0.0.0-py3-none-any.whl # or any other path to the wheel file 64 | ``` 65 | After pip install has completed, run `cpost` and you should see the help message. 66 | 67 | ### Prepare Pipeline Data 68 | 69 | Let's prepare some source data to work with. We will use the AI Spleen Segementation Pipeline as an example 70 | 71 | Download the [Clara AI Spleen Segmentation Pipeline](https://ngc.nvidia.com/catalog/resources/nvidia:clara:clara_ai_spleen_pipeline) to a directory (e.g. `./demo`). Download instructions are available on the linked page 72 | 73 | Once we have the spleen downloaded, go into the folder and unzip the model and input data. 74 | ``` 75 | $ cd clara_ai_spleen_pipeline_v${VERSION_ON_NGC} 76 | $ ls clara_ai_spleen_pipeline_v${VERSION_ON_NGC} 77 | app_spleen-input_v1.zip app_spleen-model_v1.zip source.zip spleen-pipeline-model-repo.yaml spleen-pipeline.yaml 78 | $ unzip app_spleen-input_v1.zip -d app_spleen-input_v1 79 | $ unzip app_spleen-model_v1.zip -d app_spleen-model_v1 80 | ``` 81 | Now we're ready to run cpost! 82 | 83 | The simplest way to run `cpost` is to provide a pipeline definition file and input payload data as shown below. The resulting metrics and console logs are written to standard output directly. In the demo folder: 84 | ``` 85 | $ cpost --models_dir clara_ai_spleen_pipeline_v${VERSION_ON_NGC}/app_spleen-model_v1 clara_ai_spleen_pipeline_v${VERSION_ON_NGC}/spleen-pipeline.yaml clara_ai_spleen_pipeline_v${VERSION_ON_NGC}/app_spleen-input_v1 86 | ``` 87 | 88 | If raw metrics are desired, then a valid directory can be specified with `--metrics_dir` and the resulting metrics csv files will be stored in the given directory for each executed operator. 89 | ``` 90 | $ mkdir metrics 91 | $ cpost--metrics_dir metrics --models_dir clara_ai_spleen_pipeline_v${VERSION_ON_NGC}/app_spleen-model_v1 clara_ai_spleen_pipeline_v${VERSION_ON_NGC}/spleen-pipeline.yaml clara_ai_spleen_pipeline_v${VERSION_ON_NGC}/app_spleen-input_v1 92 | ``` 93 | 94 | ### Interpreting the Result 95 | After running the above command, you should see below as output: 96 | 97 | ``` 98 | All software dependencies are fullfilled. 99 | 100 | ______________Executing Operator dicom-reader_______________ 101 | Running operator ... 102 | The container id is: 47ca2626929006154a5515eba841755993df3f298de0abcdc5b9b951971470ca 103 | Results are stored in /home/magzhang/code/sdk/Tools/cpost/demo/metrics/dicom-reader_final_result.csv 104 | _______________Operator dicom-reader Summary________________ 105 | +--------+-----------+------------+-------------+ 106 | | Metric | Average | Maximum | Resource | 107 | +--------+-----------+------------+-------------+ 108 | | CPU | 124.714 % | 1097.941 % | cpu: 11 | 109 | | Memory | 91.057 MB | 405.242 MB | memory: 512 | 110 | +--------+-----------+------------+-------------+ 111 | 112 | ___________Executing Operator spleen-segmentation___________ 113 | Running operator ... 114 | The container id is: 270f486475aa4584b4fb5911a0db23a10b4eaf0eb26a14daa3fa8951c6a77c95 115 | Results are stored in /home/magzhang/code/sdk/Tools/cpost/demo/metrics/spleen-segmentation_final_result.csv 116 | ____________Operator spleen-segmentation Summary____________ 117 | +--------+-------------+-------------+--------------+ 118 | | Metric | Average | Maximum | Resource | 119 | +--------+-------------+-------------+--------------+ 120 | | CPU | 150.649 % | 1134.358 % | cpu: 12 | 121 | | Memory | 1630.311 MB | 4455.412 MB | memory: 4608 | 122 | +--------+-------------+-------------+--------------+ 123 | 124 | ______________Executing Operator dicom-writer_______________ 125 | Running operator ... 126 | The container id is: 32cf46da42111c75dfa1856ec35e4724e22d9e6d246e64ab3089fc212f049a4a 127 | Results are stored in /home/magzhang/code/sdk/Tools/cpost/demo/metrics/dicom-writer_final_result.csv 128 | _______________Operator dicom-writer Summary________________ 129 | +--------+------------+------------+-------------+ 130 | | Metric | Average | Maximum | Resource | 131 | +--------+------------+------------+-------------+ 132 | | CPU | 190.224 % | 1017.747 % | cpu: 11 | 133 | | Memory | 278.678 MB | 552.313 MB | memory: 768 | 134 | +--------+------------+------------+-------------+ 135 | 136 | __Executing Operator register-volume-images-for-rendering___ 137 | Running operator ... 138 | The container id is: 2ad135d27cd827de8f687791c9c70ca88229d5eec912be1d20c1a66993ecbb1a 139 | Results are stored in /home/magzhang/code/sdk/Tools/cpost/demo/metrics/register-volume-images-for-rendering_final_result.csv 140 | Operator failed with exitcode is: 126 141 | ___Operator register-volume-images-for-rendering Summary____ 142 | +--------+----------+----------+-------------+ 143 | | Metric | Average | Maximum | Resource | 144 | +--------+----------+----------+-------------+ 145 | | CPU | 12.667 % | 14.923 % | cpu: 1 | 146 | | Memory | 2.633 MB | 3.783 MB | memory: 256 | 147 | +--------+----------+----------+-------------+ 148 | Operator register-volume-images-for-rendering failed with exitcode 126 149 | +--------------------------------------+----------+-------------+-------------+--------------+ 150 | | Operator | Metric | Average | Maximum | Resource | 151 | +======================================+==========+=============+=============+==============+ 152 | | dicom-reader | CPU | 124.714 % | 1097.941 % | cpu: 11 | 153 | | | Memory | 91.057 MB | 405.242 MB | memory: 512 | 154 | +--------------------------------------+----------+-------------+-------------+--------------+ 155 | | spleen-segmentation | CPU | 150.649 % | 1134.358 % | cpu: 12 | 156 | | | Memory | 1630.311 MB | 4455.412 MB | memory: 4608 | 157 | +--------------------------------------+----------+-------------+-------------+--------------+ 158 | | dicom-writer | CPU | 190.224 % | 1017.747 % | cpu: 11 | 159 | | | Memory | 278.678 MB | 552.313 MB | memory: 768 | 160 | +--------------------------------------+----------+-------------+-------------+--------------+ 161 | | register-volume-images-for-rendering | CPU | 12.667 % | 14.923 % | cpu: 1 | 162 | | (Non-zero exitcode) | Memory | 2.633 MB | 3.783 MB | memory: 256 | 163 | +--------------------------------------+----------+-------------+-------------+--------------+ 164 | ``` 165 | The last column in the last table is what you can put into the pipeline definition file's `requests`. 166 | Please note that there maybe some small differences between each execution. You can run multiple times to see what are the best numbers to fill. 167 | 168 | 169 | ## Troubleshooting 170 | ### Docker pull error 171 | ``` 172 | Docker pull command for `nvcr.io/nvstaging/clara/dicom-reader:0.8.1-2108.1` returned with code 1 173 | stdout is: 174 | stderr is: Error response from daemon: unauthorized: authentication required 175 | 176 | Please verify docker access and the pipeline definition 177 | ``` 178 | **Resolution**: CPOST performs a local check to match with the given image and tag. If this fails, CPOST performs a docker pull. Thus, please do a `docker login` to the correct registry or ensure that you have the correct docker image locally. 179 | 180 | ### Docker network error 181 | ``` 182 | Error response from daemon: network with name cpost_net already exists 183 | 184 | cpost_net already exist, please remove the network and rerun cpost 185 | ``` 186 | **Resolution**: This occurs because the docker network with name "cpost_net" already exist, which could either because you happen to have this network or because CPOST failed to clean up in one of the previous runs. Please do a `docker network rm cpost_net` and `docker network ls` to ensure this network is cleaned up. 187 | 188 | For all other problems, please submit an issue in the repository and we will resolve this as soon as possible. 189 | 190 | ### Warning from container ID timeout 191 | ``` 192 | Running operator ... 193 | Obtaining docker ID timed out. Operator spleen-segmentation failed 194 | Operator spleen-segmentation failed with exitcode -15 195 | ``` 196 | **Resolution**: This occurs when CPOST tries to run the container in detached mode and times out during when waiting for the container ID to return. The exitcode `-15` means that cpost terminated the docker container because it speculates that something has gone wrong. This could happen due to a lot of reasons, and you can run in `-v` (verbose) mode to see the full `docker run` command and run it yourself and hopefully this will provides you some insights on why CPOST couldn't obtain a docker ID. 197 | 198 | ## Running from Source Code During Development 199 | 200 | The environment must have Python 3.8 installed and should have the necessary packages required by cpost installed. The `requirements.txt` contains all the necessary packages and can be used to install them. The tools used for development can be found in `requirements-dev.txt` 201 | 202 | Once virtual environment are created successfully and have been activated. Install the `requirements.txt` with `pip` or `conda`, etc.. The following command can be run directly as cpost: 203 | ``` 204 | python src/main.py 205 | ``` 206 | 207 | ### Test Coverage 208 | 209 | To see test coverage, activate the virtual environment and install the development tools from `requirements-dev.txt`. 210 | From the root of repository, run the command below will provide the unittest coverage report. 211 | ``` 212 | coverage run -m pytest tests && coverage report 213 | ``` 214 | -------------------------------------------------------------------------------- /tests/test_pipeline_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | import os 18 | import re 19 | import sys 20 | import time 21 | from multiprocessing import Manager, Queue 22 | from random import uniform as rand_float 23 | from unittest.mock import MagicMock, call, patch 24 | 25 | import pytest 26 | from src.cli import ContinueOptions 27 | 28 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src")) 29 | from clarac_utils import OperatorConfig, ServiceConfig # nopep8 # noqa: E402 30 | from container import Metrics # nopep8 # noqa: E402 31 | from pipeline_utils import (_enqueue_output, build_operator_cmd, clean_up_containers, # nopep8 # noqa: E402 32 | get_output_writers, print_operator_summary, print_pipeline_summary, run_pipeline, 33 | run_pipeline_alone, run_pipeline_with_services, sample_operator, start_operator, 34 | start_pipeline_services) 35 | from triton_utils import RUN_MODE # nopep8 # noqa: E402 36 | 37 | 38 | def test_enqueue_output(tmp_path): 39 | file_path = tmp_path / "test_enqueue" 40 | data = b"1255\n1233\n" 41 | file_path.write_bytes(data) 42 | q = Queue() 43 | opened_file = file_path.open("rb") 44 | _enqueue_output(opened_file, q) 45 | assert q.get(timeout=1) == b"1255\n" 46 | assert q.get(timeout=1) == b"1233\n" 47 | 48 | 49 | @patch("pipeline_utils.Popen") 50 | def test_start_operator(mock_popen): 51 | raw_container_id = b'8c0b4110ae930dbe26b258de9bc34a03f98056ed6f27f991d32919bfe401d7c5\n' 52 | actual_container_id = raw_container_id.decode('utf-8').strip() 53 | mock_popen.return_value = MagicMock(**{"returncode": 0, 54 | "poll.return_value": None, 55 | "stdout.readline.return_value": raw_container_id, 56 | "stdout.close.return_value": None}) 57 | 58 | manager = Manager() 59 | expected_container_id = manager.Value('c_wchar_p', '') 60 | mock_event = MagicMock() 61 | cmd = ["some", "docker", "run", "command"] 62 | 63 | start_operator(expected_container_id, mock_event, cmd) 64 | 65 | assert actual_container_id == expected_container_id.value 66 | mock_event.set.assert_called_once() 67 | 68 | 69 | @patch("pipeline_utils.Process") 70 | @patch("pipeline_utils.Popen") 71 | def test_start_operator_popen_error(mock_popen, mock_multi_process): 72 | mock_exit_msg = "exiting because of error" 73 | mock_popen.return_value = MagicMock(**{"returncode": 1, "stderr.read.return_value": mock_exit_msg.encode("UTF-8")}) 74 | 75 | manager = Manager() 76 | expected_container_id = manager.Value('c_wchar_p', '') 77 | mock_event = MagicMock() 78 | cmd = ['some', 'docker', 'run', 'command'] 79 | 80 | with pytest.raises(SystemExit) as exc: 81 | start_operator(expected_container_id, mock_event, cmd) 82 | assert mock_exit_msg in str(exc.value) 83 | mock_event.set.assert_not_called() 84 | 85 | 86 | @pytest.mark.parametrize("mock_exitcode, expected_code", [(b'0\n', None), (b'125\n', 125), (b'error\n', 1)]) 87 | @patch("pipeline_utils.Queue") 88 | @patch("pipeline_utils.subproc_run") 89 | @patch("pipeline_utils.Process") 90 | @patch("pipeline_utils.Popen") 91 | def test_start_operator_docker_error( 92 | mock_popen, mock_multi_process, mock_subproc_run, mock_q, mock_exitcode, expected_code): 93 | mock_popen.return_value = MagicMock(**{"returncode": None, "poll.return_value": None}) 94 | 95 | mock_q.return_value = MagicMock( 96 | **{"get_nowait.return_value": b'8c0b4110ae930dbe26b258de9bc34a03f98056ed6f27f991d32919bfe401d7c5\n'}) 97 | 98 | mock_subproc_run.return_value = MagicMock(**{"returncode": None, "stdout": mock_exitcode}) 99 | 100 | manager = Manager() 101 | expected_container_id = manager.Value('c_wchar_p', '') 102 | mock_event = MagicMock() 103 | cmd = ['some', 'docker', 'run', 'command'] 104 | 105 | if mock_exitcode == b'0\n': 106 | start_operator(expected_container_id, mock_event, cmd) 107 | else: 108 | with pytest.raises(SystemExit) as exc: 109 | start_operator(expected_container_id, mock_event, cmd) 110 | assert exc.value.code == expected_code 111 | mock_event.set.assert_called_once() 112 | 113 | 114 | def test_sample_operator_logic(): 115 | mock_q = MagicMock() 116 | mock_container = MagicMock() 117 | mock_container.metrics_path_exists.side_effect = [0, 1, 1, 0] 118 | mock_container.sample_metrics.return_value = None 119 | sample_operator(mock_container, mock_q) 120 | assert mock_container.method_calls == [ 121 | call.metrics_path_exists(), 122 | call.metrics_path_exists(), 123 | call.metrics_path_exists(), 124 | call.sample_metrics(), 125 | call.metrics_path_exists(), 126 | ] 127 | 128 | assert mock_q.put.call_count == 2 129 | assert mock_q.put.call_args_list == [call(None), call(0)] 130 | 131 | 132 | @pytest.mark.parametrize("sampling_time,expected", [(rand_float(0.0001, 0.19), [0.2]), (0.3, [0.3])]) 133 | def test_sample_operator_sampling_rate(sampling_time, expected): 134 | mock_q = MagicMock() 135 | mock_container = MagicMock() 136 | sampling_num = 10 137 | mock_container.metrics_path_exists.side_effect = [0, 1] + [1] * sampling_num + [0] 138 | result_timestamps = [] 139 | 140 | def mock_sample(): 141 | """Mock sampling function that appends a timestamp to a list.""" 142 | timestamp = time.perf_counter() 143 | time.sleep(sampling_time) 144 | result_timestamps.append(timestamp) 145 | 146 | mock_container.sample_metrics = mock_sample 147 | sample_operator(mock_container, mock_q) 148 | assert len(result_timestamps) == sampling_num, "The number of samples does not match with expected." 149 | 150 | result_diffs = [round(j - i, 1) for i, j in zip(result_timestamps[:-1], result_timestamps[1:])] 151 | assert result_diffs == expected * (sampling_num - 1), "Something is wrong with the accuracy of time.sleep()" 152 | 153 | 154 | # autopep8: off 155 | @pytest.mark.parametrize( 156 | "op_config, expected_args", 157 | [ 158 | pytest.param( 159 | OperatorConfig("op_name", "image:tag", None, {"VAR0": 2, "VAR1": "hi"}, [{"path": "/input"}], [{"path": "/output"}]), 160 | ["--env", "VAR0=2", "--env", "VAR1=hi", "-v", "%tmp%/app_data:/input", "-v", "%tmp%/op_name:/output", "image:tag"], id="with_ENV_VAR" 161 | ), 162 | pytest.param( 163 | OperatorConfig("op_name", "image:tag", None, None, None, None), 164 | ["image:tag"], id="no_input_output" 165 | ), 166 | pytest.param( 167 | OperatorConfig("op_name", "image:tag", None, None, [{"path": "/input"}], [{"path": "/output"}]), 168 | ["-v", "%tmp%/app_data:/input", "-v", "%tmp%/op_name:/output", "image:tag"], id="min_input_output" 169 | ), 170 | pytest.param( 171 | OperatorConfig("op_name", "image:tag", None, None, [{"from": "liver", "path": "/input"}], None), 172 | ["-v", "%tmp%/liver:/input", "image:tag"], id="input_contains_from" 173 | ), 174 | pytest.param( 175 | OperatorConfig("op_name", "image:tag", None, None, [{"from": "liver", "name": "classification", "path": "/input"}, {"path": "/dcm"}], None), 176 | ["-v", "%tmp%/liver/classification:/input", "-v", "%tmp%/app_data:/dcm", "image:tag"], id="double_inputs" 177 | ), 178 | pytest.param( 179 | OperatorConfig("op_name", "image:tag", None, None, [{"path": "/input"}], [{"name": "logs", "path": "/output"}]), 180 | ["-v", "%tmp%/app_data:/input", "-v", "%tmp%/op_name/logs:/output", "image:tag"], id="named_output" 181 | ), 182 | pytest.param( 183 | OperatorConfig("op_name", "image:tag", ["some", "command"], None, [{"path": "/input"}], [{"path": "/output"}]), 184 | ["-v", "%tmp%/app_data:/input", "-v", "%tmp%/op_name:/output", "image:tag", "some", "command"], id="image_with_command" 185 | ), 186 | pytest.param( 187 | OperatorConfig("op_name", "image:tag", None, None, None, None, ["model1"]), 188 | ["--env", "NVIDIA_TRITON_HTTPURI=localhost:8000", "--env", "CLARA_TRITON_URI=localhost:8000", "--env", "NVIDIA_CLARA_TRTISURI=localhost:8000", "--env", "NVIDIA_TRITON_GRPCURI=localhost:8001", "image:tag"], id="model_repo" 189 | ), 190 | pytest.param( 191 | OperatorConfig("op_name", "image:tag", None, None, None, None, None, [ServiceConfig("name", "it", None, None)]), 192 | ["image:tag"], id="pipeline_services" 193 | ), 194 | pytest.param( 195 | OperatorConfig("op_name", "image:tag", ["some", "command"], {"VAR0": 2}, 196 | [{"from": "liver", "name": "classification", "path": "/input"}, {"path": "/dcm"}], 197 | [{"name": "dicom", "path": "/output"}, {"name": "logs", "path": "/logs"}]), 198 | ["--env", "VAR0=2", "-v", "%tmp%/liver/classification:/input", "-v", "%tmp%/app_data:/dcm", 199 | "-v", "%tmp%/op_name/dicom:/output", "-v", "%tmp%/op_name/logs:/logs", "image:tag", "some", "command"], 200 | id="all_in_one" 201 | ), 202 | ], 203 | ) 204 | # autopep8: on 205 | def test_build_operator_cmd(tmp_path, op_config, expected_args): 206 | input_path = tmp_path / "app_data" 207 | 208 | def swap_tmp(temp_dir, args): 209 | return [re.sub(r'%tmp%', temp_dir, i) for i in args] 210 | expected_args = swap_tmp(str(tmp_path), expected_args) 211 | config = op_config 212 | 213 | result_cmd = build_operator_cmd(input_path, tmp_path, config, "localhost") 214 | 215 | assert (tmp_path / "op_name").is_dir() 216 | 217 | assert result_cmd == ["docker", "run", "-d", "--rm", "--env", "NVIDIA_CLARA_NOSYNCLOCK=1"] + expected_args 218 | 219 | 220 | def test_print_operator_summary(caplog): 221 | metrics = [Metrics(1.5, 10, 20), Metrics(1.5, 20, 20), Metrics(1.5, 30, 25)] 222 | with caplog.at_level(logging.INFO): 223 | print_operator_summary(metrics, "opeartor_name") 224 | # [1] only gets the table section 225 | messages = [rec.getMessage() for rec in caplog.records][1] 226 | 227 | messages = messages.split("\n") 228 | cpu_line = messages[3] 229 | mem_line = messages[4] 230 | assert "CPU" in cpu_line 231 | assert "20" in cpu_line 232 | assert "30" in cpu_line 233 | assert "Memory" in mem_line 234 | assert "21.6" in mem_line 235 | assert "25" in mem_line 236 | 237 | 238 | @pytest.mark.parametrize("run_mode", [RUN_MODE.NO_INFERENCE_SERVER, RUN_MODE.MODEL_REPO, RUN_MODE.PIPELINE_SERVICES]) 239 | @patch("pipeline_utils.run_pipeline_with_services") 240 | @patch("pipeline_utils.run_pipeline_alone") 241 | @patch("pipeline_utils.run_triton_model_repo") 242 | @patch("pipeline_utils.decide_method_to_run_triton") 243 | def test_run_pipeline(mock_decide, mock_run_triton, mock_run_alone, mock_run_services, run_mode): 244 | mock_decide.return_value = run_mode 245 | mock_run_triton.return_value.__enter__.return_value = MagicMock() 246 | run_pipeline([], None, None, None, ContinueOptions.NONE) 247 | if run_mode == RUN_MODE.NO_INFERENCE_SERVER: 248 | mock_run_triton.assert_not_called() 249 | mock_run_alone.assert_called_once() 250 | mock_run_services.assert_not_called() 251 | elif run_mode == RUN_MODE.MODEL_REPO: 252 | mock_run_triton.assert_called_once() 253 | mock_run_alone.assert_called_once() 254 | mock_run_services.assert_not_called() 255 | elif run_mode == RUN_MODE.PIPELINE_SERVICES: 256 | mock_run_triton.assert_not_called() 257 | mock_run_alone.assert_not_called() 258 | mock_run_services.assert_called_once() 259 | 260 | 261 | def test_get_output_writers(tmp_path): 262 | mock_writer = MagicMock(**{"join.return_value": None}) 263 | with get_output_writers(tmp_path) as writers: 264 | assert writers == [] 265 | writers.append(mock_writer) 266 | assert mock_writer.join.call_count == 1 267 | 268 | 269 | def test_get_no_output_writers(): 270 | with get_output_writers(None) as writers: 271 | assert writers is None 272 | 273 | 274 | @patch("pipeline_utils.build_operator_cmd") 275 | @patch("pipeline_utils.run_operator") 276 | @patch("pipeline_utils.TemporaryDirectory") 277 | def test_run_pipeline_alone(mock_temp_file, mock_run_operator, mock_build_cmd, tmp_path): 278 | mock_temp_file.return_value.__enter__.return_value = "tmp_file_name" 279 | mock_run_operator.side_effect = [None, True, None] 280 | m1, m2, m3 = MagicMock(**{"name": "1"}), MagicMock(**{"name": "2"}), MagicMock(**{"name": "3"}) 281 | execution_order = [m1, m2, m3] 282 | run_pipeline_alone(execution_order, tmp_path, None, ContinueOptions.NONE, None) 283 | assert len(mock_run_operator.call_args_list) == 2 284 | assert m1 in mock_run_operator.call_args_list[0].args 285 | assert m2 in mock_run_operator.call_args_list[1].args 286 | 287 | 288 | @patch("pipeline_utils.subproc_run_wrapper") 289 | def test_clean_up_containers(mock_subproc_run_wrapper): 290 | running_containers = {"image1": ("ID1", "ip_address")} 291 | clean_up_containers(running_containers) 292 | assert mock_subproc_run_wrapper.call_args.args[0] == ["docker", "kill", "ID1"] 293 | assert running_containers == {} 294 | 295 | 296 | @patch("pipeline_utils.start_triton") 297 | @patch("pipeline_utils.clean_up_containers") 298 | def test_start_pipeline_services(mock_clean_up_containers, mock_start_triton): 299 | container_info = ("container_id_123", "ip_address") 300 | mock_start_triton.return_value = container_info 301 | 302 | service_config_1 = ServiceConfig("trtis", "image_tag", ["some", "cmd"], {"VAR": "port_num"}) 303 | op_config_1 = OperatorConfig("name", None, None, None, None, None, None, [service_config_1]) 304 | services_dict = {} 305 | start_pipeline_services(op_config_1, services_dict, "some-dir") 306 | assert services_dict["image_tag some cmd"] == container_info 307 | assert op_config_1.variables == {"VAR": "ip_address:port_num"} 308 | assert mock_start_triton.call_count == 1 309 | 310 | # Same service -> no new services created 311 | start_pipeline_services(op_config_1, services_dict, "some-dir") 312 | assert services_dict["image_tag some cmd"] == container_info 313 | assert op_config_1.variables == {"VAR": "ip_address:port_num"} 314 | assert mock_start_triton.call_count == 1 315 | 316 | # Different service -> new service created 317 | service_config_2 = ServiceConfig("trtis", "image_tag2", ["some", "cmd"], {"VAR": "port_num2"}) 318 | op_config_2 = OperatorConfig("name", None, None, None, None, None, None, [service_config_2]) 319 | start_pipeline_services(op_config_2, services_dict, "some-dir") 320 | mock_clean_up_containers.assert_called_once() 321 | assert services_dict["image_tag2 some cmd"] == container_info 322 | assert op_config_2.variables == {"VAR": "ip_address:port_num2"} 323 | assert mock_start_triton.call_count == 2 324 | 325 | 326 | @patch("pipeline_utils.start_triton") 327 | @patch("pipeline_utils.clean_up_containers") 328 | def test_start_service_not_supported(mock_clean_up_containers, mock_start_triton, caplog): 329 | service_config_1 = ServiceConfig("other service", "image_tag", ["some", "cmd"], {"VAR": "value"}) 330 | op_config_1 = OperatorConfig("name", None, None, None, None, None, None, [service_config_1]) 331 | services_dict = {} 332 | 333 | with caplog.at_level(logging.WARNING): 334 | start_pipeline_services(op_config_1, services_dict, "some-dir") 335 | messages = [rec.getMessage() for rec in caplog.records] 336 | mock_clean_up_containers.assert_not_called() 337 | mock_start_triton.assert_not_called() 338 | assert "does not support" in messages[0] 339 | assert "Skipping `other service`" in messages[1] 340 | 341 | 342 | @patch("pipeline_utils.clean_up_containers") 343 | @patch("pipeline_utils.build_operator_cmd") 344 | @patch("pipeline_utils.start_pipeline_services") 345 | @patch("pipeline_utils.run_operator") 346 | @patch("pipeline_utils.TemporaryDirectory") 347 | def test_run_pipeline_with_services( 348 | mock_temp_file, mock_run_operator, mock_start_pipeline_services, mock_build_cmd, mock_clean_up_containers, 349 | tmp_path): 350 | 351 | def mock_add_dict(op, services_dict, *args): 352 | services_dict["name"] = "cont_id" 353 | mock_start_pipeline_services.side_effect = mock_add_dict 354 | 355 | mock_temp_file.return_value.__enter__.return_value = "tmp_file_name" 356 | mock_run_operator.side_effect = [None, True, None] 357 | mock_config1 = MagicMock(**{"services": True}) 358 | mock_config2 = MagicMock(**{"services": False}) 359 | execution_order = [mock_config1, mock_config2, mock_config2] 360 | run_pipeline_with_services(execution_order, tmp_path, None, tmp_path, ContinueOptions.NONE) 361 | assert len(mock_run_operator.call_args_list) == 2 362 | mock_start_pipeline_services.assert_called_once() 363 | assert mock_build_cmd.call_count == 2 364 | mock_clean_up_containers.assert_called_once() 365 | 366 | 367 | @patch("pipeline_utils.tabulate") 368 | def test_print_pipeline_summary(mock_tabulate): 369 | raw_data = { 370 | 'dicom-reader': 371 | [['CPU', '130.407 %', '732.975 %', 'cpu: 8'], 372 | ['Memory', '109.309 MB', '431.407 MB', 'memory: 512']], 373 | 'spleen-segmentation': 374 | [['CPU', '126.747 %', '1144.132 %', 'cpu: 12'], 375 | ['Memory', '1403.712 MB', '4339.55 MB', 'memory: 8192']], 376 | 'dicom-writer': 377 | [['CPU', '168.027 %', '676.498 %', 'cpu: 7'], 378 | ['Memory', '481.506 MB', '866.976 MB', 'memory: 1024']], 379 | 'register-dicom-output\n(Non-zero exitcode)': 380 | [['CPU', '14.524 %', '18.102 %', 'cpu: 1'], 381 | ['Memory', '2.074 MB', '2.589 MB', 'memory: 4']]} 382 | 383 | print_pipeline_summary(raw_data) 384 | # This format is desired to keep the display result from tabulate clean 385 | assert mock_tabulate.call_args.args[0] == [ 386 | ['dicom-reader', 'CPU\nMemory', '130.407 %\n109.309 MB', '732.975 %\n431.407 MB', 'cpu: 8\nmemory: 512'], 387 | ['spleen-segmentation', 'CPU\nMemory', '126.747 %\n1403.712 MB', '1144.132 %\n4339.55 MB', 'cpu: 12\nmemory: 8192'], 388 | ['dicom-writer', 'CPU\nMemory', '168.027 %\n481.506 MB', '676.498 %\n866.976 MB', 'cpu: 7\nmemory: 1024'], 389 | ['register-dicom-output\n(Non-zero exitcode)', 'CPU\nMemory', '14.524 %\n2.074 MB', '18.102 %\n2.589 MB', 390 | 'cpu: 1\nmemory: 4']] 391 | -------------------------------------------------------------------------------- /src/pipeline_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 NVIDIA Corporation 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | import sys 18 | import time 19 | from contextlib import contextmanager 20 | from dataclasses import astuple 21 | from multiprocessing import Manager, Process, Queue 22 | from pathlib import Path 23 | from queue import Empty 24 | from subprocess import PIPE, Popen 25 | from subprocess import run as subproc_run 26 | from tempfile import TemporaryDirectory 27 | 28 | from clarac_utils import OperatorConfig 29 | from constants import (ID_WAITING_TIME_SECONDS, LEGACY_TRITON_HTTP_ENV_VAR, LEGACY_TRTIS_HTTP_ENV_VAR, 30 | METRIC_SAMPLING_PERIOD_SECONDS, ON_POSIX, TRITON_GRPC_ENV_VAR, TRITON_GRPC_PORT, 31 | TRITON_HTTP_ENV_VAR, TRITON_HTTP_PORT) 32 | from container import METRICS_HEADER, Container 33 | from tabulate import tabulate 34 | from triton_utils import (RUN_MODE, check_triton_status, decide_method_to_run_triton, inspect_ip_address, 35 | run_triton_model_repo, start_triton) 36 | from utils import convert_percent_to_cores, prompt_yes_or_no, round_up_to_multiple, subproc_run_wrapper, write_to_csv 37 | 38 | from cli import ContinueOptions 39 | 40 | 41 | def _enqueue_output(out, queue): 42 | """Reads the file content, add to queue, and close the file handler when done. 43 | 44 | Args: 45 | out: opened file handler or stdout 46 | queue: multiprocessing.Queue object 47 | 48 | Returns: 49 | None 50 | """ 51 | for line in iter(out.readline, b''): 52 | queue.put(line) 53 | out.close() 54 | 55 | 56 | def start_operator(container_id, id_returned_event, cmd): 57 | """Runs the given docker command and assign docker ID to the given shared value. 58 | 59 | Args: 60 | container_id: A multiprocessing.Value object to allow sharing of values. 61 | id_returned_event: A multiprocess.Event object, set when container_id is assigned. 62 | cmd: The full docker command to run an image. 63 | 64 | Returns: 65 | None 66 | """ 67 | cmd_proc = Popen(cmd, stdout=PIPE, stderr=PIPE, close_fds=ON_POSIX) 68 | logging.info("Running operator ...") 69 | q = Queue() 70 | 71 | checker = Process(target=_enqueue_output, args=(cmd_proc.stdout, q), daemon=True) 72 | checker.start() 73 | 74 | while cmd_proc.poll() is None: 75 | try: 76 | raw_id = q.get_nowait() 77 | except Empty: 78 | continue 79 | else: 80 | # Validate the result, expect length to be 64 + 1 from '\n' 81 | if len(raw_id) == 65: 82 | container_id.value = raw_id.decode('utf-8').strip() 83 | logging.info(f"The container id is: {container_id.value}") 84 | id_returned_event.set() 85 | break 86 | else: 87 | sys.exit(f"The output of docker should be the 64 bit container ID, got {raw_id} instead.") 88 | else: 89 | if cmd_proc.returncode != 0: 90 | checker.terminate() 91 | checker.join() 92 | # This means that cmd_proc has errorred and terminated. Log the error and return 93 | logging.warning(f"Operator failed to start with returncode {cmd_proc.returncode}") 94 | sys.exit(f"The operator failed with stderr:\n{cmd_proc.stderr.read().decode('UTF-8')}") 95 | 96 | checker.terminate() 97 | checker.join() 98 | if cmd_proc.returncode is None: 99 | logging.debug("Operator is running...") 100 | # We need to know if docker exited correctly 101 | docker_wait_proc = subproc_run(["docker", "wait", container_id.value], capture_output=True) 102 | returned_str = docker_wait_proc.stdout.decode('UTF-8').strip() 103 | if returned_str == "0": 104 | logging.debug(f"Operator finished successfully with exitcode {returned_str}") 105 | else: 106 | logging.error(f"Operator failed with exitcode is: {returned_str}") 107 | try: 108 | return_code = int(returned_str) 109 | sys.exit(return_code) 110 | except ValueError: 111 | sys.exit(1) 112 | else: 113 | logging.debug(f"Docker run command returned with {cmd_proc.returncode}") 114 | 115 | 116 | def sample_operator(container, que): 117 | """Samples and writes metrics for the given operator as long as its metrics paths exist. 118 | Sampling frequency is determined by METRIC_SAMPLING_PERIOD_SECONDS. 119 | 120 | Args: 121 | container: Container object. 122 | que: None or a multiprocessing.Queue object to store data that needs to be written to csv. 123 | 124 | Returns: 125 | None 126 | """ 127 | # Waits for the files to be created by docker 128 | while not container.metrics_path_exists(): 129 | continue 130 | 131 | # Samples until the files disappear 132 | logging.debug("Starts sampling container ...") 133 | before_sample = time.perf_counter() 134 | while container.metrics_path_exists(): 135 | metric = container.sample_metrics() 136 | if que: 137 | que.put(metric) 138 | after_sample = time.perf_counter() 139 | sleep_time = METRIC_SAMPLING_PERIOD_SECONDS - (after_sample - before_sample) 140 | sleep_time = sleep_time if sleep_time > 0 else 0 141 | if sleep_time == 0: 142 | logging.info( 143 | f"Sampling taking longer than sampling period with time of {(after_sample - before_sample)} seconds") 144 | # NOTE: Due to the inaccurate nature of time.sleep(), our sampling will not be extremely precise 145 | time.sleep(sleep_time) 146 | before_sample = time.perf_counter() 147 | 148 | # Signal the end of que 149 | if que: 150 | que.put(0) 151 | logging.debug("Finished sampling container.") 152 | 153 | 154 | def build_operator_cmd(input_dir: Path, data_folder_name: str, op_config: OperatorConfig, triton_ip: str = None): 155 | """Constructs the docker command used to run operator. 156 | 157 | Args: 158 | input_dir: A Path object for the input payload data in local system 159 | data_folder_name: Name of the data folder to store temporary data 160 | op_config: A OperatorConfig object containing information about the operator 161 | triton_ip: None, or Triton's IP address 162 | 163 | Returns: 164 | cmd: A list of string representing the docker command that can be used to run the operator 165 | """ 166 | logging.debug(f"Constructing commands for operator {op_config.name} ...") 167 | op_output_dir = Path(data_folder_name) / op_config.name 168 | op_output_dir.mkdir() 169 | 170 | cmd = ["docker", "run", "-d", "--rm", "--env", "NVIDIA_CLARA_NOSYNCLOCK=1"] 171 | 172 | # If models is present, then we supply Triton ports to this 173 | if op_config.models: 174 | cmd.extend(["--env", f"{TRITON_HTTP_ENV_VAR}={triton_ip}:{TRITON_HTTP_PORT}"]) 175 | cmd.extend(["--env", f"{LEGACY_TRITON_HTTP_ENV_VAR}={triton_ip}:{TRITON_HTTP_PORT}"]) 176 | cmd.extend(["--env", f"{LEGACY_TRTIS_HTTP_ENV_VAR}={triton_ip}:{TRITON_HTTP_PORT}"]) 177 | cmd.extend(["--env", f"{TRITON_GRPC_ENV_VAR}={triton_ip}:{TRITON_GRPC_PORT}"]) 178 | 179 | # Add operator specific environment variables 180 | if op_config.variables: 181 | for key, value in op_config.variables.items(): 182 | cmd.extend(["--env", f"{key}={value}"]) 183 | 184 | # Mount input and output volumes 185 | def build_volume_mount(local, remote): 186 | return ["-v", ":".join([local, remote])] 187 | 188 | # Mount input volumes 189 | if op_config.inputs: 190 | for input_obj in op_config.inputs: 191 | # If `from` is not present, we use the input payload directory 192 | if input_obj.get("from") is None: 193 | cmd.extend(build_volume_mount(str(input_dir), input_obj["path"])) 194 | # If `from` is specified, we use the specified operator's output directory as the input for this operator 195 | else: 196 | op_input_dir = op_output_dir.parent / input_obj["from"] 197 | # If `name` is specified, then find the subdirectory and use this as the input 198 | if input_obj.get("name"): 199 | cmd.extend(build_volume_mount(str((op_input_dir / input_obj["name"])), input_obj["path"])) 200 | else: 201 | cmd.extend(build_volume_mount(str(op_input_dir), input_obj["path"])) 202 | 203 | # Mount output volumes 204 | if op_config.outputs: 205 | for output_obj in op_config.outputs: 206 | # If `name` is specified, create a subdirectory with this name 207 | if output_obj.get("name"): 208 | sub_dir = Path(op_output_dir / output_obj["name"]) 209 | sub_dir.mkdir(parents=True) 210 | cmd.extend(build_volume_mount(str(sub_dir), output_obj["path"])) 211 | else: 212 | cmd.extend(build_volume_mount(str(op_output_dir), output_obj["path"])) 213 | 214 | # Add the image and tag, and command last 215 | cmd.append(op_config.image_n_tag) 216 | if op_config.command: 217 | cmd.extend(op_config.command) 218 | logging.debug(f"Docker command for operator {op_config.name} is: {cmd}") 219 | return cmd 220 | 221 | 222 | def print_operator_metrics(metrics, metrics_header, op_name): 223 | """Logs the metrics to console in a table format. 224 | 225 | Args: 226 | metrics: list of Metrics object 227 | metrics_header: Header of the metrics data 228 | op_name: Name of the operator 229 | 230 | Returns: 231 | None 232 | """ 233 | logging.info("{:_^60}".format(f"Operator {op_name} Metrics Data")) # pragma: no cover 234 | data = [astuple(metric) for metric in metrics] # pragma: no cover 235 | logging.info(tabulate(data, metrics_header, tablefmt="pretty")) # pragma: no cover 236 | 237 | 238 | def print_operator_summary(metrics, op_name): 239 | """Calculate and logs the metrics statistics in a readable format. 240 | 241 | Args: 242 | metrics: list of Metrics object 243 | op_name: Name of the operator 244 | 245 | Returns: 246 | None 247 | """ 248 | logging.info("{:_^60}".format(f"Operator {op_name} Summary")) 249 | # Calculate metrics for CPU and memory 250 | cpu_data = [metric.cpu_percent for metric in metrics] 251 | cpu_avg = round(sum(cpu_data)/len(metrics), 3) 252 | cpu_max = round(max(cpu_data), 3) 253 | 254 | memory_data = [metric.memory for metric in metrics] 255 | memory_avg = round(sum(memory_data)/len(metrics), 3) 256 | memory_max = round(max(memory_data), 3) 257 | 258 | recommended_cpu = convert_percent_to_cores(cpu_max) 259 | # Add 100MB of buffer memory and round to multiple of base 256 260 | recommended_memory = round_up_to_multiple(memory_max + 100.0, 256) 261 | 262 | # Log it onto console 263 | data = [["CPU", f"{cpu_avg} %", f"{cpu_max} %", f"cpu: {recommended_cpu}"], [ 264 | "Memory", f"{memory_avg} MB", f"{memory_max} MB", f"memory: {recommended_memory}"]] 265 | logging.info( 266 | tabulate( 267 | data, ["Metric", "Average", "Maximum", "Resource"], 268 | tablefmt="pretty")) 269 | return data 270 | 271 | 272 | def print_pipeline_summary(pipeline_metrics_dict): 273 | """Display the pipeline summary table. 274 | 275 | Args: 276 | pipeline_metrics_dict: Dictionary with key being operator name and values are metrics 277 | 278 | Returns: 279 | None 280 | """ 281 | pipeline_data = [] 282 | for op_name, op_summary in pipeline_metrics_dict.items(): 283 | p_sumamry = [op_name] + ["\n".join([str(row1), str(row2)]) for row1, row2 in zip(op_summary[0], op_summary[1])] 284 | pipeline_data.append(p_sumamry) 285 | logging.info( 286 | tabulate( 287 | pipeline_data, ["Operator", "Metric", "Average", "Maximum", "Resource"], 288 | tablefmt="grid", numalign="right")) 289 | 290 | 291 | def run_operator( 292 | op_config, docker_cmd, output_writers, metrics_output, continue_option, 293 | pipeline_summary_dict): 294 | """Run the operator using the directories given. 295 | 296 | Args: 297 | op_config: a OperatorConfig object 298 | docker_cmd: List of docker commands to run the operator 299 | output_writers: List of writers or None 300 | metrics_output: A Path object for the metrics directory or None 301 | continue_option: A ContinueOptions Enum object 302 | pipeline_summary_dict: Dictionary with key being operator name and values are metrics 303 | 304 | Returns: 305 | True when the operator failed and user wants to stop, otherwise None 306 | """ 307 | container = Container() 308 | manager = Manager() 309 | container_id = manager.Value('c_wchar_p', '') 310 | id_returned_event = manager.Event() 311 | 312 | if output_writers is not None: 313 | write_que = Queue() 314 | writer_process = Process( 315 | target=write_to_csv, 316 | args=(write_que, METRICS_HEADER, (metrics_output / f"{op_config.name}_final_result.csv"))) 317 | writer_process.start() 318 | output_writers.append(writer_process) 319 | else: 320 | write_que = None 321 | 322 | p_start = Process(target=start_operator, args=(container_id, id_returned_event, docker_cmd)) 323 | before_id = time.perf_counter() # timing 324 | p_start.start() 325 | 326 | if id_returned_event.wait(timeout=ID_WAITING_TIME_SECONDS): 327 | # Event.wait() returns true if it has been set 328 | after_id = time.perf_counter() # timing 329 | container.id = container_id.value 330 | container.construct_metrics_path() 331 | sample_operator(container, write_que) 332 | end = time.perf_counter() # timing 333 | logging.debug(f"Time it takes to get container ID: {after_id-before_id} s") 334 | logging.debug(f"Waiting and Sampling Time: {end-after_id} s") 335 | 336 | p_start.join() 337 | 338 | # print metrics to console if not written to csv 339 | if output_writers is None: 340 | print_operator_metrics(container.metrics, METRICS_HEADER, op_config.name) 341 | operator_summary = print_operator_summary(container.metrics, op_config.name) 342 | pipeline_summary_dict[op_config.name] = operator_summary 343 | 344 | else: 345 | logging.warning(f"Obtaining docker ID timed out. Operator {op_config.name} failed") 346 | p_start.terminate() 347 | p_start.join() 348 | if output_writers is not None: 349 | writer_process.terminate() 350 | 351 | if p_start.exitcode != 0: # i.e. container_id timed out 352 | logging.warning(f"Operator {op_config.name} failed with exitcode {p_start.exitcode}") 353 | if pipeline_summary_dict.get(op_config.name): 354 | new_key = f"{op_config.name}\n(Non-zero exitcode)" 355 | pipeline_summary_dict[new_key] = pipeline_summary_dict.pop(op_config.name) 356 | if continue_option == ContinueOptions.CONT: 357 | return 358 | if continue_option == ContinueOptions.STOP: 359 | return True 360 | if not prompt_yes_or_no( 361 | "Would you like to continue execution at the risk of the rest of pipeline failing (y)? If (n), cpost will stop and cleanup."): 362 | # When user says no, we exit the for-loop and return 363 | return True 364 | 365 | 366 | def run_pipeline(execution_order, input_data_dir, metrics_output, models_dir, continue_option): 367 | """Run the pipeline operators in the given execution_order using the directories given. 368 | 369 | Args: 370 | execution_order: List of OperatorConfig objects in the order of execution 371 | input_data_dir: Path to the input payload directory 372 | metrics_output: A Path object for the metrics directory or stdout 373 | models_dir: A directory that contains Triton models 374 | continue_option: A ContinueOptions Enum object 375 | 376 | Returns: 377 | None 378 | """ 379 | 380 | triton_mode = decide_method_to_run_triton(execution_order) 381 | 382 | if triton_mode == RUN_MODE.NO_INFERENCE_SERVER: 383 | return run_pipeline_alone(execution_order, input_data_dir, metrics_output, continue_option) 384 | if triton_mode == RUN_MODE.MODEL_REPO: 385 | with run_triton_model_repo(execution_order, models_dir) as triton_ip: 386 | run_pipeline_alone(execution_order, input_data_dir, metrics_output, continue_option, triton_ip) 387 | else: # PIPELINE_SERVICES 388 | run_pipeline_with_services(execution_order, input_data_dir, metrics_output, 389 | models_dir, continue_option) 390 | 391 | 392 | @contextmanager 393 | def get_output_writers(metrics_output): 394 | """Context manager for keeping a list of output writers and cleaning up. 395 | The list is used to keep output_writer processes which are threads/multiprocessing.Process. 396 | 397 | Args: 398 | metrics_output: a pathlib.Path object or None 399 | 400 | Yields: 401 | None if metrics_output is None. Empty list if metrics_output is Path 402 | """ 403 | try: 404 | write_csv_flag = True if isinstance(metrics_output, Path) else False 405 | if write_csv_flag: 406 | output_writers = [] 407 | yield output_writers 408 | else: 409 | yield None 410 | 411 | finally: 412 | if write_csv_flag: 413 | for writer in output_writers: 414 | writer.join() 415 | 416 | 417 | def run_pipeline_alone(execution_order, input_data_dir, metrics_output, continue_option, triton_ip=None): 418 | """Run the pipeline operators in the given execution_order using the directories given. 419 | 420 | Args: 421 | execution_order: List of OperatorConfig objects in the order of execution 422 | input_data_dir: Path to the input payload directory 423 | metrics_output: A Path object for the metrics directory or stdout 424 | continue_option: A ContinueOptions Enum object 425 | triton_ip: None, or Triton's IP address 426 | 427 | Returns: 428 | None 429 | """ 430 | with TemporaryDirectory() as data_folder_name: 431 | with get_output_writers(metrics_output) as output_writers: 432 | pipeline_summary_dict = {} 433 | for op_config in execution_order: 434 | logging.info("\n{:_^60}".format(f"Executing Operator {op_config.name}")) 435 | docker_cmd = build_operator_cmd(input_data_dir, data_folder_name, op_config, triton_ip) 436 | exit = run_operator(op_config, docker_cmd, output_writers, 437 | metrics_output, continue_option, pipeline_summary_dict) 438 | if exit: 439 | break 440 | print_pipeline_summary(pipeline_summary_dict) 441 | 442 | 443 | def clean_up_containers(running_dict): 444 | """Kill the containers in the given dictionary and remove the item from the dictionary. 445 | 446 | Args: 447 | running_dict: Dictionary where key is image name and value is (container ID, ip_address) 448 | 449 | Returns: 450 | None 451 | """ 452 | for old_key, container_info in running_dict.items(): 453 | logging.debug(f"Tear down unused services {old_key}") 454 | if container_info: 455 | subproc_run_wrapper(["docker", "kill", container_info[0]]) 456 | running_dict.clear() 457 | 458 | 459 | def start_pipeline_services(op_config, running_dict, models_dir): 460 | """Start the pipeline services for the given op_config. 461 | 462 | Args: 463 | op_config: A OperatorConfig object 464 | running_dict: Dictionary for keep track of currently running services 465 | models_dir: A directory that contains Triton models 466 | 467 | Return: 468 | None 469 | """ 470 | for service in op_config.services: 471 | logging.debug(f"Checking service with name {service.name}") 472 | key = service.image_n_tag + " " + " ".join(service.command) 473 | if running_dict.get(key): 474 | # Add the connection variables 475 | ip_address = running_dict[key][1] 476 | http_connections_dict = {k: f"{ip_address}:{v}" for k, v in service.http_connections.items()} 477 | op_config.update_variables(http_connections_dict) 478 | logging.debug("Found running services that suit the needs") 479 | else: 480 | logging.debug("Didn't find matching service, starting new service") 481 | if len(running_dict) != 0: # tear down current services before spin up another one 482 | clean_up_containers(running_dict) 483 | if "trtis" in service.name or "triton" in service.name: 484 | triton_container_id, ip_address = start_triton(models_dir, service.command, service.image_n_tag) 485 | running_dict[key] = (triton_container_id, ip_address) 486 | http_connections_dict = {k: f"{ip_address}:{v}" for k, v in service.http_connections.items()} 487 | op_config.update_variables(http_connections_dict) 488 | else: 489 | logging.warning("CPOST currently does not support services other than triton or trtis.") 490 | logging.warning(f"Skipping `{service.name}`, operator may fail because of this.") 491 | 492 | 493 | def run_pipeline_with_services( 494 | execution_order, input_data_dir, metrics_output, models_dir, continue_option): 495 | """Run the pipeline operators in the given execution_order using the directories given. 496 | 497 | Args: 498 | execution_order: List of OperatorConfig objects in the order of execution 499 | input_data_dir: Path to the input payload directory 500 | metrics_output: A Path object for the metrics directory or stdout 501 | models_dir: A directory that contains Triton models 502 | continue_option: A ContinueOptions Enum object 503 | 504 | Returns: 505 | None 506 | """ 507 | with TemporaryDirectory() as data_folder_name: 508 | with get_output_writers(metrics_output) as output_writers: 509 | try: 510 | running_services = {} 511 | pipeline_summary_dict = {} 512 | for op_config in execution_order: 513 | if op_config.services: 514 | start_pipeline_services(op_config, running_services, models_dir) 515 | logging.info("\n{:_^60}".format(f"Executing Operator {op_config.name}")) 516 | docker_cmd = build_operator_cmd(input_data_dir, data_folder_name, op_config) 517 | exit = run_operator(op_config, docker_cmd, output_writers, 518 | metrics_output, continue_option, pipeline_summary_dict) 519 | if exit: 520 | break 521 | print_pipeline_summary(pipeline_summary_dict) 522 | finally: 523 | # Stop any currently running services 524 | clean_up_containers(running_services) 525 | --------------------------------------------------------------------------------