├── VERSION
├── tests
    ├── __init__.py
    ├── pipelines
    │   ├── operator_with_model.yaml
    │   ├── operator_with_services.yaml
    │   └── nullpipeline.yaml
    ├── test_main.py
    ├── test_topology_sort.py
    ├── test_container.py
    ├── test_clarac_utils.py
    ├── test_cli.py
    ├── test_utils.py
    ├── test_triton_utils.py
    └── test_pipeline_utils.py
├── .gitignore
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
├── src
    ├── __init__.py
    ├── constants.py
    ├── main.py
    ├── topology_sort.py
    ├── cli.py
    ├── clarac_utils.py
    ├── container.py
    ├── utils.py
    ├── triton_utils.py
    └── pipeline_utils.py
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    └── workflows
    │   ├── build.yml
    │   ├── codeql-analysis.yml
    │   ├── release-staging.yml
    │   └── release.yml
├── ngc
    └── overview.md
├── setup.py
├── CONTRIBUTING.md
├── LICENSE
└── README.md


/VERSION:
--------------------------------------------------------------------------------
1 | 0.8.1


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv/
2 | .vscode/
3 | *.csv
4 | demo/
5 | .coverage
6 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | autopep8==1.5.7
2 | coverage==5.5
3 | flake8==3.9.2
4 | isort==5.9.2
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | psutil==5.8.0
2 | pytest==6.2.4
3 | PyYAML==5.4.1
4 | requests==2.25.1
5 | tabulate==0.8.9
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [coverage:run]
 2 | source = .
 3 | omit = venv/*
 4 | 
 5 | [coverage:report]
 6 | show_missing = True
 7 | include = 
 8 |     src/*
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 NVIDIA Corporation
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: 'Build'
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - name: Set up Python 3.8
15 |         uses: actions/setup-python@v2
16 |         with:
17 |           python-version: 3.8
18 |       - name: Install dependencies
19 |         run: |
20 |           python -m pip install --upgrade pip wheel
21 |           pip install -r requirements.txt
22 |       - name: Build
23 |         run: |
24 |           python3 setup.py sdist bdist_wheel
25 |       - name: Install
26 |         run: |
27 |           pip install ./dist/*.whl
28 |       - name: Test
29 |         run: python3 -m pytest tests
30 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | name: 'CodeQL'
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |   schedule:
 9 |     - cron: '37 23 * * 0'
10 | 
11 | jobs:
12 |   analyze:
13 |     name: Analyze
14 |     runs-on: ubuntu-latest
15 |     permissions:
16 |       actions: read
17 |       contents: read
18 |       security-events: write
19 | 
20 |     strategy:
21 |       fail-fast: false
22 |       matrix:
23 |         language: [ 'python' ]
24 | 
25 |     steps:
26 |     - name: Checkout repository
27 |       uses: actions/checkout@v2
28 | 
29 |     # Initializes the CodeQL tools for scanning.
30 |     - name: Initialize CodeQL
31 |       uses: github/codeql-action/init@v1
32 |       with:
33 |         languages: ${{ matrix.language }}
34 | 
35 |     - name: Autobuild
36 |       uses: github/codeql-action/autobuild@v1
37 | 
38 |     - name: Perform CodeQL Analysis
39 |       uses: github/codeql-action/analyze@v1
40 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ### Description
11 | A clear and concise description of what the bug is.
12 | 
13 | ### Steps to reproduce
14 | Please share a clear and concise description of the problem.
15 | 
16 | ...
17 | 
18 | ### Expected behavior
19 | A clear and concise description of what you expected to happen.
20 | 
21 | ### Actual behavior
22 | A clear and concise description of what actually happened.
23 | 
24 | ### Configuration
25 | 
26 | * CPOST version: (PIP package version or branch name if building from source)
27 | * Python version:
28 | * OS and version (distro if applicable):
29 | 
30 | ### Regression?
31 | Did this work in the previous build or release of the playbook? If you can try a previous release or build to find out, that can help us narrow down the problem. If you don't know, that's OK.
32 | 
33 | ### Other information
34 | (Please attach any relevant stdout / logs if available and remember to anonymize any PHI before sharing).
35 | 
36 | 


--------------------------------------------------------------------------------
/ngc/overview.md:
--------------------------------------------------------------------------------
 1 | # CPOST (Clara Pipeline Operator Sizing Tool)
 2 | ## Tool to measure resource usage of Clara Platform pipeline operators
 3 | 
 4 | CPOST is a tool that will help you run your pipeline locally and provides you with the CPU and memory usage of each operators ran for the given input payload. Opeartors are ran one at a time and CPU and memory usage are sampled. The CPU and memory usage metrics are provided in a .csv format which allows further data analytics as needed.
 5 | 
 6 | ##  System Requirements
 7 | * Clara Compiler (downloadable from [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:clara:clara_cli))
 8 | * Docker 20.10 or higher due to cgroup v2 constraints
 9 | * System must be using cgroup v2 (See [Docker Control Groups](https://docs.docker.com/config/containers/runmetrics/#control-groups) for more information)
10 | * Python 3.8.0 or higher
11 | * Do not have a Triton instance running on the same machine that CPOST is running on. CPOST will provision it's own Triton instance and the two instances could conflict and cause failures.
12 | 
13 | CPOST is available on [GitHub](https://github.com/NVIDIA/clara-pipeline-operator-sizing-tool)
14 | 


--------------------------------------------------------------------------------
/tests/pipelines/operator_with_model.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 NVIDIA Corporation
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | api-version: 0.4.0
15 | name: null-pipeline
16 | orchestrator: Clara
17 | pull-secrets:
18 |   - ngc-clara
19 | operators:
20 | - name: null-reader
21 |   variables:
22 |     CLARA_TRACE: 2
23 |   container:
24 |     image: null-pipeline/operator-py
25 |     tag: 0.8.1
26 |     command: ["python", "register.py", "--agent", "renderserver"]
27 |   input:
28 |   - path: /input
29 |   output:
30 |   - path: /output
31 |   models:
32 |   # change the following line to match the name created for the model
33 |   - name: segmentation_ct_spleen_v1
34 |   - name: segmentation_ct_liver_v1
35 | 


--------------------------------------------------------------------------------
/tests/pipelines/operator_with_services.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 NVIDIA Corporation
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | api-version: 0.4.0
15 | name: null-pipeline
16 | orchestrator: Clara
17 | pull-secrets:
18 |   - ngc-clara
19 | operators:
20 | - name: null-reader
21 |   variables:
22 |     CLARA_TRACE: 2
23 |   container:
24 |     image: null-pipeline/operator-py
25 |     tag: 0.8.1
26 |   input:
27 |   - path: /input
28 |   output:
29 |   - path: /output
30 |   services:
31 |   - name: trtis
32 |   # Triton Inference Server, required by this AI application.
33 |     container:
34 |       image: nvcr.io/nvidia/tritonserver
35 |       tag: latest
36 |       command: ["some", "command"]
37 |     connections:
38 |       http:
39 |       - name: NVIDIA_CLARA_TRTISURI
40 |         port: 8000
41 | 


--------------------------------------------------------------------------------
/src/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 NVIDIA Corporation
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import os
17 | import sys
18 | from pathlib import Path
19 | 
20 | B_MB_FACTOR = 1e6
21 | 
22 | SYSFS_PATH = Path("/sys/fs/cgroup")
23 | 
24 | ON_POSIX = 'posix' in sys.builtin_module_names
25 | 
26 | NS_PER_S = 1e9
27 | CLOCK_TICKS_PER_S = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
28 | ONLINE_CPUS = os.sysconf(os.sysconf_names['SC_NPROCESSORS_ONLN'])
29 | 
30 | ID_WAITING_TIME_SECONDS = 15
31 | METRIC_SAMPLING_PERIOD_SECONDS = 0.2  # i.e 200ms
32 | 
33 | 
34 | TRITON_IMAGE_TAG = "nvcr.io/nvidia/tritonserver:20.07-v1-py3"
35 | TRITON_READY_TIMEOUT_SECONDS = 30
36 | TRITON_WAIT_TIME_SECONDS = 15
37 | TRITON_WAIT_SLEEP_TIME_SECONDS = 1
38 | TRITON_HTTP_ENV_VAR = "NVIDIA_TRITON_HTTPURI"
39 | TRITON_HTTP_PORT = 8000
40 | TRITON_GRPC_ENV_VAR = "NVIDIA_TRITON_GRPCURI"
41 | TRITON_GRPC_PORT = 8001
42 | LEGACY_TRTIS_HTTP_ENV_VAR = "NVIDIA_CLARA_TRTISURI"
43 | LEGACY_TRITON_HTTP_ENV_VAR = "CLARA_TRITON_URI"
44 | 


--------------------------------------------------------------------------------
/.github/workflows/release-staging.yml:
--------------------------------------------------------------------------------
 1 | name: 'Staging'
 2 | # generating and testing package artifacts from the main branch
 3 | 
 4 | on:
 5 |   workflow_dispatch:
 6 | 
 7 | jobs:
 8 |   packaging:
 9 |     runs-on: ubuntu-latest
10 |     strategy:
11 |       matrix:
12 |         python-version: [3.8, 3.9]
13 |     steps:
14 |       - uses: actions/checkout@v2
15 |         with:
16 |           fetch-depth: 0
17 |       - name: Set up Python ${{ matrix.python-version }}
18 |         uses: actions/setup-python@v2
19 |         with:
20 |           python-version: ${{ matrix.python-version }}
21 |       - name: Install dependencies
22 |         run: |
23 |           python -m pip install --upgrade pip wheel
24 |           pip install -r requirements.txt
25 |       - name: Build
26 |         run: |
27 |           python3 setup.py sdist bdist_wheel
28 |       - name: Install
29 |         run: |
30 |           pip install ./dist/*.whl
31 |       - name: Test
32 |         run: python3 -m pytest tests
33 | 
34 |       - if: matrix.python-version == '3.8' 
35 |         name: Upload artifacts
36 |         uses: actions/upload-artifact@v2
37 |         with:
38 |           path: ./dist/nvidia_clara_cpost-*.whl
39 | 
40 |       - if: matrix.python-version == '3.8'
41 |         name: Check artifacts
42 |         run: |
43 |           ls -al dist/
44 |           rm dist/nvidia-clara-cpost-*.tar.gz
45 |           ls -al dist/
46 | 
47 |       - if: matrix.python-version == '3.8' 
48 |         name: Publish to Test PyPI
49 |         uses: pypa/gh-action-pypi-publish@master
50 |         with:
51 |           password: ${{ secrets.TEST_PYPI_TOKEN }}
52 |           repository_url: https://test.pypi.org/legacy/


--------------------------------------------------------------------------------
/tests/pipelines/nullpipeline.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 NVIDIA Corporation
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | api-version: 0.4.0
15 | name: null-pipeline
16 | orchestrator: Clara
17 | pull-secrets:
18 |   - ngc-clara
19 | # This pipeline is intended to emulate the traditional 3-stage pipeline used by our
20 | # reference pipelines: reader -> inference -> writer.
21 | operators:
22 | - name: null-reader
23 |   variables:
24 |     CLARA_TRACE: 2
25 |   container:
26 |     image: null-pipeline/operator-py
27 |     tag: 0.8.1
28 |   input:
29 |   - path: /input
30 |   output:
31 |   - path: /output
32 | - name: null-inference
33 |   variables:
34 |     CLARA_TRACE: 2
35 |   container:
36 |     image: null-pipeline/operator-py
37 |     tag: 0.8.1
38 |   input:
39 |   - from: null-reader
40 |     path: /input
41 |   output:
42 |   - path: /output
43 |   requests:
44 |     gpu: 1 # Request a GPU to better emulate GPU enabled inference workloads.
45 | - name: null-writer
46 |   variables:
47 |     CLARA_TRACE: 2
48 |   container:
49 |     image: null-pipeline/operator-py
50 |     tag: 0.8.1
51 |   input:
52 |   - from: null-inference
53 |     path: /input
54 |   output:
55 |   - path: /output
56 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 NVIDIA Corporation
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import os
17 | import sys
18 | from unittest.mock import MagicMock, patch
19 | 
20 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src"))
21 | from main import main  # nopep8  # noqa: E402
22 | 
23 | 
24 | @patch("main.run_pipeline")
25 | @patch("main.topo_sort_pipeline")
26 | @patch("main.check_images_and_tags")
27 | @patch("main.run_clarac")
28 | @patch("main.assert_installed")
29 | @patch("main.set_up_logging")
30 | @patch("main.parse_args")
31 | def test_main(mock_parse, mock_set_logging, mock_assert_install, mock_run_clarac, mock_check, mock_sort, mock_run):
32 | 
33 |     mock_parse.return_value = MagicMock(**{"verbose": 2, "pipeline_path": "some_path"})
34 |     mock_run_clarac.return_value = MagicMock(**{"operators": "operators"})
35 |     main()
36 |     mock_set_logging.assert_called_with(2)
37 |     assert mock_assert_install.call_count == 2
38 |     mock_run_clarac.assert_called_with("some_path")
39 |     mock_check.assert_called_with("operators")
40 |     mock_sort.assert_called_with("operators")
41 |     mock_run.assert_called_once()
42 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: 'Release'
 2 | # generating and testing package artifacts from the main branch
 3 | 
 4 | on:
 5 |   release:
 6 |     types: [edited, published]
 7 |     tags-ignore:
 8 |       - data
 9 | 
10 | jobs:
11 |   packaging:
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         python-version: [3.8, 3.9]
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |         with:
19 |           fetch-depth: 0
20 |       - name: Set up Python ${{ matrix.python-version }}
21 |         uses: actions/setup-python@v2
22 |         with:
23 |           python-version: ${{ matrix.python-version }}
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip wheel
27 |           pip install -r requirements.txt
28 |       - name: Build
29 |         run: |
30 |           python3 setup.py sdist bdist_wheel
31 |       - name: Install
32 |         run: |
33 |           pip install ./dist/*.whl
34 |       - name: Test
35 |         run: python3 -m pytest tests
36 | 
37 |       - if: matrix.python-version == '3.8' && startsWith(github.ref, 'refs/tags/')
38 |         name: Upload artifacts
39 |         uses: actions/upload-artifact@v2
40 |         with:
41 |           path: ./dist/nvidia_clara_cpost-*.whl
42 | 
43 |       - if: matrix.python-version == '3.8' && startsWith(github.ref, 'refs/tags/')
44 |         name: Check artifacts
45 |         run: |
46 |           ls -al dist/
47 |           rm ./dist/nvidia-clara-cpost-*.tar.gz
48 |           ls -al dist/
49 | 
50 |       - if: matrix.python-version == '3.8' && startsWith(github.ref, 'refs/tags/')
51 |         name: Publish to Production PyPI
52 |         uses: pypa/gh-action-pypi-publish@master
53 |         with:
54 |           user: __token__
55 |           password: ${{ secrets.PYPI_TOKEN }}
56 | 
57 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 NVIDIA Corporation
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import logging
17 | import os
18 | import sys
19 | 
20 | sys.path.append('{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), '../src'))
21 | from clarac_utils import run_clarac  # nopep8  # noqa: E402
22 | from pipeline_utils import run_pipeline  # nopep8  # noqa: E402
23 | from topology_sort import topo_sort_pipeline  # nopep8  # noqa: E402
24 | from utils import assert_installed, check_images_and_tags, set_up_logging  # nopep8  # noqa: E402
25 | 
26 | from cli import parse_args  # nopep8  # noqa: E402
27 | 
28 | 
29 | def main():
30 |     parsed_args = parse_args(sys.argv[1:])
31 | 
32 |     set_up_logging(parsed_args.verbose)
33 | 
34 |     assert_installed("clarac")
35 |     assert_installed("docker")
36 |     logging.info("All software dependencies are fullfilled.")
37 | 
38 |     pipeline_config = run_clarac(parsed_args.pipeline_path)
39 | 
40 |     check_images_and_tags(pipeline_config.operators)
41 | 
42 |     execution_order = topo_sort_pipeline(pipeline_config.operators)
43 | 
44 |     run_pipeline(execution_order, parsed_args.input_dir, parsed_args.metrics_dir,
45 |                  parsed_args.models_dir, parsed_args.force)
46 | 
47 | 
48 | if __name__ == "__main__":    # pragma: no cover
49 |     main()
50 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 NVIDIA Corporation
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import os
17 | 
18 | import setuptools
19 | 
20 | with open("README.md", "r") as fh:
21 |     long_description = fh.read()
22 | 
23 | # Install required packages from requirements.txt file
24 | requirements_relative_path = "/requirements.txt"
25 | package_folder = os.path.dirname(os.path.realpath(__file__))
26 | requirements_path = package_folder + requirements_relative_path
27 | if os.path.isfile(requirements_path):
28 |     with open(requirements_path) as f:
29 |         install_requires = f.read().splitlines()
30 | 
31 | # Extract version number from VERSION file
32 | release_version = "0.0.0"
33 | if os.path.exists('VERSION'):
34 |     with open('VERSION') as version_file:
35 |         release_version = version_file.read().strip()
36 | 
37 | setuptools.setup(
38 |     name="nvidia-clara-cpost",
39 |     author="NVIDIA Clara Deploy",
40 |     version=release_version,
41 |     description="Python package to run Clara Pipeline Operator Sizing Tool (cpost)",
42 |     long_description=long_description,
43 |     long_description_content_type="text/markdown",
44 |     url="https://gitlab-master.nvidia.com/Clara/sdk/-/tree/main/Tools/cpost",
45 |     install_requires=install_requires,
46 |     packages=setuptools.find_packages('.'),
47 |     entry_points={
48 |         'console_scripts': [
49 |                 'cpost = src.main:main'
50 |         ]
51 |     },
52 |     classifiers=[
53 |         "Programming Language :: Python :: 3",
54 |         "Operating System :: OS Independent",
55 |     ],
56 |     python_requires='>=3.8',
57 | )
58 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to NVIDIA CPOST (Clara Pipeline Operator Sizing Tool)
 2 | 
 3 | All PRs submitted to the CPOST project must be signed with real name and email address. Please see below for more information.
 4 | 
 5 | ## Sign your work
 6 | 
 7 | We enforce the [Developer Certificate of Origin](http://developercertificate.org/) on all pull 
 8 | requests. The sign-off is a simple line at the end of the explanation for the patch. Your
 9 | signature certifies that you wrote the patch or otherwise have the right to pass
10 | it on as an open-source patch. The rules are pretty simple: if you can certify
11 | the below (from [developercertificate.org](http://developercertificate.org/)):
12 | 
13 | ```
14 | Developer Certificate of Origin
15 | Version 1.1
16 | 
17 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
18 | 
19 | Everyone is permitted to copy and distribute verbatim copies of this
20 | license document, but changing it is not allowed.
21 | 
22 | 
23 | Developer's Certificate of Origin 1.1
24 | 
25 | By making a contribution to this project, I certify that:
26 | 
27 | (a) The contribution was created in whole or in part by me and I
28 |     have the right to submit it under the open source license
29 |     indicated in the file; or
30 | 
31 | (b) The contribution is based upon previous work that, to the best
32 |     of my knowledge, is covered under an appropriate open source
33 |     license and I have the right under that license to submit that
34 |     work with modifications, whether created in whole or in part
35 |     by me, under the same open source license (unless I am
36 |     permitted to submit under a different license), as indicated
37 |     in the file; or
38 | 
39 | (c) The contribution was provided directly to me by some other
40 |     person who certified (a), (b) or (c) and I have not modified
41 |     it.
42 | 
43 | (d) I understand and agree that this project and the contribution
44 |     are public and that a record of the contribution (including all
45 |     personal information I submit with it, including my sign-off) is
46 |     maintained indefinitely and may be redistributed consistent with
47 |     this project or the open source license(s) involved.
48 | ```
49 | 
50 | Then you just add a line to every git commit message:
51 | 
52 |     Signed-off-by: Joe Smith <joe.smith@email.com>
53 | 
54 | Please use your real name (sorry, no pseudonyms or anonymous contributions.)
55 | 
56 | If you set your `user.name` and `user.email` git configs, you can sign your
57 | commit automatically with `git commit -s`. 


--------------------------------------------------------------------------------
/tests/test_topology_sort.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 NVIDIA Corporation
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import os
17 | import sys
18 | 
19 | import pytest
20 | 
21 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src"))
22 | from clarac_utils import OperatorConfig  # nopep8  # noqa: E402
23 | from topology_sort import PipelineDAG, topo_sort_pipeline  # nopep8  # noqa: E402
24 | 
25 | 
26 | def test_topo_sort():
27 |     g = PipelineDAG()
28 |     g.add_input_edge(2, 5)
29 |     g.add_input_edge(0, 5)
30 |     g.add_input_edge(0, 4)
31 |     g.add_input_edge(1, 4)
32 |     g.add_input_edge(3, 2)
33 |     g.add_input_edge(1, 3)
34 |     assert g.topological_sort() == [5, 4, 2, 0, 3, 1]
35 | 
36 | 
37 | def test_topo_sort_2():
38 |     g = PipelineDAG()
39 |     g.add_input_edge(2, 1)
40 |     g.add_input_edge(3, 2)
41 |     g.add_input_edge(4, 3)
42 |     assert g.topological_sort() == [1, 2, 3, 4]
43 | 
44 | 
45 | def test_topo_sort_error():
46 |     g = PipelineDAG()
47 |     g.add_input_edge(2, 1)
48 |     g.add_input_edge(3, 2)
49 |     g.add_input_edge(1, 3)
50 |     with pytest.raises(RuntimeError):
51 |         g.topological_sort()
52 | 
53 | 
54 | def test_a_pipeline():
55 |     op1 = OperatorConfig("Input1", "tag", None, None, [{"path": "/input"}], None)
56 |     op2 = OperatorConfig("Input2", "tag", None, None, [{"from": "Input1", "path": "/input"}], None)
57 |     op3 = OperatorConfig("Input3", "tag", None, None, [{"from": "Input2", "path": "/input"}], None)
58 | 
59 |     sequence = topo_sort_pipeline([op2, op3, op1])
60 |     assert sequence == [op1, op2, op3]
61 | 
62 | 
63 | def test_a_single_operator_pipeline():
64 |     op1 = OperatorConfig("Input1", "tag", None, None, [{"path": "/input"}], None)
65 | 
66 |     sequence = topo_sort_pipeline([op1])
67 |     assert sequence == [op1]
68 | 
69 | 
70 | def test_twp_operator_pipeline():
71 |     op1 = OperatorConfig("Input1", "tag", None, None, [{"path": "/input"}], None)
72 |     op2 = OperatorConfig("Input2", "tag", None, None, [{"from": "Input1", "path": "/input"}], None)
73 | 
74 |     sequence = topo_sort_pipeline([op2, op1])
75 |     assert sequence == [op1, op2]
76 | 
77 | 
78 | def test_complex_pipeline():
79 |     op1 = OperatorConfig("Input1", "tag", None, None, [{"path": "/input"}], None)
80 |     op2 = OperatorConfig("Input2", "tag", None, None, [{"path": "/input"}], None)
81 |     op3 = OperatorConfig("Input3", "tag", None, None,
82 |                          [{"from": "Input1", "path": "/input"},
83 |                           {"from": "Input2", "path": "/input"}],
84 |                          None)
85 |     op4 = OperatorConfig("Input4", "tag", None, None, [{"from": "Input2", "path": "/input"}], None)
86 |     op5 = OperatorConfig("Input5", "tag", None, None,
87 |                          [{"from": "Input3", "path": "/input"},
88 |                           {"from": "Input4", "path": "/input"}],
89 |                          None)
90 | 
91 |     sequence = topo_sort_pipeline([op3, op4, op1, op2, op5])
92 |     assert sequence == [op1, op2, op3, op4, op5]
93 | 


--------------------------------------------------------------------------------
/src/topology_sort.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import logging
 17 | from collections import defaultdict
 18 | 
 19 | 
 20 | class PipelineDAG:
 21 |     """Class for the Pipeline DAG used for sorting."""
 22 | 
 23 |     def __init__(self):
 24 |         self.input_deg_graph = defaultdict(lambda: 0)
 25 |         self.output_graph = defaultdict(list)  # dictionary containing adjacency List
 26 | 
 27 |     def add_input_edge(self, node: str, input_node: str):
 28 |         """Add the node by giving its input node.
 29 | 
 30 |         Args:
 31 |             node: Node to be added
 32 |             input_node: One of its dependency nodes
 33 | 
 34 |         Returns:
 35 |             None
 36 |         """
 37 |         self.output_graph[input_node].append(node)
 38 |         # Update the input_degree_graph as we are adding each node
 39 |         self.input_deg_graph[input_node] += 0
 40 |         self.input_deg_graph[node] += 1
 41 | 
 42 |     def topological_sort(self):
 43 |         """Topologically sort the given graph based on Kahn's algorithm.
 44 | 
 45 |         Args:
 46 |             None
 47 | 
 48 |         Returns:
 49 |             A list that is the topological order of the current graph
 50 | 
 51 |         Raises:
 52 |             Runtime Error if the graph contains cycles
 53 |         """
 54 |         visited_count = 0
 55 |         topo_order = []
 56 |         # Create a list for all node with in-degree 0
 57 |         zero_indegree = [node for node, length in self.input_deg_graph.items() if length == 0]
 58 | 
 59 |         # Pick zero-in-degree node one by one and check if any new zero-in-degree node shows up
 60 |         while zero_indegree:
 61 |             # Get the first zero in-degree node and add it to topo_order
 62 |             cur_node = zero_indegree.pop(0)
 63 |             topo_order.append(cur_node)
 64 | 
 65 |             # Iterate through output nodes of cur_node and decrease their in-degree by 1
 66 |             for i in self.output_graph[cur_node]:
 67 |                 self.input_deg_graph[i] -= 1
 68 |                 # If in-degree becomes zero, add it to zero_indegree
 69 |                 if self.input_deg_graph[i] == 0:
 70 |                     zero_indegree.append(i)
 71 | 
 72 |             visited_count += 1
 73 | 
 74 |         # Check for a cycle in the graph
 75 |         if visited_count != len(self.output_graph.keys()):
 76 |             raise RuntimeError("There exists a cycle in the given graph")
 77 | 
 78 |         return topo_order
 79 | 
 80 | 
 81 | def topo_sort_pipeline(operators):
 82 |     """Topologically sort the given operators.
 83 | 
 84 |     Args:
 85 |         operators: List of OperatorConfig objects
 86 | 
 87 |     Returns:
 88 |         A topologically ordered OperatorConfig objects
 89 |     """
 90 |     logging.debug(f"Topolocally order the given input: {operators}")
 91 |     if len(operators) == 1:
 92 |         result = operators.copy()
 93 |     else:
 94 |         # Construct a dictionary from operators so that we can convert names back to OperatorConfigs later
 95 |         op_dict = {op.name: op for op in operators}
 96 |         dag = PipelineDAG()
 97 |         for op in operators:
 98 |             for input_path in op.inputs:
 99 |                 if input_path.get("from"):
100 |                     dag.add_input_edge(op.name, input_path.get("from"))
101 |         sequence = dag.topological_sort()
102 |         result = [op_dict[op_name] for op_name in sequence]
103 |     logging.debug(f"Topologically order result is: {result}")
104 |     return result
105 | 


--------------------------------------------------------------------------------
/src/cli.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import argparse
 17 | import sys
 18 | from enum import IntEnum
 19 | from pathlib import Path
 20 | 
 21 | 
 22 | class ContinueOptions(IntEnum):
 23 |     """Enum to organize options to prompt user, continue execution, or stop execution when operator fails."""
 24 |     NONE = 0  # prompt user y/n
 25 |     CONT = 1  # continue execution
 26 |     STOP = 2  # stop execution
 27 | 
 28 |     # methods for compatible with argparse and error message
 29 | 
 30 |     def __str__(self):
 31 |         return self.name.lower()
 32 | 
 33 |     def __repr__(self):
 34 |         return str(self)
 35 | 
 36 |     @staticmethod
 37 |     def argparse(s):
 38 |         try:
 39 |             return ContinueOptions[s.upper()]
 40 |         except KeyError:  # To be used with `choices` in add_argument()
 41 |             return s
 42 | 
 43 | 
 44 | class MyParser(argparse.ArgumentParser):
 45 |     """Custom parser class to override the error method."""
 46 | 
 47 |     def error(self, message):
 48 |         """Overriding the default error method to print help message before exiting."""
 49 |         sys.stderr.write('error: %s\n' % message)
 50 |         self.print_help(sys.stderr)
 51 |         self.exit(2)
 52 | 
 53 | 
 54 | def valid_file(path):
 55 |     """Helper method for parse_args to convert to Path and verify if the file path exists.
 56 | 
 57 |     Args:
 58 |         path: path to file from parse_args()
 59 | 
 60 |     Returns:
 61 |         The absolute path of the given file path if it exists
 62 | 
 63 |     Raises:
 64 |         argparse.ArgumentTypeError if the file given does not exist
 65 |     """
 66 |     path = Path(path)
 67 |     if path.exists() and path.is_file():
 68 |         return path.absolute()
 69 |     raise argparse.ArgumentTypeError(f"No such file or the given path is not a file: '{path}'")
 70 | 
 71 | 
 72 | def valid_dir(path):
 73 |     """Helper method for parse_args to convert to Path and verify if the directory exists.
 74 | 
 75 |     Args:
 76 |         path: path to directory from parse_args()
 77 | 
 78 |     Returns:
 79 |         The absolute path of the given directory if it exists
 80 | 
 81 |     Raises:
 82 |         argparse.ArgumentTypeError if the directory given does not exist or if not a directory
 83 |     """
 84 |     path = Path(path)
 85 |     if path.exists() and path.is_dir():
 86 |         return path.absolute()
 87 |     raise argparse.ArgumentTypeError(f"No such directory or the given path is not a directory: '{path}'")
 88 | 
 89 | 
 90 | def parse_args(args):
 91 |     """Create an argument parser and parse the command-line arguments.
 92 | 
 93 |     Args:
 94 |         args: A list of arguments to parse
 95 | 
 96 |     Returns:
 97 |         A parser object containing parsed arguments
 98 |     """
 99 | 
100 |     parser = MyParser(prog="cpost", description="Clara Pipeline Sizing Tool CLI")
101 | 
102 |     parser.add_argument("pipeline_path", metavar="<pipeline_path>",
103 |                         type=valid_file, help="pipeline definition file path")
104 | 
105 |     parser.add_argument("input_dir", metavar="<input_dir>", type=valid_dir, help="input payload directory")
106 | 
107 |     parser.add_argument("--metrics_dir", type=valid_dir,
108 |                         help="metrics output directory, if not specified, write to stdout")
109 | 
110 |     parser.add_argument("--models_dir", type=valid_dir,
111 |                         help="directory for Triton models, required if pipeline uses Triton")
112 | 
113 |     parser.add_argument(
114 |         "-v", "--verbose", action='store_true',
115 |         help="verbose output (DEBUG level). If not specified, default output is INFO level.")
116 | 
117 |     parser.add_argument(
118 |         "--force", default=ContinueOptions.NONE, const=ContinueOptions.NONE, nargs='?', type=ContinueOptions.argparse,
119 |         choices=list(ContinueOptions),
120 |         help='force continue or stop when operator failure occurs. \
121 |             (default: %(default)s, which will prompt the user for each failure).')
122 | 
123 |     return parser.parse_args(args)
124 | 


--------------------------------------------------------------------------------
/src/clarac_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import logging
 16 | import sys
 17 | from dataclasses import dataclass
 18 | from subprocess import run as subproc_run
 19 | from tempfile import NamedTemporaryFile
 20 | from typing import Dict, List
 21 | 
 22 | import yaml
 23 | 
 24 | 
 25 | @dataclass
 26 | class ServiceConfig:
 27 |     name: str
 28 |     image_n_tag: str
 29 |     command: List[str]
 30 |     http_connections: Dict
 31 | 
 32 | 
 33 | @dataclass
 34 | class OperatorConfig:
 35 |     name: str
 36 |     image_n_tag: str
 37 |     command: List[str]
 38 |     variables: Dict
 39 |     inputs: List
 40 |     outputs: List
 41 |     models: List[str] = None
 42 |     services: List[ServiceConfig] = None
 43 | 
 44 |     def update_variables(self, var_dict):
 45 |         """Update the variable attribute with the given dictionary."""
 46 |         if self.variables:
 47 |             self.variables = {**var_dict, **self.variables}
 48 |         else:
 49 |             self.variables = {**var_dict}
 50 | 
 51 | 
 52 | @dataclass
 53 | class PipelineConfig:
 54 |     name: str
 55 |     operators: List[OperatorConfig]
 56 | 
 57 | 
 58 | def run_clarac(source_file: str) -> PipelineConfig:
 59 |     """Run Clara Complier in a subprocess using the given pipeline definition and parse the results.
 60 | 
 61 |     Args:
 62 |         source_file: path to the pipeline definition file
 63 | 
 64 |     Returns:
 65 |         A PipelineConfig object
 66 |     """
 67 |     def _extract_services(services):
 68 |         """Extract services section in pipeline definition into list of ServiceConfig."""
 69 |         result = []
 70 |         for service in services:
 71 |             service_image_n_tag = service["container"]["image"] + ":" + service["container"]["tag"]
 72 |             command = service["container"].get("command")
 73 |             if command:
 74 |                 command = [c.replace("$(NVIDIA_CLARA_SERVICE_DATA_PATH)", "") for c in command]
 75 |             op_service = ServiceConfig(
 76 |                 name=service["name"],
 77 |                 image_n_tag=service_image_n_tag,
 78 |                 command=command,
 79 |                 http_connections={con["name"]: con["port"] for con in service["connections"].get("http")})
 80 |             result.append(op_service)
 81 |         return result
 82 | 
 83 |     logging.debug("Running Clara Complier to validate the pipeline definition ...")
 84 |     with NamedTemporaryFile() as result_file:
 85 |         cmd = ["clarac", "-p", source_file, "-o", result_file.name, "--resolve-imports"]
 86 |         proc = subproc_run(cmd)
 87 |         if proc.returncode != 0:
 88 |             logging.error(proc.stderr)
 89 |             sys.exit(proc.returncode)
 90 |         else:
 91 |             logging.debug(f"stdout from Clara Complier: {proc.stdout}")
 92 |             logging.debug(f"Clara Complier returned with error code {proc.returncode}, loading result as python object")
 93 | 
 94 |         try:
 95 |             config = yaml.load(result_file, yaml.FullLoader)
 96 |         except yaml.YAMLError as exc:
 97 |             logging.error(f"Error in configuration file from Clara Complier: {exc}")
 98 |             sys.exit(2)
 99 |         logging.debug(f"The content loaded from Clara Complier is: {config}")
100 | 
101 |     operators = []
102 |     # Get the objects of interest, construct a list, and return it
103 |     for op in config["operators"]:
104 |         # Get services and names of triton models used by this operator
105 |         op_models = [model_dict["name"] for model_dict in op.get("models")] if op.get("models") else None
106 |         op_services = _extract_services(op.get("services")) if op.get("services") else None
107 | 
108 |         image_n_tag = op["container"]["image"] + ":" + op["container"]["tag"]
109 |         cmd = op["container"].get("command")
110 |         operator = OperatorConfig(name=op["name"], image_n_tag=image_n_tag, command=cmd, variables=op.get(
111 |             "variables"), inputs=op["input"], outputs=op.get("output"), models=op_models, services=op_services)
112 |         operators.append(operator)
113 | 
114 |     return PipelineConfig(name=config["name"], operators=operators)
115 | 


--------------------------------------------------------------------------------
/tests/test_container.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import os
 17 | import sys
 18 | import tempfile
 19 | from pathlib import Path
 20 | from unittest.mock import patch
 21 | 
 22 | import pytest
 23 | 
 24 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src"))
 25 | from container import Container, Metrics, RawMetrics  # nopep8  # noqa: E402
 26 | 
 27 | 
 28 | def is_empty(any_structure):
 29 |     """Helper method to check if structure is empty."""
 30 |     if any_structure:
 31 |         return False
 32 |     else:
 33 |         return True
 34 | 
 35 | 
 36 | TEMP_DIR = Path(tempfile.gettempdir())
 37 | TEST_SYS_FS = TEMP_DIR / "test_sys_fs"
 38 | 
 39 | 
 40 | @patch("container.SYSFS_PATH", TEST_SYS_FS)
 41 | class TestContainer:
 42 | 
 43 |     def test_init_container(self):
 44 |         container = Container()
 45 |         assert isinstance(container, Container)
 46 |         assert is_empty(container.id)
 47 |         assert is_empty(container.raw_metrics)
 48 |         assert is_empty(container.metric_paths)
 49 | 
 50 |     def test_create_metrics_path_no_id(self):
 51 |         container = Container()
 52 |         with pytest.raises(RuntimeError):
 53 |             container.construct_metrics_path()
 54 | 
 55 |     def test_create_metrics_path_with_id(self):
 56 |         container = Container()
 57 |         container.id = "testID1"
 58 |         container.construct_metrics_path()
 59 | 
 60 |         cpu_path = TEST_SYS_FS / "cpuacct" / "docker" / container.id / "cpuacct.usage"
 61 |         per_cpu_path = TEST_SYS_FS / "cpuacct" / "docker" / container.id / "cpuacct.usage_percpu"
 62 |         mem_path = TEST_SYS_FS / "memory" / "docker" / container.id / "memory.usage_in_bytes"
 63 | 
 64 |         assert container.metric_paths == (cpu_path, per_cpu_path, mem_path)
 65 | 
 66 |     def test_metrics_path_exists(self, tmp_path):
 67 |         container = Container()
 68 |         p1, p2, p3 = tmp_path / "p1", tmp_path / "p2", tmp_path / "p3"
 69 |         container.metric_paths = (p1, p2, p3)
 70 | 
 71 |         assert not container.metrics_path_exists()
 72 |         p1.touch()
 73 |         assert not container.metrics_path_exists()
 74 |         p2.touch()
 75 |         assert not container.metrics_path_exists()
 76 |         p3.touch()
 77 |         assert container.metrics_path_exists()
 78 | 
 79 |     @patch("container.psutil.cpu_times")
 80 |     def test_read_raw_metrics(self, mock_cpu, tmp_path):
 81 |         mock_cpu_data = [10, 20, 10, 20, 10, 20, 10, 20]
 82 |         mock_cpu.return_value = mock_cpu_data
 83 |         container = Container()
 84 |         p1, p2, p3 = tmp_path / "p1", tmp_path / "p2", tmp_path / "p3"
 85 |         content1, content2, content3 = b'123', b'456', b'789'
 86 |         p1.write_bytes(content1)
 87 |         p2.write_bytes(content2)
 88 |         p3.write_bytes(content3)
 89 |         container.metric_paths = (p1, p2, p3)
 90 | 
 91 |         raw_metrics = container._read_raw_metrics()
 92 |         assert isinstance(raw_metrics, RawMetrics)
 93 |         assert isinstance(raw_metrics.timestamp, float)
 94 |         assert raw_metrics.cpu == float(content1)
 95 |         assert raw_metrics.per_cpu == content2
 96 |         assert raw_metrics.sys_cpu == sum(mock_cpu_data[:7])
 97 |         assert raw_metrics.memory == float(content3)
 98 | 
 99 |     def test_sample_metrics_no_path(self):
100 |         container = Container()
101 |         with pytest.raises(RuntimeError):
102 |             container.sample_metrics()
103 | 
104 |     @patch("container.Container._read_raw_metrics")
105 |     @patch("container.Container._process_raw_data")
106 |     def test_sample_metrics(self, mock_process_data, mock_read_metrics):
107 |         container = Container()
108 |         container.metric_paths = (1, 2, 3)
109 |         mock_read_metrics.side_effect = [1, 2, 3]
110 | 
111 |         def sum_two(prev, cur):
112 |             return (prev + cur)
113 |         mock_process_data.side_effect = sum_two
114 | 
115 |         container.sample_metrics()
116 |         assert container.raw_metrics == [1]
117 |         assert container.metrics == []
118 |         container.sample_metrics()
119 |         assert container.raw_metrics == [1, 2]
120 |         assert container.metrics == [3]
121 |         container.sample_metrics()
122 |         assert container.raw_metrics == [1, 2, 3]
123 |         assert container.metrics == [3, 5]
124 | 
125 |     @patch("container.ONLINE_CPUS", 4)
126 |     def test_process_raw_data(self):
127 |         container = Container()
128 |         raw_data = [
129 |             RawMetrics(
130 |                 timestamp=2.0, cpu=800000.0, per_cpu=b'300000 0 0 500000 \n', sys_cpu=14000000.00,
131 |                 memory=6500000),
132 |             RawMetrics(
133 |                 timestamp=3.0, cpu=1000000.0, per_cpu=b'500000 0 0 500000 \n', sys_cpu=14000000.60,
134 |                 memory=8500000)]
135 |         post_data = container._process_raw_data(raw_data[0], raw_data[1])
136 |         cpu_delta = (raw_data[1].cpu - raw_data[0].cpu) / 1e9
137 |         sys_delta = raw_data[1].sys_cpu - raw_data[0].sys_cpu
138 | 
139 |         assert post_data == Metrics(
140 |             timestamp=2.5,
141 |             cpu_percent=(cpu_delta/sys_delta)*4*100,
142 |             memory=7.50,
143 |         )
144 | 


--------------------------------------------------------------------------------
/tests/test_clarac_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import os
 17 | import sys
 18 | from pathlib import Path
 19 | from unittest.mock import MagicMock, patch
 20 | 
 21 | import pytest
 22 | 
 23 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src"))
 24 | from clarac_utils import OperatorConfig, PipelineConfig, ServiceConfig, run_clarac  # nopep8  # noqa: E402
 25 | 
 26 | 
 27 | @pytest.mark.parametrize("og_variables, exp_variables",
 28 |                          [(None, {"a": 1, "b": 2}),
 29 |                           ({"c": 3},
 30 |                            {"c": 3, "a": 1, "b": 2})])
 31 | def test_op_config_update_variables(og_variables, exp_variables):
 32 |     new_variables = {"a": 1, "b": 2}
 33 |     op = OperatorConfig("op1", "image_tag", None, og_variables, None, None)
 34 |     op.update_variables(new_variables)
 35 |     assert op.variables == exp_variables
 36 | 
 37 | 
 38 | @patch("clarac_utils.subproc_run")
 39 | def test_run_clarac_subproc_error(mock_subproc_run, tmp_path):
 40 |     mock_subproc_run.return_value = MagicMock(**{"returncode": 1, "stderr": "some error"})
 41 |     with pytest.raises(SystemExit):
 42 |         run_clarac(tmp_path)
 43 | 
 44 | 
 45 | @patch("clarac_utils.NamedTemporaryFile")
 46 | @patch("clarac_utils.subproc_run")
 47 | def test_run_clarac_yaml_error(mock_subproc_run, mock_temp_file, tmp_path):
 48 |     mock_subproc_run.return_value = MagicMock(**{"returncode": 0, "stdout": "some output"})
 49 |     mock_file = tmp_path / "bad.yaml"
 50 |     mock_file.touch()
 51 |     mock_file.write_text("api-version: '0.4.0'\n name: null-pipeline")
 52 |     with open(mock_file) as mock_file_obj:
 53 |         mock_temp_file.return_value.__enter__.return_value = mock_file_obj
 54 | 
 55 |         with pytest.raises(SystemExit):
 56 |             run_clarac(tmp_path)
 57 | 
 58 | 
 59 | @pytest.mark.skip("Skipping due to pipeline setup for clarac is incomplete")
 60 | def test_run_clarac():
 61 |     pipeline_file = Path(__file__).parent / "pipelines" / ("nullpipeline.yaml")
 62 | 
 63 |     config = run_clarac(pipeline_file)
 64 |     assert isinstance(config, PipelineConfig)
 65 |     assert config.name == "null-pipeline"
 66 |     assert len(config.operators) == 3
 67 |     op = config.operators[0]
 68 |     assert op.name == "null-reader"
 69 |     assert op.image_n_tag == "null-pipeline/operator-py:0.8.1"
 70 |     assert op.command is None
 71 |     assert op.variables == {"CLARA_TRACE": 2}
 72 |     assert op.inputs == [{"name": None, "path": "/input"}]
 73 |     assert op.outputs == [{"name": None, "path": "/output"}]
 74 |     assert op.models is None
 75 |     assert op.services is None
 76 | 
 77 |     op = config.operators[1]
 78 |     assert op.name == "null-inference"
 79 |     assert op.image_n_tag == "null-pipeline/operator-py:0.8.1"
 80 |     assert op.command is None
 81 |     assert op.variables == {"CLARA_TRACE": 2}
 82 |     assert op.inputs == [{"from": "null-reader", "name": None, "path": "/input"}]
 83 |     assert op.outputs == [{"name": None, "path": "/output"}]
 84 |     assert op.models is None
 85 |     assert op.services is None
 86 | 
 87 |     op = config.operators[2]
 88 |     assert op.name == "null-writer"
 89 |     assert op.image_n_tag == "null-pipeline/operator-py:0.8.1"
 90 |     assert op.command is None
 91 |     assert op.variables == {"CLARA_TRACE": 2}
 92 |     assert op.inputs == [{"from": "null-inference", "name": None, "path": "/input"}]
 93 |     assert op.outputs == [{"name": None, "path": "/output"}]
 94 |     assert op.models is None
 95 |     assert op.services is None
 96 | 
 97 | 
 98 | @pytest.mark.skip("Skipping due to pipeline setup for clarac is incomplete")
 99 | def test_run_clarac_with_triton_models():
100 |     pipeline_file = Path(__file__).parent / "pipelines" / ("operator_with_model.yaml")
101 | 
102 |     config = run_clarac(pipeline_file)
103 |     assert isinstance(config, PipelineConfig)
104 |     assert config.name == "null-pipeline"
105 |     assert len(config.operators) == 1
106 |     op = config.operators[0]
107 |     assert op.name == "null-reader"
108 |     assert op.image_n_tag == "null-pipeline/operator-py:0.8.1"
109 |     assert op.inputs == [{"name": None, "path": "/input"}]
110 |     assert op.outputs == [{"name": None, "path": "/output"}]
111 |     assert op.command == ["python", "register.py", "--agent", "renderserver"]
112 |     assert op.models == ["segmentation_ct_spleen_v1", "segmentation_ct_liver_v1"]
113 |     assert op.services is None
114 | 
115 | 
116 | @pytest.mark.skip("Skipping due to pipeline setup for clarac is incomplete")
117 | def test_run_clarac_with_pipeline_services():
118 |     pipeline_file = Path(__file__).parent / "pipelines" / ("operator_with_services.yaml")
119 | 
120 |     config = run_clarac(pipeline_file)
121 |     assert isinstance(config, PipelineConfig)
122 |     assert config.name == "null-pipeline"
123 |     assert len(config.operators) == 1
124 |     op = config.operators[0]
125 |     assert op.name == "null-reader"
126 |     assert op.image_n_tag == "null-pipeline/operator-py:0.8.1"
127 |     assert op.inputs == [{"name": None, "path": "/input"}]
128 |     assert op.outputs == [{"name": None, "path": "/output"}]
129 |     assert op.command is None
130 |     assert op.models is None
131 |     assert len(op.services) == 1
132 |     op_service = op.services[0]
133 |     assert isinstance(op_service, ServiceConfig)
134 |     assert op_service.name == "trtis"
135 |     assert op_service.image_n_tag == "nvcr.io/nvidia/tritonserver:latest"
136 |     assert op_service.command == ["some", "command"]
137 |     assert op_service.http_connections == {"NVIDIA_CLARA_TRTISURI": 8000}
138 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import os
 17 | import re
 18 | import sys
 19 | from argparse import ArgumentTypeError
 20 | 
 21 | import pytest
 22 | 
 23 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src"))
 24 | from cli import ContinueOptions, parse_args  # nopep8  # noqa: E402
 25 | 
 26 | 
 27 | @pytest.fixture(scope="function")
 28 | def file_maker(tmp_path):
 29 |     """A function scoped pytest fixture to return the path of a temporary file."""
 30 |     file_path = tmp_path / "pipeline_defn"
 31 |     file_path.touch()
 32 |     return str(file_path)
 33 | 
 34 | 
 35 | def swap_pattern(pattern, substitute, args):
 36 |     """Helper method to substitute a pattern in args for cleaner tests."""
 37 |     return [re.sub(pattern, substitute, i) for i in args]
 38 | 
 39 | 
 40 | def test_swap_pattern():
 41 |     args = ["%tmp_file%", "some_input_dir", "%tmp%", "hello", "%tmp%"]
 42 |     result = swap_pattern("%tmp%", "abc", args)
 43 |     assert result == ["%tmp_file%", "some_input_dir", "abc", "hello", "abc"]
 44 | 
 45 | 
 46 | @pytest.mark.parametrize("input_args", [["%tmp_file%"], [], ["-x"], ["-v"]])
 47 | def test_missing_required_args(input_args, file_maker, capsys):
 48 |     input_args = swap_pattern(r'%tmp_file%', file_maker, input_args)
 49 | 
 50 |     with pytest.raises(SystemExit) as pytest_wrapped_e:
 51 |         parse_args(input_args)
 52 |     out, err = capsys.readouterr()
 53 | 
 54 |     assert "" == out
 55 |     assert "error: the following arguments are required" in err
 56 |     assert "usage: cpost" in err
 57 |     assert pytest_wrapped_e.value.code == 2
 58 | 
 59 | 
 60 | @pytest.mark.parametrize("input_args, error",
 61 |                          [
 62 |                              (["some_pipeline_path", "some_input_dir"], ArgumentTypeError),
 63 |                              (["/tmp", "/tmp"], ArgumentTypeError),
 64 |                              (["%tmp_file%", "some_input_dir"], ArgumentTypeError),
 65 |                              (["%tmp_file%", "%tmp_file%"], ArgumentTypeError),
 66 |                              (["%tmp_file%", "/tmp", "--metrics_dir", "some_dir"], ArgumentTypeError),
 67 |                              (["%tmp_file%", "/tmp", "--metrics_dir", "%tmp_file%"], ArgumentTypeError),
 68 |                              (["%tmp_file%", "/tmp", "--models_dir", "some_dir"], ArgumentTypeError),
 69 |                              (["%tmp_file%", "/tmp", "--models_dir", "%tmp_file%"], ArgumentTypeError)
 70 |                          ])
 71 | def test_invalid_path(input_args, error, file_maker):
 72 |     input_args = swap_pattern(r'%tmp_file%', file_maker, input_args)
 73 |     with pytest.raises(SystemExit) as pytest_wrapped_e:
 74 |         with pytest.raises(error) as excinfo:
 75 |             parse_args(input_args)
 76 |         assert "No such" in str(excinfo.value)
 77 |     assert pytest_wrapped_e.value.code == 2
 78 | 
 79 | 
 80 | @pytest.mark.parametrize("optional_dir_specified", [True, False])
 81 | def test_valid_path(optional_dir_specified, tmp_path, file_maker):
 82 |     input_dir = tmp_path / test_valid_path.__name__
 83 |     input_dir.mkdir()
 84 |     pipeline = file_maker
 85 | 
 86 |     if not optional_dir_specified:
 87 |         input_args = [pipeline, str(input_dir)]
 88 |         parsed = parse_args(input_args)
 89 |         assert parsed.input_dir == input_dir
 90 |         assert str(parsed.pipeline_path) == pipeline
 91 |         assert parsed.metrics_dir is None
 92 |         assert parsed.models_dir is None
 93 |         assert parsed.force == ContinueOptions.NONE
 94 |     else:
 95 |         metrics_dir = tmp_path / "test_output_metrics"
 96 |         metrics_dir.mkdir()
 97 |         models_dir = tmp_path / "model_repo"
 98 |         models_dir.mkdir()
 99 |         input_args = [pipeline, str(input_dir), "--metrics_dir", str(metrics_dir), "--models_dir", str(models_dir)]
100 |         parsed = parse_args(input_args)
101 |         assert parsed.input_dir == input_dir
102 |         assert str(parsed.pipeline_path) == pipeline
103 |         assert parsed.metrics_dir == metrics_dir
104 |         assert parsed.models_dir == models_dir
105 |         assert parsed.force == ContinueOptions.NONE
106 | 
107 | 
108 | @pytest.mark.parametrize("force_args, exp_option",
109 |                          [(["--force", "cont"], ContinueOptions.CONT),
110 |                           (["--force=cont"], ContinueOptions.CONT),
111 |                           ([], ContinueOptions.NONE),
112 |                           (["--force", "none"], ContinueOptions.NONE),
113 |                           (["--force", "stop"], ContinueOptions.STOP)])
114 | def test_parse_force_options(force_args, exp_option, tmp_path, file_maker):
115 |     input_dir = tmp_path / test_parse_force_options.__name__
116 |     input_dir.mkdir()
117 |     pipeline = file_maker
118 |     input_args = force_args + [pipeline, str(input_dir)]
119 |     parsed = parse_args(input_args)
120 |     assert parsed.input_dir == input_dir
121 |     assert str(parsed.pipeline_path) == pipeline
122 |     assert parsed.metrics_dir is None
123 |     assert parsed.models_dir is None
124 |     assert parsed.force == exp_option
125 | 
126 | 
127 | @pytest.mark.parametrize("force_args, err_msg",
128 |                          [(["--force", "continue"], "argument --force: invalid choice: 'continue'"),
129 |                           (["--force"], "argument --force: invalid choice:"),
130 |                           (["--force", "aaaa"], "argument --force: invalid choice: 'aaaa'")])
131 | def test_parse_force_options_error(force_args, err_msg, tmp_path, capsys, file_maker):
132 |     input_dir = tmp_path / test_parse_force_options_error.__name__
133 |     input_dir.mkdir()
134 |     pipeline = file_maker
135 |     input_args = force_args + [pipeline, str(input_dir)]
136 |     with pytest.raises(SystemExit):
137 |         parse_args(input_args)
138 |     out, err = capsys.readouterr()
139 |     assert err_msg in err
140 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import io
 17 | import os
 18 | import sys
 19 | from dataclasses import dataclass
 20 | from unittest.mock import MagicMock, patch
 21 | 
 22 | import pytest
 23 | 
 24 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src"))
 25 | from clarac_utils import OperatorConfig  # nopep8  # noqa: E402
 26 | from utils import (assert_installed, check_images_and_tags, convert_percent_to_cores,  # nopep8  # noqa: E402
 27 |                    prompt_yes_or_no, round_up_to_multiple, subproc_run_wrapper, write_to_csv)
 28 | 
 29 | 
 30 | @pytest.mark.parametrize("data_in, base, data_out",
 31 |                          [(3, 2, 4),
 32 |                           (4, 2, 4),
 33 |                           (15.5, 5, 20),
 34 |                           (148.05, 256, 256),
 35 |                           (256.05, 256, 512)])
 36 | def test_round_up_to_multiple(data_in, base, data_out):
 37 |     assert round_up_to_multiple(data_in, base) == data_out
 38 | 
 39 | 
 40 | @pytest.mark.parametrize("data_in, data_out",
 41 |                          [(100.05, 2),
 42 |                           (1343.5, 14),
 43 |                           (50.55, 1)])
 44 | def test_convert_percent_to_cores(data_in, data_out):
 45 |     assert convert_percent_to_cores(data_in) == data_out
 46 | 
 47 | 
 48 | @pytest.mark.parametrize("program, exist", [("echo", True), ("clara", True), ("claraabc", False)])
 49 | def test_assert_installed(program, exist):
 50 |     if program == "clara":
 51 |         pytest.skip()
 52 |     if exist:
 53 |         assert assert_installed(program) is None
 54 |     else:
 55 |         with pytest.raises(SystemExit) as exc:
 56 |             assert_installed(program)
 57 |         assert exc.value.code == 1
 58 | 
 59 | 
 60 | @pytest.mark.parametrize("mocked_return, run_called_count", [
 61 |     pytest.param(MagicMock(**{"stdout": b'tag1\n'}), 2, id="exists_locally"),
 62 |     pytest.param(MagicMock(**{"stdout": b'', "returncode": 0}), 4, id="can_be_pulled"),
 63 |     pytest.param(MagicMock(**{"stdout": b'', "returncode": 1, "stderr": b'error message'}), 2, id="pull_failed"),
 64 | ])
 65 | @patch("utils.subproc_run")
 66 | def test_check_images_and_tags(mock_subproc_run, mocked_return, run_called_count):
 67 |     mock_subproc_run.return_value = mocked_return
 68 |     mock_service = [MagicMock(**{"image_n_tag": "tag1"})]
 69 |     op1 = OperatorConfig("Input1", "tag1", None, None, [{"path": "/input"}], None, None, mock_service)
 70 |     if mocked_return.returncode == 1:
 71 |         with pytest.raises(SystemExit):
 72 |             check_images_and_tags([op1])
 73 |     else:
 74 |         check_images_and_tags([op1])
 75 |     assert mock_subproc_run.call_count == run_called_count
 76 | 
 77 | 
 78 | @patch("utils.TRITON_IMAGE_TAG", "triton-tag")
 79 | @pytest.mark.parametrize("mocked_return, expect_exit, run_called_count",
 80 |                          [
 81 |                              pytest.param([MagicMock(**{"stdout": b'triton-tag\n'})],
 82 |                                           False, 2, id="exists_locally"),
 83 |                              pytest.param(
 84 |                                  [MagicMock(**{"stdout": b''}),
 85 |                                   MagicMock(**{"stdout": b'', "returncode": 0})],
 86 |                                  False, 3, id="can_be_pulled"),
 87 |                              pytest.param(
 88 |                                  [MagicMock(**{"stdout": b''}),
 89 |                                   MagicMock(**{"stdout": b'', "returncode": 1, "stderr": b'error message'})],
 90 |                                  True, 3, id="pull_failed"),
 91 | 
 92 |                          ])
 93 | @patch("utils.subproc_run")
 94 | def test_check_images_and_tags_with_triton(mock_subproc_run, mocked_return, expect_exit, run_called_count):
 95 |     mock_subproc_run.side_effect = [MagicMock(**{"stdout": b'tag1\n'})] + mocked_return
 96 |     op1 = OperatorConfig("Input1", "tag1", None, None, [{"path": "/input"}], None, ["model1"])
 97 |     if expect_exit:
 98 |         with pytest.raises(SystemExit):
 99 |             check_images_and_tags([op1])
100 |     else:
101 |         check_images_and_tags([op1])
102 |     assert mock_subproc_run.call_count == run_called_count
103 | 
104 | 
105 | @pytest.mark.parametrize("mocked_return", [
106 |     pytest.param(MagicMock(**{"stdout": b'container_id\n', "returncode": 0}), id="all_good"),
107 |     pytest.param(MagicMock(**{"stderr": b'error message', "returncode": 1}), id="error")
108 | ])
109 | @patch("utils.subproc_run")
110 | def test_subproc_run_wrapper(mock_subproc_run, mocked_return):
111 |     mock_subproc_run.return_value = mocked_return
112 |     if mocked_return.returncode == 1:
113 |         with pytest.raises(SystemExit):
114 |             subproc_run_wrapper(["some", "cmd"])
115 |     else:
116 |         result = subproc_run_wrapper(["some", "cmd"])
117 |         assert result == "container_id"
118 | 
119 | 
120 | @pytest.mark.parametrize("choice, expected_result", [
121 |     ("y", True),
122 |     ("Y", True),
123 |     ("yes", True),
124 |     ("YES", True),
125 |     ("yup", True),
126 |     ("n", False),
127 |     ("N", False),
128 |     ("no", False),
129 |     ("NO", False),
130 |     ("nope", False),
131 |     ("j\nx\nyeeee", True),
132 |     ("exxxy\nadsfa\nnaaah", False),
133 |     ("\nx\ny", True)
134 | ])
135 | def test_prompt_yes_or_no(choice, expected_result):
136 |     sys.stdin = io.StringIO(choice)
137 |     assert prompt_yes_or_no("Please give your response") == expected_result
138 | 
139 | 
140 | def test_write_to_csv(tmp_path):
141 | 
142 |     @ dataclass
143 |     class MockMetric:
144 |         field1: str
145 |         field2: int
146 | 
147 |     mock_q = MagicMock()
148 |     mock_q.get.side_effect = [None, MockMetric("abc", 12), MockMetric("fdvc", 15), 0]
149 |     output_dir = tmp_path / "sub_dir" / "test_write_to_csv"
150 |     field_names = ["field1", "field2"]
151 |     write_to_csv(mock_q, field_names, output_dir)
152 | 
153 |     assert output_dir.read_text() == "field1,field2\nabc,12\nfdvc,15\n"
154 | 


--------------------------------------------------------------------------------
/src/container.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | from dataclasses import dataclass
 17 | from dataclasses import fields as data_fields
 18 | from datetime import datetime
 19 | 
 20 | import psutil
 21 | from constants import B_MB_FACTOR, NS_PER_S, ONLINE_CPUS, SYSFS_PATH
 22 | 
 23 | 
 24 | @dataclass
 25 | class Metrics:
 26 |     timestamp: float
 27 |     cpu_percent: float
 28 |     memory: float  # in MB
 29 | 
 30 | 
 31 | METRICS_HEADER = [obj.name for obj in data_fields(Metrics)]
 32 | 
 33 | 
 34 | @dataclass
 35 | class RawMetrics:
 36 |     timestamp: float
 37 |     cpu: float
 38 |     per_cpu: bytes
 39 |     sys_cpu: tuple
 40 |     memory: float  # in bytes
 41 | 
 42 | 
 43 | class Container:
 44 | 
 45 |     def __init__(self) -> None:
 46 |         """Initializes the Container object with id, metrics_path, raw_metrics, and metrics.
 47 | 
 48 |         Args:
 49 |             None
 50 | 
 51 |         Returns:
 52 |             None
 53 |         """
 54 |         self.id = ""
 55 |         self.metric_paths = ()  # Tuple[Path, Path, Path]
 56 |         self.raw_metrics = []  # List[RawMetrics]
 57 |         self.metrics = []
 58 | 
 59 |     def construct_metrics_path(self):
 60 |         """Constructs metrics reading paths in a tuple based on self.id attribute.
 61 | 
 62 |         Args:
 63 |             None
 64 | 
 65 |         Returns:
 66 |             None
 67 | 
 68 |         Raises:
 69 |             RuntimeError if id is not set when this is called
 70 |         """
 71 |         if self.id:
 72 |             _cpu_path = SYSFS_PATH / "cpuacct" / "docker" / self.id / "cpuacct.usage"
 73 |             _per_cpu_path = SYSFS_PATH / "cpuacct" / "docker" / self.id / "cpuacct.usage_percpu"
 74 |             _mem_path = SYSFS_PATH / "memory" / "docker" / self.id / "memory.usage_in_bytes"
 75 |             self.metric_paths = (_cpu_path, _per_cpu_path, _mem_path)
 76 |         else:
 77 |             raise RuntimeError("Container ID is not set when creating paths")
 78 | 
 79 |     def metrics_path_exists(self) -> bool:
 80 |         """Checks if all the paths in the container.metrics_path attribute exist.
 81 | 
 82 |         Args:
 83 |             None
 84 | 
 85 |         Returns:
 86 |             A boolean value for whether all metrics_paths exist on the system.
 87 |         """
 88 |         return self.metric_paths[0].exists() and self.metric_paths[1].exists() and self.metric_paths[2].exists()
 89 | 
 90 |     def _read_raw_metrics(self) -> RawMetrics:
 91 |         """Reads raw metrics data based on the self.metric_path and timestamp it.
 92 | 
 93 |         Args:
 94 |             None
 95 | 
 96 |         Returns:
 97 |             A RawMetrics object
 98 |         """
 99 |         timestamp = datetime.utcnow().timestamp()
100 |         # Rationale for raw_sys_cpu arithmetic: getSystemCPUUsage() in docker/daemon/stats_collector_unix.go
101 |         # in https://github.com/rancher/docker
102 |         raw_sys_cpu = sum(psutil.cpu_times()[:7])  # in seconds
103 |         # Note: Converting to float takes an extra 1000ns
104 |         raw_cpu = float(self.metric_paths[0].read_bytes())
105 |         # If we know this len is the same as the system cpu num, then we don't need per_cpu anymore
106 |         raw_per_cpu = self.metric_paths[1].read_bytes()
107 |         raw_mem = float(self.metric_paths[2].read_bytes())
108 |         return RawMetrics(timestamp, raw_cpu, raw_per_cpu, raw_sys_cpu, raw_mem)
109 | 
110 |     def sample_metrics(self) -> None:
111 |         """Samples raw metrics data and append to self.raw_metrics list.
112 | 
113 |         FileNotFoundError and OSError errno 19 implies that the file no longer
114 |         exist and thus these are bypassed.
115 | 
116 |         Args:
117 |             None
118 | 
119 |         Returns:
120 |             None or metric, which is a Metrics object
121 | 
122 |         Raises:
123 |             RuntimeError if self.metric_paths is not set when this is called
124 |         """
125 |         if self.metric_paths:
126 |             try:
127 |                 raw_metrics = self._read_raw_metrics()
128 |                 self.raw_metrics.append(raw_metrics)
129 |                 # process metrics starting at second item
130 |                 if len(self.raw_metrics) >= 2:
131 |                     metric = self._process_raw_data(self.raw_metrics[-2], self.raw_metrics[-1])
132 |                     self.metrics.append(metric)
133 |                     return metric
134 |                 else:
135 |                     return
136 |             except FileNotFoundError:
137 |                 return
138 |             except OSError as err:
139 |                 if err.errno == 19:  # no such device error
140 |                     return
141 |                 else:
142 |                     raise(err)
143 |         else:
144 |             raise RuntimeError("Metrics paths must constructed before sampling.")
145 | 
146 |     @staticmethod
147 |     def _process_raw_data(prev, cur):
148 |         """Process the given data and convert units.
149 |         Computation according to https://docs.docker.com/engine/api/v1.41/#operation/ContainerStats
150 | 
151 |         Args:
152 |             prev: the prior RawMetrics object
153 |             cur: the current RawMetrics object
154 | 
155 |         Returns:
156 |             result: A list of MetricsData object
157 |         """
158 |         ts_avg = (prev.timestamp + cur.timestamp) / 2.0
159 |         cpu_percent = 0.0
160 |         # Convert from nanoseconds to seconds
161 |         cpu_delta = (cur.cpu - prev.cpu) / NS_PER_S
162 |         # Below does not need div by CLOCK_TICKS_PER_S because it has been done in psutils
163 |         sys_cpu_delta = cur.sys_cpu - prev.sys_cpu
164 | 
165 |         if cpu_delta > 0.0 and sys_cpu_delta > 0.0:
166 |             cpu_percent = (cpu_delta / sys_cpu_delta) * ONLINE_CPUS * 100.0
167 | 
168 |         # Since we're averaging the cpu, we also need to average the memory to match the averaged timestamp
169 |         memory_avg = (prev.memory + cur.memory) / 2.0 / B_MB_FACTOR
170 | 
171 |         return Metrics(ts_avg, cpu_percent=cpu_percent, memory=memory_avg)
172 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import csv
 17 | import dataclasses
 18 | import logging
 19 | import math
 20 | import shutil
 21 | import sys
 22 | from pathlib import Path
 23 | from subprocess import PIPE, Popen
 24 | from subprocess import run as subproc_run
 25 | from typing import List
 26 | 
 27 | from clarac_utils import OperatorConfig
 28 | from constants import ON_POSIX, TRITON_IMAGE_TAG
 29 | 
 30 | 
 31 | def round_up_to_multiple(x, base):
 32 |     """Round up the given number to the nearest multiple of the given base number."""
 33 |     return math.ceil(float(x) / float(base)) * base
 34 | 
 35 | 
 36 | def convert_percent_to_cores(x):
 37 |     "Convert the given percentage to CPU cores."
 38 |     return int(math.ceil(x / 100.0))
 39 | 
 40 | 
 41 | def assert_installed(prog: str):
 42 |     """Check if the given program is installed, terminate if not.
 43 | 
 44 |     Args:
 45 |         prog: Name of the commandline program
 46 | 
 47 |     Returns:
 48 |         None. If program is not installed, sys.exit(1)
 49 |     """
 50 |     logging.debug(f"Checking for dependency {prog} ...")
 51 |     if not shutil.which(prog):
 52 |         sys.stderr.write(f"error: {prog} not installed, please install {prog}\n")
 53 |         sys.exit(1)
 54 |     logging.debug(f"Dependency {prog} fulfilled")
 55 | 
 56 | 
 57 | def set_up_logging(verbose):
 58 |     """Setup logging for cpost to standard out.
 59 | 
 60 |     Args:
 61 |         verbose: Boolean value indicating whether log level will be debug or not
 62 | 
 63 |     Returns:
 64 |         None.
 65 |     """
 66 |     if verbose:  # pragma: no cover
 67 |         level = logging.DEBUG
 68 |     else:    # pragma: no cover
 69 |         level = logging.INFO
 70 |     # logging config are default to StreamHandlers
 71 |     logging.basicConfig(format='%(message)s', level=level)  # pragma: no cover
 72 | 
 73 | 
 74 | def check_images_and_tags(operators: List[OperatorConfig]):
 75 |     """For the image and tag of each operator, examine local images and pull if not found locally.
 76 | 
 77 |     Args:
 78 |         operators: List of OperatorConfig objects
 79 | 
 80 |     Returns:
 81 |         None
 82 | 
 83 |     Raises:
 84 |         sys.exit if the docker pull command errorred out
 85 |     """
 86 |     uses_triton_model_repo = False
 87 |     logging.info("Checking for container images and tags needed for the pipeline...")
 88 | 
 89 |     def _check_image_exists_locally(image_and_tag):
 90 |         logging.debug(f"Checking if `{image_and_tag}` are in local images...")
 91 |         local_check_proc = subproc_run(
 92 |             ["docker", "images", image_and_tag, "--format", "{{.Repository}}:{{.Tag}}"],
 93 |             capture_output=True)
 94 |         result = local_check_proc.stdout.decode('UTF-8')
 95 |         if image_and_tag in result:
 96 |             logging.debug(f"`{image_and_tag}` found.")
 97 |             return True
 98 |         else:
 99 |             return False
100 | 
101 |     def _pull_image(image_and_tag):
102 |         logging.debug(f"`{image_and_tag}` not found, try pulling from registry ...")
103 |         pull_proc = subproc_run(["docker", "pull", image_and_tag], capture_output=True)
104 |         if pull_proc.returncode == 0:
105 |             logging.debug(f"Docker pull command for `{image_and_tag}` returned with code {pull_proc.returncode}")
106 |             logging.debug(f"stdout is: \n{pull_proc.stdout.decode('UTF-8').strip()}")
107 |         else:
108 |             logging.error(f"Docker pull command for `{image_and_tag}` returned with code {pull_proc.returncode}")
109 |             logging.error(f"stdout is: {pull_proc.stdout.decode('UTF-8')}")
110 |             logging.error(f"stderr is: {pull_proc.stderr.decode('UTF-8')}")
111 |             sys.exit("Please verify docker access and the pipeline definition")
112 | 
113 |     for operator in operators:
114 |         if not _check_image_exists_locally(operator.image_n_tag):
115 |             _pull_image(operator.image_n_tag)
116 |         if operator.models:
117 |             uses_triton_model_repo = True
118 |         if operator.services:
119 |             for op_service in operator.services:
120 |                 if not _check_image_exists_locally(op_service.image_n_tag):
121 |                     _pull_image(op_service.image_n_tag)
122 |     if uses_triton_model_repo:
123 |         if not _check_image_exists_locally(TRITON_IMAGE_TAG):
124 |             _pull_image(TRITON_IMAGE_TAG)
125 | 
126 |     logging.info("All container images are ready to be used.")
127 | 
128 | 
129 | def subproc_run_wrapper(cmd, **kwargs):
130 |     sub_proc = subproc_run(cmd, capture_output=True, **kwargs)
131 |     if sub_proc.returncode == 0:
132 |         std_out = sub_proc.stdout.decode('UTF-8').strip()
133 |         logging.debug(f"Subprocess returned with stdout {std_out}")
134 |         return std_out
135 |     else:
136 |         logging.error(
137 |             f"Running {cmd} returned with {sub_proc.returncode} with error {sub_proc.stderr}")
138 |         return sys.exit(f"Failed to run subprocess with command {cmd}")
139 | 
140 | 
141 | def prompt_yes_or_no(condition: str):
142 |     """Prompt the user with a question and waits for the y/n input.
143 | 
144 |     Args:
145 |         condition: Condition that needs user's input
146 | 
147 |     Returns:
148 |         Boolean value corresponding to yes or no
149 |     """
150 |     while "the answer is invalid":
151 |         reply = input(condition + ' (y/n): ').lower().strip()
152 |         if reply:
153 |             if reply[0] == 'y':
154 |                 return True
155 |             if reply[0] == 'n':
156 |                 return False
157 | 
158 | 
159 | def write_to_csv(que, field_names, output_file):
160 |     """Write data in que to the output file in csv format.
161 | 
162 |     Args:
163 |         que: a multiprocess.Queue contains the data to be written
164 |         field_names: Header for the csv file
165 |         output_file: String or Path of the output file location
166 | 
167 |     Returns:
168 |         None
169 |     """
170 |     output_file = Path(output_file)
171 |     if not output_file.parent.exists():
172 |         output_file.parent.mkdir(parents=True)
173 | 
174 |     with open(output_file, "w") as f:
175 |         csv_writer = csv.DictWriter(f, fieldnames=field_names)
176 |         csv_writer.writeheader()
177 |         while True:
178 |             item = que.get()
179 |             if item is None:
180 |                 continue
181 |             if item == 0:
182 |                 que.close()
183 |                 break
184 |             csv_writer.writerow(dataclasses.asdict(item))
185 |             f.flush()
186 |     logging.info(f"Results are stored in {output_file}")
187 | 


--------------------------------------------------------------------------------
/src/triton_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import logging
 17 | import sys
 18 | import time
 19 | from contextlib import contextmanager
 20 | from enum import Enum, auto
 21 | from typing import List
 22 | 
 23 | import requests
 24 | from clarac_utils import OperatorConfig
 25 | from constants import (TRITON_HTTP_PORT, TRITON_IMAGE_TAG, TRITON_READY_TIMEOUT_SECONDS,
 26 |                        TRITON_WAIT_SLEEP_TIME_SECONDS, TRITON_WAIT_TIME_SECONDS)
 27 | from utils import subproc_run_wrapper
 28 | 
 29 | 
 30 | class RUN_MODE(Enum):
 31 |     NO_INFERENCE_SERVER = auto()
 32 |     MODEL_REPO = auto()
 33 |     PIPELINE_SERVICES = auto()
 34 | 
 35 | 
 36 | def _extract_models_from_configs(op_configs: List[OperatorConfig]):
 37 |     """Helper method to obtain models from list of OperatorConfig.
 38 | 
 39 |     Args:
 40 |         op_configs: List of OperatorConfigs to extract information from
 41 | 
 42 |     Returns:
 43 |         List of string which represents the names of each model with no repeating models
 44 |     """
 45 |     logging.debug("Abstracting model form pipeline definition")
 46 |     result = list(set([model for op in op_configs if op.models for model in op.models]))
 47 |     logging.debug(f"The models present are `{result}`")
 48 |     return result
 49 | 
 50 | 
 51 | def check_models_directory(op_configs, models_dir) -> List[str]:
 52 |     """Checks if the model directory contains the models needed in the pipeline.
 53 | 
 54 |     Args:
 55 |         op_configs: List of OperatorConfigs to extract information from
 56 |         models_dir: A directory that contains Triton models
 57 | 
 58 |     Returns:
 59 |         model_names: List of model names used by this pipeline
 60 |     """
 61 |     logging.info("Checking model directory for dependent models ...")
 62 |     required_models = _extract_models_from_configs(op_configs)
 63 |     if required_models == []:
 64 |         logging.debug("Pipeline did not specify any Triton models, skipping check for models_dir")
 65 |         return []
 66 |     else:
 67 |         logging.debug("Examining model directory ...")
 68 |         if models_dir is None:
 69 |             sys.exit(f"Model directory must be provided since your pipeline uses: {required_models}")
 70 | 
 71 |         # The directory can contain more models than what's needed
 72 |         model_names = []
 73 |         for model_name in required_models:
 74 |             logging.debug(f"Checking for model `{model_name}` ...")
 75 |             matching_config = list(models_dir.glob(f"{model_name}/config.pbtxt"))
 76 |             if len(matching_config) == 0:
 77 |                 sys.exit(f"Model `{model_name}` is missing in the models directory")
 78 |             elif len(matching_config) > 1:
 79 |                 logging.warning(
 80 |                     f"Found more than one matching config file for model `{model_name}`. Using the first occurrence.")
 81 |             model_path = matching_config[0]
 82 |             with open(model_path) as f:
 83 |                 name_in_file = f.readline().split(":")[1].strip()[1:-1]
 84 |                 if name_in_file != model_path.parent.name:
 85 |                     sys.exit(
 86 |                         f"Expected name in config {name_in_file} to be equal to directory name {model_path.parent.name}")
 87 |             model_names.append(model_path.parent.name)
 88 | 
 89 |         logging.info("All model directory checks are complete!")
 90 |         return model_names
 91 | 
 92 | 
 93 | def decide_method_to_run_triton(op_configs) -> RUN_MODE:
 94 |     """Decide how to run triton based on the given op_configs.
 95 | 
 96 |     Args:
 97 |         op_configs: List of OperatorConfig objects
 98 | 
 99 |     Return:
100 |         RUN_MODE.MODEL_REPO, RUN_MODE.PIPELINE_SERVICES or RUN_MODE.NO_INFERENCE_SERVER
101 | 
102 |     Raises:
103 |         SystemExit if both models and services are present in the op_config
104 |     """
105 |     model_repo = False
106 |     services = False
107 |     for op in op_configs:
108 |         if op.models:
109 |             model_repo = True
110 |         if op.services:
111 |             services = True
112 |     if model_repo and services:
113 |         sys.exit("CPOST does not support model_repository and pipeline services at the same time")
114 |     if model_repo:
115 |         return RUN_MODE.MODEL_REPO
116 |     elif services:
117 |         return RUN_MODE.PIPELINE_SERVICES
118 |     return RUN_MODE.NO_INFERENCE_SERVER
119 | 
120 | 
121 | def check_triton_status(triton_models_names=[], host="localhost", port=TRITON_HTTP_PORT):
122 |     """Check status of Triton server via http.
123 | 
124 |     Kwargs:
125 |         triton_models_names: list of triton model names to verify, default: []
126 |         host: ip address of triton, default: localhost
127 |         port: the port to query http status, default: "8000"
128 | 
129 |     Returns:
130 |         None
131 | 
132 |     Raises:
133 |         SystemExit if requests.get returned with a non-200 status
134 |     """
135 |     logging.debug("Waiting and checking Triton status ...")
136 |     time.sleep(TRITON_WAIT_TIME_SECONDS)
137 |     start_time = time.perf_counter()
138 |     while time.perf_counter() - start_time < TRITON_READY_TIMEOUT_SECONDS:
139 |         time.sleep(TRITON_WAIT_SLEEP_TIME_SECONDS)
140 |         try:
141 |             ready = requests.get(f"http://{host}:{port}/api/status")
142 |             if ready.status_code != 200:
143 |                 sys.exit(f"Triton is not working, status code = {ready.status_code} with message {ready.text}")
144 |             break
145 |         except requests.ConnectionError:
146 |             continue
147 |     else:
148 |         raise TimeoutError("Timeout when waiting for triton to be ready.")
149 | 
150 |     # Verify that each model is ready
151 |     for model_name in triton_models_names:
152 |         ready = requests.get(
153 |             f"http://{host}:{port}/api/status/{model_name}", timeout=TRITON_READY_TIMEOUT_SECONDS)
154 |         if ready.status_code != 200:
155 |             sys.exit(f"Error: {ready.status_code} {ready.reason}, {ready.headers}")
156 |     logging.debug("Triton is ready to be used")
157 | 
158 | 
159 | def inspect_ip_address(container_name):
160 |     """Inspect and obtain the IP address for the given container.
161 | 
162 |     Args:
163 |         container_name: docker name or docker container ID
164 | 
165 |     Returns:
166 |         network_ip: the IP address of the container
167 |     """
168 |     cmd = ["docker", "inspect", "--format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}'", container_name]
169 |     output = subproc_run_wrapper(cmd)
170 |     network_ip = output[1:-1]  # Strip away the quotes around the returned IP address
171 |     logging.debug(f"{container_name} can be communicated on address {network_ip}")
172 |     return network_ip
173 | 
174 | 
175 | def start_triton(models_dir, command, image_tag=TRITON_IMAGE_TAG, triton_models_names=[]):
176 |     """Starts triton container and wait for it to be ready.
177 | 
178 |     Args:
179 |         models_dir: Absolute path of models_directory
180 |         command: list of commands to run for the container
181 | 
182 |     Kwargs:
183 |         image_tag: The image and tag for the container, e.g. image:tag, default to TRITON_IMAGE_TAG
184 |         triton_models_names: List of triton model names to load, default = []
185 | 
186 |     Returns:
187 |        triton_container_id, ip_address: Tuple of string
188 |     """
189 |     # build triton command
190 |     loading_models = [f"--load-model={name}" for name in triton_models_names]
191 |     cmd = ["docker", "run", "--gpus=1", "--rm", "-d", "-p8000:8000", "-p8001:8001", "-p8002:8002",
192 |            "-v", f"{models_dir}:/models", image_tag] + command + loading_models
193 |     logging.debug(f"Spinning up Triton with {cmd}")
194 |     triton_container_id = subproc_run_wrapper(cmd)
195 |     ip_address = inspect_ip_address(triton_container_id)
196 |     check_triton_status(triton_models_names=triton_models_names, host=ip_address)
197 |     return triton_container_id, ip_address
198 | 
199 | 
200 | @contextmanager
201 | def run_triton_model_repo(execution_order, models_dir):
202 |     """Run Triton in a context manager if pipeline requires Triton.
203 | 
204 |     Args:
205 |         execution_order: List of OperatorConfigs to extract information from
206 |         models_dir: Absolute path of models_directory
207 | 
208 |     Yields:
209 |         ip_address
210 |     """
211 |     try:
212 |         triton_models_names = check_models_directory(execution_order, models_dir)
213 |         command = ["tritonserver", "--model-repository=/models", "--model-control-mode=explicit"]
214 |         triton_container_id, ip_address = start_triton(models_dir, command, triton_models_names=triton_models_names)
215 |         yield ip_address
216 |     finally:
217 |         logging.debug("Stopping Triton ...")
218 |         subproc_run_wrapper(["docker", "kill", triton_container_id])
219 |         logging.debug("Finished cleaning up Triton")
220 | 


--------------------------------------------------------------------------------
/tests/test_triton_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import logging
 17 | import os
 18 | import re
 19 | import sys
 20 | from dataclasses import dataclass
 21 | from typing import List
 22 | from unittest.mock import MagicMock, patch
 23 | 
 24 | import pytest
 25 | 
 26 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src"))
 27 | from triton_utils import (RUN_MODE, _extract_models_from_configs, check_models_directory,  # nopep8  # noqa: E402
 28 |                           check_triton_status, decide_method_to_run_triton, inspect_ip_address, run_triton_model_repo,
 29 |                           start_triton)
 30 | 
 31 | 
 32 | @pytest.fixture(scope="function")
 33 | def create_triton_models_dir(tmp_path):
 34 |     """Custom Pytest fixture to mock triton models directory.
 35 | 
 36 |     Args:
 37 |         model_names: List of str representing triton model names.
 38 | 
 39 |     Returns:
 40 |         None
 41 |     """
 42 |     def _func(model_names):
 43 |         # Create the folders needed and some extra models in that directory
 44 |         for dir_name in model_names:
 45 |             config_file = tmp_path / "models" / dir_name / "config.pbtxt"
 46 |             config_file.parent.mkdir(parents=True, exist_ok=True)
 47 |             file_content = f'name: "{dir_name}"\n'
 48 |             config_file.write_text(file_content)
 49 |     yield _func
 50 | 
 51 | 
 52 | def test_fixture_create_models_dir(tmp_path, create_triton_models_dir):
 53 |     names = ["liver", "heart"]
 54 |     create_triton_models_dir(names)
 55 |     assert sorted(os.listdir(str(tmp_path / "models"))) == sorted(names)
 56 | 
 57 | 
 58 | @dataclass
 59 | class MockConfig:
 60 |     models: List[str] = None
 61 | 
 62 | 
 63 | @pytest.mark.parametrize("configs, expected", [
 64 |     ([MockConfig(), MockConfig(["m1", "m2"]), MockConfig(["m3"]), MockConfig(), MockConfig(["m4", "m5", "m6"])],
 65 |      ["m1", "m2", "m3", "m4", "m5", "m6"]),
 66 |     ([MockConfig(), MockConfig()], []),
 67 |     ([MockConfig(["m1", "m2"]), MockConfig(["m1"])], ["m1", "m2"])
 68 | ])
 69 | def test_extract_models_from_configs(configs, expected):
 70 |     result = _extract_models_from_configs(configs)
 71 |     assert sorted(result) == expected
 72 | 
 73 | 
 74 | @patch("triton_utils._extract_models_from_configs")
 75 | def test_check_model_repository_no_models_needed(mock_models, tmp_path):
 76 |     mock_models.return_value = []
 77 |     mock_configs = MagicMock()
 78 |     result = check_models_directory(mock_configs, tmp_path)
 79 |     assert result == []
 80 | 
 81 | 
 82 | @patch("triton_utils._extract_models_from_configs")
 83 | def test_check_model_repository_no_model_dir(mock_models):
 84 |     mock_models.return_value = ["liver", "spleen", "heart"]
 85 |     mock_configs = MagicMock()
 86 |     with pytest.raises(SystemExit):
 87 |         check_models_directory(mock_configs, None)
 88 | 
 89 | 
 90 | @pytest.mark.parametrize("mock_models, dir_name, file_content", [
 91 |     pytest.param(["liver"], "liver", 'name: "segmentation_liver_v1"\n', id="content_not_match"),
 92 |     pytest.param(["liver"], "liver_seg", 'name: "liver"\n', id="dir_name_not_match"),
 93 |     pytest.param(["liver", "heart"], "liver", 'name: "liver"\n', id="missing_model")
 94 | ])
 95 | @patch("triton_utils._extract_models_from_configs")
 96 | def test_check_model_repository_bad_input(mock_func, mock_models, dir_name, file_content, tmp_path):
 97 |     mock_func.return_value = mock_models
 98 |     mock_configs = MagicMock()
 99 |     config_file = tmp_path / "models" / dir_name / "config.pbtxt"
100 |     config_file.parent.mkdir(parents=True)
101 |     config_file.write_text(file_content)
102 |     with pytest.raises(SystemExit):
103 |         check_models_directory(mock_configs, config_file.parents[1])
104 | 
105 | 
106 | @pytest.mark.parametrize("mock_models", [
107 |     pytest.param(["liver"], id="one_model"),
108 |     pytest.param(["liver", "spleen", "heart"], id="three_models"),
109 | ])
110 | @patch("triton_utils._extract_models_from_configs")
111 | def test_check_model_repository_good_input(mock_func, mock_models, tmp_path, create_triton_models_dir):
112 |     mock_func.return_value = mock_models
113 |     mock_configs = MagicMock()
114 | 
115 |     create_triton_models_dir(mock_models + ["eyes", "lung"])
116 | 
117 |     result = check_models_directory(mock_configs, tmp_path / "models")
118 |     assert sorted(result) == sorted(mock_models)
119 | 
120 | 
121 | @pytest.mark.parametrize("mock_configs, exp_mode", [
122 |     pytest.param([MagicMock(**{"models": True, "services": None})], RUN_MODE.MODEL_REPO, id="model_repo"),
123 |     pytest.param([MagicMock(**{"models": None, "services": True})], RUN_MODE.PIPELINE_SERVICES, id="services"),
124 |     pytest.param([MagicMock(**{"models": None, "services": None})], RUN_MODE.NO_INFERENCE_SERVER, id="neither"),
125 | ])
126 | def test_decide_method_to_run_triton(mock_configs, exp_mode):
127 |     assert decide_method_to_run_triton(mock_configs) == exp_mode
128 | 
129 | 
130 | def test_decide_method_to_run_triton_error():
131 |     mock_configs = [MagicMock(**{"models": True, "services": True})]
132 |     with pytest.raises(SystemExit):
133 |         decide_method_to_run_triton(mock_configs)
134 | 
135 | 
136 | @pytest.mark.parametrize(
137 |     "model_names, mock_reponses",
138 |     [
139 |         pytest.param(
140 |             [],
141 |             [MagicMock(**{"status_code": 200})],
142 |             id="no_model_names"),
143 |         pytest.param(
144 |             ["model1"],
145 |             [MagicMock(**{"status_code": 200, "text": None}), MagicMock(**{"status_code": 200, "text": None})],
146 |             id="1_model_name"),
147 |     ]
148 | )
149 | @patch("triton_utils.TRITON_WAIT_SLEEP_TIME_SECONDS", 0)
150 | @patch("triton_utils.TRITON_WAIT_TIME_SECONDS", 0)
151 | @patch("triton_utils.requests")
152 | def test_check_triton_status_200(mock_requests, model_names, mock_reponses):
153 |     mock_requests.configure_mock(**{"ConnectionError": ValueError})
154 |     mock_requests.get.side_effect = mock_reponses
155 |     check_triton_status(triton_models_names=model_names, host="some_host", port="1234")
156 |     assert f"http://some_host:1234" in mock_requests.get.call_args.args[0]
157 | 
158 | 
159 | @pytest.mark.parametrize(
160 |     "model_names, mock_reponses, exp_msg",
161 |     [
162 |         pytest.param(
163 |             [],
164 |             [MagicMock(**{"status_code": 400, "text": "some msg"})],
165 |             "Triton is not working", id="no_model_names"),
166 |         pytest.param(
167 |             ["model1"],
168 |             [MagicMock(**{"status_code": 200, "text": None}), MagicMock(**{"status_code": 400, "text": "some msg"})],
169 |             "Error:", id="1_model_name"),
170 |     ]
171 | )
172 | @patch("triton_utils.TRITON_WAIT_SLEEP_TIME_SECONDS", 0)
173 | @patch("triton_utils.TRITON_WAIT_TIME_SECONDS", 0)
174 | @patch("triton_utils.requests")
175 | def test_check_triton_status_error(mock_requests, model_names, mock_reponses, exp_msg):
176 | 
177 |     mock_requests.configure_mock(**{"ConnectionError": ValueError})
178 |     mock_requests.get.side_effect = mock_reponses
179 |     with pytest.raises(SystemExit) as exc:
180 |         check_triton_status(triton_models_names=model_names)
181 |     assert exp_msg in str(exc.value)
182 | 
183 | 
184 | @patch("triton_utils.subproc_run_wrapper")
185 | def test_inspect_ip_address(mock_subproc_run_wrapper):
186 |     mock_subproc_run_wrapper.return_value = "'125.12.199.0'"
187 |     result = inspect_ip_address("container_name")
188 |     assert result == "125.12.199.0"
189 | 
190 | 
191 | @pytest.mark.parametrize("model_names", [["spleen", "arm", "legs"], []])
192 | @patch("triton_utils.check_triton_status")
193 | @patch("triton_utils.inspect_ip_address")
194 | @patch("triton_utils.subproc_run_wrapper")
195 | def test_start_triton(mock_subproc_run_wrapper, mock_inspect, mock_check_triton_status, model_names):
196 |     mock_subproc_run_wrapper.return_value = "container_id"
197 |     mock_inspect.return_value = "ip_address"
198 |     result = start_triton("models", ["some", "command"], triton_models_names=model_names)
199 |     assert result == ("container_id", "ip_address")
200 | 
201 |     # Check that all the models used are listed in the call_args for Popen
202 |     if model_names != []:
203 |         for name in model_names:
204 |             assert f"--load-model={name}" in mock_subproc_run_wrapper.call_args_list[0].args[0]
205 | 
206 | 
207 | @patch("triton_utils.subproc_run_wrapper")
208 | @patch("triton_utils.check_models_directory")
209 | @patch("triton_utils.start_triton")
210 | def test_run_triton_model_repo(mock_start_triton, mock_check_dir, mock_subproc_run_wrapper):
211 |     triton_models_names = ["spleen", "arm", "legs"]
212 |     mock_check_dir.return_value = triton_models_names
213 | 
214 |     process_mock = MagicMock()
215 |     process_mock.configure_mock(**{"returncode": None, "terminate.return_value": None})
216 |     mock_start_triton.return_value = ("container_id", "ip_address")
217 | 
218 |     with run_triton_model_repo([], "some_dir"):
219 |         pass
220 | 
221 |     mock_subproc_run_wrapper.assert_called_once()
222 |     assert "container_id" in mock_subproc_run_wrapper.call_args_list[0].args[0]
223 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![License](https://img.shields.io/badge/License-Apache_2.0-lightgrey.svg)](https://opensource.org/licenses/Apache-2.0)
  2 | 
  3 | [![NVIDIA](https://github.com/NVIDIA/clara-platform-python-client/blob/main/ext/NVIDIA_horo_white.png?raw=true)](https://docs.nvidia.com/clara/deploy/index.html)
  4 | 
  5 | # CPOST (Clara Pipeline Operator Sizing Tool)
  6 | ## Tool to measure resource usage of Clara Platform pipeline operators
  7 | 
  8 | Cpost is a tool that will help you run your pipeline locally and provides you with the CPU and memory usage of each operators ran for the given input payload. Opeartors are ran one at a time and CPU and memory usage are sampled. The CPU and memory usage metrics are provided in a .csv format which allows further data analytics as needed.
  9 | 
 10 | ##  System Requirements
 11 | * Clara Compiler (downloadable from [NGC](https://ngc.nvidia.com/catalog/resources/nvidia:clara:clara_cli))
 12 | * Docker 20.10 or higher due to cgroup v2 constraints
 13 | * System must be using cgroup v2 (See [Docker Control Groups](https://docs.docker.com/config/containers/runmetrics/#control-groups) for more information)
 14 | * Python 3.8.0 or higher
 15 | *Do not have a Triton instance running on the same machine that CPOST is running on. CPOST will provision it's own Triton instance and the two instances could conflict and cause failures.
 16 | 
 17 | ## Usage
 18 | The following is the help message of cpost:
 19 | ```
 20 | usage: cpost [-h] [--metrics_dir METRICS_DIR] [--models_dir MODELS_DIR] [-v] [--force [{none,cont,stop}]] <pipeline_path> <input_dir>
 21 | 
 22 | Clara Pipeline Sizing Tool CLI
 23 | 
 24 | positional arguments:
 25 |   <pipeline_path>       pipeline definition file path
 26 |   <input_dir>           input payload directory
 27 | 
 28 | optional arguments:
 29 |   -h, --help            show this help message and exit
 30 |   --metrics_dir METRICS_DIR
 31 |                         metrics output directory, if not specified, write to stdout
 32 |   --models_dir MODELS_DIR
 33 |                         directory for Triton models, required if pipeline uses Triton
 34 |   -v, --verbose         verbose output (DEBUG level). If not specified, default output is INFO level.
 35 |   --force [{none,cont,stop}]
 36 |                         force continue or stop when operator failure occurs. (default: none, which will prompt the user for each failure).
 37 | ```
 38 | 
 39 | ## Quick Start Guide
 40 | 
 41 | ### Download CPOST
 42 | #### Method 1: Install from Pypi as a PIP package
 43 | Run `pip install nvidia-clara-cpost`
 44 | 
 45 | #### Method 2: Build from Source Repository
 46 | 1. Clone this repository.
 47 | 2. In the source folder, run `python3 setup.py sdist bdist_wheel` and you should see a wheel file in `./dist`. Use this file to `pip install` in your desired virtual environment. For example:
 48 | ```
 49 | $ ls 
 50 | CONTRIBUTING.md  demo  dist  LICENSE  README.md  requirements-dev.txt  requirements.txt  setup.cfg  setup.py  src  tests
 51 | $ ls dist
 52 | nvidia_clara_cpost-0.0.0-py3-none-any.whl  nvidia-clara-cpost-0.0.0.tar.gz
 53 | ```
 54 | 
 55 | ### Run CPOST in a virtual environment (recommended)
 56 | After you have downloaded the wheel from [Download CPOST](#download-cpost), create a virtual environment to work with.
 57 | ```
 58 | $ mkdir ./demo
 59 | $ cd demo
 60 | $ python3.8 -m venv venv
 61 | $ source venv/bin/activate
 62 | $ pip install -U pip
 63 | $ pip install ../dist/nvidia_clara_cpost-0.0.0-py3-none-any.whl  # or any other path to the wheel file
 64 | ```
 65 | After pip install has completed, run `cpost` and you should see the help message.
 66 | 
 67 | ### Prepare Pipeline Data
 68 | 
 69 | Let's prepare some source data to work with. We will use the AI Spleen Segementation Pipeline as an example
 70 | 
 71 | Download the [Clara AI Spleen Segmentation Pipeline](https://ngc.nvidia.com/catalog/resources/nvidia:clara:clara_ai_spleen_pipeline) to a directory (e.g. `./demo`). Download instructions are available on the linked page
 72 | 
 73 | Once we have the spleen downloaded, go into the folder and unzip the model and input data.
 74 | ```
 75 | $ cd clara_ai_spleen_pipeline_v${VERSION_ON_NGC}
 76 | $ ls clara_ai_spleen_pipeline_v${VERSION_ON_NGC}
 77 | app_spleen-input_v1.zip  app_spleen-model_v1.zip  source.zip  spleen-pipeline-model-repo.yaml  spleen-pipeline.yaml
 78 | $ unzip app_spleen-input_v1.zip -d app_spleen-input_v1
 79 | $ unzip app_spleen-model_v1.zip -d app_spleen-model_v1
 80 | ```
 81 | Now we're ready to run cpost!
 82 | 
 83 | The simplest way to run `cpost` is to provide a pipeline definition file and input payload data as shown below. The resulting metrics and console logs are written to standard output directly. In the demo folder:
 84 | ```
 85 | $ cpost --models_dir clara_ai_spleen_pipeline_v${VERSION_ON_NGC}/app_spleen-model_v1 clara_ai_spleen_pipeline_v${VERSION_ON_NGC}/spleen-pipeline.yaml clara_ai_spleen_pipeline_v${VERSION_ON_NGC}/app_spleen-input_v1
 86 | ```
 87 | 
 88 | If raw metrics are desired, then a valid directory can be specified with `--metrics_dir` and the resulting metrics csv files will be stored in the given directory for each executed operator.
 89 | ```
 90 | $ mkdir metrics
 91 | $ cpost--metrics_dir metrics --models_dir clara_ai_spleen_pipeline_v${VERSION_ON_NGC}/app_spleen-model_v1 clara_ai_spleen_pipeline_v${VERSION_ON_NGC}/spleen-pipeline.yaml clara_ai_spleen_pipeline_v${VERSION_ON_NGC}/app_spleen-input_v1
 92 | ```
 93 | 
 94 | ### Interpreting the Result
 95 | After running the above command, you should see below as output:
 96 | 
 97 | ```
 98 | All software dependencies are fullfilled.
 99 | 
100 | ______________Executing Operator dicom-reader_______________
101 | Running operator ...
102 | The container id is: 47ca2626929006154a5515eba841755993df3f298de0abcdc5b9b951971470ca
103 | Results are stored in /home/magzhang/code/sdk/Tools/cpost/demo/metrics/dicom-reader_final_result.csv
104 | _______________Operator dicom-reader Summary________________
105 | +--------+-----------+------------+-------------+
106 | | Metric |  Average  |  Maximum   |  Resource   |
107 | +--------+-----------+------------+-------------+
108 | |  CPU   | 124.714 % | 1097.941 % |   cpu: 11   |
109 | | Memory | 91.057 MB | 405.242 MB | memory: 512 |
110 | +--------+-----------+------------+-------------+
111 | 
112 | ___________Executing Operator spleen-segmentation___________
113 | Running operator ...
114 | The container id is: 270f486475aa4584b4fb5911a0db23a10b4eaf0eb26a14daa3fa8951c6a77c95
115 | Results are stored in /home/magzhang/code/sdk/Tools/cpost/demo/metrics/spleen-segmentation_final_result.csv
116 | ____________Operator spleen-segmentation Summary____________
117 | +--------+-------------+-------------+--------------+
118 | | Metric |   Average   |   Maximum   |   Resource   |
119 | +--------+-------------+-------------+--------------+
120 | |  CPU   |  150.649 %  | 1134.358 %  |   cpu: 12    |
121 | | Memory | 1630.311 MB | 4455.412 MB | memory: 4608 |
122 | +--------+-------------+-------------+--------------+
123 | 
124 | ______________Executing Operator dicom-writer_______________
125 | Running operator ...
126 | The container id is: 32cf46da42111c75dfa1856ec35e4724e22d9e6d246e64ab3089fc212f049a4a
127 | Results are stored in /home/magzhang/code/sdk/Tools/cpost/demo/metrics/dicom-writer_final_result.csv
128 | _______________Operator dicom-writer Summary________________
129 | +--------+------------+------------+-------------+
130 | | Metric |  Average   |  Maximum   |  Resource   |
131 | +--------+------------+------------+-------------+
132 | |  CPU   | 190.224 %  | 1017.747 % |   cpu: 11   |
133 | | Memory | 278.678 MB | 552.313 MB | memory: 768 |
134 | +--------+------------+------------+-------------+
135 | 
136 | __Executing Operator register-volume-images-for-rendering___
137 | Running operator ...
138 | The container id is: 2ad135d27cd827de8f687791c9c70ca88229d5eec912be1d20c1a66993ecbb1a
139 | Results are stored in /home/magzhang/code/sdk/Tools/cpost/demo/metrics/register-volume-images-for-rendering_final_result.csv
140 | Operator failed with exitcode is: 126
141 | ___Operator register-volume-images-for-rendering Summary____
142 | +--------+----------+----------+-------------+
143 | | Metric | Average  | Maximum  |  Resource   |
144 | +--------+----------+----------+-------------+
145 | |  CPU   | 12.667 % | 14.923 % |   cpu: 1    |
146 | | Memory | 2.633 MB | 3.783 MB | memory: 256 |
147 | +--------+----------+----------+-------------+
148 | Operator register-volume-images-for-rendering failed with exitcode 126
149 | +--------------------------------------+----------+-------------+-------------+--------------+
150 | | Operator                             | Metric   | Average     | Maximum     | Resource     |
151 | +======================================+==========+=============+=============+==============+
152 | | dicom-reader                         | CPU      | 124.714 %   | 1097.941 %  | cpu: 11      |
153 | |                                      | Memory   | 91.057 MB   | 405.242 MB  | memory: 512  |
154 | +--------------------------------------+----------+-------------+-------------+--------------+
155 | | spleen-segmentation                  | CPU      | 150.649 %   | 1134.358 %  | cpu: 12      |
156 | |                                      | Memory   | 1630.311 MB | 4455.412 MB | memory: 4608 |
157 | +--------------------------------------+----------+-------------+-------------+--------------+
158 | | dicom-writer                         | CPU      | 190.224 %   | 1017.747 %  | cpu: 11      |
159 | |                                      | Memory   | 278.678 MB  | 552.313 MB  | memory: 768  |
160 | +--------------------------------------+----------+-------------+-------------+--------------+
161 | | register-volume-images-for-rendering | CPU      | 12.667 %    | 14.923 %    | cpu: 1       |
162 | | (Non-zero exitcode)                  | Memory   | 2.633 MB    | 3.783 MB    | memory: 256  |
163 | +--------------------------------------+----------+-------------+-------------+--------------+
164 | ```
165 | The last column in the last table is what you can put into the pipeline definition file's `requests`.
166 | Please note that there maybe some small differences between each execution. You can run multiple times to see what are the best numbers to fill.
167 | 
168 | 
169 | ## Troubleshooting
170 | ### Docker pull error
171 | ```
172 | Docker pull command for `nvcr.io/nvstaging/clara/dicom-reader:0.8.1-2108.1` returned with code 1
173 | stdout is:
174 | stderr is: Error response from daemon: unauthorized: authentication required
175 | 
176 | Please verify docker access and the pipeline definition
177 | ```
178 | **Resolution**: CPOST performs a local check to match with the given image and tag. If this fails, CPOST performs a docker pull. Thus, please do a `docker login` to the correct registry or ensure that you have the correct docker image locally. 
179 | 
180 | ### Docker network error
181 | ```
182 | Error response from daemon: network with name cpost_net already exists
183 | 
184 | cpost_net already exist, please remove the network and rerun cpost
185 | ```
186 | **Resolution**: This occurs because the docker network with name "cpost_net" already exist, which could either because you happen to have this network or because CPOST failed to clean up in one of the previous runs. Please do a `docker network rm cpost_net` and `docker network ls` to ensure this network is cleaned up.
187 | 
188 | For all other problems, please submit an issue in the repository and we will resolve this as soon as possible.
189 | 
190 | ### Warning from container ID timeout
191 | ```
192 | Running operator ...
193 | Obtaining docker ID timed out. Operator spleen-segmentation failed
194 | Operator spleen-segmentation failed with exitcode -15
195 | ```
196 | **Resolution**: This occurs when CPOST tries to run the container in detached mode and times out during when waiting for the container ID to return. The exitcode `-15` means that cpost terminated the docker container because it speculates that something has gone wrong. This could happen due to a lot of reasons, and you can run in `-v` (verbose) mode to see the full `docker run` command and run it yourself and hopefully this will provides you some insights on why CPOST couldn't obtain a docker ID.
197 | 
198 | ##  Running from Source Code During Development
199 | 
200 | The environment must have Python 3.8 installed and should have the necessary packages required by cpost installed. The `requirements.txt` contains all the necessary packages and can be used to install them. The tools used for development can be found in `requirements-dev.txt`
201 | 
202 | Once virtual environment are created successfully and have been activated. Install the `requirements.txt` with `pip` or `conda`, etc..  The following command can be run directly as cpost:
203 | ```
204 | python src/main.py 
205 | ```
206 | 
207 | ### Test Coverage
208 | 
209 | To see test coverage, activate the virtual environment and install the development tools from `requirements-dev.txt`. 
210 | From the root of repository, run the command below will provide the unittest coverage report.
211 | ```
212 | coverage run -m pytest tests && coverage report
213 | ```
214 | 


--------------------------------------------------------------------------------
/tests/test_pipeline_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import logging
 17 | import os
 18 | import re
 19 | import sys
 20 | import time
 21 | from multiprocessing import Manager, Queue
 22 | from random import uniform as rand_float
 23 | from unittest.mock import MagicMock, call, patch
 24 | 
 25 | import pytest
 26 | from src.cli import ContinueOptions
 27 | 
 28 | sys.path.append("{}/{}".format(os.path.dirname(os.path.realpath(__file__)), "../src"))
 29 | from clarac_utils import OperatorConfig, ServiceConfig  # nopep8  # noqa: E402
 30 | from container import Metrics  # nopep8  # noqa: E402
 31 | from pipeline_utils import (_enqueue_output, build_operator_cmd, clean_up_containers,  # nopep8  # noqa: E402
 32 |                             get_output_writers, print_operator_summary, print_pipeline_summary, run_pipeline,
 33 |                             run_pipeline_alone, run_pipeline_with_services, sample_operator, start_operator,
 34 |                             start_pipeline_services)
 35 | from triton_utils import RUN_MODE  # nopep8  # noqa: E402
 36 | 
 37 | 
 38 | def test_enqueue_output(tmp_path):
 39 |     file_path = tmp_path / "test_enqueue"
 40 |     data = b"1255\n1233\n"
 41 |     file_path.write_bytes(data)
 42 |     q = Queue()
 43 |     opened_file = file_path.open("rb")
 44 |     _enqueue_output(opened_file, q)
 45 |     assert q.get(timeout=1) == b"1255\n"
 46 |     assert q.get(timeout=1) == b"1233\n"
 47 | 
 48 | 
 49 | @patch("pipeline_utils.Popen")
 50 | def test_start_operator(mock_popen):
 51 |     raw_container_id = b'8c0b4110ae930dbe26b258de9bc34a03f98056ed6f27f991d32919bfe401d7c5\n'
 52 |     actual_container_id = raw_container_id.decode('utf-8').strip()
 53 |     mock_popen.return_value = MagicMock(**{"returncode": 0,
 54 |                                            "poll.return_value": None,
 55 |                                            "stdout.readline.return_value": raw_container_id,
 56 |                                            "stdout.close.return_value": None})
 57 | 
 58 |     manager = Manager()
 59 |     expected_container_id = manager.Value('c_wchar_p', '')
 60 |     mock_event = MagicMock()
 61 |     cmd = ["some", "docker", "run", "command"]
 62 | 
 63 |     start_operator(expected_container_id, mock_event, cmd)
 64 | 
 65 |     assert actual_container_id == expected_container_id.value
 66 |     mock_event.set.assert_called_once()
 67 | 
 68 | 
 69 | @patch("pipeline_utils.Process")
 70 | @patch("pipeline_utils.Popen")
 71 | def test_start_operator_popen_error(mock_popen, mock_multi_process):
 72 |     mock_exit_msg = "exiting because of error"
 73 |     mock_popen.return_value = MagicMock(**{"returncode": 1, "stderr.read.return_value": mock_exit_msg.encode("UTF-8")})
 74 | 
 75 |     manager = Manager()
 76 |     expected_container_id = manager.Value('c_wchar_p', '')
 77 |     mock_event = MagicMock()
 78 |     cmd = ['some', 'docker', 'run', 'command']
 79 | 
 80 |     with pytest.raises(SystemExit) as exc:
 81 |         start_operator(expected_container_id, mock_event, cmd)
 82 |     assert mock_exit_msg in str(exc.value)
 83 |     mock_event.set.assert_not_called()
 84 | 
 85 | 
 86 | @pytest.mark.parametrize("mock_exitcode, expected_code", [(b'0\n', None), (b'125\n', 125), (b'error\n', 1)])
 87 | @patch("pipeline_utils.Queue")
 88 | @patch("pipeline_utils.subproc_run")
 89 | @patch("pipeline_utils.Process")
 90 | @patch("pipeline_utils.Popen")
 91 | def test_start_operator_docker_error(
 92 |         mock_popen, mock_multi_process, mock_subproc_run, mock_q, mock_exitcode, expected_code):
 93 |     mock_popen.return_value = MagicMock(**{"returncode": None, "poll.return_value": None})
 94 | 
 95 |     mock_q.return_value = MagicMock(
 96 |         **{"get_nowait.return_value": b'8c0b4110ae930dbe26b258de9bc34a03f98056ed6f27f991d32919bfe401d7c5\n'})
 97 | 
 98 |     mock_subproc_run.return_value = MagicMock(**{"returncode": None, "stdout": mock_exitcode})
 99 | 
100 |     manager = Manager()
101 |     expected_container_id = manager.Value('c_wchar_p', '')
102 |     mock_event = MagicMock()
103 |     cmd = ['some', 'docker', 'run', 'command']
104 | 
105 |     if mock_exitcode == b'0\n':
106 |         start_operator(expected_container_id, mock_event, cmd)
107 |     else:
108 |         with pytest.raises(SystemExit) as exc:
109 |             start_operator(expected_container_id, mock_event, cmd)
110 |         assert exc.value.code == expected_code
111 |     mock_event.set.assert_called_once()
112 | 
113 | 
114 | def test_sample_operator_logic():
115 |     mock_q = MagicMock()
116 |     mock_container = MagicMock()
117 |     mock_container.metrics_path_exists.side_effect = [0, 1, 1, 0]
118 |     mock_container.sample_metrics.return_value = None
119 |     sample_operator(mock_container, mock_q)
120 |     assert mock_container.method_calls == [
121 |         call.metrics_path_exists(),
122 |         call.metrics_path_exists(),
123 |         call.metrics_path_exists(),
124 |         call.sample_metrics(),
125 |         call.metrics_path_exists(),
126 |     ]
127 | 
128 |     assert mock_q.put.call_count == 2
129 |     assert mock_q.put.call_args_list == [call(None), call(0)]
130 | 
131 | 
132 | @pytest.mark.parametrize("sampling_time,expected", [(rand_float(0.0001, 0.19), [0.2]), (0.3, [0.3])])
133 | def test_sample_operator_sampling_rate(sampling_time, expected):
134 |     mock_q = MagicMock()
135 |     mock_container = MagicMock()
136 |     sampling_num = 10
137 |     mock_container.metrics_path_exists.side_effect = [0, 1] + [1] * sampling_num + [0]
138 |     result_timestamps = []
139 | 
140 |     def mock_sample():
141 |         """Mock sampling function that appends a timestamp to a list."""
142 |         timestamp = time.perf_counter()
143 |         time.sleep(sampling_time)
144 |         result_timestamps.append(timestamp)
145 | 
146 |     mock_container.sample_metrics = mock_sample
147 |     sample_operator(mock_container, mock_q)
148 |     assert len(result_timestamps) == sampling_num, "The number of samples does not match with expected."
149 | 
150 |     result_diffs = [round(j - i, 1) for i, j in zip(result_timestamps[:-1], result_timestamps[1:])]
151 |     assert result_diffs == expected * (sampling_num - 1), "Something is wrong with the accuracy of time.sleep()"
152 | 
153 | 
154 | # autopep8: off
155 | @pytest.mark.parametrize(
156 |     "op_config, expected_args",
157 |     [
158 |         pytest.param(
159 |             OperatorConfig("op_name", "image:tag", None, {"VAR0": 2, "VAR1": "hi"}, [{"path": "/input"}], [{"path": "/output"}]),
160 |             ["--env", "VAR0=2", "--env", "VAR1=hi", "-v", "%tmp%/app_data:/input", "-v", "%tmp%/op_name:/output", "image:tag"], id="with_ENV_VAR"
161 |         ),
162 |         pytest.param(
163 |             OperatorConfig("op_name", "image:tag", None, None, None, None),
164 |             ["image:tag"], id="no_input_output"
165 |         ),
166 |         pytest.param(
167 |             OperatorConfig("op_name", "image:tag", None, None, [{"path": "/input"}], [{"path": "/output"}]),
168 |             ["-v", "%tmp%/app_data:/input", "-v", "%tmp%/op_name:/output", "image:tag"], id="min_input_output"
169 |         ),
170 |         pytest.param(
171 |             OperatorConfig("op_name", "image:tag", None, None, [{"from": "liver", "path": "/input"}], None),
172 |             ["-v", "%tmp%/liver:/input", "image:tag"], id="input_contains_from"
173 |         ),
174 |         pytest.param(
175 |             OperatorConfig("op_name", "image:tag", None, None, [{"from": "liver", "name": "classification", "path": "/input"}, {"path": "/dcm"}], None),
176 |             ["-v", "%tmp%/liver/classification:/input", "-v", "%tmp%/app_data:/dcm", "image:tag"], id="double_inputs"
177 |         ),
178 |         pytest.param(
179 |             OperatorConfig("op_name", "image:tag", None, None, [{"path": "/input"}], [{"name": "logs", "path": "/output"}]),
180 |             ["-v", "%tmp%/app_data:/input", "-v", "%tmp%/op_name/logs:/output", "image:tag"], id="named_output"
181 |         ),
182 |         pytest.param(
183 |             OperatorConfig("op_name", "image:tag", ["some", "command"], None, [{"path": "/input"}], [{"path": "/output"}]),
184 |             ["-v", "%tmp%/app_data:/input", "-v", "%tmp%/op_name:/output", "image:tag", "some", "command"], id="image_with_command"
185 |         ),
186 |         pytest.param(
187 |             OperatorConfig("op_name", "image:tag", None, None, None, None, ["model1"]),
188 |             ["--env", "NVIDIA_TRITON_HTTPURI=localhost:8000", "--env", "CLARA_TRITON_URI=localhost:8000", "--env", "NVIDIA_CLARA_TRTISURI=localhost:8000", "--env", "NVIDIA_TRITON_GRPCURI=localhost:8001", "image:tag"], id="model_repo"
189 |         ),
190 |         pytest.param(
191 |             OperatorConfig("op_name", "image:tag", None, None, None, None, None, [ServiceConfig("name", "it", None, None)]),
192 |             ["image:tag"], id="pipeline_services"
193 |         ),
194 |         pytest.param(
195 |             OperatorConfig("op_name", "image:tag", ["some", "command"], {"VAR0": 2}, 
196 |                 [{"from": "liver", "name": "classification", "path": "/input"}, {"path": "/dcm"}], 
197 |                 [{"name": "dicom", "path": "/output"}, {"name": "logs", "path": "/logs"}]),
198 |                 ["--env", "VAR0=2", "-v", "%tmp%/liver/classification:/input", "-v", "%tmp%/app_data:/dcm", 
199 |                     "-v", "%tmp%/op_name/dicom:/output", "-v", "%tmp%/op_name/logs:/logs", "image:tag", "some", "command"], 
200 |             id="all_in_one"
201 |         ),
202 |     ],
203 | )
204 | # autopep8: on
205 | def test_build_operator_cmd(tmp_path, op_config, expected_args):
206 |     input_path = tmp_path / "app_data"
207 | 
208 |     def swap_tmp(temp_dir, args):
209 |         return [re.sub(r'%tmp%', temp_dir, i) for i in args]
210 |     expected_args = swap_tmp(str(tmp_path), expected_args)
211 |     config = op_config
212 | 
213 |     result_cmd = build_operator_cmd(input_path, tmp_path, config, "localhost")
214 | 
215 |     assert (tmp_path / "op_name").is_dir()
216 | 
217 |     assert result_cmd == ["docker", "run", "-d", "--rm", "--env", "NVIDIA_CLARA_NOSYNCLOCK=1"] + expected_args
218 | 
219 | 
220 | def test_print_operator_summary(caplog):
221 |     metrics = [Metrics(1.5, 10, 20), Metrics(1.5, 20, 20), Metrics(1.5, 30, 25)]
222 |     with caplog.at_level(logging.INFO):
223 |         print_operator_summary(metrics, "opeartor_name")
224 |         # [1] only gets the table section
225 |         messages = [rec.getMessage() for rec in caplog.records][1]
226 | 
227 |     messages = messages.split("\n")
228 |     cpu_line = messages[3]
229 |     mem_line = messages[4]
230 |     assert "CPU" in cpu_line
231 |     assert "20" in cpu_line
232 |     assert "30" in cpu_line
233 |     assert "Memory" in mem_line
234 |     assert "21.6" in mem_line
235 |     assert "25" in mem_line
236 | 
237 | 
238 | @pytest.mark.parametrize("run_mode", [RUN_MODE.NO_INFERENCE_SERVER, RUN_MODE.MODEL_REPO, RUN_MODE.PIPELINE_SERVICES])
239 | @patch("pipeline_utils.run_pipeline_with_services")
240 | @patch("pipeline_utils.run_pipeline_alone")
241 | @patch("pipeline_utils.run_triton_model_repo")
242 | @patch("pipeline_utils.decide_method_to_run_triton")
243 | def test_run_pipeline(mock_decide, mock_run_triton, mock_run_alone, mock_run_services, run_mode):
244 |     mock_decide.return_value = run_mode
245 |     mock_run_triton.return_value.__enter__.return_value = MagicMock()
246 |     run_pipeline([], None, None, None, ContinueOptions.NONE)
247 |     if run_mode == RUN_MODE.NO_INFERENCE_SERVER:
248 |         mock_run_triton.assert_not_called()
249 |         mock_run_alone.assert_called_once()
250 |         mock_run_services.assert_not_called()
251 |     elif run_mode == RUN_MODE.MODEL_REPO:
252 |         mock_run_triton.assert_called_once()
253 |         mock_run_alone.assert_called_once()
254 |         mock_run_services.assert_not_called()
255 |     elif run_mode == RUN_MODE.PIPELINE_SERVICES:
256 |         mock_run_triton.assert_not_called()
257 |         mock_run_alone.assert_not_called()
258 |         mock_run_services.assert_called_once()
259 | 
260 | 
261 | def test_get_output_writers(tmp_path):
262 |     mock_writer = MagicMock(**{"join.return_value": None})
263 |     with get_output_writers(tmp_path) as writers:
264 |         assert writers == []
265 |         writers.append(mock_writer)
266 |     assert mock_writer.join.call_count == 1
267 | 
268 | 
269 | def test_get_no_output_writers():
270 |     with get_output_writers(None) as writers:
271 |         assert writers is None
272 | 
273 | 
274 | @patch("pipeline_utils.build_operator_cmd")
275 | @patch("pipeline_utils.run_operator")
276 | @patch("pipeline_utils.TemporaryDirectory")
277 | def test_run_pipeline_alone(mock_temp_file, mock_run_operator, mock_build_cmd, tmp_path):
278 |     mock_temp_file.return_value.__enter__.return_value = "tmp_file_name"
279 |     mock_run_operator.side_effect = [None, True, None]
280 |     m1, m2, m3 = MagicMock(**{"name": "1"}), MagicMock(**{"name": "2"}), MagicMock(**{"name": "3"})
281 |     execution_order = [m1, m2, m3]
282 |     run_pipeline_alone(execution_order, tmp_path, None, ContinueOptions.NONE, None)
283 |     assert len(mock_run_operator.call_args_list) == 2
284 |     assert m1 in mock_run_operator.call_args_list[0].args
285 |     assert m2 in mock_run_operator.call_args_list[1].args
286 | 
287 | 
288 | @patch("pipeline_utils.subproc_run_wrapper")
289 | def test_clean_up_containers(mock_subproc_run_wrapper):
290 |     running_containers = {"image1": ("ID1", "ip_address")}
291 |     clean_up_containers(running_containers)
292 |     assert mock_subproc_run_wrapper.call_args.args[0] == ["docker", "kill", "ID1"]
293 |     assert running_containers == {}
294 | 
295 | 
296 | @patch("pipeline_utils.start_triton")
297 | @patch("pipeline_utils.clean_up_containers")
298 | def test_start_pipeline_services(mock_clean_up_containers, mock_start_triton):
299 |     container_info = ("container_id_123", "ip_address")
300 |     mock_start_triton.return_value = container_info
301 | 
302 |     service_config_1 = ServiceConfig("trtis", "image_tag", ["some", "cmd"], {"VAR": "port_num"})
303 |     op_config_1 = OperatorConfig("name", None, None, None, None, None, None, [service_config_1])
304 |     services_dict = {}
305 |     start_pipeline_services(op_config_1, services_dict, "some-dir")
306 |     assert services_dict["image_tag some cmd"] == container_info
307 |     assert op_config_1.variables == {"VAR": "ip_address:port_num"}
308 |     assert mock_start_triton.call_count == 1
309 | 
310 |     # Same service -> no new services created
311 |     start_pipeline_services(op_config_1, services_dict, "some-dir")
312 |     assert services_dict["image_tag some cmd"] == container_info
313 |     assert op_config_1.variables == {"VAR": "ip_address:port_num"}
314 |     assert mock_start_triton.call_count == 1
315 | 
316 |     # Different service -> new service created
317 |     service_config_2 = ServiceConfig("trtis", "image_tag2", ["some", "cmd"], {"VAR": "port_num2"})
318 |     op_config_2 = OperatorConfig("name", None, None, None, None, None, None, [service_config_2])
319 |     start_pipeline_services(op_config_2, services_dict, "some-dir")
320 |     mock_clean_up_containers.assert_called_once()
321 |     assert services_dict["image_tag2 some cmd"] == container_info
322 |     assert op_config_2.variables == {"VAR": "ip_address:port_num2"}
323 |     assert mock_start_triton.call_count == 2
324 | 
325 | 
326 | @patch("pipeline_utils.start_triton")
327 | @patch("pipeline_utils.clean_up_containers")
328 | def test_start_service_not_supported(mock_clean_up_containers, mock_start_triton, caplog):
329 |     service_config_1 = ServiceConfig("other service", "image_tag", ["some", "cmd"], {"VAR": "value"})
330 |     op_config_1 = OperatorConfig("name", None, None, None, None, None, None, [service_config_1])
331 |     services_dict = {}
332 | 
333 |     with caplog.at_level(logging.WARNING):
334 |         start_pipeline_services(op_config_1, services_dict, "some-dir")
335 |         messages = [rec.getMessage() for rec in caplog.records]
336 |     mock_clean_up_containers.assert_not_called()
337 |     mock_start_triton.assert_not_called()
338 |     assert "does not support" in messages[0]
339 |     assert "Skipping `other service`" in messages[1]
340 | 
341 | 
342 | @patch("pipeline_utils.clean_up_containers")
343 | @patch("pipeline_utils.build_operator_cmd")
344 | @patch("pipeline_utils.start_pipeline_services")
345 | @patch("pipeline_utils.run_operator")
346 | @patch("pipeline_utils.TemporaryDirectory")
347 | def test_run_pipeline_with_services(
348 |         mock_temp_file, mock_run_operator, mock_start_pipeline_services, mock_build_cmd, mock_clean_up_containers,
349 |         tmp_path):
350 | 
351 |     def mock_add_dict(op, services_dict, *args):
352 |         services_dict["name"] = "cont_id"
353 |     mock_start_pipeline_services.side_effect = mock_add_dict
354 | 
355 |     mock_temp_file.return_value.__enter__.return_value = "tmp_file_name"
356 |     mock_run_operator.side_effect = [None, True, None]
357 |     mock_config1 = MagicMock(**{"services": True})
358 |     mock_config2 = MagicMock(**{"services": False})
359 |     execution_order = [mock_config1, mock_config2, mock_config2]
360 |     run_pipeline_with_services(execution_order, tmp_path, None, tmp_path, ContinueOptions.NONE)
361 |     assert len(mock_run_operator.call_args_list) == 2
362 |     mock_start_pipeline_services.assert_called_once()
363 |     assert mock_build_cmd.call_count == 2
364 |     mock_clean_up_containers.assert_called_once()
365 | 
366 | 
367 | @patch("pipeline_utils.tabulate")
368 | def test_print_pipeline_summary(mock_tabulate):
369 |     raw_data = {
370 |         'dicom-reader':
371 |         [['CPU', '130.407 %', '732.975 %', 'cpu: 8'],
372 |          ['Memory', '109.309 MB', '431.407 MB', 'memory: 512']],
373 |         'spleen-segmentation':
374 |         [['CPU', '126.747 %', '1144.132 %', 'cpu: 12'],
375 |             ['Memory', '1403.712 MB', '4339.55 MB', 'memory: 8192']],
376 |         'dicom-writer':
377 |         [['CPU', '168.027 %', '676.498 %', 'cpu: 7'],
378 |             ['Memory', '481.506 MB', '866.976 MB', 'memory: 1024']],
379 |         'register-dicom-output\n(Non-zero exitcode)':
380 |         [['CPU', '14.524 %', '18.102 %', 'cpu: 1'],
381 |             ['Memory', '2.074 MB', '2.589 MB', 'memory: 4']]}
382 | 
383 |     print_pipeline_summary(raw_data)
384 |     # This format is desired to keep the display result from tabulate clean
385 |     assert mock_tabulate.call_args.args[0] == [
386 |         ['dicom-reader', 'CPU\nMemory', '130.407 %\n109.309 MB', '732.975 %\n431.407 MB', 'cpu: 8\nmemory: 512'],
387 |         ['spleen-segmentation', 'CPU\nMemory', '126.747 %\n1403.712 MB', '1144.132 %\n4339.55 MB', 'cpu: 12\nmemory: 8192'],
388 |         ['dicom-writer', 'CPU\nMemory', '168.027 %\n481.506 MB', '676.498 %\n866.976 MB', 'cpu: 7\nmemory: 1024'],
389 |         ['register-dicom-output\n(Non-zero exitcode)', 'CPU\nMemory', '14.524 %\n2.074 MB', '18.102 %\n2.589 MB',
390 |          'cpu: 1\nmemory: 4']]
391 | 


--------------------------------------------------------------------------------
/src/pipeline_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 NVIDIA Corporation
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import logging
 17 | import sys
 18 | import time
 19 | from contextlib import contextmanager
 20 | from dataclasses import astuple
 21 | from multiprocessing import Manager, Process, Queue
 22 | from pathlib import Path
 23 | from queue import Empty
 24 | from subprocess import PIPE, Popen
 25 | from subprocess import run as subproc_run
 26 | from tempfile import TemporaryDirectory
 27 | 
 28 | from clarac_utils import OperatorConfig
 29 | from constants import (ID_WAITING_TIME_SECONDS, LEGACY_TRITON_HTTP_ENV_VAR, LEGACY_TRTIS_HTTP_ENV_VAR,
 30 |                        METRIC_SAMPLING_PERIOD_SECONDS, ON_POSIX, TRITON_GRPC_ENV_VAR, TRITON_GRPC_PORT,
 31 |                        TRITON_HTTP_ENV_VAR, TRITON_HTTP_PORT)
 32 | from container import METRICS_HEADER, Container
 33 | from tabulate import tabulate
 34 | from triton_utils import (RUN_MODE, check_triton_status, decide_method_to_run_triton, inspect_ip_address,
 35 |                           run_triton_model_repo, start_triton)
 36 | from utils import convert_percent_to_cores, prompt_yes_or_no, round_up_to_multiple, subproc_run_wrapper, write_to_csv
 37 | 
 38 | from cli import ContinueOptions
 39 | 
 40 | 
 41 | def _enqueue_output(out, queue):
 42 |     """Reads the file content, add to queue, and close the file handler when done.
 43 | 
 44 |     Args:
 45 |         out: opened file handler or stdout
 46 |         queue: multiprocessing.Queue object
 47 | 
 48 |     Returns:
 49 |         None
 50 |     """
 51 |     for line in iter(out.readline, b''):
 52 |         queue.put(line)
 53 |     out.close()
 54 | 
 55 | 
 56 | def start_operator(container_id, id_returned_event, cmd):
 57 |     """Runs the given docker command and assign docker ID to the given shared value.
 58 | 
 59 |     Args:
 60 |         container_id: A multiprocessing.Value object to allow sharing of values.
 61 |         id_returned_event: A multiprocess.Event object, set when container_id is assigned.
 62 |         cmd: The full docker command to run an image.
 63 | 
 64 |     Returns:
 65 |         None
 66 |     """
 67 |     cmd_proc = Popen(cmd, stdout=PIPE, stderr=PIPE, close_fds=ON_POSIX)
 68 |     logging.info("Running operator ...")
 69 |     q = Queue()
 70 | 
 71 |     checker = Process(target=_enqueue_output, args=(cmd_proc.stdout, q), daemon=True)
 72 |     checker.start()
 73 | 
 74 |     while cmd_proc.poll() is None:
 75 |         try:
 76 |             raw_id = q.get_nowait()
 77 |         except Empty:
 78 |             continue
 79 |         else:
 80 |             # Validate the result, expect length to be 64 + 1 from '\n'
 81 |             if len(raw_id) == 65:
 82 |                 container_id.value = raw_id.decode('utf-8').strip()
 83 |                 logging.info(f"The container id is: {container_id.value}")
 84 |                 id_returned_event.set()
 85 |                 break
 86 |             else:
 87 |                 sys.exit(f"The output of docker should be the 64 bit container ID, got {raw_id} instead.")
 88 |     else:
 89 |         if cmd_proc.returncode != 0:
 90 |             checker.terminate()
 91 |             checker.join()
 92 |             # This means that cmd_proc has errorred and terminated. Log the error and return
 93 |             logging.warning(f"Operator failed to start with returncode {cmd_proc.returncode}")
 94 |             sys.exit(f"The operator failed with stderr:\n{cmd_proc.stderr.read().decode('UTF-8')}")
 95 | 
 96 |     checker.terminate()
 97 |     checker.join()
 98 |     if cmd_proc.returncode is None:
 99 |         logging.debug("Operator is running...")
100 |         # We need to know if docker exited correctly
101 |         docker_wait_proc = subproc_run(["docker", "wait", container_id.value], capture_output=True)
102 |         returned_str = docker_wait_proc.stdout.decode('UTF-8').strip()
103 |         if returned_str == "0":
104 |             logging.debug(f"Operator finished successfully with exitcode {returned_str}")
105 |         else:
106 |             logging.error(f"Operator failed with exitcode is: {returned_str}")
107 |             try:
108 |                 return_code = int(returned_str)
109 |                 sys.exit(return_code)
110 |             except ValueError:
111 |                 sys.exit(1)
112 |     else:
113 |         logging.debug(f"Docker run command returned with {cmd_proc.returncode}")
114 | 
115 | 
116 | def sample_operator(container, que):
117 |     """Samples and writes metrics for the given operator as long as its metrics paths exist.
118 |     Sampling frequency is determined by METRIC_SAMPLING_PERIOD_SECONDS.
119 | 
120 |     Args:
121 |         container: Container object.
122 |         que: None or a multiprocessing.Queue object to store data that needs to be written to csv.
123 | 
124 |     Returns:
125 |         None
126 |     """
127 |     # Waits for the files to be created by docker
128 |     while not container.metrics_path_exists():
129 |         continue
130 | 
131 |     # Samples until the files disappear
132 |     logging.debug("Starts sampling container ...")
133 |     before_sample = time.perf_counter()
134 |     while container.metrics_path_exists():
135 |         metric = container.sample_metrics()
136 |         if que:
137 |             que.put(metric)
138 |         after_sample = time.perf_counter()
139 |         sleep_time = METRIC_SAMPLING_PERIOD_SECONDS - (after_sample - before_sample)
140 |         sleep_time = sleep_time if sleep_time > 0 else 0
141 |         if sleep_time == 0:
142 |             logging.info(
143 |                 f"Sampling taking longer than sampling period with time of {(after_sample - before_sample)} seconds")
144 |         # NOTE: Due to the inaccurate nature of time.sleep(), our sampling will not be extremely precise
145 |         time.sleep(sleep_time)
146 |         before_sample = time.perf_counter()
147 | 
148 |     # Signal the end of que
149 |     if que:
150 |         que.put(0)
151 |     logging.debug("Finished sampling container.")
152 | 
153 | 
154 | def build_operator_cmd(input_dir: Path, data_folder_name: str, op_config: OperatorConfig, triton_ip: str = None):
155 |     """Constructs the docker command used to run operator.
156 | 
157 |     Args:
158 |         input_dir: A Path object for the input payload data in local system
159 |         data_folder_name: Name of the data folder to store temporary data
160 |         op_config: A OperatorConfig object containing information about the operator
161 |         triton_ip: None, or Triton's IP address
162 | 
163 |     Returns:
164 |         cmd: A list of string representing the docker command that can be used to run the operator
165 |     """
166 |     logging.debug(f"Constructing commands for operator {op_config.name} ...")
167 |     op_output_dir = Path(data_folder_name) / op_config.name
168 |     op_output_dir.mkdir()
169 | 
170 |     cmd = ["docker", "run", "-d", "--rm", "--env", "NVIDIA_CLARA_NOSYNCLOCK=1"]
171 | 
172 |     # If models is present, then we supply Triton ports to this
173 |     if op_config.models:
174 |         cmd.extend(["--env", f"{TRITON_HTTP_ENV_VAR}={triton_ip}:{TRITON_HTTP_PORT}"])
175 |         cmd.extend(["--env", f"{LEGACY_TRITON_HTTP_ENV_VAR}={triton_ip}:{TRITON_HTTP_PORT}"])
176 |         cmd.extend(["--env", f"{LEGACY_TRTIS_HTTP_ENV_VAR}={triton_ip}:{TRITON_HTTP_PORT}"])
177 |         cmd.extend(["--env", f"{TRITON_GRPC_ENV_VAR}={triton_ip}:{TRITON_GRPC_PORT}"])
178 | 
179 |     # Add operator specific environment variables
180 |     if op_config.variables:
181 |         for key, value in op_config.variables.items():
182 |             cmd.extend(["--env", f"{key}={value}"])
183 | 
184 |     # Mount input and output volumes
185 |     def build_volume_mount(local, remote):
186 |         return ["-v", ":".join([local, remote])]
187 | 
188 |     # Mount input volumes
189 |     if op_config.inputs:
190 |         for input_obj in op_config.inputs:
191 |             # If `from` is not present, we use the input payload directory
192 |             if input_obj.get("from") is None:
193 |                 cmd.extend(build_volume_mount(str(input_dir), input_obj["path"]))
194 |             # If `from` is specified, we use the specified operator's output directory as the input for this operator
195 |             else:
196 |                 op_input_dir = op_output_dir.parent / input_obj["from"]
197 |                 # If `name` is specified, then find the subdirectory and use this as the input
198 |                 if input_obj.get("name"):
199 |                     cmd.extend(build_volume_mount(str((op_input_dir / input_obj["name"])), input_obj["path"]))
200 |                 else:
201 |                     cmd.extend(build_volume_mount(str(op_input_dir), input_obj["path"]))
202 | 
203 |     # Mount output volumes
204 |     if op_config.outputs:
205 |         for output_obj in op_config.outputs:
206 |             # If `name` is specified, create a subdirectory with this name
207 |             if output_obj.get("name"):
208 |                 sub_dir = Path(op_output_dir / output_obj["name"])
209 |                 sub_dir.mkdir(parents=True)
210 |                 cmd.extend(build_volume_mount(str(sub_dir), output_obj["path"]))
211 |             else:
212 |                 cmd.extend(build_volume_mount(str(op_output_dir), output_obj["path"]))
213 | 
214 |     # Add the image and tag, and command last
215 |     cmd.append(op_config.image_n_tag)
216 |     if op_config.command:
217 |         cmd.extend(op_config.command)
218 |     logging.debug(f"Docker command for operator {op_config.name} is: {cmd}")
219 |     return cmd
220 | 
221 | 
222 | def print_operator_metrics(metrics, metrics_header, op_name):
223 |     """Logs the metrics to console in a table format.
224 | 
225 |     Args:
226 |         metrics: list of Metrics object
227 |         metrics_header: Header of the metrics data
228 |         op_name: Name of the operator
229 | 
230 |     Returns:
231 |         None
232 |     """
233 |     logging.info("{:_^60}".format(f"Operator {op_name} Metrics Data"))  # pragma: no cover
234 |     data = [astuple(metric) for metric in metrics]  # pragma: no cover
235 |     logging.info(tabulate(data, metrics_header, tablefmt="pretty"))  # pragma: no cover
236 | 
237 | 
238 | def print_operator_summary(metrics, op_name):
239 |     """Calculate and logs the metrics statistics in a readable format.
240 | 
241 |     Args:
242 |         metrics: list of Metrics object
243 |         op_name: Name of the operator
244 | 
245 |     Returns:
246 |         None
247 |     """
248 |     logging.info("{:_^60}".format(f"Operator {op_name} Summary"))
249 |     # Calculate metrics for CPU and memory
250 |     cpu_data = [metric.cpu_percent for metric in metrics]
251 |     cpu_avg = round(sum(cpu_data)/len(metrics), 3)
252 |     cpu_max = round(max(cpu_data), 3)
253 | 
254 |     memory_data = [metric.memory for metric in metrics]
255 |     memory_avg = round(sum(memory_data)/len(metrics), 3)
256 |     memory_max = round(max(memory_data), 3)
257 | 
258 |     recommended_cpu = convert_percent_to_cores(cpu_max)
259 |     # Add 100MB of buffer memory and round to multiple of base 256
260 |     recommended_memory = round_up_to_multiple(memory_max + 100.0, 256)
261 | 
262 |     # Log it onto console
263 |     data = [["CPU", f"{cpu_avg} %", f"{cpu_max} %", f"cpu: {recommended_cpu}"], [
264 |         "Memory", f"{memory_avg} MB", f"{memory_max} MB", f"memory: {recommended_memory}"]]
265 |     logging.info(
266 |         tabulate(
267 |             data, ["Metric", "Average", "Maximum", "Resource"],
268 |             tablefmt="pretty"))
269 |     return data
270 | 
271 | 
272 | def print_pipeline_summary(pipeline_metrics_dict):
273 |     """Display the pipeline summary table.
274 | 
275 |     Args:
276 |         pipeline_metrics_dict: Dictionary with key being operator name and values are metrics
277 | 
278 |     Returns:
279 |         None
280 |     """
281 |     pipeline_data = []
282 |     for op_name, op_summary in pipeline_metrics_dict.items():
283 |         p_sumamry = [op_name] + ["\n".join([str(row1), str(row2)]) for row1, row2 in zip(op_summary[0], op_summary[1])]
284 |         pipeline_data.append(p_sumamry)
285 |     logging.info(
286 |         tabulate(
287 |             pipeline_data, ["Operator", "Metric", "Average", "Maximum", "Resource"],
288 |             tablefmt="grid", numalign="right"))
289 | 
290 | 
291 | def run_operator(
292 |         op_config, docker_cmd, output_writers, metrics_output, continue_option,
293 |         pipeline_summary_dict):
294 |     """Run the operator using the directories given.
295 | 
296 |     Args:
297 |         op_config: a OperatorConfig object
298 |         docker_cmd: List of docker commands to run the operator
299 |         output_writers: List of writers or None
300 |         metrics_output: A Path object for the metrics directory or None
301 |         continue_option: A ContinueOptions Enum object
302 |         pipeline_summary_dict: Dictionary with key being operator name and values are metrics
303 | 
304 |     Returns:
305 |         True when the operator failed and user wants to stop, otherwise None
306 |     """
307 |     container = Container()
308 |     manager = Manager()
309 |     container_id = manager.Value('c_wchar_p', '')
310 |     id_returned_event = manager.Event()
311 | 
312 |     if output_writers is not None:
313 |         write_que = Queue()
314 |         writer_process = Process(
315 |             target=write_to_csv,
316 |             args=(write_que, METRICS_HEADER, (metrics_output / f"{op_config.name}_final_result.csv")))
317 |         writer_process.start()
318 |         output_writers.append(writer_process)
319 |     else:
320 |         write_que = None
321 | 
322 |     p_start = Process(target=start_operator, args=(container_id, id_returned_event, docker_cmd))
323 |     before_id = time.perf_counter()  # timing
324 |     p_start.start()
325 | 
326 |     if id_returned_event.wait(timeout=ID_WAITING_TIME_SECONDS):
327 |         # Event.wait() returns true if it has been set
328 |         after_id = time.perf_counter()  # timing
329 |         container.id = container_id.value
330 |         container.construct_metrics_path()
331 |         sample_operator(container, write_que)
332 |         end = time.perf_counter()  # timing
333 |         logging.debug(f"Time it takes to get container ID: {after_id-before_id} s")
334 |         logging.debug(f"Waiting and Sampling Time: {end-after_id} s")
335 | 
336 |         p_start.join()
337 | 
338 |         # print metrics to console if not written to csv
339 |         if output_writers is None:
340 |             print_operator_metrics(container.metrics, METRICS_HEADER, op_config.name)
341 |         operator_summary = print_operator_summary(container.metrics, op_config.name)
342 |         pipeline_summary_dict[op_config.name] = operator_summary
343 | 
344 |     else:
345 |         logging.warning(f"Obtaining docker ID timed out. Operator {op_config.name} failed")
346 |         p_start.terminate()
347 |         p_start.join()
348 |         if output_writers is not None:
349 |             writer_process.terminate()
350 | 
351 |     if p_start.exitcode != 0:  # i.e. container_id timed out
352 |         logging.warning(f"Operator {op_config.name} failed with exitcode {p_start.exitcode}")
353 |         if pipeline_summary_dict.get(op_config.name):
354 |             new_key = f"{op_config.name}\n(Non-zero exitcode)"
355 |             pipeline_summary_dict[new_key] = pipeline_summary_dict.pop(op_config.name)
356 |         if continue_option == ContinueOptions.CONT:
357 |             return
358 |         if continue_option == ContinueOptions.STOP:
359 |             return True
360 |         if not prompt_yes_or_no(
361 |                 "Would you like to continue execution at the risk of the rest of pipeline failing (y)? If (n), cpost will stop and cleanup."):
362 |             # When user says no, we exit the for-loop and return
363 |             return True
364 | 
365 | 
366 | def run_pipeline(execution_order, input_data_dir, metrics_output, models_dir, continue_option):
367 |     """Run the pipeline operators in the given execution_order using the directories given.
368 | 
369 |     Args:
370 |         execution_order: List of OperatorConfig objects in the order of execution
371 |         input_data_dir: Path to the input payload directory
372 |         metrics_output: A Path object for the metrics directory or stdout
373 |         models_dir: A directory that contains Triton models
374 |         continue_option: A ContinueOptions Enum object
375 | 
376 |     Returns:
377 |         None
378 |     """
379 | 
380 |     triton_mode = decide_method_to_run_triton(execution_order)
381 | 
382 |     if triton_mode == RUN_MODE.NO_INFERENCE_SERVER:
383 |         return run_pipeline_alone(execution_order, input_data_dir, metrics_output, continue_option)
384 |     if triton_mode == RUN_MODE.MODEL_REPO:
385 |         with run_triton_model_repo(execution_order, models_dir) as triton_ip:
386 |             run_pipeline_alone(execution_order, input_data_dir, metrics_output, continue_option, triton_ip)
387 |     else:  # PIPELINE_SERVICES
388 |         run_pipeline_with_services(execution_order, input_data_dir, metrics_output,
389 |                                    models_dir, continue_option)
390 | 
391 | 
392 | @contextmanager
393 | def get_output_writers(metrics_output):
394 |     """Context manager for keeping a list of output writers and cleaning up.
395 |     The list is used to keep output_writer processes which are threads/multiprocessing.Process.
396 | 
397 |     Args:
398 |         metrics_output: a pathlib.Path object or None
399 | 
400 |     Yields:
401 |         None if metrics_output is None. Empty list if metrics_output is Path
402 |     """
403 |     try:
404 |         write_csv_flag = True if isinstance(metrics_output, Path) else False
405 |         if write_csv_flag:
406 |             output_writers = []
407 |             yield output_writers
408 |         else:
409 |             yield None
410 | 
411 |     finally:
412 |         if write_csv_flag:
413 |             for writer in output_writers:
414 |                 writer.join()
415 | 
416 | 
417 | def run_pipeline_alone(execution_order, input_data_dir, metrics_output, continue_option, triton_ip=None):
418 |     """Run the pipeline operators in the given execution_order using the directories given.
419 | 
420 |     Args:
421 |         execution_order: List of OperatorConfig objects in the order of execution
422 |         input_data_dir: Path to the input payload directory
423 |         metrics_output: A Path object for the metrics directory or stdout
424 |         continue_option: A ContinueOptions Enum object
425 |         triton_ip: None, or Triton's IP address
426 | 
427 |     Returns:
428 |         None
429 |     """
430 |     with TemporaryDirectory() as data_folder_name:
431 |         with get_output_writers(metrics_output) as output_writers:
432 |             pipeline_summary_dict = {}
433 |             for op_config in execution_order:
434 |                 logging.info("\n{:_^60}".format(f"Executing Operator {op_config.name}"))
435 |                 docker_cmd = build_operator_cmd(input_data_dir, data_folder_name, op_config, triton_ip)
436 |                 exit = run_operator(op_config, docker_cmd, output_writers,
437 |                                     metrics_output, continue_option, pipeline_summary_dict)
438 |                 if exit:
439 |                     break
440 |             print_pipeline_summary(pipeline_summary_dict)
441 | 
442 | 
443 | def clean_up_containers(running_dict):
444 |     """Kill the containers in the given dictionary and remove the item from the dictionary.
445 | 
446 |     Args:
447 |         running_dict: Dictionary where key is image name and value is (container ID, ip_address)
448 | 
449 |     Returns:
450 |         None
451 |     """
452 |     for old_key, container_info in running_dict.items():
453 |         logging.debug(f"Tear down unused services {old_key}")
454 |         if container_info:
455 |             subproc_run_wrapper(["docker", "kill", container_info[0]])
456 |     running_dict.clear()
457 | 
458 | 
459 | def start_pipeline_services(op_config, running_dict, models_dir):
460 |     """Start the pipeline services for the given op_config.
461 | 
462 |     Args:
463 |         op_config: A OperatorConfig object
464 |         running_dict: Dictionary for keep track of currently running services
465 |         models_dir: A directory that contains Triton models
466 | 
467 |     Return:
468 |         None
469 |     """
470 |     for service in op_config.services:
471 |         logging.debug(f"Checking service with name {service.name}")
472 |         key = service.image_n_tag + " " + " ".join(service.command)
473 |         if running_dict.get(key):
474 |             # Add the connection variables
475 |             ip_address = running_dict[key][1]
476 |             http_connections_dict = {k: f"{ip_address}:{v}" for k, v in service.http_connections.items()}
477 |             op_config.update_variables(http_connections_dict)
478 |             logging.debug("Found running services that suit the needs")
479 |         else:
480 |             logging.debug("Didn't find matching service, starting new service")
481 |             if len(running_dict) != 0:  # tear down current services before spin up another one
482 |                 clean_up_containers(running_dict)
483 |             if "trtis" in service.name or "triton" in service.name:
484 |                 triton_container_id, ip_address = start_triton(models_dir, service.command, service.image_n_tag)
485 |                 running_dict[key] = (triton_container_id, ip_address)
486 |                 http_connections_dict = {k: f"{ip_address}:{v}" for k, v in service.http_connections.items()}
487 |                 op_config.update_variables(http_connections_dict)
488 |             else:
489 |                 logging.warning("CPOST currently does not support services other than triton or trtis.")
490 |                 logging.warning(f"Skipping `{service.name}`, operator may fail because of this.")
491 | 
492 | 
493 | def run_pipeline_with_services(
494 |         execution_order, input_data_dir, metrics_output, models_dir, continue_option):
495 |     """Run the pipeline operators in the given execution_order using the directories given.
496 | 
497 |     Args:
498 |         execution_order: List of OperatorConfig objects in the order of execution
499 |         input_data_dir: Path to the input payload directory
500 |         metrics_output: A Path object for the metrics directory or stdout
501 |         models_dir: A directory that contains Triton models
502 |         continue_option: A ContinueOptions Enum object
503 | 
504 |     Returns:
505 |         None
506 |     """
507 |     with TemporaryDirectory() as data_folder_name:
508 |         with get_output_writers(metrics_output) as output_writers:
509 |             try:
510 |                 running_services = {}
511 |                 pipeline_summary_dict = {}
512 |                 for op_config in execution_order:
513 |                     if op_config.services:
514 |                         start_pipeline_services(op_config, running_services, models_dir)
515 |                     logging.info("\n{:_^60}".format(f"Executing Operator {op_config.name}"))
516 |                     docker_cmd = build_operator_cmd(input_data_dir, data_folder_name, op_config)
517 |                     exit = run_operator(op_config, docker_cmd, output_writers,
518 |                                         metrics_output, continue_option, pipeline_summary_dict)
519 |                     if exit:
520 |                         break
521 |                 print_pipeline_summary(pipeline_summary_dict)
522 |             finally:
523 |                 # Stop any currently running services
524 |                 clean_up_containers(running_services)
525 | 


--------------------------------------------------------------------------------