├── .gitignore ├── script ├── setup │ ├── .cm │ │ ├── desc.json │ │ ├── meta.json │ │ └── info.json │ ├── setup.ssd-mobilenet.sh │ ├── setup.ssd-resnet.sh │ ├── setup.common.sh │ └── setup.resnet.sh ├── explore-params │ ├── .cm │ │ ├── desc.json │ │ ├── meta.json │ │ └── info.json │ ├── explore.sh │ └── run.sh └── .cm │ ├── alias-a-setup │ ├── alias-u-c03619b7c49c5a52 │ ├── alias-a-explore-params │ └── alias-u-64da79981c7d1099 ├── .cm ├── alias-a-program ├── alias-a-script ├── alias-u-84e27ad9dd12e734 └── alias-u-b0ac08fe1d3c2615 ├── COPYRIGHT.txt ├── program ├── zpp-worker-tensorrt-py │ ├── .cm │ │ ├── desc.json │ │ ├── info.json │ │ └── meta.json │ └── zpp_worker_trt.py ├── object-detection-zpp-hub-py │ ├── .cm │ │ ├── desc.json │ │ ├── info.json │ │ └── meta.json │ └── zpp_hub_detect.py ├── .cm │ ├── alias-a-zpp-worker-tensorrt-py │ ├── alias-u-0b248b2913eb548b │ ├── alias-a-object-detection-zpp-hub-py │ ├── alias-u-1dc4528a1a53c218 │ ├── alias-a-image-classification-zpp-hub-py │ ├── alias-u-6495587eb9150c0b │ ├── alias-a-object-detection-zpp-hub-loadgen-py │ ├── alias-u-f497e983b6b2eaaf │ ├── alias-a-image-classification-zpp-hub-loadgen-py │ └── alias-u-c4a80957e3ae1c8f ├── image-classification-zpp-hub-py │ ├── .cm │ │ ├── desc.json │ │ ├── info.json │ │ └── meta.json │ └── zpp_hub_classify.py ├── image-classification-zpp-hub-loadgen-py │ ├── .cm │ │ ├── desc.json │ │ ├── info.json │ │ └── meta.json │ ├── user.conf │ └── zpp_hub_classify_loadgen.py └── object-detection-zpp-hub-loadgen-py │ ├── .cm │ ├── desc.json │ ├── info.json │ └── meta.json │ ├── user.conf │ └── zpp_hub_detect_loadgen.py ├── README.md ├── .ckr.json └── LICENSE.txt /.gitignore: -------------------------------------------------------------------------------- 1 | program/*/tmp/* 2 | -------------------------------------------------------------------------------- /script/setup/.cm/desc.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /script/setup/.cm/meta.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /.cm/alias-a-program: -------------------------------------------------------------------------------- 1 | b0ac08fe1d3c2615 2 | -------------------------------------------------------------------------------- /.cm/alias-a-script: -------------------------------------------------------------------------------- 1 | 84e27ad9dd12e734 2 | -------------------------------------------------------------------------------- /.cm/alias-u-84e27ad9dd12e734: -------------------------------------------------------------------------------- 1 | script 2 | -------------------------------------------------------------------------------- /.cm/alias-u-b0ac08fe1d3c2615: -------------------------------------------------------------------------------- 1 | program 2 | -------------------------------------------------------------------------------- /script/explore-params/.cm/desc.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /script/explore-params/.cm/meta.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /script/.cm/alias-a-setup: -------------------------------------------------------------------------------- 1 | c03619b7c49c5a52 2 | -------------------------------------------------------------------------------- /script/.cm/alias-u-c03619b7c49c5a52: -------------------------------------------------------------------------------- 1 | setup 2 | -------------------------------------------------------------------------------- /COPYRIGHT.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019-2020 dividiti 2 | -------------------------------------------------------------------------------- /program/zpp-worker-tensorrt-py/.cm/desc.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /program/object-detection-zpp-hub-py/.cm/desc.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /script/.cm/alias-a-explore-params: -------------------------------------------------------------------------------- 1 | 64da79981c7d1099 2 | -------------------------------------------------------------------------------- /script/.cm/alias-u-64da79981c7d1099: -------------------------------------------------------------------------------- 1 | explore-params 2 | -------------------------------------------------------------------------------- /program/.cm/alias-a-zpp-worker-tensorrt-py: -------------------------------------------------------------------------------- 1 | 0b248b2913eb548b 2 | -------------------------------------------------------------------------------- /program/.cm/alias-u-0b248b2913eb548b: -------------------------------------------------------------------------------- 1 | zpp-worker-tensorrt-py 2 | -------------------------------------------------------------------------------- /program/image-classification-zpp-hub-py/.cm/desc.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /program/.cm/alias-a-object-detection-zpp-hub-py: -------------------------------------------------------------------------------- 1 | 1dc4528a1a53c218 2 | -------------------------------------------------------------------------------- /program/.cm/alias-u-1dc4528a1a53c218: -------------------------------------------------------------------------------- 1 | object-detection-zpp-hub-py 2 | -------------------------------------------------------------------------------- /program/image-classification-zpp-hub-loadgen-py/.cm/desc.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /program/object-detection-zpp-hub-loadgen-py/.cm/desc.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /program/.cm/alias-a-image-classification-zpp-hub-py: -------------------------------------------------------------------------------- 1 | 6495587eb9150c0b 2 | -------------------------------------------------------------------------------- /program/.cm/alias-u-6495587eb9150c0b: -------------------------------------------------------------------------------- 1 | image-classification-zpp-hub-py 2 | -------------------------------------------------------------------------------- /program/.cm/alias-a-object-detection-zpp-hub-loadgen-py: -------------------------------------------------------------------------------- 1 | f497e983b6b2eaaf 2 | -------------------------------------------------------------------------------- /program/.cm/alias-u-f497e983b6b2eaaf: -------------------------------------------------------------------------------- 1 | object-detection-zpp-hub-loadgen-py 2 | -------------------------------------------------------------------------------- /program/.cm/alias-a-image-classification-zpp-hub-loadgen-py: -------------------------------------------------------------------------------- 1 | c4a80957e3ae1c8f 2 | -------------------------------------------------------------------------------- /program/.cm/alias-u-c4a80957e3ae1c8f: -------------------------------------------------------------------------------- 1 | image-classification-zpp-hub-loadgen-py 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CK workflows for experiments with ZeroMQ, LoadGen and TensorRT 2 | 3 | ```bash 4 | $ ck pull repo --url=https://github.com/dividiti/ck-zeromq 5 | ``` 6 | -------------------------------------------------------------------------------- /script/setup/.cm/info.json: -------------------------------------------------------------------------------- 1 | { 2 | "backup_data_uid": "c03619b7c49c5a52", 3 | "backup_module_uid": "84e27ad9dd12e734", 4 | "backup_module_uoa": "script", 5 | "control": { 6 | "engine": "CK", 7 | "iso_datetime": "2020-03-09T21:14:06.029449", 8 | "version": [ 9 | "1", 10 | "12", 11 | "2" 12 | ] 13 | }, 14 | "data_name": "setup" 15 | } 16 | -------------------------------------------------------------------------------- /script/explore-params/.cm/info.json: -------------------------------------------------------------------------------- 1 | { 2 | "backup_data_uid": "64da79981c7d1099", 3 | "backup_module_uid": "84e27ad9dd12e734", 4 | "backup_module_uoa": "script", 5 | "control": { 6 | "engine": "CK", 7 | "iso_datetime": "2020-01-17T11:01:57.319371", 8 | "version": [ 9 | "1", 10 | "11", 11 | "4" 12 | ] 13 | }, 14 | "data_name": "explore-params" 15 | } 16 | -------------------------------------------------------------------------------- /.ckr.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_uoa": "ck-zeromq", 3 | "data_uid": "aa14f24b90ec9276", 4 | "data_alias": "ck-zeromq", 5 | "data_name": "ck-zeromq", 6 | "dict": { 7 | "shared": "git", 8 | "url": "git@github.com:dividiti/ck-zeromq.git", 9 | "repo_deps": [ 10 | { 11 | "repo_uoa": "ck-tensorrt" 12 | }, 13 | { 14 | "repo_uoa": "ck-mlperf" 15 | } 16 | ] 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /program/object-detection-zpp-hub-py/.cm/info.json: -------------------------------------------------------------------------------- 1 | { 2 | "backup_data_uid": "1dc4528a1a53c218", 3 | "backup_module_uid": "b0ac08fe1d3c2615", 4 | "backup_module_uoa": "program", 5 | "control": { 6 | "engine": "CK", 7 | "iso_datetime": "2020-03-16T11:53:09.534707", 8 | "version": [ 9 | "1", 10 | "12", 11 | "2" 12 | ] 13 | }, 14 | "data_name": "object-detection-zpp-hub-py" 15 | } 16 | -------------------------------------------------------------------------------- /program/zpp-worker-tensorrt-py/.cm/info.json: -------------------------------------------------------------------------------- 1 | { 2 | "backup_data_uid": "0b248b2913eb548b", 3 | "backup_module_uid": "b0ac08fe1d3c2615", 4 | "backup_module_uoa": "program", 5 | "control": { 6 | "engine": "CK", 7 | "iso_datetime": "2019-12-12T12:42:20.202044", 8 | "version": [ 9 | "1", 10 | "11", 11 | "4", 12 | "1" 13 | ] 14 | }, 15 | "data_name": "zpp-worker-tensorrt-py" 16 | } 17 | -------------------------------------------------------------------------------- /program/object-detection-zpp-hub-loadgen-py/.cm/info.json: -------------------------------------------------------------------------------- 1 | { 2 | "backup_data_uid": "f497e983b6b2eaaf", 3 | "backup_module_uid": "b0ac08fe1d3c2615", 4 | "backup_module_uoa": "program", 5 | "control": { 6 | "engine": "CK", 7 | "iso_datetime": "2020-03-17T12:07:30.704594", 8 | "version": [ 9 | "1", 10 | "12", 11 | "2" 12 | ] 13 | }, 14 | "data_name": "object-detection-zpp-hub-loadgen-py" 15 | } 16 | -------------------------------------------------------------------------------- /program/image-classification-zpp-hub-py/.cm/info.json: -------------------------------------------------------------------------------- 1 | { 2 | "backup_data_uid": "6495587eb9150c0b", 3 | "backup_module_uid": "b0ac08fe1d3c2615", 4 | "backup_module_uoa": "program", 5 | "control": { 6 | "engine": "CK", 7 | "iso_datetime": "2019-12-12T12:55:29.646618", 8 | "version": [ 9 | "1", 10 | "11", 11 | "4", 12 | "1" 13 | ] 14 | }, 15 | "data_name": "image-classification-zpp-hub-py" 16 | } 17 | -------------------------------------------------------------------------------- /program/image-classification-zpp-hub-loadgen-py/.cm/info.json: -------------------------------------------------------------------------------- 1 | { 2 | "backup_data_uid": "c4a80957e3ae1c8f", 3 | "backup_module_uid": "b0ac08fe1d3c2615", 4 | "backup_module_uoa": "program", 5 | "control": { 6 | "engine": "CK", 7 | "iso_datetime": "2019-12-27T16:40:01.027933", 8 | "version": [ 9 | "1", 10 | "11", 11 | "4", 12 | "1" 13 | ] 14 | }, 15 | "data_name": "image-classification-zpp-hub-loadgen-py" 16 | } 17 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019-2020 dividiti 2 | All rights reserved 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the cTuning foundation 15 | nor the names of its contributors may be used to endorse 16 | or promote products derived from this software without 17 | specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /script/setup/setup.ssd-mobilenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | function exit_if_error() { 5 | if [ "${?}" != "0" ]; then exit 1; fi 6 | } 7 | 8 | 9 | SSD_MODEL_NAME="SSD-MobileNet" 10 | SSD_MODEL_TAGS="ssd-mobilenet" 11 | SSD_MODEL_SIDE=300 12 | 13 | 14 | # Refresh CK-TensorRT and its dependencies. 15 | echo "Refreshing CK-TensorRT ..." 16 | ck pull repo:ck-tensorrt 17 | exit_if_error 18 | 19 | 20 | echo 21 | 22 | 23 | echo "Setting up ${SSD_MODEL_NAME} and COCO ..." 24 | 25 | # Skip SSD model setup: should be NO for hub; should be NO for worker. 26 | skip_ssd_setup=${CK_SKIP_SSD_SETUP:-"NO"} 27 | echo "- skip ${SSD_MODEL_NAME} setup (download): ${skip_ssd_setup}" 28 | 29 | # Skip COCO setup: should be NO for hub; should be YES or NO for worker. 30 | skip_coco_setup=${CK_SKIP_COCO_SETUP:-"NO"} 31 | echo "- skip COCO setup (download and preprocessing): ${skip_coco_setup}" 32 | 33 | 34 | echo 35 | 36 | 37 | if [ "${skip_ssd_setup}" == "NO" ]; then 38 | # Install SSD model generated from NVIDIA's MLPerf Inference v0.5 submission. 39 | # TODO: Xavier only at the moment. 40 | ck install package --tags=model,tensorrt,downloaded,${SSD_MODEL_TAGS} 41 | exit_if_error 42 | fi 43 | 44 | 45 | if [ "${skip_coco_setup}" == "NO" ]; then 46 | # Detect OpenCV in its location in JetPack 4.3. 47 | # TODO: Only works on Jetson machines at the moment. 48 | ck detect soft --tags=python-package,cv2 --cus.version=JetPack \ 49 | --full_path=/usr/lib/python3.6/dist-packages/cv2/__init__.py 50 | exit_if_error 51 | 52 | # Remove training annotations (~765 MB), leaving only 5,000 images (~788 MB) and 53 | # validation annotations (~52 MB). 54 | ck virtual env \ 55 | --tags=object-detection,dataset,coco.2017,val,original,full \ 56 | --shell_cmd='rm -f $CK_ENV_DATASET_COCO_LABELS_DIR/*train2017.json' 57 | 58 | # Preprocess for SSD-MobileNet (300x300 input images, 264 KB each, 1.3 GB in total). 59 | ck install package --tags=dataset,preprocessed,using-opencv,coco.2017,full,side.${SSD_MODEL_SIDE} 60 | exit_if_error 61 | fi 62 | 63 | 64 | echo 65 | 66 | 67 | echo "Done." 68 | -------------------------------------------------------------------------------- /script/setup/setup.ssd-resnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | function exit_if_error() { 5 | if [ "${?}" != "0" ]; then exit 1; fi 6 | } 7 | 8 | 9 | SSD_MODEL_NAME="SSD-ResNet" 10 | SSD_MODEL_TAGS="ssd-resnet" 11 | SSD_MODEL_SIDE=1200 12 | 13 | 14 | # Refresh CK-TensorRT and its dependencies. 15 | echo "Refreshing CK-TensorRT ..." 16 | ck pull repo:ck-tensorrt 17 | exit_if_error 18 | 19 | 20 | echo 21 | 22 | 23 | echo "Setting up ${SSD_MODEL_NAME} and COCO ..." 24 | 25 | # Skip SSD model setup: should be NO for hub; should be NO for worker. 26 | skip_ssd_setup=${CK_SKIP_SSD_SETUP:-"NO"} 27 | echo "- skip ${SSD_MODEL_NAME} setup (download): ${skip_ssd_setup}" 28 | 29 | # Skip COCO setup: should be NO for hub; should be YES or NO for worker. 30 | skip_coco_setup=${CK_SKIP_COCO_SETUP:-"NO"} 31 | echo "- skip COCO setup (download and preprocessing): ${skip_coco_setup}" 32 | 33 | 34 | echo 35 | 36 | 37 | if [ "${skip_ssd_setup}" == "NO" ]; then 38 | # Install SSD models generated from NVIDIA's MLPerf Inference v0.5 submission. 39 | # TODO: Xavier only at the moment. 40 | ck install package --tags=model,tensorrt,downloaded,${SSD_MODEL_TAGS} 41 | ck install package --tags=model,tensorrt,downloaded,${SSD_MODEL_TAGS}.singlestream 42 | exit_if_error 43 | fi 44 | 45 | 46 | if [ "${skip_coco_setup}" == "NO" ]; then 47 | # Detect OpenCV in its location in JetPack 4.3. 48 | # TODO: Only works on Jetson machines at the moment. 49 | ck detect soft --tags=python-package,cv2 --cus.version=JetPack \ 50 | --full_path=/usr/lib/python3.6/dist-packages/cv2/__init__.py 51 | exit_if_error 52 | 53 | # Remove training annotations (~765 MB), leaving only 5,000 images (~788 MB) and 54 | # validation annotations (~52 MB). 55 | ck virtual env \ 56 | --tags=object-detection,dataset,coco.2017,val,original,full \ 57 | --shell_cmd='rm -f $CK_ENV_DATASET_COCO_LABELS_DIR/*train2017.json' 58 | 59 | # Preprocess for SSD-ResNet (1200x1200 input images, 4.2 MB each, 21 GB in total). 60 | ck install package --tags=dataset,preprocessed,using-opencv,coco.2017,full,side.${SSD_MODEL_SIDE} 61 | exit_if_error 62 | fi 63 | 64 | 65 | echo 66 | 67 | 68 | echo "Done." 69 | -------------------------------------------------------------------------------- /program/zpp-worker-tensorrt-py/.cm/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "backup_data_uid": "0b248b2913eb548b", 3 | "pass_env_to_resolve": "yes", 4 | "build_compiler_vars": {}, 5 | "data_name": "zpp-worker-tensorrt-py", 6 | "main_language": "python", 7 | "no_compile": "yes", 8 | "no_target_file": "yes", 9 | "process_in_tmp": "yes", 10 | "program": "yes", 11 | "run_cmds": { 12 | "default": { 13 | "ignore_return_code": "no", 14 | "run_time": { 15 | "run_cmd_main": "$<>$ ../zpp_worker_trt.py" 16 | } 17 | } 18 | }, 19 | "run_deps": { 20 | "lib-python-numpy": { 21 | "local": "yes", 22 | "name": "Python NumPy library", 23 | "sort": 10, 24 | "tags": "lib,python-package,numpy" 25 | }, 26 | "lib-python-pycuda": { 27 | "local": "yes", 28 | "name": "Python PyCUDA library", 29 | "sort": 20, 30 | "tags": "lib,python-package,pycuda" 31 | }, 32 | "lib-python-tensorrt": { 33 | "local": "yes", 34 | "name": "Python TensorRT library", 35 | "sort": 30, 36 | "tags": "lib,python-package,tensorrt" 37 | }, 38 | "lib-python-zmq": { 39 | "local": "yes", 40 | "name": "Python ZeroMQ library", 41 | "sort": 40, 42 | "tags": "lib,python-package,zmq" 43 | }, 44 | "plugin-nms": { 45 | "enable_if_env": [ { "CK_WORKER_OUTPUT_FORMAT": ["direct_return"] } ], 46 | "local": "yes", 47 | "name": "TensorRT NMS plugin", 48 | "sort": 60, 49 | "tags": "tensorrt,plugin,nms" 50 | }, 51 | "weights": { 52 | "local": "yes", 53 | "name": "TensorRT model", 54 | "sort": 50, 55 | "tags": "tensorrt,model" 56 | } 57 | }, 58 | "run_vars": { 59 | "CK_WORKER_ID": "", 60 | "CK_WORKER_JOB_LIMIT": 0, 61 | "CK_WORKER_OUTPUT_FORMAT": "softmax", 62 | "CK_WORKER_POSTWORK_TIMEOUT_S": "", 63 | "CK_HUB_IP": "localhost", 64 | "CK_ZMQ_FAN_PORT": 5557, 65 | "CK_ZMQ_FUNNEL_PORT": 5558, 66 | "CK_TRANSFER_MODE": "numpy", 67 | "CK_TRANSFER_FLOAT": "YES", 68 | "CK_PREPROCESS_ON_GPU": "NO" 69 | }, 70 | "tags": [ 71 | "image-classification", 72 | "zeromq", 73 | "zmq", 74 | "worker", 75 | "lang-python" 76 | ] 77 | } 78 | -------------------------------------------------------------------------------- /program/image-classification-zpp-hub-py/.cm/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "backup_data_uid": "6495587eb9150c0b", 3 | "build_compiler_vars": {}, 4 | "data_name": "image-classification-zpp-hub-py", 5 | "main_language": "python", 6 | "no_compile": "yes", 7 | "no_target_file": "yes", 8 | "process_in_tmp": "yes", 9 | "program": "yes", 10 | "run_cmds": { 11 | "default": { 12 | "ignore_return_code": "no", 13 | "run_time": { 14 | "fine_grain_timer_file": "tmp-ck-timer.json", 15 | "post_process_via_ck": { 16 | "data_uoa": "b98ee24399ef4c3a", 17 | "module_uoa": "script", 18 | "script_name": "postprocess" 19 | }, 20 | "run_cmd_main": "$<>$ ../zpp_hub_classify.py" 21 | } 22 | } 23 | }, 24 | "run_deps": { 25 | "imagenet-aux": { 26 | "force_target_as_host": "yes", 27 | "local": "yes", 28 | "name": "ImageNet dataset (aux)", 29 | "sort": 10, 30 | "tags": "dataset,imagenet,aux" 31 | }, 32 | "images": { 33 | "force_target_as_host": "yes", 34 | "local": "yes", 35 | "name": "ImageNet dataset (preprocessed subset)", 36 | "sort": 20, 37 | "tags": "dataset,imagenet,preprocessed" 38 | }, 39 | "imagenet-helper": { 40 | "local": "yes", 41 | "name": "Python ImageNet helper functions and metadata", 42 | "sort": 35, 43 | "tags": "lib,python-package,imagenet-helper" 44 | }, 45 | "lib-python-numpy": { 46 | "local": "yes", 47 | "name": "Python NumPy library", 48 | "sort": 30, 49 | "tags": "lib,python-package,numpy" 50 | }, 51 | "lib-python-zeromq": { 52 | "local": "yes", 53 | "name": "Python ZeroMQ library", 54 | "sort": 40, 55 | "tags": "lib,python-package,zmq" 56 | }, 57 | "weights": { 58 | "local": "yes", 59 | "name": "TensorRT model", 60 | "sort": 50, 61 | "tags": "tensorrt,model,image-classification" 62 | } 63 | }, 64 | "run_vars": { 65 | "CK_BATCH_COUNT": 1, 66 | "CK_BATCH_SIZE": 1, 67 | "CK_RESULTS_DIR": "predictions", 68 | "CK_SILENT_MODE": 0, 69 | "CK_SKIP_IMAGES": 0, 70 | "CK_SLEEP_AFTER_SEND_MS": 0, 71 | "CK_ZMQ_FAN_PORT": 5557, 72 | "CK_ZMQ_FUNNEL_PORT": 5558, 73 | "CK_TRANSFER_MODE": "numpy", 74 | "CK_TRANSFER_FLOAT": "YES", 75 | "CK_PREPROCESS_ON_GPU": "NO" 76 | }, 77 | "tags": [ 78 | "image-classification", 79 | "zeromq", 80 | "zmq", 81 | "hub", 82 | "lang-python" 83 | ] 84 | } 85 | -------------------------------------------------------------------------------- /script/setup/setup.common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | function exit_if_error() { 5 | if [ "${?}" != "0" ]; then exit 1; fi 6 | } 7 | 8 | 9 | echo "Setting up CK-ZeroMQ ..." 10 | 11 | # Skip Python setup: should be NO for hub; should be NO for worker. 12 | skip_python_setup=${CK_SKIP_PYTHON_SETUP:-"NO"} 13 | echo "- skip Python setup: ${skip_python_setup}" 14 | 15 | # Skip NVIDIA setup: can be YES or NO for hub; should be NO for worker. 16 | skip_nvidia_setup=${CK_SKIP_NVIDIA_SETUP:-"NO"} 17 | echo "- skip NVIDIA setup: ${skip_nvidia_setup}" 18 | 19 | # Skip LoadGen setup: should be NO for hub; can be YES or NO for worker. 20 | skip_loadgen_setup=${CK_SKIP_LOADGEN_SETUP:-"NO"} 21 | echo "- skip LoadGen setup: ${skip_loadgen_setup}" 22 | 23 | 24 | echo 25 | 26 | 27 | # Refresh CK-ZeroMQ and its dependencies. 28 | echo "Refreshing CK-ZeroMQ ..." 29 | ck pull repo:ck-zeromq --url=https://github.com/dividiti/ck-zeromq 30 | exit_if_error 31 | 32 | 33 | echo 34 | 35 | 36 | if [ "${skip_python_setup}" == "NO" ]; then 37 | # Set up Python, NumPy, PyZMQ. 38 | echo "Setting up Python 3 and essential packages ..." 39 | ck detect soft:compiler.python --full_path=`which python3` 40 | exit_if_error 41 | 42 | ck install package --tags=python-package,cython 43 | exit_if_error 44 | 45 | # NB: Building NumPy 1.18.1 requires Cython >= 0.29.14. 46 | ck virtual env --tags=cython --shell_cmd='ck install package --tags=python-package,numpy' 47 | exit_if_error 48 | 49 | ck install package --tags=python-package,zmq 50 | exit_if_error 51 | fi 52 | 53 | 54 | if [ "${skip_nvidia_setup}" == "NO" ]; then 55 | # Detect TensorRT and PyTensorRT. 56 | echo "Setting up TensorRT/PyTensorRT ..." 57 | 58 | ck detect soft:lib.tensorrt --full_path=/usr/lib/aarch64-linux-gnu/libnvinfer.so 59 | exit_if_error 60 | 61 | ck detect soft:lib.python.tensorrt --full_path=/usr/lib/python3.6/dist-packages/tensorrt/__init__.py 62 | exit_if_error 63 | 64 | # Detect GCC/CUDA and install PyCUDA. 65 | echo "Setting up CUDA/PyCUDA ..." 66 | 67 | ck detect soft:compiler.gcc --full_path=`which gcc-7` 68 | exit_if_error 69 | 70 | ck detect soft:compiler.cuda --full_path=/usr/local/cuda-10.0/bin/nvcc 71 | exit_if_error 72 | 73 | ck install package --tags=python-package,pycuda 74 | exit_if_error 75 | fi 76 | 77 | 78 | if [ "${skip_loadgen_setup}" == "NO" ]; then 79 | # Install MLPerf Inference packages. 80 | echo "Setting up MLPerf Inference packages ..." 81 | 82 | ck install package --tags=mlperf,inference,source 83 | exit_if_error 84 | 85 | ck install package --tags=python-package,absl 86 | exit_if_error 87 | 88 | ck install package --tags=mlperf,loadgen,python-package 89 | exit_if_error 90 | fi 91 | 92 | 93 | echo 94 | 95 | 96 | echo "Done." 97 | -------------------------------------------------------------------------------- /program/object-detection-zpp-hub-py/.cm/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "backup_data_uid": "1dc4528a1a53c218", 3 | "build_compiler_vars": {}, 4 | "data_name": "object-detection-zpp-hub-py", 5 | "main_language": "python", 6 | "no_compile": "yes", 7 | "no_target_file": "yes", 8 | "process_in_tmp": "yes", 9 | "program": "yes", 10 | "run_cmds": { 11 | "default": { 12 | "ignore_return_code": "no", 13 | "run_time": { 14 | "fine_grain_timer_file": "tmp-ck-timer.json", 15 | "post_process_via_ck": { 16 | "data_uoa": "24c98b0cee248d93", 17 | "module_uoa": "script", 18 | "script_name": "iniless_postprocess" 19 | }, 20 | "run_cmd_main": "$<>$ ../zpp_hub_detect.py" 21 | } 22 | } 23 | }, 24 | "run_deps": { 25 | "tool-coco": { 26 | "local": "yes", 27 | "name": "Python API for COCO", 28 | "sort": 60, 29 | "tags": "tool,coco" 30 | }, 31 | "dataset": { 32 | "force_target_as_host": "yes", 33 | "local": "yes", 34 | "name": "Preprocessed subset of COCO dataset", 35 | "sort": 20, 36 | "tags": "dataset,preprocessed,object-detection" 37 | }, 38 | "coco-helper": { 39 | "local": "yes", 40 | "name": "Python COCO helper functions and metadata", 41 | "sort": 35, 42 | "tags": "lib,python-package,coco-helper" 43 | }, 44 | "lib-python-matplotlib": { 45 | "local": "yes", 46 | "name": "Python Matplotlib library", 47 | "sort": 40, 48 | "tags": "lib,python-package,matplotlib" 49 | }, 50 | "lib-python-numpy": { 51 | "local": "yes", 52 | "name": "Python NumPy library", 53 | "sort": 30, 54 | "tags": "lib,python-package,numpy" 55 | }, 56 | "lib-python-zeromq": { 57 | "local": "yes", 58 | "name": "Python ZeroMQ library", 59 | "sort": 10, 60 | "tags": "lib,python-package,zmq" 61 | }, 62 | "weights": { 63 | "local": "yes", 64 | "name": "TensorRT model", 65 | "sort": 50, 66 | "tags": "tensorrt,model,object-detection" 67 | } 68 | }, 69 | "run_vars": { 70 | "CK_ANNOTATIONS_OUT_DIR": "annotations", 71 | "CK_DETECTIONS_OUT_DIR": "detections", 72 | "CK_DETECTION_THRESHOLD": 0.0, 73 | "CK_PREPROCESSED_OUT_DIR": "preprocessed", 74 | "CK_RESULTS_OUT_DIR": "results", 75 | "CK_BATCH_COUNT": 1, 76 | "CK_BATCH_SIZE": 1, 77 | "CK_SILENT_MODE": 0, 78 | "CK_SKIP_IMAGES": 0, 79 | "CK_TIMER_FILE": "tmp-ck-timer.json", 80 | 81 | "CK_PREPROCESS_ON_GPU": "NO", 82 | "CK_SLEEP_AFTER_SEND_MS": 0, 83 | "CK_TRANSFER_FLOAT": "YES", 84 | "CK_TRANSFER_MODE": "numpy", 85 | "CK_ZMQ_FAN_PORT": 5557, 86 | "CK_ZMQ_FUNNEL_PORT": 5558 87 | }, 88 | "tags": [ 89 | "object-detection", 90 | "zeromq", 91 | "zmq", 92 | "hub", 93 | "lang-python" 94 | ] 95 | } 96 | -------------------------------------------------------------------------------- /program/object-detection-zpp-hub-loadgen-py/user.conf: -------------------------------------------------------------------------------- 1 | # The format of this config file is 'key = value'. 2 | # The key has the format 'model.scenario.key'. Value is mostly int64_t. 3 | # Model maybe '*' as wildcard. In that case the value applies to all models. 4 | # All times are in milli seconds 5 | 6 | *.SingleStream.qsl_rng_seed = 3133965575612453542 7 | *.SingleStream.sample_index_rng_seed = 665484352860916858 8 | *.SingleStream.schedule_rng_seed = 3622009729038561421 9 | *.SingleStream.target_latency = 5 10 | *.SingleStream.target_latency_percentile = 90 11 | *.SingleStream.min_duration = 60000 12 | *.SingleStream.min_query_count = 1024 13 | mobilenet.SingleStream.performance_sample_count_override = 1024 14 | resnet50.SingleStream.performance_sample_count_override = 1024 15 | ssd-mobilenet.SingleStream.performance_sample_count_override = 256 16 | ssd-resnet34.SingleStream.performance_sample_count_override = 64 17 | gnmt.SingleStream.performance_sample_count_override = 3903900 18 | 19 | *.MultiStream.qsl_rng_seed = 3133965575612453542 20 | *.MultiStream.sample_index_rng_seed = 665484352860916858 21 | *.MultiStream.schedule_rng_seed = 3622009729038561421 22 | *.MultiStream.target_qps = 20 23 | *.MultiStream.target_latency_percentile = 99 24 | *.MultiStream.samples_per_query = 4 25 | *.MultiStream.max_async_queries = 1 26 | *.MultiStream.target_latency = 50 27 | *.MultiStream.min_duration = 60000 28 | *.MultiStream.min_query_count = 270336 29 | ssd-resnet34.MultiStream.target_qps = 15 30 | ssd-resnet34.MultiStream.target_latency = 66 31 | gnmt.MultiStream.min_query_count = 90112 32 | gnmt.MultiStream.target_latency = 100 33 | gnmt.MultiStream.target_qps = 10 34 | gnmt.MultiStream.target_latency_percentile = 97 35 | mobilenet.MultiStream.performance_sample_count_override = 1024 36 | resnet50.MultiStream.performance_sample_count_override = 1024 37 | ssd-mobilenet.MultiStream.performance_sample_count_override = 256 38 | ssd-resnet34.MultiStream.performance_sample_count_override = 64 39 | gnmt.MultiStream.performance_sample_count_override = 3903900 40 | 41 | *.Server.qsl_rng_seed = 3133965575612453542 42 | *.Server.sample_index_rng_seed = 665484352860916858 43 | *.Server.schedule_rng_seed = 3622009729038561421 44 | *.Server.target_qps = 1.0 45 | *.Server.target_latency = 10 46 | *.Server.target_latency_percentile = 99 47 | *.Server.target_duration = 0 48 | *.Server.min_duration = 60000 49 | *.Server.min_query_count = 270336 50 | resnet50.Server.target_latency = 15 51 | ssd-resnet34.Server.target_latency = 100 52 | gnmt.Server.min_query_count = 90112 53 | gnmt.Server.target_latency = 250 54 | gnmt.Server.target_latency_percentile = 97 55 | mobilenet.Server.performance_sample_count_override = 1024 56 | resnet50.Server.performance_sample_count_override = 1024 57 | ssd-mobilenet.Server.performance_sample_count_override = 256 58 | ssd-resnet34.Server.performance_sample_count_override = 64 59 | gnmt.Server.performance_sample_count_override = 3903900 60 | 61 | *.Offline.qsl_rng_seed = 3133965575612453542 62 | *.Offline.sample_index_rng_seed = 665484352860916858 63 | *.Offline.schedule_rng_seed = 3622009729038561421 64 | *.Offline.target_qps = 70 65 | *.Offline.target_latency_percentile = 90 66 | *.Offline.min_duration = 60000 67 | *.Offline.min_query_count = 1 68 | mobilenet.Offline.performance_sample_count_override = 1024 69 | resnet50.Offline.performance_sample_count_override = 1024 70 | ssd-mobilenet.Offline.performance_sample_count_override = 256 71 | ssd-resnet34.Offline.performance_sample_count_override = 64 72 | gnmt.Offline.performance_sample_count_override = 3903900 73 | -------------------------------------------------------------------------------- /program/image-classification-zpp-hub-loadgen-py/user.conf: -------------------------------------------------------------------------------- 1 | # The format of this config file is 'key = value'. 2 | # The key has the format 'model.scenario.key'. Value is mostly int64_t. 3 | # Model maybe '*' as wildcard. In that case the value applies to all models. 4 | # All times are in milli seconds 5 | 6 | *.SingleStream.qsl_rng_seed = 3133965575612453542 7 | *.SingleStream.sample_index_rng_seed = 665484352860916858 8 | *.SingleStream.schedule_rng_seed = 3622009729038561421 9 | *.SingleStream.target_latency = 5 10 | *.SingleStream.target_latency_percentile = 90 11 | *.SingleStream.min_duration = 60000 12 | *.SingleStream.min_query_count = 1024 13 | mobilenet.SingleStream.performance_sample_count_override = 1024 14 | resnet50.SingleStream.performance_sample_count_override = 1024 15 | ssd-mobilenet.SingleStream.performance_sample_count_override = 256 16 | ssd-resnet34.SingleStream.performance_sample_count_override = 64 17 | gnmt.SingleStream.performance_sample_count_override = 3903900 18 | 19 | *.MultiStream.qsl_rng_seed = 3133965575612453542 20 | *.MultiStream.sample_index_rng_seed = 665484352860916858 21 | *.MultiStream.schedule_rng_seed = 3622009729038561421 22 | *.MultiStream.target_qps = 20 23 | *.MultiStream.target_latency_percentile = 99 24 | *.MultiStream.samples_per_query = 4 25 | *.MultiStream.max_async_queries = 1 26 | *.MultiStream.target_latency = 50 27 | *.MultiStream.min_duration = 60000 28 | *.MultiStream.min_query_count = 270336 29 | ssd-resnet34.MultiStream.target_qps = 15 30 | ssd-resnet34.MultiStream.target_latency = 66 31 | gnmt.MultiStream.min_query_count = 90112 32 | gnmt.MultiStream.target_latency = 100 33 | gnmt.MultiStream.target_qps = 10 34 | gnmt.MultiStream.target_latency_percentile = 97 35 | mobilenet.MultiStream.performance_sample_count_override = 1024 36 | resnet50.MultiStream.performance_sample_count_override = 1024 37 | ssd-mobilenet.MultiStream.performance_sample_count_override = 256 38 | ssd-resnet34.MultiStream.performance_sample_count_override = 64 39 | gnmt.MultiStream.performance_sample_count_override = 3903900 40 | 41 | *.Server.qsl_rng_seed = 3133965575612453542 42 | *.Server.sample_index_rng_seed = 665484352860916858 43 | *.Server.schedule_rng_seed = 3622009729038561421 44 | *.Server.target_qps = 1.0 45 | *.Server.target_latency = 10 46 | *.Server.target_latency_percentile = 99 47 | *.Server.target_duration = 0 48 | *.Server.min_duration = 60000 49 | *.Server.min_query_count = 270336 50 | resnet50.Server.target_latency = 15 51 | ssd-resnet34.Server.target_latency = 100 52 | gnmt.Server.min_query_count = 90112 53 | gnmt.Server.target_latency = 250 54 | gnmt.Server.target_latency_percentile = 97 55 | mobilenet.Server.performance_sample_count_override = 1024 56 | resnet50.Server.performance_sample_count_override = 1024 57 | ssd-mobilenet.Server.performance_sample_count_override = 256 58 | ssd-resnet34.Server.performance_sample_count_override = 64 59 | gnmt.Server.performance_sample_count_override = 3903900 60 | 61 | *.Offline.qsl_rng_seed = 3133965575612453542 62 | *.Offline.sample_index_rng_seed = 665484352860916858 63 | *.Offline.schedule_rng_seed = 3622009729038561421 64 | *.Offline.target_qps = 70 65 | *.Offline.target_latency_percentile = 90 66 | *.Offline.min_duration = 60000 67 | *.Offline.min_query_count = 1 68 | mobilenet.Offline.performance_sample_count_override = 1024 69 | resnet50.Offline.performance_sample_count_override = 1024 70 | ssd-mobilenet.Offline.performance_sample_count_override = 256 71 | ssd-resnet34.Offline.performance_sample_count_override = 64 72 | gnmt.Offline.performance_sample_count_override = 3903900 73 | -------------------------------------------------------------------------------- /script/setup/setup.resnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | function exit_if_error() { 5 | if [ "${?}" != "0" ]; then exit 1; fi 6 | } 7 | 8 | 9 | # Refresh CK-MLPerf and its dependencies. 10 | echo "Refreshing CK-MLPerf ..." 11 | ck pull repo:ck-mlperf 12 | exit_if_error 13 | 14 | 15 | echo 16 | 17 | 18 | echo "Setting up ResNet and ImageNet ..." 19 | 20 | # Skip ResNet setup: can be YES or NO for hub; should be NO for worker. 21 | skip_resnet_setup=${CK_SKIP_RESNET_SETUP:-"NO"} 22 | echo "- skip ResNet setup: ${skip_resnet_setup}" 23 | 24 | # Fake ResNet detection: can be NO or YES for hub; should be NO for worker. 25 | fake_resnet_detection=${CK_FAKE_RESNET_DETECTION:-"NO"} 26 | ck_tools=${CK_TOOLS:-"$HOME/CK-TOOLS"} 27 | echo "- fake ResNet detection: ${fake_resnet_detection} (CK_TOOLS=${ck_tools})" 28 | 29 | if [ "${skip_resnet_setup}" == "NO" ] && [ "${fake_resnet_detection}" != "NO" ]; then 30 | echo "ERROR: You cannot set up ResNet and fake ResNet detection at the same time!" 31 | exit 1 32 | fi 33 | 34 | if [ "${skip_resnet_setup}" != "NO" ] && [ "${fake_resnet_detection}" == "NO" ]; then 35 | echo "ERROR: You cannot skip ResNet setup and not to fake ResNet detection at the same time!" 36 | exit 1 37 | fi 38 | 39 | # Skip ImageNet detection: should be NO for hub; should be YES for worker. 40 | skip_imagenet_detection=${CK_SKIP_IMAGENET_DETECTION:-"NO"} 41 | echo "- skip ImageNet detection: ${skip_imagenet_detection}" 42 | 43 | 44 | echo 45 | 46 | 47 | if [ "${skip_resnet_setup}" == "NO" ]; then 48 | # Install the official MLPerf ONNX model and convert it to TensorRT with predefined options. 49 | ck install package --tags=model,onnx,resnet,downloaded 50 | ck install package --tags=model,tensorrt,resnet,converted-from-onnx,maxbatch.20,fp16 51 | # Install a quantized model converted for Xavier from NVIDIA's v0.5 submission. 52 | ck install package --tags=model,tensorrt,resnet,downloaded,int8,linear,for.xavier 53 | exit_if_error 54 | fi 55 | 56 | 57 | if [ "${fake_resnet_detection}" != "NO" ]; then 58 | # Detect fake ResNet model. 59 | model_dir=${ck_tools}/model-tensorrt-converted-from-onnx-fp16-maxbatch.20-resnet 60 | model_file=${model_dir}/converted_model.trt 61 | mkdir -p ${model_dir} 62 | touch ${model_file} 63 | ck detect soft:model.tensorrt --cus.version=resnet50-fp16 \ 64 | --full_path=${model_file} \ 65 | --extra_tags=converted,converted-from-onnx,fp16,image-classification,maxbatch.20,model,resnet,tensorrt,trt \ 66 | --ienv.ML_MODEL_MAX_BATCH_SIZE=20 \ 67 | --ienv.ML_MODEL_DATA_TYPE=float16 \ 68 | --ienv.ML_MODEL_DATA_LAYOUT=NCHW \ 69 | --ienv.ML_MODEL_NORMALIZE_DATA=NO \ 70 | --ienv.ML_MODEL_SUBTRACT_MEAN=YES \ 71 | --ienv.ML_MODEL_GIVEN_CHANNEL_MEANS='123.68 116.78 103.94' \ 72 | --ienv.ML_MODEL_IMAGE_HEIGHT=224 \ 73 | --ienv.ML_MODEL_IMAGE_WIDTH=224 74 | exit_if_error 75 | fi 76 | 77 | 78 | if [ "${skip_imagenet_detection}" == "NO" ]; then 79 | # Detect a preprocessed ImageNet validation dataset (50,000 images). 80 | echo "Detecting a preprocessed ImageNet validation set ..." 81 | imagenet_dir=${CK_ENV_DATASET_IMAGENET_PREPROCESSED_DIR:-"/datasets/dataset-imagenet-preprocessed-using-opencv-crop.875-full-inter.linear-side.224/ILSVRC2012_val_00000001.rgb8"} 82 | imagenet_tags=${CK_ENV_DATASET_IMAGENET_PREPROCESSED_TAGS:-"preprocessed,using-opencv,universal,crop.875,full,inter.linear,side.224"} 83 | imagenet_version=${CK_ENV_DATASET_IMAGENET_PREPROCESSED_VERSION:-"using-opencv"} 84 | ck detect soft:dataset.imagenet.preprocessed --full_path=${imagenet_dir} --extra_tags=${imagenet_tags} --cus.version=${imagenet_version} 85 | 86 | # Install ImageNet labels. 87 | ck install package --tags=dataset,imagenet,aux 88 | exit_if_error 89 | fi 90 | 91 | 92 | echo 93 | 94 | 95 | echo "Done." 96 | -------------------------------------------------------------------------------- /program/image-classification-zpp-hub-loadgen-py/.cm/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "backup_data_uid": "c4a80957e3ae1c8f", 3 | "pass_env_to_resolve": "yes", 4 | "build_compiler_vars": {}, 5 | "data_name": "image-classification-zpp-hub-loadgen-py", 6 | "main_language": "python", 7 | "no_compile": "yes", 8 | "no_target_file": "yes", 9 | "process_in_tmp": "yes", 10 | "program": "yes", 11 | "run_cmds": { 12 | "default": { 13 | "ignore_return_code": "no", 14 | "run_time": { 15 | "fine_grain_timer_file": "tmp-ck-timer.json", 16 | "post_process_via_ck": { 17 | "data_uoa": "b98ee24399ef4c3a", 18 | "module_uoa": "script", 19 | "script_name": "loadgen_postprocess" 20 | }, 21 | "run_cmd_main": "$<>$ ../zpp_hub_classify_loadgen.py" 22 | } 23 | } 24 | }, 25 | "run_deps": { 26 | "imagenet-aux": { 27 | "force_target_as_host": "yes", 28 | "local": "yes", 29 | "name": "ImageNet dataset (aux)", 30 | "sort": 10, 31 | "tags": "dataset,imagenet,aux" 32 | }, 33 | "images": { 34 | "force_target_as_host": "yes", 35 | "local": "yes", 36 | "name": "ImageNet dataset (preprocessed subset)", 37 | "sort": 20, 38 | "tags": "dataset,imagenet,preprocessed" 39 | }, 40 | "loadgen-config-file": { 41 | "enable_if_env": [ 42 | { "CK_LOADGEN_USE_CONFIG_ENV": ["yes", "Yes", "YES", "on", "On", "ON", "true", "True", "TRUE", "1"] } 43 | ], 44 | "add_to_path": "no", 45 | "local": "yes", 46 | "name": "LoadGen Config file", 47 | "sort": 35, 48 | "tags": "loadgen,config" 49 | }, 50 | "lib-python-loadgen": { 51 | "local": "yes", 52 | "name": "Python LoadGen library", 53 | "sort": 40, 54 | "tags": "lib,python-package,loadgen" 55 | }, 56 | "lib-python-numpy": { 57 | "local": "yes", 58 | "name": "Python NumPy library", 59 | "sort": 50, 60 | "tags": "lib,python-package,numpy" 61 | }, 62 | "lib-python-zeromq": { 63 | "local": "yes", 64 | "name": "Python ZeroMQ library", 65 | "sort": 60, 66 | "tags": "lib,python-package,zmq" 67 | }, 68 | "mlperf-inference-src": { 69 | "add_to_path": "no", 70 | "force_target_as_host": "yes", 71 | "local": "yes", 72 | "name": "MLPerf Inference source", 73 | "sort": 110, 74 | "tags": "mlperf,inference,source" 75 | }, 76 | "python": { 77 | "force_target_as_host": "yes", 78 | "local": "yes", 79 | "name": "Python interpreter", 80 | "sort": 100, 81 | "tags": "compiler,lang-python" 82 | }, 83 | "weights": { 84 | "local": "yes", 85 | "name": "TensorRT model", 86 | "sort": 30, 87 | "tags": "tensorrt,model,image-classification" 88 | } 89 | }, 90 | "run_vars": { 91 | "CK_BATCH_SIZE": 1, 92 | "CK_LOADGEN_BUFFER_SIZE": 8, 93 | "CK_LOADGEN_COUNT_OVERRIDE": "", 94 | "CK_LOADGEN_DATASET_SIZE": 20, 95 | "CK_LOADGEN_MAX_DURATION_S": "", 96 | "CK_LOADGEN_MODE": "AccuracyOnly", 97 | "CK_LOADGEN_MULTISTREAMNESS": "", 98 | "CK_LOADGEN_SCENARIO": "SingleStream", 99 | "CK_LOADGEN_SIDELOAD_JSON": "non-mlperf_sideload.json", 100 | "CK_LOADGEN_TARGET_QPS": "", 101 | "CK_LOADGEN_USE_CONFIG_ENV": "no", 102 | "CK_LOADGEN_WARMUP_SAMPLES": 0, 103 | "CK_ZMQ_FAN_PORT": 5557, 104 | "CK_ZMQ_FUNNEL_PORT": 5558, 105 | "CK_TRANSFER_MODE": "numpy", 106 | "CK_TRANSFER_FLOAT": "YES", 107 | "CK_PREPROCESS_ON_GPU": "NO" 108 | }, 109 | "tags": [ 110 | "image-classification", 111 | "zeromq", 112 | "zmq", 113 | "hub", 114 | "loadgen", 115 | "lang-python" 116 | ] 117 | } 118 | -------------------------------------------------------------------------------- /program/object-detection-zpp-hub-loadgen-py/.cm/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "backup_data_uid": "f497e983b6b2eaaf", 3 | "build_compiler_vars": {}, 4 | "data_name": "object-detection-zpp-hub-loadgen-py", 5 | "main_language": "python", 6 | "no_compile": "yes", 7 | "no_target_file": "yes", 8 | "pass_env_to_resolve": "yes", 9 | "process_in_tmp": "yes", 10 | "program": "yes", 11 | "run_cmds": { 12 | "default": { 13 | "ignore_return_code": "no", 14 | "run_time": { 15 | "fine_grain_timer_file": "tmp-ck-timer.json", 16 | "post_process_via_ck": { 17 | "data_uoa": "24c98b0cee248d93", 18 | "module_uoa": "script", 19 | "script_name": "loadgen_postprocess" 20 | }, 21 | "run_cmd_main": "$<>$ ../zpp_hub_detect_loadgen.py" 22 | } 23 | } 24 | }, 25 | "run_deps": { 26 | "python": { 27 | "force_target_as_host": "yes", 28 | "local": "yes", 29 | "name": "Python interpreter", 30 | "sort": 10, 31 | "tags": "compiler,lang-python" 32 | }, 33 | "lib-python-zeromq": { 34 | "local": "yes", 35 | "name": "Python ZeroMQ library", 36 | "sort": 20, 37 | "tags": "lib,python-package,zmq" 38 | }, 39 | 40 | "mlperf-inference-src": { 41 | "add_to_path": "no", 42 | "force_target_as_host": "yes", 43 | "local": "yes", 44 | "name": "MLPerf Inference source", 45 | "sort": 110, 46 | "tags": "mlperf,inference,source" 47 | }, 48 | "lib-python-loadgen": { 49 | "local": "yes", 50 | "name": "Python LoadGen library", 51 | "sort": 120, 52 | "tags": "lib,python-package,loadgen" 53 | }, 54 | "loadgen-config-file": { 55 | "add_to_path": "no", 56 | "enable_if_env": [ { "CK_LOADGEN_USE_CONFIG_ENV": [ "yes", "Yes", "YES", "on", "On", "ON", "true", "True", "TRUE", "1" ] } ], 57 | "local": "yes", 58 | "name": "LoadGen Config file", 59 | "sort": 130, 60 | "tags": "loadgen,config" 61 | }, 62 | "dataset": { 63 | "force_target_as_host": "yes", 64 | "local": "yes", 65 | "name": "Preprocessed subset of COCO dataset", 66 | "sort": 210, 67 | "tags": "dataset,preprocessed,object-detection,coco" 68 | }, 69 | "weights": { 70 | "local": "yes", 71 | "name": "TensorRT object detection model", 72 | "sort": 220, 73 | "tags": "tensorrt,model,object-detection" 74 | }, 75 | "lib-python-numpy": { 76 | "local": "yes", 77 | "name": "Python NumPy library", 78 | "sort": 230, 79 | "tags": "lib,python-package,numpy" 80 | }, 81 | "lib-python-matplotlib": { 82 | "local": "yes", 83 | "name": "Python Matplotlib library", 84 | "sort": 240, 85 | "tags": "lib,python-package,matplotlib" 86 | }, 87 | "tool-coco": { 88 | "local": "yes", 89 | "name": "Python API for COCO", 90 | "sort": 250, 91 | "tags": "tool,coco" 92 | } 93 | }, 94 | "run_vars": { 95 | "CK_BATCH_SIZE": 1, 96 | "CK_LOADGEN_BUFFER_SIZE": 8, 97 | "CK_LOADGEN_COUNT_OVERRIDE": "", 98 | "CK_LOADGEN_DATASET_SIZE": 20, 99 | "CK_LOADGEN_MAX_DURATION_S": "", 100 | "CK_LOADGEN_MODE": "AccuracyOnly", 101 | "CK_LOADGEN_MULTISTREAMNESS": "", 102 | "CK_LOADGEN_SCENARIO": "SingleStream", 103 | "CK_LOADGEN_SIDELOAD_JSON": "non-mlperf_sideload.json", 104 | "CK_LOADGEN_TARGET_QPS": "", 105 | "CK_LOADGEN_USE_CONFIG_ENV": "no", 106 | "CK_LOADGEN_WARMUP_SAMPLES": 0, 107 | "CK_PREPROCESS_ON_GPU": "NO", 108 | "CK_TRANSFER_FLOAT": "YES", 109 | "CK_TRANSFER_MODE": "numpy", 110 | "CK_ZMQ_FAN_PORT": 5557, 111 | "CK_ZMQ_FUNNEL_PORT": 5558 112 | }, 113 | "tags": [ 114 | "object-detection", 115 | "zeromq", 116 | "zmq", 117 | "hub", 118 | "loadgen", 119 | "lang-python" 120 | ] 121 | } 122 | -------------------------------------------------------------------------------- /script/explore-params/explore.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "ZeroMQ Push-Pull exploration!" 4 | 5 | # Dry run - print commands but do not execute them. 6 | dry_run=${CK_DRY_RUN:-""} 7 | echo "- dry run: ${dry_run}" 8 | 9 | # Hub IP. 10 | hub_ip=${CK_HUB_IP:-"localhost"} 11 | echo "- hub IP: ${hub_ip}" 12 | 13 | # Workers can be defined in two ways: 14 | # (1) As a list of N IPs. Worker IDs get derived as a sequence from 1 to N. 15 | # (2) As a list of N IDs. Worker IPs get derived as a sequence of 192.168.1.. 16 | ips=( ${CK_WORKER_IPS:-} ) # use parentheses to interpret the string as an array 17 | ids=( ${CK_WORKER_IDS:-} ) # use parentheses to interpret the string as an array 18 | if [[ -z "${ips}" ]] && [[ -z ${ids} ]] 19 | then 20 | # If neither is defined, send to itself. 21 | ips=( "${hub_ip}" ) 22 | fi 23 | if [[ "${ips}" ]] # (1) 24 | then 25 | num_ips=${#ips[@]} 26 | ids=( $(seq 1 ${num_ips}) ) 27 | num_ids=${#ids[@]} 28 | else # (2) 29 | ids=( ${CK_WORKER_IDS:-1} ) 30 | num_ids=${#ids[@]} 31 | ips=( ) 32 | for id in ${ids[@]}; do 33 | id_plus_1=$((id+1)) 34 | ips+=( "192.168.1.10${id_plus_1}" ) 35 | done 36 | num_ips=${#ips[@]} 37 | fi 38 | echo "- ${num_ips} worker IP(s): ${ips[@]}" 39 | echo "- ${num_ids} worker ID(s): ${ids[@]}" 40 | if [[ ${num_ips} != ${num_ids} ]]; then 41 | echo "ERROR: ${num_ips} not equal to ${num_ids}!" 42 | exit 1 43 | fi 44 | 45 | # Worker ssh ports (22 by default). 46 | ports=( ${CK_WORKER_PORTS:-} ) # use parentheses to interpret the string as an array 47 | if [[ -z "${ports}" ]]; then 48 | for id in ${ips[@]}; do 49 | ports+=( "22" ) 50 | done 51 | fi 52 | num_ports=${#ports[@]} 53 | echo "- ${num_ports} worker port(s): ${ports[@]}" 54 | if [[ ${num_ports} != ${num_ips} ]]; then 55 | echo "ERROR: ${num_ports} not equal to ${num_ips}!" 56 | exit 1 57 | fi 58 | 59 | # Time each worker should wait after last received work-item before exiting. 60 | postwork_timeout_s=${CK_WORKER_POSTWORK_TIMEOUT_S:-10} 61 | echo "- postwork timeout: ${postwork_timeout_s} s" 62 | 63 | # Directory where run.sh is (may not be the current one in the future). 64 | script_dir=`ck find ck-zeromq:script:explore-params` 65 | 66 | # LoadGen mode: PerformanceOnly, AccuracyOnly. 67 | mode=${CK_LOADGEN_MODE:-PerformanceOnly} 68 | if [ "${mode}" = "PerformanceOnly" ]; then 69 | mode_tag="performance" 70 | dataset_size=${CK_LOADGEN_DATASET_SIZE:-1024} 71 | buffer_size=${CK_LOADGEN_BUFFER_SIZE:-1024} 72 | elif [ "${mode}" = "AccuracyOnly" ]; then 73 | mode_tag="accuracy" 74 | imagenet_size=50000 75 | dataset_size=${CK_LOADGEN_DATASET_SIZE:-${imagenet_size}} 76 | buffer_size=${CK_LOADGEN_BUFFER_SIZE:-500} 77 | else 78 | echo "ERROR: Unsupported LoadGen mode '${mode}'!" 79 | exit 1 80 | fi 81 | echo "- mode: ${mode} (${mode_tag})" 82 | echo "- dataset size: ${dataset_size}" 83 | echo "- buffer size: ${buffer_size}" 84 | 85 | # Define the exploration space. 86 | if [ "${mode_tag}" = "accuracy" ]; then 87 | batch_sizes=(1) 88 | transfer_modes=("raw") 89 | transfer_floats=("YES" "NO") 90 | else 91 | batch_sizes=($(seq 1 4)) 92 | transfer_modes=("raw" "pickle" "numpy" "json") 93 | transfer_floats=("YES" "NO") 94 | fi 95 | echo "- batch sizes: [ ${batch_sizes[@]} ]" 96 | echo "- transfer modes: [ ${transfer_modes[@]} ]" 97 | echo "- transfer floats: [ ${transfer_floats[@]} ]" 98 | 99 | # Blank line. 100 | echo 101 | 102 | # Run once for each point. 103 | experiment_id=1 104 | for batch_size in "${batch_sizes[@]}"; do 105 | for transfer_mode in "${transfer_modes[@]}"; do 106 | for transfer_float in "${transfer_floats[@]}"; do 107 | if [ "${transfer_float}" = "YES" ] || [ "${transfer_mode}" = "json" ] ; then 108 | preprocess_on_gpu_list=("NO") 109 | else 110 | preprocess_on_gpu_list=("NO" "YES") 111 | fi 112 | for preprocess_on_gpu in "${preprocess_on_gpu_list[@]}"; do 113 | echo "[`date`] Experiment #${experiment_id}: ..." 114 | experiment_id=$(( ${experiment_id}+1 )) 115 | read -d '' CMD < job_id={} {}".format(job_id, batch_ids)) 139 | 140 | time.sleep(SLEEP_AFTER_SEND_MS/1000) # do not overflow the ZeroMQ 141 | 142 | fan_time_s = time.time()-fan_start-SLEEP_AFTER_SEND_MS/1000 143 | print("[fan] Done submitting batches. Submission took {} s".format(fan_time_s)) 144 | 145 | output_dict['fan_time_s'] = fan_time_s 146 | output_dict['avg_send_batch_time_ms'] = fan_time_s*1000/BATCH_COUNT 147 | 148 | 149 | def funnel_code(): 150 | 151 | # Cleanup results directory 152 | if os.path.isdir(DETECTIONS_OUT_DIR): 153 | shutil.rmtree(DETECTIONS_OUT_DIR) 154 | os.mkdir(DETECTIONS_OUT_DIR) 155 | 156 | bg_class_offset = 1 157 | 158 | ## Workaround for SSD-Resnet34 model incorrectly trained on filtered labels 159 | class_map = None 160 | if (SKIPPED_CLASSES): 161 | class_map = [] 162 | for i in range(len(class_labels) + bg_class_offset): 163 | if i not in SKIPPED_CLASSES: 164 | class_map.append(i) 165 | 166 | funnel_start = time.time() 167 | inference_times_ms_by_worker_id = {} 168 | 169 | for _ in range(BATCH_COUNT): 170 | done_job = from_workers.recv_json() 171 | 172 | job_id = done_job['job_id'] 173 | local_metadata = in_progress.pop(job_id) 174 | roundtrip_time_ms = (time.time()-local_metadata['submission_time'])*1000 175 | worker_id = done_job['worker_id'] 176 | inference_time_ms = done_job['inference_time_ms'] 177 | floatize_time_ms = done_job['floatize_time_ms'] 178 | 179 | print("[funnel] <- [worker {}] job_id={}, worker_type_conversion={:.2f} ms, inference={:.2f} ms, roundtrip={:.2f} ms".format( 180 | worker_id, job_id, floatize_time_ms, inference_time_ms, roundtrip_time_ms)) 181 | 182 | batch_ids = local_metadata['batch_ids'] 183 | batch_size = len(batch_ids) 184 | apparent_batch_size = MODEL_MAX_BATCH_SIZE if MODEL_USE_DLA else batch_size 185 | raw_batch_results = np.array(done_job['raw_batch_results'], dtype=np.float32) 186 | batch_results = np.split(raw_batch_results, apparent_batch_size)[:batch_size] 187 | 188 | if worker_id not in inference_times_ms_by_worker_id: 189 | inference_times_ms_by_worker_id[worker_id] = [] 190 | inference_times_ms_by_worker_id[worker_id].append( inference_time_ms ) 191 | 192 | for global_image_index, single_image_predictions in zip(batch_ids, batch_results): 193 | num_boxes = single_image_predictions[MODEL_MAX_PREDICTIONS*7].view('int32') 194 | width_orig, height_orig = original_w_h[global_image_index] 195 | 196 | filename_orig = image_filenames[global_image_index] 197 | detections_filename = os.path.splitext(filename_orig)[0] + '.txt' 198 | detections_filepath = os.path.join(DETECTIONS_OUT_DIR, detections_filename) 199 | 200 | with open(detections_filepath, 'w') as det_file: 201 | det_file.write('{:d} {:d}\n'.format(width_orig, height_orig)) 202 | 203 | for row in range(num_boxes): 204 | (image_id, ymin, xmin, ymax, xmax, confidence, class_number) = single_image_predictions[row*7:(row+1)*7] 205 | 206 | if confidence >= SCORE_THRESHOLD: 207 | class_number = int(class_number) 208 | 209 | if class_map: 210 | class_number = class_map[class_number] 211 | 212 | image_id = int(image_id) 213 | x1 = xmin * width_orig 214 | y1 = ymin * height_orig 215 | x2 = xmax * width_orig 216 | y2 = ymax * height_orig 217 | class_label = class_labels[class_number - bg_class_offset] 218 | det_file.write('{:.2f} {:.2f} {:.2f} {:.2f} {:.3f} {} {}\n'.format( 219 | x1, y1, x2, y2, confidence, class_number, class_label)) 220 | 221 | 222 | funnel_time_s = time.time()-funnel_start 223 | print("[funnel] Done receiving batches. Receiving took {} s".format(funnel_time_s)) 224 | 225 | for worker_id in inference_times_ms_by_worker_id: 226 | offset = 1 if len(inference_times_ms_by_worker_id[worker_id]) > 1 else 0 # skip the potential cold startup in case there is more data 227 | avg_inference_time_ms_by_worker_id = np.mean(inference_times_ms_by_worker_id[worker_id][offset:]) 228 | output_dict['avg_inference_time_ms_by_worker_id'][worker_id] = avg_inference_time_ms_by_worker_id 229 | print("[funnel] Average batch inference time on [worker {}] is {}".format(worker_id, avg_inference_time_ms_by_worker_id)) 230 | 231 | output_dict['funnel_time_s'] = funnel_time_s 232 | output_dict['avg_rountrip_time_ms'] = funnel_time_s*1000/BATCH_COUNT 233 | 234 | 235 | ## We need one thread to feed the ZeroMQ, another (the main one) to read back from it: 236 | # 237 | fan_thread = threading.Thread(target=fan_code, args=()) 238 | fan_thread.start() 239 | 240 | funnel_code() 241 | 242 | fan_thread.join() 243 | 244 | 245 | ## Store benchmarking results: 246 | # 247 | with open('tmp-ck-timer.json', 'w') as out_file: 248 | json.dump(output_dictionary, out_file, indent=4, sort_keys=True) 249 | 250 | -------------------------------------------------------------------------------- /program/image-classification-zpp-hub-py/zpp_hub_classify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import os 5 | import shutil 6 | import struct 7 | import threading 8 | import time 9 | 10 | from imagenet_helper import (load_preprocessed_batch, image_list, class_labels, 11 | MODEL_DATA_LAYOUT, MODEL_COLOURS_BGR, MODEL_INPUT_DATA_TYPE, MODEL_DATA_TYPE, MODEL_USE_DLA, MODEL_MAX_BATCH_SIZE, 12 | IMAGE_DIR, IMAGE_LIST_FILE, MODEL_NORMALIZE_DATA, SUBTRACT_MEAN, GIVEN_CHANNEL_MEANS, BATCH_SIZE) 13 | 14 | import numpy as np 15 | import zmq 16 | 17 | try: 18 | raw_input 19 | except NameError: 20 | # Python 3 21 | raw_input = input 22 | 23 | 24 | import sys 25 | try: 26 | sys.getwindowsversion() 27 | except AttributeError: 28 | win = False 29 | else: 30 | win = True 31 | 32 | if win: 33 | import win32api,win32process,win32con 34 | pid = win32api.GetCurrentProcessId() 35 | handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid) 36 | # https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setpriorityclass 37 | print("Setting REALTIME_PRIORITY_CLASS on Windows ...") 38 | win32process.SetPriorityClass(handle, win32process.REALTIME_PRIORITY_CLASS) 39 | 40 | 41 | ## Model properties: 42 | # 43 | MODEL_PATH = os.environ['CK_ENV_TENSORRT_MODEL_FILENAME'] 44 | 45 | 46 | ## Transfer mode (numpy floats by default): 47 | # 48 | TRANSFER_MODE = os.getenv('CK_TRANSFER_MODE', 'numpy') 49 | TRANSFER_FLOAT = (os.getenv('CK_TRANSFER_FLOAT', 'YES') in ('YES', 'yes', 'ON', 'on', '1')) and (MODEL_INPUT_DATA_TYPE == 'float32') 50 | TRANSFER_TYPE_NP, TRANSFER_TYPE_SYMBOL = (np.float32, 'f') if TRANSFER_FLOAT else (np.int8, 'b') 51 | 52 | SLEEP_AFTER_SEND_MS = int(os.getenv('CK_SLEEP_AFTER_SEND_MS', 0)) 53 | 54 | ## ZMQ ports: 55 | # 56 | ZMQ_FAN_PORT = os.getenv('CK_ZMQ_FAN_PORT', 5557) 57 | ZMQ_FUNNEL_PORT = os.getenv('CK_ZMQ_FUNNEL_PORT', 5558) 58 | 59 | 60 | ## Writing the results out: 61 | # 62 | RESULTS_DIR = os.getenv('CK_RESULTS_DIR', './results') 63 | FULL_REPORT = os.getenv('CK_SILENT_MODE', '0') in ('NO', 'no', 'OFF', 'off', '0') 64 | 65 | 66 | ## Processing in batches: 67 | # 68 | BATCH_COUNT = int(os.getenv('CK_BATCH_COUNT', 1)) 69 | 70 | 71 | ## ZeroMQ communication setup: 72 | # 73 | zmq_context = zmq.Context() 74 | 75 | to_workers = zmq_context.socket(zmq.PUSH) 76 | to_workers.bind("tcp://*:{}".format(ZMQ_FAN_PORT)) 77 | 78 | from_workers = zmq_context.socket(zmq.PULL) 79 | from_workers.bind("tcp://*:{}".format(ZMQ_FUNNEL_PORT)) 80 | 81 | 82 | ## (Shared) placeholders: 83 | # 84 | in_progress = {} # to be written to by one thread and read by another 85 | output_dict = { # to be topped up by both threads 86 | 'batch_size': BATCH_SIZE, 87 | 'batch_count': BATCH_COUNT, 88 | 'avg_inference_time_ms_by_worker_id': {}, 89 | 'avg_roundtrip_time_ms_by_worker_id': {}, 90 | 'min_roundtrip_time_ms_by_worker_id': {}, 91 | 'pc50_roundtrip_time_ms_by_worker_id': {}, 92 | 'pc90_roundtrip_time_ms_by_worker_id': {}, 93 | 'pc99_roundtrip_time_ms_by_worker_id': {}, 94 | 'max_roundtrip_time_ms_by_worker_id': {}, 95 | } 96 | 97 | 98 | def fan_code(): 99 | 100 | print("Press Enter when the workers are ready: ") 101 | _ = raw_input() 102 | print("[fan] Submitting jobs...") 103 | 104 | fan_start = time.time() 105 | 106 | image_index = 0 107 | for batch_index in range(BATCH_COUNT): 108 | 109 | batch_first_index = image_index 110 | batch_data, image_index = load_preprocessed_batch(image_list, image_index) 111 | 112 | batch_vector_numpy = batch_data.ravel() 113 | 114 | batch_ids = list(range(batch_first_index, image_index)) 115 | job_id = batch_index+1 116 | 117 | in_progress[job_id] = { 118 | 'submission_time': time.time(), 119 | 'batch_ids': batch_ids, 120 | } 121 | 122 | if TRANSFER_MODE == 'dummy': 123 | job_data_raw = struct.pack(' job_id={} {}".format(job_id, batch_ids)) 150 | 151 | time.sleep(SLEEP_AFTER_SEND_MS/1000) # do not overflow the ZeroMQ 152 | 153 | fan_time_s = time.time()-fan_start-SLEEP_AFTER_SEND_MS/1000 154 | print("[fan] Done submitting batches. Submission took {:.2f} s".format(fan_time_s)) 155 | 156 | output_dict['fan_time_s'] = fan_time_s 157 | output_dict['avg_send_batch_time_ms'] = fan_time_s*1000/BATCH_COUNT 158 | 159 | 160 | def funnel_code(): 161 | 162 | # Cleanup results directory 163 | if os.path.isdir(RESULTS_DIR): 164 | shutil.rmtree(RESULTS_DIR) 165 | os.mkdir(RESULTS_DIR) 166 | 167 | funnel_start = time.time() 168 | inference_times_ms_by_worker_id = {} 169 | roundtrip_times_ms_by_worker_id = {} 170 | 171 | for _ in range(BATCH_COUNT): 172 | done_job = from_workers.recv_json() 173 | 174 | job_id = done_job['job_id'] 175 | local_metadata = in_progress.pop(job_id) 176 | roundtrip_time_ms = (time.time()-local_metadata['submission_time'])*1000 177 | worker_id = done_job['worker_id'] 178 | inference_time_ms = done_job['inference_time_ms'] 179 | floatize_time_ms = done_job['floatize_time_ms'] 180 | 181 | print("[funnel] <- [worker {}] job_id={}, worker_type_conversion={:.2f} ms, inference={:.2f} ms, roundtrip={:.2f} ms".format( 182 | worker_id, job_id, floatize_time_ms, inference_time_ms, roundtrip_time_ms)) 183 | 184 | batch_ids = local_metadata['batch_ids'] 185 | batch_size = len(batch_ids) 186 | apparent_batch_size = MODEL_MAX_BATCH_SIZE if MODEL_USE_DLA else batch_size 187 | raw_batch_results = np.array(done_job['raw_batch_results']) 188 | batch_results = np.split(raw_batch_results, apparent_batch_size)[:batch_size] 189 | 190 | if worker_id not in inference_times_ms_by_worker_id: 191 | inference_times_ms_by_worker_id[worker_id] = [] 192 | inference_times_ms_by_worker_id[worker_id].append( inference_time_ms ) 193 | 194 | if worker_id not in roundtrip_times_ms_by_worker_id: 195 | roundtrip_times_ms_by_worker_id[worker_id] = [] 196 | roundtrip_times_ms_by_worker_id[worker_id].append( roundtrip_time_ms ) 197 | 198 | for sample_id, prediction_for_one_sample in zip(batch_ids, batch_results): 199 | if len(prediction_for_one_sample)==1: 200 | predicted_label = int(prediction_for_one_sample[0]) 201 | trimmed_softmax_vector = [0]*predicted_label + [1] + [0]*(1000-predicted_label-1) 202 | else: 203 | trimmed_softmax_vector = prediction_for_one_sample[-1000:] # skipping the background class on the left (if present) 204 | 205 | res_file = os.path.join(RESULTS_DIR, image_list[int(sample_id)]) 206 | with open(res_file + '.txt', 'w') as f: 207 | for prob in trimmed_softmax_vector: 208 | f.write('{}\n'.format(prob)) 209 | 210 | funnel_time_s = time.time()-funnel_start 211 | print("[funnel] Done receiving batches. Receiving took {:.2f} s".format(funnel_time_s)) 212 | 213 | print("") 214 | 215 | print("[funnel] Batch inference time (ms):") 216 | for worker_id in inference_times_ms_by_worker_id: 217 | offset = 1 if len(inference_times_ms_by_worker_id[worker_id]) > 1 else 0 # skip the potential cold startup in case there is more data 218 | avg_inference_time_ms_by_worker_id = np.mean(inference_times_ms_by_worker_id[worker_id][offset:]) 219 | output_dict['avg_inference_time_ms_by_worker_id'][worker_id] = avg_inference_time_ms_by_worker_id 220 | print("- [worker {}] average: {:.2f}".format(worker_id, avg_inference_time_ms_by_worker_id)) 221 | 222 | print("") 223 | 224 | print("[funnel] Batch roundtrip time (ms):") 225 | for worker_id in roundtrip_times_ms_by_worker_id: 226 | offset = 1 if len(roundtrip_times_ms_by_worker_id[worker_id]) > 1 else 0 # skip the potential cold startup in case there is more data 227 | 228 | avg_roundtrip_time_ms_by_worker_id = np.mean(roundtrip_times_ms_by_worker_id[worker_id][offset:]) 229 | output_dict['avg_roundtrip_time_ms_by_worker_id'][worker_id] = avg_roundtrip_time_ms_by_worker_id 230 | print("- [worker {}] average: {:.2f}".format(worker_id, avg_roundtrip_time_ms_by_worker_id)) 231 | 232 | min_roundtrip_time_ms_by_worker_id = np.min(roundtrip_times_ms_by_worker_id[worker_id][offset:]) 233 | output_dict['min_roundtrip_time_ms_by_worker_id'][worker_id] = min_roundtrip_time_ms_by_worker_id 234 | print("- [worker {}] minimum: {:.2f}".format(worker_id, min_roundtrip_time_ms_by_worker_id)) 235 | 236 | pc50_roundtrip_time_ms_by_worker_id = np.percentile(roundtrip_times_ms_by_worker_id[worker_id][offset:], 50) 237 | output_dict['pc50_roundtrip_time_ms_by_worker_id'][worker_id] = pc50_roundtrip_time_ms_by_worker_id 238 | print("- [worker {}] 50 percentile: {:.2f}".format(worker_id, pc50_roundtrip_time_ms_by_worker_id)) 239 | 240 | pc90_roundtrip_time_ms_by_worker_id = np.percentile(roundtrip_times_ms_by_worker_id[worker_id][offset:], 90) 241 | output_dict['pc90_roundtrip_time_ms_by_worker_id'][worker_id] = pc90_roundtrip_time_ms_by_worker_id 242 | print("- [worker {}] 90 percentile: {:.2f}".format(worker_id, pc90_roundtrip_time_ms_by_worker_id)) 243 | 244 | pc99_roundtrip_time_ms_by_worker_id = np.percentile(roundtrip_times_ms_by_worker_id[worker_id][offset:], 99) 245 | output_dict['pc99_roundtrip_time_ms_by_worker_id'][worker_id] = pc99_roundtrip_time_ms_by_worker_id 246 | print("- [worker {}] 99 percentile: {:.2f}".format(worker_id, pc99_roundtrip_time_ms_by_worker_id)) 247 | 248 | max_roundtrip_time_ms_by_worker_id = np.max(roundtrip_times_ms_by_worker_id[worker_id][offset:]) 249 | output_dict['max_roundtrip_time_ms_by_worker_id'][worker_id] = max_roundtrip_time_ms_by_worker_id 250 | print("- [worker {}] maximum: {:.2f}".format(worker_id, max_roundtrip_time_ms_by_worker_id)) 251 | 252 | print("") 253 | 254 | output_dict['funnel_time_s'] = funnel_time_s 255 | output_dict['avg_rountrip_time_ms'] = funnel_time_s*1000/BATCH_COUNT 256 | 257 | 258 | ## We need one thread to feed the ZeroMQ, another (the main one) to read back from it: 259 | # 260 | fan_thread = threading.Thread(target=fan_code, args=()) 261 | fan_thread.start() 262 | 263 | funnel_code() 264 | 265 | fan_thread.join() 266 | 267 | 268 | ## Store benchmarking results: 269 | # 270 | with open('tmp-ck-timer.json', 'w') as out_file: 271 | json.dump(output_dict, out_file, indent=4, sort_keys=True) 272 | 273 | -------------------------------------------------------------------------------- /script/explore-params/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "ZeroMQ Push-Pull experiment:" 4 | 5 | # Task: image-classification or object-detection. 6 | task=${CK_TASK:-"image-classification"} 7 | echo "- task: ${task}" 8 | 9 | # Platform: tx2, tx1, velociti, xavier, ... 10 | platform=${CK_PLATFORM:-"tx2"} 11 | echo "- platform: ${platform}" 12 | 13 | # Model tags. 14 | model_tags=${CK_MODEL_TAGS:-"converted-from-onnx"} 15 | echo "- model tags: ${model_tags}" 16 | 17 | # Launch hub-side program: NO for debugging only! 18 | launch_hub=${CK_LAUNCH_HUB:-YES} 19 | echo "- launch hub: ${launch_hub}" 20 | 21 | # Use hub-side program with LoadGen: YES/NO. 22 | loadgen=${CK_LOADGEN:-YES} 23 | echo "- use LoadGen: ${loadgen}" 24 | 25 | # Hub-side program. 26 | if [ "${loadgen}" = "YES" ]; then 27 | program="${task}-zpp-hub-loadgen-py" 28 | else 29 | program="${task}-zpp-hub-py" 30 | fi 31 | program_dir=`ck find ck-zeromq:program:${program}` 32 | echo "- program: ${program}" 33 | echo "- program directory: ${program_dir}" 34 | 35 | # LoadGen config file. 36 | config_file=${CK_ENV_LOADGEN_CONFIG_FILE:-${program_dir}/user.conf} 37 | echo "- config file: ${config_file}" 38 | 39 | # Model name for LoadGen config: resnet50, ssd-resnet34, mobilenet, ssd-mobilenet, gnmt. 40 | model_name=${CK_LOADGEN_MODEL_NAME:-"unknown"} 41 | echo "- model name: ${model_name}" 42 | 43 | # Dry run - print commands but do not execute them. 44 | dry_run=${CK_DRY_RUN:-""} 45 | echo "- dry run: ${dry_run}" 46 | 47 | # Skip existing experiments. 48 | skip_existing=${CK_SKIP_EXISTING:-""} 49 | echo "- skip existing: ${skip_existing}" 50 | 51 | # Timestamp. 52 | timestamp=$(date +%Y%m%d-%H%M%S) 53 | echo "- timestamp: ${timestamp}" 54 | 55 | # Hub IP. 56 | hub_ip=${CK_HUB_IP:-"localhost"} 57 | echo "- hub IP: ${hub_ip}" 58 | 59 | # Workers can be defined in two ways: 60 | # (1) As a list of N IPs. Worker IDs get derived as a sequence from 1 to N. 61 | # (2) As a list of N IDs. Worker IPs get derived as a sequence of 192.168.1.. 62 | ips=( ${CK_WORKER_IPS:-} ) # use parentheses to interpret the string as an array 63 | ids=( ${CK_WORKER_IDS:-} ) # use parentheses to interpret the string as an array 64 | if [[ -z "${ips}" ]] && [[ -z ${ids} ]] 65 | then 66 | # If neither is defined, send to itself. 67 | ips=( "${hub_ip}" ) 68 | fi 69 | if [[ "${ips}" ]] # (1) 70 | then 71 | num_ips=${#ips[@]} 72 | ids=( $(seq 1 ${num_ips}) ) 73 | num_ids=${#ids[@]} 74 | else # (2) 75 | ids=( ${CK_WORKER_IDS:-1} ) 76 | num_ids=${#ids[@]} 77 | ips=( ) 78 | for id in ${ids[@]}; do 79 | id_plus_1=$((id+1)) 80 | ips+=( "192.168.1.10${id_plus_1}" ) 81 | done 82 | num_ips=${#ips[@]} 83 | fi 84 | echo "- ${num_ips} worker IP(s): ${ips[@]}" 85 | echo "- ${num_ids} worker ID(s): ${ids[@]}" 86 | if [[ ${num_ips} != ${num_ids} ]]; then 87 | echo "ERROR: ${num_ips} not equal to ${num_ids}!" 88 | exit 1 89 | fi 90 | 91 | # Worker ssh ports (22 by default). 92 | ports=( ${CK_WORKER_PORTS:-} ) # use parentheses to interpret the string as an array 93 | if [[ -z "${ports}" ]]; then 94 | for id in ${ips[@]}; do 95 | ports+=( "22" ) 96 | done 97 | fi 98 | num_ports=${#ports[@]} 99 | echo "- ${num_ports} worker port(s): ${ports[@]}" 100 | if [[ ${num_ports} != ${num_ips} ]]; then 101 | echo "ERROR: ${num_ports} not equal to ${num_ips}!" 102 | exit 1 103 | fi 104 | 105 | # ZMQ ports: fan (out), funnel (in). 106 | fan_port=${CK_ZMQ_FAN_PORT:-15051} 107 | funnel_port=${CK_ZMQ_FUNNEL_PORT:-15052} 108 | echo "- fan port: ${fan_port}" 109 | echo "- funnel port: ${funnel_port}" 110 | 111 | # Time each worker should wait after last received work-item before exiting. 112 | postwork_timeout_s=${CK_WORKER_POSTWORK_TIMEOUT_S:-10} 113 | echo "- postwork timeout: ${postwork_timeout_s} s" 114 | 115 | # Worker response format: 116 | # - argmax returns a class id; 117 | # - softmax returns a 1000 or 1001-element vector of class probabilities; 118 | # - direct_return returns a (7*N+1)-element vector where N is the maximum 119 | # number of predictions (100 for SSD-MobileNet, 200 for SSD-ResNet). 120 | worker_output=${CK_WORKER_OUTPUT_FORMAT:-argmax} 121 | echo "- worker output: ${worker_output}" 122 | 123 | # Transfer mode: raw, json, pickle, numpy. 124 | transfer_mode=${CK_TRANSFER_MODE:-numpy} 125 | echo "- transfer mode: ${transfer_mode}" 126 | 127 | # Transfer as floats or as 8-bit integers: YES/NO. 128 | transfer_float=${CK_TRANSFER_FLOAT:-YES} 129 | echo "- transfer float: ${transfer_float}" 130 | 131 | # Preprocess on GPU: YES/NO. 132 | preprocess_on_gpu=${CK_PREPROCESS_ON_GPU:-NO} 133 | if [ "${preprocess_on_gpu}" = "YES" ] && [ "${transfer_float}" = "YES" ]; then 134 | echo "WARNING: Forcing not to preprocess on GPU since transferring float!" 135 | preprocess_on_gpu="NO" 136 | fi 137 | if [ "${preprocess_on_gpu}" = "YES" ] && [ "${transfer_mode}" = "json" ]; then 138 | echo "WARNING: Forcing not to preprocess on GPU since transferring json!" 139 | preprocess_on_gpu="NO" 140 | fi 141 | echo "- preprocess on GPU: ${preprocess_on_gpu}" 142 | 143 | # LoadGen scenario: MultiStream, SingleStream, Offline. 144 | scenario=${CK_LOADGEN_SCENARIO:-MultiStream} 145 | if [ "${scenario}" = "MultiStream" ]; then 146 | scenario_tag="multistream" 147 | elif [ "${scenario}" = "SingleStream" ]; then 148 | scenario_tag="singlestream" 149 | elif [ "${scenario}" = "Offline" ]; then 150 | scenario_tag="offline" 151 | else 152 | echo "ERROR: Unsupported LoadGen scenario '${scenario}'!" 153 | exit 1 154 | fi 155 | echo "- scenario: ${scenario} (${scenario_tag})" 156 | 157 | # LoadGen mode: PerformanceOnly, AccuracyOnly. 158 | mode=${CK_LOADGEN_MODE:-PerformanceOnly} 159 | if [ "${mode}" = "PerformanceOnly" ]; then 160 | mode_tag="performance" 161 | elif [ "${mode}" = "AccuracyOnly" ]; then 162 | mode_tag="accuracy" 163 | else 164 | echo "ERROR: Unsupported LoadGen mode '${mode}'!" 165 | exit 1 166 | fi 167 | echo "- mode: ${mode} (${mode_tag})" 168 | 169 | if [ "${task}" = "image-classification" ]; then 170 | imagenet_size=50000 171 | if [ "${mode}" = "AccuracyOnly" ]; then 172 | dataset_size=${CK_LOADGEN_DATASET_SIZE:-${imagenet_size}} 173 | buffer_size=${CK_LOADGEN_BUFFER_SIZE:-500} 174 | else 175 | dataset_size=${CK_LOADGEN_DATASET_SIZE:-1024} 176 | buffer_size=${CK_LOADGEN_BUFFER_SIZE:-1024} 177 | fi 178 | elif [ "${task}" = "object-detection" ]; then 179 | coco_size=5000 180 | if [ "${mode}" = "AccuracyOnly" ]; then 181 | dataset_size=${CK_LOADGEN_DATASET_SIZE:-${coco_size}} 182 | buffer_size=${CK_LOADGEN_BUFFER_SIZE:-50} 183 | else 184 | if [ "${model_name}" = "ssd-mobilenet" ]; then 185 | dataset_size=${CK_LOADGEN_DATASET_SIZE:-256} 186 | buffer_size=${CK_LOADGEN_BUFFER_SIZE:-256} 187 | elif [ "${model_name}" = "ssd-resnet34" ]; then 188 | dataset_size=${CK_LOADGEN_DATASET_SIZE:-64} 189 | buffer_size=${CK_LOADGEN_BUFFER_SIZE:-64} 190 | else 191 | dataset_size=${CK_LOADGEN_DATASET_SIZE:-1024} 192 | buffer_size=${CK_LOADGEN_BUFFER_SIZE:-1024} 193 | fi # model name 194 | fi # mode 195 | else 196 | echo "ERROR: Unsupported task '${task}'!" 197 | exit 1 198 | fi # task 199 | 200 | 201 | echo "- dataset size: ${dataset_size}" 202 | echo "- buffer size: ${buffer_size}" 203 | 204 | # In the PerformanceOnly mode, affects the number of samples per query that LoadGen issues 205 | # (aiming to meet the minimum duration of 60 seconds and, in the Offline mode, the minimum 206 | # number of samples of 24,576). 207 | target_qps=${CK_LOADGEN_TARGET_QPS:-70} 208 | if [ "${mode}" = "PerformanceOnly" ]; then 209 | if [ "${scenario}" == "SingleStream" ] || [ "${scenario}" == "Offline" ]; then 210 | TARGET_QPS="--env.CK_LOADGEN_TARGET_QPS=${target_qps}" 211 | fi 212 | fi 213 | echo "- target QPS (queries per second): ${target_qps} ('${TARGET_QPS}')" 214 | 215 | # Allow to override the number of queries in the PerformanceOnly mode. 216 | # By default, use 1440=720*2: 217 | # - 720==6! (6 factorial) is evenly divisible between any number of co-processors 1-6. 218 | # - 1200==60*20 is the minimum number of 50 ms queries to meet the minimum duration of 60 ms. 219 | # - 1440 > 1200. 220 | count_override=${CK_LOADGEN_COUNT_OVERRIDE:-1440} 221 | if [ "${mode}" = "PerformanceOnly" ]; then 222 | COUNT_OVERRIDE="--env.CK_LOADGEN_COUNT_OVERRIDE=${count_override}" 223 | fi 224 | echo "- count override: ${count_override} ('${COUNT_OVERRIDE}')" 225 | 226 | # Batch size. 227 | batch_size=${CK_BATCH_SIZE:-1} 228 | echo "- batch size: ${batch_size}" 229 | 230 | # Batch count. 231 | batch_count=${CK_BATCH_COUNT:-1} 232 | echo "- batch count: ${batch_count}" 233 | 234 | # In the MultiStream scenario, affects the number of streams that LoadGen issues 235 | # (aiming to meet the target latency of 50 ms). 236 | # By default, set to the product of the number of workers and the batch size. 237 | multistreamness=${CK_LOADGEN_MULTISTREAMNESS:-$((${num_ids} * ${batch_size}))} 238 | if [ "${scenario}" = "MultiStream" ]; then 239 | MULTISTREAMNESS="--env.CK_LOADGEN_MULTISTREAMNESS=${multistreamness}" 240 | fi 241 | echo "- multistreamness: ${multistreamness} ('${MULTISTREAMNESS}')" 242 | 243 | # Number of warming up samples to be discarded. 244 | # By default, set to the product of the number of workers and the batch size. 245 | warmup_samples=${CK_LOADGEN_WARMUP_SAMPLES:-$((${num_ids} * ${batch_size}))} 246 | echo "- warm-up samples: ${warmup_samples}" 247 | 248 | # Maximum batch size that the TensorRT model supports. 249 | maxbatch=${CK_WEIGHTS_MAXBATCH:-20} 250 | echo "- weights maxbatch: ${maxbatch}" 251 | 252 | # Numerical precision of the TensorRT model. 253 | precision=${CK_WEIGHTS_PRECISION:-fp16} 254 | echo "- weights precision: ${precision}" 255 | 256 | # Input preprocessing. 257 | preprocessing_tags=${CK_PREPROCESSING_TAGS:-"rgb8,full,side.224,preprocessed,using-opencv"} 258 | echo "- preprocessing tags: ${preprocessing_tags}" 259 | 260 | # Prepare record UOA and tags. 261 | mlperf="mlperf" 262 | division="closed" 263 | library="zpp" # ZeroMQ Push-Pull. 264 | benchmark=${model_name} 265 | record_uoa="${mlperf}.${division}.${task}.${platform}.${library}.${benchmark}.${scenario_tag}.${mode_tag}" 266 | record_tags="${mlperf},${division},${task},${platform},${library},${benchmark},${scenario_tag},${mode_tag}" 267 | if [ "${mode_tag}" = "accuracy" ]; then 268 | # Get substring after "preprocessed," to end, i.e. "using-opencv" here. 269 | preprocessing="${preprocessing_tags##*preprocessed,}" 270 | record_uoa+=".${preprocessing}" 271 | record_tags+=",${preprocessing}" 272 | fi 273 | if [ "${mode_tag}" = "accuracy" ]; then 274 | if [ "${task}" = "image-classification" ] && [ "${dataset_size}" != "${imagenet_size}" ]; then 275 | record_uoa+=".${dataset_size}" 276 | record_tags+=",${dataset_size}" 277 | fi 278 | if [ "${task}" = "object-detection" ] && [ "${dataset_size}" != "${coco_size}" ]; then 279 | record_uoa+=".${dataset_size}" 280 | record_tags+=",${dataset_size}" 281 | fi 282 | fi 283 | echo "- record UOA: ${record_uoa}" 284 | echo "- record tags: ${record_tags}" 285 | 286 | # Blank line before printing commands. 287 | echo 288 | 289 | # Skip existing experiments if requested. 290 | if (ck find experiment:${record_uoa} >/dev/null) && [[ "${skip_existing}" ]]; then 291 | echo "Experiment '${record_uoa}' already exists, skipping ..." 292 | exit 0 293 | fi 294 | 295 | # Launch the worker programs. 296 | for i in $(seq 1 ${#ips[@]}); do 297 | ip=${ips[${i}-1]} 298 | id=${ids[${i}-1]} 299 | port=${ports[${i}-1]} 300 | worker_id="worker-${id}" 301 | read -d '' CMD < /home/$USER/nohup.log 2>&1 &'" 321 | END_OF_CMD 322 | echo ${CMD} 323 | if [ -z "${dry_run}" ]; then 324 | eval ${CMD} 325 | fi 326 | echo 327 | done 328 | 329 | # Wait a bit. 330 | sleep 1s 331 | 332 | # Launch the hub program. 333 | read -d '' CMD </dev/null | grep python | grep 5557 | awk '{print $7}' | sed 's/\/.*//'` 20 | ## to stop the socket-hogging process. 21 | ########################################################################################################### 22 | 23 | 24 | ## ZMQ ports: 25 | # 26 | ZMQ_FAN_PORT = os.getenv('CK_ZMQ_FAN_PORT', 5557) 27 | ZMQ_FUNNEL_PORT = os.getenv('CK_ZMQ_FUNNEL_PORT', 5558) 28 | 29 | ## LoadGen test properties: 30 | # 31 | LOADGEN_SCENARIO = os.getenv('CK_LOADGEN_SCENARIO', 'SingleStream') 32 | LOADGEN_MODE = os.getenv('CK_LOADGEN_MODE', 'AccuracyOnly') 33 | LOADGEN_BUFFER_SIZE = int(os.getenv('CK_LOADGEN_BUFFER_SIZE')) # set to how many samples are you prepared to keep in memory at once 34 | LOADGEN_DATASET_SIZE = int(os.getenv('CK_LOADGEN_DATASET_SIZE')) # set to how many total samples to choose from (0 = full set) 35 | LOADGEN_CONFIG_FILE = os.getenv('CK_ENV_LOADGEN_CONFIG_FILE', '') # Very Important: make sure 'pass_env_to_resolve' is on 36 | LOADGEN_MODEL_NAME = os.getenv('CK_LOADGEN_MODEL_NAME', 'random_model_name') 37 | LOADGEN_MULTISTREAMNESS = os.getenv('CK_LOADGEN_MULTISTREAMNESS', '') # if not set, use value from LoadGen's config file, or LoadGen code 38 | LOADGEN_MAX_DURATION_S = os.getenv('CK_LOADGEN_MAX_DURATION_S', '') # if not set, use value from LoadGen's config file, or LoadGen code 39 | LOADGEN_COUNT_OVERRIDE = os.getenv('CK_LOADGEN_COUNT_OVERRIDE', '') 40 | LOADGEN_TARGET_QPS = os.getenv('CK_LOADGEN_TARGET_QPS', '') # Maps to differently named internal config options, depending on scenario - see below. 41 | LOADGEN_WARMUP_SAMPLES = int(os.getenv('CK_LOADGEN_WARMUP_SAMPLES', '0')) 42 | BATCH_SIZE = int(os.getenv('CK_BATCH_SIZE', '1')) 43 | SIDELOAD_JSON = os.getenv('CK_LOADGEN_SIDELOAD_JSON','') 44 | 45 | ## Model properties: 46 | # 47 | MODEL_PATH = os.environ['CK_ENV_TENSORRT_MODEL_FILENAME'] 48 | MODEL_DATA_LAYOUT = os.getenv('ML_MODEL_DATA_LAYOUT', 'NCHW') 49 | LABELS_PATH = os.environ['CK_CAFFE_IMAGENET_SYNSET_WORDS_TXT'] 50 | MODEL_COLOURS_BGR = os.getenv('ML_MODEL_COLOUR_CHANNELS_BGR', 'NO') in ('YES', 'yes', 'ON', 'on', '1') 51 | MODEL_INPUT_DATA_TYPE = os.getenv('ML_MODEL_INPUT_DATA_TYPE', 'float32') 52 | MODEL_DATA_TYPE = os.getenv('ML_MODEL_DATA_TYPE', '(unknown)') 53 | MODEL_IMAGE_HEIGHT = int(os.getenv('ML_MODEL_IMAGE_HEIGHT', 54 | os.getenv('CK_ENV_ONNX_MODEL_IMAGE_HEIGHT', 55 | os.getenv('CK_ENV_TENSORFLOW_MODEL_IMAGE_HEIGHT', 56 | '')))) 57 | MODEL_IMAGE_WIDTH = int(os.getenv('ML_MODEL_IMAGE_WIDTH', 58 | os.getenv('CK_ENV_ONNX_MODEL_IMAGE_WIDTH', 59 | os.getenv('CK_ENV_TENSORFLOW_MODEL_IMAGE_WIDTH', 60 | '')))) 61 | MODEL_IMAGE_CHANNELS = 3 62 | MODEL_SOFTMAX_LAYER = os.getenv('CK_ENV_ONNX_MODEL_OUTPUT_LAYER_NAME', os.getenv('CK_ENV_TENSORFLOW_MODEL_OUTPUT_LAYER_NAME', '')) 63 | 64 | 65 | ## Data transfer (numpy floats by default): 66 | # 67 | TRANSFER_MODE = os.getenv('CK_TRANSFER_MODE', 'numpy') 68 | TRANSFER_FLOAT = (os.getenv('CK_TRANSFER_FLOAT', 'YES') in ('YES', 'yes', 'ON', 'on', '1')) and (MODEL_INPUT_DATA_TYPE == 'float32') 69 | TRANSFER_TYPE_NP, TRANSFER_TYPE_SYMBOL = (np.float32, 'f') if TRANSFER_FLOAT else (np.int8, 'b') 70 | 71 | ## Image normalization: 72 | # 73 | PREPROCESS_ON_GPU = not TRANSFER_FLOAT and os.getenv('CK_PREPROCESS_ON_GPU', 'NO') in ('YES', 'yes', 'ON', 'on', '1') 74 | MODEL_NORMALIZE_DATA = os.getenv('ML_MODEL_NORMALIZE_DATA') in ('YES', 'yes', 'ON', 'on', '1') 75 | SUBTRACT_MEAN = os.getenv('ML_MODEL_SUBTRACT_MEAN', 'YES') in ('YES', 'yes', 'ON', 'on', '1') 76 | GIVEN_CHANNEL_MEANS = os.getenv('ML_MODEL_GIVEN_CHANNEL_MEANS', '') 77 | if GIVEN_CHANNEL_MEANS: 78 | GIVEN_CHANNEL_MEANS = np.fromstring(GIVEN_CHANNEL_MEANS, dtype=np.float32, sep=' ').astype(TRANSFER_TYPE_NP) 79 | if MODEL_COLOURS_BGR: 80 | GIVEN_CHANNEL_MEANS = GIVEN_CHANNEL_MEANS[::-1] # swapping Red and Blue colour channels 81 | 82 | ## Input image properties: 83 | # 84 | IMAGE_DIR = os.getenv('CK_ENV_DATASET_IMAGENET_PREPROCESSED_DIR') 85 | IMAGE_LIST_FILE = os.path.join(IMAGE_DIR, os.getenv('CK_ENV_DATASET_IMAGENET_PREPROCESSED_SUBSET_FOF')) 86 | IMAGE_DATA_TYPE = np.dtype( os.getenv('CK_ENV_DATASET_IMAGENET_PREPROCESSED_DATA_TYPE', 'uint8') ) 87 | 88 | ## Misc 89 | # 90 | VERBOSITY_LEVEL = int(os.getenv('CK_VERBOSE', '0')) 91 | 92 | 93 | ## ZeroMQ communication setup: 94 | # 95 | zmq_context = zmq.Context() 96 | 97 | to_workers = zmq_context.socket(zmq.PUSH) 98 | to_workers.bind("tcp://*:{}".format(ZMQ_FAN_PORT)) 99 | 100 | from_workers = zmq_context.socket(zmq.PULL) 101 | from_workers.bind("tcp://*:{}".format(ZMQ_FUNNEL_PORT)) 102 | from_workers.RCVTIMEO = 2000 103 | 104 | 105 | # Load preprocessed image filepaths: 106 | with open(IMAGE_LIST_FILE, 'r') as f: 107 | image_path_list = [ os.path.join(IMAGE_DIR, s.strip()) for s in f ] 108 | LOADGEN_DATASET_SIZE = LOADGEN_DATASET_SIZE or len(image_path_list) 109 | 110 | 111 | def load_labels(labels_filepath): 112 | my_labels = [] 113 | input_file = open(labels_filepath, 'r') 114 | for l in input_file: 115 | my_labels.append(l.strip()) 116 | return my_labels 117 | 118 | 119 | def tick(letter, quantity=1): 120 | print(letter + (str(quantity) if quantity>1 else ''), end='') 121 | 122 | 123 | # Currently loaded preprocessed images are stored in a dictionary: 124 | preprocessed_image_buffer = {} 125 | 126 | 127 | def load_query_samples(sample_indices): # 0-based indices in our whole dataset 128 | global MODEL_IMAGE_HEIGHT, MODEL_IMAGE_WIDTH, MODEL_IMAGE_CHANNELS 129 | global preprocessed_image_buffer 130 | 131 | print("load_query_samples({})".format(sample_indices)) 132 | 133 | tick('B', len(sample_indices)) 134 | 135 | for sample_index in sample_indices: 136 | img_filename = image_path_list[sample_index] 137 | img = np.fromfile(img_filename, IMAGE_DATA_TYPE) 138 | img = img.reshape((MODEL_IMAGE_HEIGHT, MODEL_IMAGE_WIDTH, MODEL_IMAGE_CHANNELS)) 139 | 140 | if PREPROCESS_ON_GPU: 141 | nhwc_img = img if MODEL_DATA_LAYOUT == 'NHWC' else img.transpose(2,0,1) 142 | preprocessed_image_buffer[sample_index] = np.array(nhwc_img).ravel() # transfer bytes and unsigned 143 | else: 144 | if MODEL_COLOURS_BGR: 145 | img = img[...,::-1] # swapping Red and Blue colour channels 146 | 147 | if IMAGE_DATA_TYPE != 'float32': 148 | img = img.astype(np.float32) 149 | 150 | # Normalize 151 | if MODEL_NORMALIZE_DATA: 152 | img = img/127.5 - 1.0 153 | 154 | # Subtract mean value 155 | if SUBTRACT_MEAN: 156 | if len(GIVEN_CHANNEL_MEANS): 157 | img -= GIVEN_CHANNEL_MEANS 158 | else: 159 | img -= np.mean(img, axis=(0,1), keepdims=True) 160 | 161 | if MODEL_INPUT_DATA_TYPE == 'int8' or TRANSFER_TYPE_NP == np.int8: 162 | img = np.clip(img, -128, 127) 163 | 164 | nhwc_img = img if MODEL_DATA_LAYOUT == 'NHWC' else img.transpose(2,0,1) 165 | preprocessed_image_buffer[sample_index] = np.array(nhwc_img).ravel().astype(TRANSFER_TYPE_NP) # transfer bytes as signed 166 | 167 | tick('l') 168 | print('') 169 | 170 | 171 | def unload_query_samples(sample_indices): 172 | #print("unload_query_samples({})".format(sample_indices)) 173 | global preprocessed_image_buffer 174 | 175 | preprocessed_image_buffer = {} 176 | tick('U') 177 | print('') 178 | 179 | 180 | openme_data = {} # side-loaded stats 181 | in_progress = {} # local store of metadata about batches between issue_queries and send_responses 182 | funnel_should_be_running = True # a way for the fan to signal to the funnel_thread to end 183 | warmup_mode = False # while on, QuerySampleResponses will not be sent to LoadGen 184 | 185 | def issue_queries(query_samples): 186 | 187 | global BATCH_SIZE 188 | 189 | if VERBOSITY_LEVEL: 190 | printable_query = [(qs.index, qs.id) for qs in query_samples] 191 | print("issue_queries( {} )".format(printable_query)) 192 | tick('Q', len(query_samples)) 193 | 194 | for j in range(0, len(query_samples), BATCH_SIZE): 195 | batch = query_samples[j:j+BATCH_SIZE] # NB: the last one may be shorter than BATCH_SIZE in length 196 | batch_vector_numpy = np.ravel([ preprocessed_image_buffer[qs.index] for qs in batch ]) 197 | 198 | job_id = batch[0].id # assume it is both sufficiently unique and sufficiently small to fit our needs 199 | 200 | in_progress[job_id] = { 201 | 'submission_time': time.time(), 202 | 'batch': batch, 203 | } 204 | 205 | if TRANSFER_MODE == 'numpy': 206 | job_data_struct = { 207 | 'job_id': job_id, 208 | 'batch_data': batch_vector_numpy, 209 | } 210 | to_workers.send_pyobj(job_data_struct) 211 | elif TRANSFER_MODE == 'pickle': 212 | job_data_struct = { 213 | 'job_id': job_id, 214 | 'batch_data': np.asarray(batch_vector_numpy), 215 | } 216 | to_workers.send_pyobj(job_data_struct) 217 | elif TRANSFER_MODE == 'raw': 218 | ## Slower, but insensitive to endianness: 219 | # batch_vector_list = batch_vector_numpy.tolist() 220 | # job_data_raw = struct.pack(' job_id={} {}".format(job_id, [qs.index for qs in batch])) 235 | 236 | 237 | def send_responses(): 238 | 239 | global funnel_should_be_running, warmup_mode, openme_data 240 | 241 | funnel_start = time.time() 242 | 243 | received_job_timings = openme_data['received_job_timings'] = [] 244 | inference_times_ms_by_worker_id = {} 245 | 246 | while funnel_should_be_running: 247 | 248 | try: 249 | done_job = from_workers.recv_json() 250 | except Exception as e: 251 | continue # go back and check if the funnel_should_be_running condition has been turned off by the main thread 252 | 253 | job_id = done_job['job_id'] 254 | local_metadata = in_progress.pop(job_id) 255 | received_timestamp = time.time() 256 | roundtrip_time_ms = (received_timestamp-local_metadata['submission_time'])*1000 257 | worker_id = done_job['worker_id'] 258 | inference_time_ms = done_job['inference_time_ms'] 259 | floatize_time_ms = done_job['floatize_time_ms'] 260 | 261 | print("[funnel] <- [worker {}] job_id={}, worker_type_conversion={:.2f} ms, inference={:.2f} ms, roundtrip={:.2f} ms".format( 262 | worker_id, job_id, floatize_time_ms, inference_time_ms, roundtrip_time_ms)) 263 | 264 | received_job_timings.append({ 265 | 'job_id': job_id, 266 | 'worker_id': worker_id, 267 | 'received_timestamp': received_timestamp, 268 | 'worker_floatize_time_ms': floatize_time_ms, 269 | 'inference_time_ms': inference_time_ms, 270 | 'roundtrip_time_ms': roundtrip_time_ms, 271 | }) 272 | 273 | if warmup_mode: 274 | continue 275 | 276 | if worker_id not in inference_times_ms_by_worker_id: 277 | inference_times_ms_by_worker_id[worker_id] = [] 278 | inference_times_ms_by_worker_id[worker_id].append( inference_time_ms ) 279 | 280 | batch = local_metadata['batch'] 281 | batch_size = len(batch) 282 | raw_batch_results = np.array(done_job['raw_batch_results']) 283 | batch_results = np.split(raw_batch_results, batch_size) 284 | 285 | response = [] 286 | response_array_refs = [] # This is needed to guarantee that the individual buffers to which we keep extra-Pythonian references, do not get garbage-collected. 287 | for qs, prediction_for_one_sample in zip(batch, batch_results): 288 | if len(prediction_for_one_sample)==1: 289 | predicted_label = prediction_for_one_sample 290 | else: 291 | predicted_label = np.argmax( prediction_for_one_sample[-1000:] ) 292 | 293 | response_array = array.array("B", np.array(predicted_label, np.float32).tobytes()) 294 | response_array_refs.append(response_array) 295 | bi = response_array.buffer_info() 296 | response.append(lg.QuerySampleResponse(qs.id, bi[0], bi[1])) 297 | lg.QuerySamplesComplete(response) 298 | tick('R', len(response)) 299 | sys.stdout.flush() 300 | print("[funnel] quitting") 301 | 302 | 303 | def flush_queries(): 304 | pass 305 | 306 | 307 | def process_latencies(latencies_ns): 308 | 309 | global openme_data 310 | 311 | latencies_ms = openme_data['loadgen_measured_latencies_ms'] = [ns/1.0e6 for ns in latencies_ns] 312 | print("LG called process_latencies({})".format(latencies_ms)) 313 | 314 | latencies_size = len(latencies_ms) 315 | latencies_avg = sum(latencies_ms)/latencies_size 316 | latencies_sorted = sorted(latencies_ms) 317 | latencies_p50 = int(latencies_size * 0.5); 318 | latencies_p90 = int(latencies_size * 0.9); 319 | latencies_p99 = int(latencies_size * 0.99); 320 | 321 | print("--------------------------------------------------------------------") 322 | print("| LATENCIES (in milliseconds and fps) |") 323 | print("--------------------------------------------------------------------") 324 | print("Number of samples run: {:9d}".format(latencies_size)) 325 | print("Min latency: {:9.2f} ms ({:.3f} fps)".format(latencies_sorted[0], 1e3/latencies_sorted[0])) 326 | print("Median latency: {:9.2f} ms ({:.3f} fps)".format(latencies_sorted[latencies_p50], 1e3/latencies_sorted[latencies_p50])) 327 | print("Average latency: {:9.2f} ms ({:.3f} fps)".format(latencies_avg, 1e3/latencies_avg)) 328 | print("90 percentile latency: {:9.2f} ms ({:.3f} fps)".format(latencies_sorted[latencies_p90], 1e3/latencies_sorted[latencies_p90])) 329 | print("99 percentile latency: {:9.2f} ms ({:.3f} fps)".format(latencies_sorted[latencies_p99], 1e3/latencies_sorted[latencies_p99])) 330 | print("Max latency: {:9.2f} ms ({:.3f} fps)".format(latencies_sorted[-1], 1e3/latencies_sorted[-1])) 331 | print("--------------------------------------------------------------------") 332 | 333 | 334 | def benchmark_using_loadgen(): 335 | "Perform the benchmark using python API for the LoadGen library" 336 | 337 | global funnel_should_be_running, warmup_mode, openme_data 338 | 339 | scenario = { 340 | 'SingleStream': lg.TestScenario.SingleStream, 341 | 'MultiStream': lg.TestScenario.MultiStream, 342 | 'Server': lg.TestScenario.Server, 343 | 'Offline': lg.TestScenario.Offline, 344 | }[LOADGEN_SCENARIO] 345 | 346 | mode = { 347 | 'AccuracyOnly': lg.TestMode.AccuracyOnly, 348 | 'PerformanceOnly': lg.TestMode.PerformanceOnly, 349 | 'SubmissionRun': lg.TestMode.SubmissionRun, 350 | }[LOADGEN_MODE] 351 | 352 | ts = lg.TestSettings() 353 | if LOADGEN_CONFIG_FILE: 354 | ts.FromConfig(LOADGEN_CONFIG_FILE, LOADGEN_MODEL_NAME, LOADGEN_SCENARIO) 355 | ts.scenario = scenario 356 | ts.mode = mode 357 | 358 | if LOADGEN_MULTISTREAMNESS: 359 | ts.multi_stream_samples_per_query = int(LOADGEN_MULTISTREAMNESS) 360 | 361 | if LOADGEN_MAX_DURATION_S: 362 | ts.max_duration_ms = int(LOADGEN_MAX_DURATION_S)*1000 363 | 364 | if LOADGEN_COUNT_OVERRIDE: 365 | ts.min_query_count = int(LOADGEN_COUNT_OVERRIDE) 366 | ts.max_query_count = int(LOADGEN_COUNT_OVERRIDE) 367 | 368 | if LOADGEN_TARGET_QPS: 369 | target_qps = float(LOADGEN_TARGET_QPS) 370 | ts.multi_stream_target_qps = target_qps 371 | ts.server_target_qps = target_qps 372 | ts.offline_expected_qps = target_qps 373 | 374 | sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) 375 | qsl = lg.ConstructQSL(LOADGEN_DATASET_SIZE, LOADGEN_BUFFER_SIZE, load_query_samples, unload_query_samples) 376 | 377 | log_settings = lg.LogSettings() 378 | log_settings.enable_trace = False 379 | 380 | funnel_thread = threading.Thread(target=send_responses, args=()) 381 | funnel_should_be_running = True 382 | funnel_thread.start() 383 | 384 | if LOADGEN_WARMUP_SAMPLES: 385 | warmup_id_range = list(range(LOADGEN_WARMUP_SAMPLES)) 386 | load_query_samples(warmup_id_range) 387 | 388 | warmup_mode = True 389 | print("Sending out the warm-up samples, waiting for responses...") 390 | issue_queries([lg.QuerySample(id,id) for id in warmup_id_range]) 391 | 392 | while len(in_progress)>0: # waiting for the in_progress queue to clear up 393 | time.sleep(1) 394 | print(" Done!") 395 | 396 | warmup_mode = False 397 | 398 | lg.StartTestWithLogSettings(sut, qsl, ts, log_settings) 399 | 400 | funnel_should_be_running = False # politely ask the funnel_thread to end 401 | funnel_thread.join() # wait for it to actually end 402 | 403 | from_workers.close() 404 | to_workers.close() 405 | 406 | lg.DestroyQSL(qsl) 407 | lg.DestroySUT(sut) 408 | 409 | if SIDELOAD_JSON: 410 | with open(SIDELOAD_JSON, 'w') as sideload_fd: 411 | json.dump(openme_data, sideload_fd, indent=4, sort_keys=True) 412 | 413 | 414 | benchmark_using_loadgen() 415 | -------------------------------------------------------------------------------- /program/zpp-worker-tensorrt-py/zpp_worker_trt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import numpy as np 4 | import os 5 | import struct 6 | import time 7 | import zmq 8 | 9 | import tensorrt as trt 10 | import pycuda.driver as cuda 11 | import pycuda.autoinit 12 | import pycuda.tools 13 | 14 | 15 | ## ZMQ ports: 16 | # 17 | ZMQ_FAN_PORT = os.getenv('CK_ZMQ_FAN_PORT', 5557) 18 | ZMQ_FUNNEL_PORT = os.getenv('CK_ZMQ_FUNNEL_PORT', 5558) 19 | 20 | ## Worker properties: 21 | # 22 | HUB_IP = os.getenv('CK_HUB_IP', 'localhost') 23 | JOBS_LIMIT = int(os.getenv('CK_WORKER_JOB_LIMIT', 0)) 24 | WORKER_ID = os.getenv('CK_WORKER_ID') or os.getpid() 25 | WORKER_OUTPUT_FORMAT = os.getenv('CK_WORKER_OUTPUT_FORMAT', 'softmax') 26 | WORKER_POSTWORK_TIMEOUT_S = os.getenv('CK_WORKER_POSTWORK_TIMEOUT_S', '') # empty string means no timeout 27 | 28 | ## Model properties: 29 | # 30 | MODEL_PATH = os.environ['CK_ENV_TENSORRT_MODEL_FILENAME'] 31 | MODEL_PLUGIN_PATH = os.getenv('CK_ENV_TENSORRT_PLUGIN_PATH', os.getenv('ML_MODEL_TENSORRT_PLUGIN','')) 32 | 33 | if MODEL_PLUGIN_PATH: 34 | import ctypes 35 | if not os.path.isfile(MODEL_PLUGIN_PATH): 36 | raise IOError("{}\n{}\n".format( 37 | "Failed to load library ({}).".format(MODEL_PLUGIN_PATH), 38 | "Please build the plugin." 39 | )) 40 | ctypes.CDLL(MODEL_PLUGIN_PATH) 41 | 42 | MODEL_DATA_LAYOUT = os.getenv('ML_MODEL_DATA_LAYOUT', 'NCHW') 43 | MODEL_COLOURS_BGR = os.getenv('ML_MODEL_COLOUR_CHANNELS_BGR', 'NO') in ('YES', 'yes', 'ON', 'on', '1') 44 | MODEL_INPUT_DATA_TYPE = os.getenv('ML_MODEL_INPUT_DATA_TYPE', 'float32') 45 | MODEL_DATA_TYPE = os.getenv('ML_MODEL_DATA_TYPE', '(unknown)') 46 | MODEL_SOFTMAX_LAYER = os.getenv('CK_ENV_ONNX_MODEL_OUTPUT_LAYER_NAME', os.getenv('CK_ENV_TENSORFLOW_MODEL_OUTPUT_LAYER_NAME', '')) 47 | MODEL_SUBTRACT_MEAN = os.getenv('ML_MODEL_SUBTRACT_MEAN', 'NO') in ('YES', 'yes', 'ON', 'on', '1') 48 | if MODEL_SUBTRACT_MEAN: 49 | MODEL_GIVEN_CHANNEL_MEANS = os.getenv('ML_MODEL_GIVEN_CHANNEL_MEANS', '0.0 0.0 0.0') 50 | channel_means = np.fromstring(MODEL_GIVEN_CHANNEL_MEANS, dtype=np.float32, sep=' ') 51 | if MODEL_COLOURS_BGR: 52 | channel_means = channel_means[::-1] # swapping Red and Blue colour channels 53 | 54 | ## Transfer mode (numpy floats by default): 55 | # 56 | TRANSFER_MODE = os.getenv('CK_TRANSFER_MODE', 'numpy') 57 | TRANSFER_FLOAT = os.getenv('CK_TRANSFER_FLOAT', 'YES') in ('YES', 'yes', 'ON', 'on', '1') 58 | PREPROCESS_ON_GPU = (TRANSFER_FLOAT == False) and (TRANSFER_MODE != 'json') and os.getenv('CK_PREPROCESS_ON_GPU', 'NO') in ('YES', 'yes', 'ON', 'on', '1') 59 | CONVERSION_NEEDED = (TRANSFER_FLOAT == False) and (MODEL_INPUT_DATA_TYPE == 'float32') 60 | CONVERSION_TYPE_SYMBOL = 'f' if (MODEL_INPUT_DATA_TYPE == 'float32') else 'b' 61 | ID_SIZE_IN_BYTES = 4 # assuming uint32 62 | 63 | ## ZeroMQ communication setup: 64 | # 65 | zmq_context = zmq.Context() 66 | 67 | from_factory = zmq_context.socket(zmq.PULL) 68 | from_factory.connect('tcp://{}:{}'.format(HUB_IP, ZMQ_FAN_PORT)) 69 | if WORKER_POSTWORK_TIMEOUT_S != '': 70 | from_factory.RCVTIMEO = int(WORKER_POSTWORK_TIMEOUT_S)*1000 # expects milliseconds 71 | 72 | to_funnel = zmq_context.socket(zmq.PUSH) 73 | to_funnel.connect('tcp://{}:{}'.format(HUB_IP, ZMQ_FUNNEL_PORT)) 74 | 75 | 76 | ## CUDA/TensorRT model setup: 77 | # 78 | pycuda_context = pycuda.tools.make_default_context() 79 | 80 | TRT_LOGGER = trt.Logger(trt.Logger.WARNING) 81 | try: 82 | trt.init_libnvinfer_plugins(TRT_LOGGER, "") 83 | with open(MODEL_PATH, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: 84 | serialized_engine = f.read() 85 | trt_engine = runtime.deserialize_cuda_engine(serialized_engine) 86 | trt_version = [ int(v) for v in trt.__version__.split('.') ] 87 | print('[TensorRT v{}.{}] successfully loaded'.format(trt_version[0], trt_version[1])) 88 | except: 89 | pycuda_context.pop() 90 | raise RuntimeError('TensorRT model file {} is not found or corrupted'.format(MODEL_PATH)) 91 | 92 | max_batch_size = trt_engine.max_batch_size 93 | 94 | d_inputs, h_d_outputs, model_bindings = [], [], [] 95 | for interface_layer in trt_engine: 96 | dtype = trt_engine.get_binding_dtype(interface_layer) 97 | shape = trt_engine.get_binding_shape(interface_layer) 98 | fmt = trt_engine.get_binding_format(trt_engine.get_binding_index(interface_layer)) if trt_version[0] >= 6 else None 99 | 100 | if fmt and fmt == trt.TensorFormat.CHW4 and trt_engine.binding_is_input(interface_layer): 101 | shape[-3] = ((shape[-3] - 1) // 4 + 1) * 4 102 | size = trt.volume(shape) * max_batch_size 103 | 104 | dev_mem = cuda.mem_alloc(size * dtype.itemsize) 105 | model_bindings.append( int(dev_mem) ) 106 | 107 | if trt_engine.binding_is_input(interface_layer): 108 | interface_type = 'Input' 109 | d_inputs.append(dev_mem) 110 | model_input_shape = shape 111 | model_input_type_size = dtype.itemsize 112 | if CONVERSION_NEEDED: 113 | d_preconverted_input = cuda.mem_alloc(size * 1) 114 | else: 115 | interface_type = 'Output' 116 | host_mem = cuda.pagelocked_empty(size, trt.nptype(dtype)) 117 | h_d_outputs.append({ 'host_mem': host_mem, 'dev_mem': dev_mem }) 118 | if MODEL_SOFTMAX_LAYER=='' or interface_layer == MODEL_SOFTMAX_LAYER: 119 | model_output_shape = shape 120 | h_output = host_mem 121 | 122 | print("{} layer {}: dtype={}, shape={}, elements_per_max_batch={}".format(interface_type, interface_layer, dtype, shape, size)) 123 | 124 | cuda_stream = cuda.Stream() 125 | input_volume = trt.volume(model_input_shape) # total number of monochromatic subpixels (before batching) 126 | output_volume = trt.volume(model_output_shape) # total number of elements in one image prediction (before batching) 127 | 128 | if MODEL_DATA_LAYOUT == 'NHWC': 129 | (MODEL_IMAGE_HEIGHT, MODEL_IMAGE_WIDTH, MODEL_IMAGE_CHANNELS) = model_input_shape 130 | else: 131 | (MODEL_IMAGE_CHANNELS, MODEL_IMAGE_HEIGHT, MODEL_IMAGE_WIDTH) = model_input_shape 132 | 133 | print("Data layout: {}".format(MODEL_DATA_LAYOUT) ) 134 | print('Model image height: {}'.format(MODEL_IMAGE_HEIGHT)) 135 | print('Model image width: {}'.format(MODEL_IMAGE_WIDTH)) 136 | print('Model image channels: {}'.format(MODEL_IMAGE_CHANNELS)) 137 | print('Model input data type: {}'.format(MODEL_INPUT_DATA_TYPE)) 138 | print('Model (internal) data type: {}'.format(MODEL_DATA_TYPE)) 139 | print('Model BGR colours: {}'.format(MODEL_COLOURS_BGR)) 140 | print('Model max_batch_size: {}'.format(max_batch_size)) 141 | print('Model input_volume: {}'.format(input_volume)) 142 | print('Model output_volume: {}'.format(output_volume)) 143 | print('Image transfer mode: {}'.format(TRANSFER_MODE)) 144 | print('Transferred images need to be converted to the input data type of the model: {}'.format(CONVERSION_NEEDED)) 145 | print('Transferred images need to be preprocessed (e.g. by subtracting means): {}'.format(PREPROCESS_ON_GPU)) 146 | print('Worker output format: {}'.format(WORKER_OUTPUT_FORMAT)) 147 | 148 | if CONVERSION_NEEDED: 149 | compilation_start = time.time(); 150 | 151 | from pycuda.compiler import SourceModule 152 | # Define type conversion kernels and more. NB: Must be done after initializing CUDA context. 153 | source_module = SourceModule(source=""" 154 | // See all type converstion (cast) built-in functionshere: 155 | // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__CAST.html 156 | // Convert signed 8-bit integer to 32-bit floating-point using round-to-nearest-even mode. 157 | __global__ void convert_int8_to_fp32( 158 | float * __restrict__ out, const signed char * __restrict__ in, long num_elems 159 | ) 160 | { 161 | long idx = threadIdx.x + blockIdx.x * blockDim.x; 162 | if (idx >= num_elems) 163 | return; 164 | 165 | out[idx] = __int2float_rn( (signed int) in[idx] ); 166 | } 167 | 168 | // Convert unsigned 8-bit integer to 32-bit floating-point using round-to-nearest-even mode. 169 | __global__ void convert_uint8_to_fp32( 170 | float * __restrict__ out, const unsigned char * __restrict__ in, long num_elems 171 | ) 172 | { 173 | long idx = threadIdx.x + blockIdx.x * blockDim.x; 174 | if (idx >= num_elems) 175 | return; 176 | 177 | out[idx] = __int2float_rn( (unsigned int) in[idx] ); 178 | } 179 | 180 | // Subtract channel means assuming NCHW layout. 181 | __global__ void subtract_means(float * data, 182 | float R_mean, float G_mean, float B_mean, 183 | long HW, // H*W 184 | long num_elems // N*C*H*W 185 | ) 186 | { 187 | long idx = threadIdx.x + blockIdx.x * blockDim.x; 188 | if (idx >= num_elems) 189 | return; 190 | 191 | switch ( (idx / HW) % 3 ) 192 | { 193 | case 0: 194 | data[idx] -= R_mean; 195 | break; 196 | case 1: 197 | data[idx] -= G_mean; 198 | break; 199 | case 2: 200 | data[idx] -= B_mean; 201 | break; 202 | } 203 | } 204 | 205 | // Convert unsigned 8-bit integer to 32-bit floating-point using round-to-nearest-even mode, 206 | // and then subtract RGB channel means assuming NCHW layout. 207 | __global__ void convert_uint8_to_fp32_and_subtract_means( 208 | float * __restrict__ out, const unsigned char * __restrict__ in, 209 | float R_mean, float G_mean, float B_mean, 210 | long HW, // H*W 211 | long num_elems // N*C*H*W 212 | ) 213 | { 214 | long idx = threadIdx.x + blockIdx.x * blockDim.x; 215 | if (idx >= num_elems) 216 | return; 217 | 218 | // Convert. 219 | out[idx] = __int2float_rn( (unsigned int) in[idx] ); 220 | 221 | // Subtract means. 222 | switch ( (idx / HW) % 3 ) 223 | { 224 | case 0: 225 | out[idx] -= R_mean; 226 | break; 227 | case 1: 228 | out[idx] -= G_mean; 229 | break; 230 | case 2: 231 | out[idx] -= B_mean; 232 | break; 233 | } 234 | } 235 | """, cache_dir=False) 236 | 237 | if PREPROCESS_ON_GPU and MODEL_SUBTRACT_MEAN: 238 | conversion_kernel_name = 'convert_uint8_to_fp32_and_subtract_means' if CONVERSION_TYPE_SYMBOL == 'f' else None 239 | conversion_kernel = source_module.get_function(conversion_kernel_name) 240 | # subtract_means_kernel = source_module.get_function('subtract_means') 241 | # conversion_kernel_name = 'convert_uint8_to_fp32' if CONVERSION_TYPE_SYMBOL == 'f' else None 242 | # conversion_kernel = source_module.get_function(conversion_kernel_name) 243 | elif not PREPROCESS_ON_GPU: 244 | conversion_kernel_name = 'convert_int8_to_fp32' if CONVERSION_TYPE_SYMBOL == 'f' else None 245 | conversion_kernel = source_module.get_function(conversion_kernel_name) 246 | 247 | compilation_time_ms = (time.time() - compilation_start)*1000 248 | print("Compilation time of GPU kernel(s): {:.2f} ms".format(compilation_time_ms)) 249 | 250 | print("") 251 | print("[worker {}] Ready to run inference on batches up to {} samples".format(WORKER_ID, max_batch_size)) 252 | 253 | 254 | ## Main inference loop: 255 | # 256 | with trt_engine.create_execution_context() as trt_context: 257 | done_count = 0 258 | total_inference_time = 0 259 | while JOBS_LIMIT<1 or done_count < JOBS_LIMIT: 260 | 261 | wait_and_receive_start = time.time() 262 | 263 | try: 264 | if TRANSFER_MODE == 'dummy': 265 | job_data_raw = from_factory.recv() 266 | elif TRANSFER_MODE == 'raw': 267 | job_data_raw = memoryview( from_factory.recv() ) 268 | elif TRANSFER_MODE == 'json': 269 | job_data_struct = from_factory.recv_json() 270 | elif TRANSFER_MODE in ('pickle', 'numpy'): 271 | job_data_struct = from_factory.recv_pyobj() 272 | except zmq.error.Again as e: # ZeroMQ's timeout exception 273 | if done_count==0: 274 | print('.', end='', flush=True) 275 | continue 276 | else: 277 | print("Having done {} inference cycles, leaving after a timeout of {} seconds".format( 278 | done_count, WORKER_POSTWORK_TIMEOUT_S)) 279 | break 280 | 281 | # FIXME: floatize -> conversion? 282 | floatize_start = time.time() 283 | 284 | if TRANSFER_MODE == 'dummy': 285 | job_id, batch_size = struct.unpack('ii', job_data_raw) 286 | converted_batch = None 287 | else: 288 | if TRANSFER_MODE == 'raw': 289 | job_id = struct.unpack('= max_x: 322 | print("Error: Number of elements exceeds max dimension X: {} >= {}".format(num_elems, max_x)) 323 | pass 324 | # Copy input to the GPU. 325 | memcpy_htod_start = time.time() 326 | cuda.memcpy_htod_async(d_preconverted_input, batch_data, cuda_stream) 327 | memcpy_htod_time_ms = (time.time() - memcpy_htod_start )*1000 328 | # One thread per element. TODO: Number of threads can be tuned down e.g. halved. 329 | block_dim_x = int( max_block_dim_x / 1 ) 330 | grid_dim_x = int( (num_elems + block_dim_x - 1) / block_dim_x ) 331 | if PREPROCESS_ON_GPU: 332 | if MODEL_SUBTRACT_MEAN: 333 | (R_mean, G_mean, B_mean) = channel_means 334 | conversion_kernel(d_inputs[0], d_preconverted_input, 335 | R_mean, G_mean, B_mean, np.int64(MODEL_IMAGE_HEIGHT*MODEL_IMAGE_WIDTH), np.int64(num_elems), grid=(grid_dim_x,1,1), block=(block_dim_x,1,1)) 336 | # TODO: Implement other transforms e.g. normalization. 337 | else: 338 | conversion_kernel(d_inputs[0], d_preconverted_input, np.int64(num_elems), grid=(grid_dim_x,1,1), block=(block_dim_x,1,1)) 339 | 340 | 341 | if batch_size > max_batch_size: # basic protection. FIXME: could report to hub, could split and still do inference... 342 | print("[worker {}] unable to perform inference on {}-sample batch. Skipping it.".format(WORKER_ID, batch_size)) 343 | continue 344 | 345 | inference_start = time.time() 346 | 347 | if TRANSFER_MODE != 'dummy': 348 | trt_context.execute_async(bindings=model_bindings, batch_size=batch_size, stream_handle=cuda_stream.handle) 349 | for output in h_d_outputs: 350 | cuda.memcpy_dtoh_async(output['host_mem'], output['dev_mem'], cuda_stream) 351 | cuda_stream.synchronize() 352 | 353 | inference_time_ms = (time.time() - inference_start)*1000 + memcpy_htod_time_ms 354 | floatize_time_ms = (inference_start-floatize_start)*1000 - memcpy_htod_time_ms 355 | wait_and_receive_time_ms = (floatize_start-wait_and_receive_start)*1000 356 | 357 | if TRANSFER_MODE == 'dummy': # no inference - fake a batch 358 | merged_batch_predictions = [ 0 ] * output_volume * batch_size 359 | else: 360 | batch_results = h_output[:output_volume * batch_size].tolist() 361 | 362 | if WORKER_OUTPUT_FORMAT == 'direct_return': 363 | merged_batch_predictions = batch_results 364 | 365 | elif WORKER_OUTPUT_FORMAT == 'softmax': 366 | if output_volume == 1: # model returns argmax - fake the softmax by padding with 1000 zeros (1001 overall) 367 | merged_batch_predictions = [] 368 | for arg_max in batch_results: 369 | merged_batch_predictions.extend( [0]*(arg_max +1) + [1] + [0]*(1000-arg_max-1) ) 370 | else: # model returns softmax - just pass it on 371 | merged_batch_predictions = batch_results 372 | 373 | elif WORKER_OUTPUT_FORMAT == 'argmax': 374 | if output_volume == 1: # model returns argmax - just pass it on 375 | merged_batch_predictions = batch_results 376 | else: # model returns softmax - filter it to return argmax 377 | merged_batch_predictions = [] 378 | 379 | for j in range(batch_size): # walk through the batch and append individual argmaxen 380 | one_argmax = max(zip(batch_results[j*1001:(j+1)*1001], range(1001)))[1]-1 381 | merged_batch_predictions.append( one_argmax ) 382 | 383 | response = { 384 | 'job_id': job_id, 385 | 'worker_id': WORKER_ID, 386 | 'wait_and_receive_time_ms': wait_and_receive_time_ms, 387 | 'floatize_time_ms': floatize_time_ms, 388 | 'inference_time_ms': inference_time_ms, 389 | 'raw_batch_results': merged_batch_predictions, 390 | } 391 | 392 | to_funnel.send_json(response) 393 | 394 | print("[worker {}] classified job_id={} [{}] in {:.2f} ms (after spending {:.2f} ms on waiting+receiving AND {:.2f} ms on type conversion)".format(WORKER_ID, job_id, batch_size, inference_time_ms, wait_and_receive_time_ms, floatize_time_ms)) 395 | total_inference_time += inference_time_ms 396 | 397 | done_count += 1 398 | 399 | print("[worker {}] Total inference time: {}s".format(WORKER_ID, total_inference_time)) 400 | 401 | pycuda_context.pop() 402 | -------------------------------------------------------------------------------- /program/object-detection-zpp-hub-loadgen-py/zpp_hub_detect_loadgen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import array 4 | import json 5 | import os 6 | import struct 7 | import sys 8 | import threading 9 | import time 10 | 11 | import numpy as np 12 | import zmq 13 | import mlperf_loadgen as lg 14 | 15 | 16 | ########################################################################################################### 17 | ## NB: if you run into "zmq.error.ZMQError: Address already in use" after a crash, 18 | ## run 19 | ## kill `netstat -ltnp 2>/dev/null | grep python | grep 5557 | awk '{print $7}' | sed 's/\/.*//'` 20 | ## to stop the socket-hogging process. 21 | ########################################################################################################### 22 | 23 | 24 | ## ZMQ ports: 25 | # 26 | ZMQ_FAN_PORT = os.getenv('CK_ZMQ_FAN_PORT', 5557) 27 | ZMQ_FUNNEL_PORT = os.getenv('CK_ZMQ_FUNNEL_PORT', 5558) 28 | 29 | ## LoadGen test properties: 30 | # 31 | LOADGEN_SCENARIO = os.getenv('CK_LOADGEN_SCENARIO', 'SingleStream') 32 | LOADGEN_MODE = os.getenv('CK_LOADGEN_MODE', 'AccuracyOnly') 33 | LOADGEN_BUFFER_SIZE = int(os.getenv('CK_LOADGEN_BUFFER_SIZE')) # set to how many samples are you prepared to keep in memory at once 34 | LOADGEN_DATASET_SIZE = int(os.getenv('CK_LOADGEN_DATASET_SIZE')) # set to how many total samples to choose from (0 = full set) 35 | LOADGEN_CONFIG_FILE = os.getenv('CK_ENV_LOADGEN_CONFIG_FILE', '') # Very Important: make sure 'pass_env_to_resolve' is on 36 | LOADGEN_MULTISTREAMNESS = os.getenv('CK_LOADGEN_MULTISTREAMNESS', '') # if not set, use value from LoadGen's config file, or LoadGen code 37 | LOADGEN_MAX_DURATION_S = os.getenv('CK_LOADGEN_MAX_DURATION_S', '') # if not set, use value from LoadGen's config file, or LoadGen code 38 | LOADGEN_COUNT_OVERRIDE = os.getenv('CK_LOADGEN_COUNT_OVERRIDE', '') 39 | LOADGEN_TARGET_QPS = os.getenv('CK_LOADGEN_TARGET_QPS', '') # Maps to differently named internal config options, depending on scenario - see below. 40 | LOADGEN_WARMUP_SAMPLES = int(os.getenv('CK_LOADGEN_WARMUP_SAMPLES', '0')) 41 | BATCH_SIZE = int(os.getenv('CK_BATCH_SIZE', '1')) 42 | SIDELOAD_JSON = os.getenv('CK_LOADGEN_SIDELOAD_JSON','') 43 | 44 | ## Model properties: 45 | # 46 | MODEL_PATH = os.environ['CK_ENV_TENSORRT_MODEL_FILENAME'] 47 | MODEL_DATA_LAYOUT = os.getenv('ML_MODEL_DATA_LAYOUT', 'NCHW') 48 | LABELS_PATH = os.getenv('CK_ENV_TENSORRT_MODEL_FLATLABELS_FILE') or os.environ['ML_MODEL_CLASS_LABELS'] 49 | MODEL_COLOURS_BGR = os.getenv('ML_MODEL_COLOUR_CHANNELS_BGR', 'NO') in ('YES', 'yes', 'ON', 'on', '1') 50 | MODEL_INPUT_DATA_TYPE = os.getenv('ML_MODEL_INPUT_DATA_TYPE', 'float32') 51 | MODEL_DATA_TYPE = os.getenv('ML_MODEL_DATA_TYPE', '(unknown)') 52 | MODEL_MAX_PREDICTIONS = int(os.getenv('ML_MODEL_MAX_PREDICTIONS', 100)) 53 | MODEL_IMAGE_HEIGHT = int(os.getenv('ML_MODEL_IMAGE_HEIGHT', 54 | os.getenv('CK_ENV_ONNX_MODEL_IMAGE_HEIGHT', 55 | os.getenv('CK_ENV_TENSORFLOW_MODEL_IMAGE_HEIGHT', 56 | '')))) 57 | MODEL_IMAGE_WIDTH = int(os.getenv('ML_MODEL_IMAGE_WIDTH', 58 | os.getenv('CK_ENV_ONNX_MODEL_IMAGE_WIDTH', 59 | os.getenv('CK_ENV_TENSORFLOW_MODEL_IMAGE_WIDTH', 60 | '')))) 61 | MODEL_IMAGE_CHANNELS = 3 62 | 63 | MODEL_SKIPPED_CLASSES = os.getenv("ML_MODEL_SKIPS_ORIGINAL_DATASET_CLASSES", None) 64 | if (MODEL_SKIPPED_CLASSES): 65 | SKIPPED_CLASSES = [int(x) for x in MODEL_SKIPPED_CLASSES.split(",")] 66 | else: 67 | SKIPPED_CLASSES = None 68 | 69 | 70 | ## Data transfer (numpy floats by default): 71 | # 72 | TRANSFER_MODE = os.getenv('CK_TRANSFER_MODE', 'numpy') 73 | TRANSFER_FLOAT = (os.getenv('CK_TRANSFER_FLOAT', 'YES') in ('YES', 'yes', 'ON', 'on', '1')) and (MODEL_INPUT_DATA_TYPE == 'float32') 74 | TRANSFER_TYPE_NP, TRANSFER_TYPE_SYMBOL = (np.float32, 'f') if TRANSFER_FLOAT else (np.int8, 'b') 75 | 76 | ## Image normalization: 77 | # 78 | PREPROCESS_ON_GPU = not TRANSFER_FLOAT and os.getenv('CK_PREPROCESS_ON_GPU', 'NO') in ('YES', 'yes', 'ON', 'on', '1') 79 | 80 | MODEL_NORMALIZE_DATA = os.getenv('ML_MODEL_NORMALIZE_DATA') in ('YES', 'yes', 'ON', 'on', '1') 81 | MODEL_NORMALIZE_LOWER = float(os.getenv('ML_MODEL_NORMALIZE_LOWER', -1.0)) 82 | MODEL_NORMALIZE_UPPER = float(os.getenv('ML_MODEL_NORMALIZE_UPPER', 1.0)) 83 | 84 | SUBTRACT_MEAN = os.getenv('ML_MODEL_SUBTRACT_MEAN', 'YES') in ('YES', 'yes', 'ON', 'on', '1') 85 | GIVEN_CHANNEL_MEANS = os.getenv('ML_MODEL_GIVEN_CHANNEL_MEANS', '') 86 | if GIVEN_CHANNEL_MEANS: 87 | GIVEN_CHANNEL_MEANS = np.fromstring(GIVEN_CHANNEL_MEANS, dtype=np.float32, sep=' ') 88 | if MODEL_COLOURS_BGR: 89 | GIVEN_CHANNEL_MEANS = GIVEN_CHANNEL_MEANS[::-1] # swapping Red and Blue colour channels 90 | 91 | GIVEN_CHANNEL_STDS = os.getenv('ML_MODEL_GIVEN_CHANNEL_STDS', '') 92 | if GIVEN_CHANNEL_STDS: 93 | GIVEN_CHANNEL_STDS = np.fromstring(GIVEN_CHANNEL_STDS, dtype=np.float32, sep=' ') 94 | if MODEL_COLOURS_BGR: 95 | GIVEN_CHANNEL_STDS = GIVEN_CHANNEL_STDS[::-1] # swapping Red and Blue colour channels 96 | 97 | 98 | ## Input image properties: 99 | # 100 | IMAGE_DIR = os.getenv('CK_ENV_DATASET_OBJ_DETECTION_PREPROCESSED_DIR') 101 | IMAGE_LIST_FILE_NAME = os.getenv('CK_ENV_DATASET_OBJ_DETECTION_PREPROCESSED_SUBSET_FOF') 102 | IMAGE_LIST_FILE = os.path.join(IMAGE_DIR, IMAGE_LIST_FILE_NAME) 103 | IMAGE_DATA_TYPE = os.getenv('CK_ENV_DATASET_OBJ_DETECTION_PREPROCESSED_DATA_TYPE', 'uint8') 104 | 105 | ## Misc 106 | # 107 | VERBOSITY_LEVEL = int(os.getenv('CK_VERBOSE', '0')) 108 | 109 | 110 | ## ZeroMQ communication setup: 111 | # 112 | zmq_context = zmq.Context() 113 | 114 | to_workers = zmq_context.socket(zmq.PUSH) 115 | to_workers.bind("tcp://*:{}".format(ZMQ_FAN_PORT)) 116 | 117 | from_workers = zmq_context.socket(zmq.PULL) 118 | from_workers.bind("tcp://*:{}".format(ZMQ_FUNNEL_PORT)) 119 | from_workers.RCVTIMEO = 2000 120 | 121 | 122 | # Load preprocessed image filepaths: 123 | image_path_list = [] 124 | original_w_h = [] 125 | with open(IMAGE_LIST_FILE, 'r') as f: 126 | for line in f: 127 | file_name, width, height = line.strip().split(";") 128 | image_path_list.append( os.path.join(IMAGE_DIR, file_name) ) 129 | original_w_h.append( (int(width), int(height)) ) 130 | 131 | LOADGEN_DATASET_SIZE = LOADGEN_DATASET_SIZE or len(image_path_list) 132 | 133 | 134 | def load_labels(labels_filepath): 135 | my_labels = [] 136 | input_file = open(labels_filepath, 'r') 137 | for l in input_file: 138 | my_labels.append(l.strip()) 139 | return my_labels 140 | 141 | 142 | def tick(letter, quantity=1): 143 | print(letter + (str(quantity) if quantity>1 else ''), end='') 144 | 145 | 146 | # Currently loaded preprocessed images are stored in a dictionary: 147 | preprocessed_image_buffer = {} 148 | 149 | labels = load_labels(LABELS_PATH) 150 | bg_class_offset = 1 151 | class_map = None 152 | if (SKIPPED_CLASSES): 153 | class_map = [] 154 | for i in range(len(labels) + bg_class_offset): 155 | if i not in SKIPPED_CLASSES: 156 | class_map.append(i) 157 | 158 | 159 | def load_query_samples(sample_indices): # 0-based indices in our whole dataset 160 | global MODEL_IMAGE_HEIGHT, MODEL_IMAGE_WIDTH, MODEL_IMAGE_CHANNELS 161 | global preprocessed_image_buffer 162 | 163 | print("load_query_samples({})".format(sample_indices)) 164 | 165 | tick('B', len(sample_indices)) 166 | 167 | for sample_index in sample_indices: 168 | img_filepath = image_path_list[sample_index] 169 | img = np.fromfile(img_filepath, np.dtype(IMAGE_DATA_TYPE)) 170 | img = img.reshape((MODEL_IMAGE_HEIGHT, MODEL_IMAGE_WIDTH, MODEL_IMAGE_CHANNELS)) 171 | 172 | if PREPROCESS_ON_GPU: 173 | nhwc_img = img if MODEL_DATA_LAYOUT == 'NHWC' else img.transpose(2,0,1) 174 | preprocessed_image_buffer[sample_index] = np.array(nhwc_img).ravel() # transfer bytes as unsigned 175 | else: 176 | if MODEL_COLOURS_BGR: 177 | img = img[...,::-1] # swapping Red and Blue colour channels 178 | 179 | if IMAGE_DATA_TYPE != 'float32': 180 | img = img.astype(np.float32) 181 | 182 | # Normalize 183 | if MODEL_NORMALIZE_DATA: 184 | img = img*(MODEL_NORMALIZE_UPPER-MODEL_NORMALIZE_LOWER)/255.0+MODEL_NORMALIZE_LOWER 185 | 186 | # Subtract mean value 187 | if SUBTRACT_MEAN: 188 | if len(GIVEN_CHANNEL_MEANS): 189 | img -= GIVEN_CHANNEL_MEANS 190 | else: 191 | img -= np.mean(img, axis=(0,1), keepdims=True) 192 | 193 | if len(GIVEN_CHANNEL_STDS): 194 | img /= GIVEN_CHANNEL_STDS 195 | 196 | if MODEL_INPUT_DATA_TYPE == 'int8' or TRANSFER_TYPE_NP == np.int8: 197 | img = np.clip(img, -128, 127) 198 | 199 | nhwc_img = img if MODEL_DATA_LAYOUT == 'NHWC' else img.transpose(2,0,1) 200 | preprocessed_image_buffer[sample_index] = np.array(nhwc_img).ravel().astype(TRANSFER_TYPE_NP) # transfer bytes as signed 201 | 202 | tick('l') 203 | print('') 204 | 205 | 206 | def unload_query_samples(sample_indices): 207 | #print("unload_query_samples({})".format(sample_indices)) 208 | global preprocessed_image_buffer 209 | 210 | preprocessed_image_buffer = {} 211 | tick('U') 212 | print('') 213 | 214 | 215 | openme_data = {} # side-loaded stats 216 | in_progress = {} # local store of metadata about batches between issue_queries and send_responses 217 | funnel_should_be_running = True # a way for the fan to signal to the funnel_thread to end 218 | warmup_mode = False # while on, QuerySampleResponses will not be sent to LoadGen 219 | 220 | def issue_queries(query_samples): 221 | 222 | global BATCH_SIZE 223 | 224 | if VERBOSITY_LEVEL: 225 | printable_query = [(qs.index, qs.id) for qs in query_samples] 226 | print("issue_queries( {} )".format(printable_query)) 227 | tick('Q', len(query_samples)) 228 | 229 | for j in range(0, len(query_samples), BATCH_SIZE): 230 | batch = query_samples[j:j+BATCH_SIZE] # NB: the last one may be shorter than BATCH_SIZE in length 231 | batch_vector_numpy = np.ravel([ preprocessed_image_buffer[qs.index] for qs in batch ]) 232 | 233 | job_id = batch[0].id # assume it is both sufficiently unique and sufficiently small to fit our needs 234 | 235 | in_progress[job_id] = { 236 | 'submission_time': time.time(), 237 | 'batch': batch, 238 | } 239 | 240 | if TRANSFER_MODE == 'numpy': 241 | job_data_struct = { 242 | 'job_id': job_id, 243 | 'batch_data': batch_vector_numpy, 244 | } 245 | to_workers.send_pyobj(job_data_struct) 246 | elif TRANSFER_MODE == 'pickle': 247 | job_data_struct = { 248 | 'job_id': job_id, 249 | 'batch_data': np.asarray(batch_vector_numpy), 250 | } 251 | to_workers.send_pyobj(job_data_struct) 252 | elif TRANSFER_MODE == 'raw': 253 | ## Slower, but insensitive to endianness: 254 | # batch_vector_list = batch_vector_numpy.tolist() 255 | # job_data_raw = struct.pack(' job_id={} {}".format(job_id, [qs.index for qs in batch])) 270 | 271 | 272 | def send_responses(): 273 | 274 | global funnel_should_be_running, warmup_mode, openme_data 275 | 276 | funnel_start = time.time() 277 | 278 | received_job_timings = openme_data['received_job_timings'] = [] 279 | inference_times_ms_by_worker_id = {} 280 | 281 | while funnel_should_be_running: 282 | 283 | try: 284 | done_job = from_workers.recv_json() 285 | except Exception as e: 286 | continue # go back and check if the funnel_should_be_running condition has been turned off by the main thread 287 | 288 | job_id = done_job['job_id'] 289 | local_metadata = in_progress.pop(job_id) 290 | received_timestamp = time.time() 291 | roundtrip_time_ms = (received_timestamp-local_metadata['submission_time'])*1000 292 | worker_id = done_job['worker_id'] 293 | inference_time_ms = done_job['inference_time_ms'] 294 | floatize_time_ms = done_job['floatize_time_ms'] 295 | 296 | print("[funnel] <- [worker {}] job_id={}, worker_type_conversion={:.2f} ms, inference={:.2f} ms, roundtrip={:.2f} ms".format( 297 | worker_id, job_id, floatize_time_ms, inference_time_ms, roundtrip_time_ms)) 298 | 299 | received_job_timings.append({ 300 | 'job_id': job_id, 301 | 'worker_id': worker_id, 302 | 'received_timestamp': received_timestamp, 303 | 'worker_floatize_time_ms': floatize_time_ms, 304 | 'inference_time_ms': inference_time_ms, 305 | 'roundtrip_time_ms': roundtrip_time_ms, 306 | }) 307 | 308 | if warmup_mode: 309 | continue 310 | 311 | if worker_id not in inference_times_ms_by_worker_id: 312 | inference_times_ms_by_worker_id[worker_id] = [] 313 | inference_times_ms_by_worker_id[worker_id].append( inference_time_ms ) 314 | 315 | batch = local_metadata['batch'] 316 | batch_size = len(batch) 317 | raw_batch_results = np.array(done_job['raw_batch_results'], dtype=np.float32) 318 | batch_results = np.split(raw_batch_results, batch_size) 319 | 320 | response = [] 321 | response_array_refs = [] # This is needed to guarantee that the individual buffers to which we keep extra-Pythonian references, do not get garbage-collected. 322 | for qs, all_boxes_for_this_sample in zip(batch, batch_results): 323 | 324 | num_active_boxes_for_this_sample = all_boxes_for_this_sample[MODEL_MAX_PREDICTIONS*7].view('int32') 325 | global_image_index = qs.index 326 | width_orig, height_orig = original_w_h[global_image_index] 327 | reformed_active_boxes_for_this_sample = [] 328 | for i in range(num_active_boxes_for_this_sample): 329 | (image_id, ymin, xmin, ymax, xmax, confidence_score, class_number) = all_boxes_for_this_sample[i*7:(i+1)*7] 330 | 331 | if class_map: 332 | class_number = float(class_map[int(class_number)]) 333 | 334 | reformed_active_boxes_for_this_sample += [ 335 | float(global_image_index), ymin, xmin, ymax, xmax, confidence_score, class_number ] 336 | 337 | response_array = array.array("B", np.array(reformed_active_boxes_for_this_sample, np.float32).tobytes()) 338 | response_array_refs.append(response_array) 339 | bi = response_array.buffer_info() 340 | response.append(lg.QuerySampleResponse(qs.id, bi[0], bi[1])) 341 | lg.QuerySamplesComplete(response) 342 | tick('R', len(response)) 343 | sys.stdout.flush() 344 | print("[funnel] quitting") 345 | 346 | 347 | def flush_queries(): 348 | pass 349 | 350 | 351 | def process_latencies(latencies_ns): 352 | 353 | global openme_data 354 | 355 | latencies_ms = openme_data['loadgen_measured_latencies_ms'] = [ns/1.0e6 for ns in latencies_ns] 356 | print("LG called process_latencies({})".format(latencies_ms)) 357 | 358 | latencies_size = len(latencies_ms) 359 | latencies_avg = sum(latencies_ms)/latencies_size 360 | latencies_sorted = sorted(latencies_ms) 361 | latencies_p50 = int(latencies_size * 0.5); 362 | latencies_p90 = int(latencies_size * 0.9); 363 | latencies_p99 = int(latencies_size * 0.99); 364 | 365 | print("--------------------------------------------------------------------") 366 | print("| LATENCIES (in milliseconds and fps) |") 367 | print("--------------------------------------------------------------------") 368 | print("Number of samples run: {:9d}".format(latencies_size)) 369 | print("Min latency: {:9.2f} ms ({:.3f} fps)".format(latencies_sorted[0], 1e3/latencies_sorted[0])) 370 | print("Median latency: {:9.2f} ms ({:.3f} fps)".format(latencies_sorted[latencies_p50], 1e3/latencies_sorted[latencies_p50])) 371 | print("Average latency: {:9.2f} ms ({:.3f} fps)".format(latencies_avg, 1e3/latencies_avg)) 372 | print("90 percentile latency: {:9.2f} ms ({:.3f} fps)".format(latencies_sorted[latencies_p90], 1e3/latencies_sorted[latencies_p90])) 373 | print("99 percentile latency: {:9.2f} ms ({:.3f} fps)".format(latencies_sorted[latencies_p99], 1e3/latencies_sorted[latencies_p99])) 374 | print("Max latency: {:9.2f} ms ({:.3f} fps)".format(latencies_sorted[-1], 1e3/latencies_sorted[-1])) 375 | print("--------------------------------------------------------------------") 376 | 377 | 378 | def benchmark_using_loadgen(): 379 | "Perform the benchmark using python API for the LoadGen library" 380 | 381 | global funnel_should_be_running, warmup_mode, openme_data 382 | 383 | scenario = { 384 | 'SingleStream': lg.TestScenario.SingleStream, 385 | 'MultiStream': lg.TestScenario.MultiStream, 386 | 'Server': lg.TestScenario.Server, 387 | 'Offline': lg.TestScenario.Offline, 388 | }[LOADGEN_SCENARIO] 389 | 390 | mode = { 391 | 'AccuracyOnly': lg.TestMode.AccuracyOnly, 392 | 'PerformanceOnly': lg.TestMode.PerformanceOnly, 393 | 'SubmissionRun': lg.TestMode.SubmissionRun, 394 | }[LOADGEN_MODE] 395 | 396 | ts = lg.TestSettings() 397 | if LOADGEN_CONFIG_FILE: 398 | ts.FromConfig(LOADGEN_CONFIG_FILE, 'random_model_name', LOADGEN_SCENARIO) 399 | ts.scenario = scenario 400 | ts.mode = mode 401 | 402 | if LOADGEN_MULTISTREAMNESS: 403 | ts.multi_stream_samples_per_query = int(LOADGEN_MULTISTREAMNESS) 404 | 405 | if LOADGEN_MAX_DURATION_S: 406 | ts.max_duration_ms = int(LOADGEN_MAX_DURATION_S)*1000 407 | 408 | if LOADGEN_COUNT_OVERRIDE: 409 | ts.min_query_count = int(LOADGEN_COUNT_OVERRIDE) 410 | ts.max_query_count = int(LOADGEN_COUNT_OVERRIDE) 411 | 412 | if LOADGEN_TARGET_QPS: 413 | target_qps = float(LOADGEN_TARGET_QPS) 414 | ts.multi_stream_target_qps = target_qps 415 | ts.server_target_qps = target_qps 416 | ts.offline_expected_qps = target_qps 417 | 418 | sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) 419 | qsl = lg.ConstructQSL(LOADGEN_DATASET_SIZE, LOADGEN_BUFFER_SIZE, load_query_samples, unload_query_samples) 420 | 421 | log_settings = lg.LogSettings() 422 | log_settings.enable_trace = False 423 | 424 | funnel_thread = threading.Thread(target=send_responses, args=()) 425 | funnel_should_be_running = True 426 | funnel_thread.start() 427 | 428 | if LOADGEN_WARMUP_SAMPLES: 429 | warmup_id_range = list(range(LOADGEN_WARMUP_SAMPLES)) 430 | load_query_samples(warmup_id_range) 431 | 432 | warmup_mode = True 433 | print("Sending out the warm-up samples, waiting for responses...") 434 | issue_queries([lg.QuerySample(id,id) for id in warmup_id_range]) 435 | 436 | while len(in_progress)>0: # waiting for the in_progress queue to clear up 437 | time.sleep(1) 438 | print(" Done!") 439 | 440 | warmup_mode = False 441 | 442 | lg.StartTestWithLogSettings(sut, qsl, ts, log_settings) 443 | 444 | funnel_should_be_running = False # politely ask the funnel_thread to end 445 | funnel_thread.join() # wait for it to actually end 446 | 447 | from_workers.close() 448 | to_workers.close() 449 | 450 | lg.DestroyQSL(qsl) 451 | lg.DestroySUT(sut) 452 | 453 | if SIDELOAD_JSON: 454 | with open(SIDELOAD_JSON, 'w') as sideload_fd: 455 | json.dump(openme_data, sideload_fd, indent=4, sort_keys=True) 456 | 457 | 458 | benchmark_using_loadgen() 459 | --------------------------------------------------------------------------------