├── CMakeLists.txt ├── Dockerfile ├── LICENSE ├── README.md ├── README_cn.md ├── README_en.md ├── docs ├── model_configuration.md └── zh_CN │ ├── model_configuration.md │ └── model_repository.md ├── examples ├── fetch_models.sh ├── infer_ernie.py ├── infer_resnet50_v1.5.py ├── infer_resnet50_v1.5.sh ├── models │ ├── ERNIE │ │ └── config.pbtxt │ └── ResNet50-v1.5 │ │ └── config.pbtxt ├── perf_ernie.sh └── perf_resnet50_v1.5.sh ├── paddle-lib ├── Dockerfile └── build_paddle.sh ├── scripts ├── build_paddle_backend.sh └── launch_triton_server.sh └── src ├── libtriton_paddle.ldscript ├── paddle.cc ├── paddle_backend_utils.cc └── paddle_backend_utils.h /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | cmake_minimum_required(VERSION 3.17) 28 | 29 | project(trironpaddlebackend LANGUAGES C CXX) 30 | 31 | set(PADDLE_INFERENCE_DIR "" CACHE PATH "Paths to Paddle Inference Directory. Multiple paths may be specified by sparating them with a semicolon.") 32 | set(PADDLE_INCLUDE_PATHS "${PADDLE_INFERENCE_DIR}/include" 33 | CACHE PATH "Paths to Paddle Inference includes. Multiple paths may be specified by sparating them with a semicolon.") 34 | set(PADDLE_LIB_PATHS "${PADDLE_INFERENCE_DIR}/lib" 35 | CACHE PATH "Paths to Paddle Inference libraries. Multiple paths may be specified by sparating them with a semicolon.") 36 | set(PADDLE_LIB_NAME "paddle_inference") 37 | 38 | set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") 39 | set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") 40 | set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") 41 | 42 | include(FetchContent) 43 | 44 | FetchContent_Declare( 45 | repo-common 46 | GIT_REPOSITORY https://github.com/triton-inference-server/common.git 47 | GIT_TAG ${TRITON_COMMON_REPO_TAG} 48 | GIT_SHALLOW ON 49 | ) 50 | FetchContent_Declare( 51 | repo-core 52 | GIT_REPOSITORY https://github.com/triton-inference-server/core.git 53 | GIT_TAG ${TRITON_CORE_REPO_TAG} 54 | GIT_SHALLOW ON 55 | ) 56 | FetchContent_Declare( 57 | repo-backend 58 | GIT_REPOSITORY https://github.com/triton-inference-server/backend.git 59 | GIT_TAG ${TRITON_BACKEND_REPO_TAG} 60 | GIT_SHALLOW ON 61 | ) 62 | FetchContent_MakeAvailable(repo-common repo-core repo-backend) 63 | 64 | configure_file(src/libtriton_paddle.ldscript libtriton_paddle.ldscript COPYONLY) 65 | 66 | add_library( 67 | triton-paddle-backend SHARED 68 | src/paddle.cc 69 | src/paddle_backend_utils.cc 70 | ) 71 | 72 | target_include_directories( 73 | triton-paddle-backend 74 | PRIVATE 75 | ${CMAKE_CURRENT_SOURCE_DIR}/src 76 | ) 77 | 78 | target_include_directories( 79 | triton-paddle-backend 80 | PRIVATE ${PADDLE_INCLUDE_PATHS} 81 | ) 82 | 83 | target_link_libraries( 84 | triton-paddle-backend 85 | PRIVATE "-L${PADDLE_LIB_PATHS} -l${PADDLE_LIB_NAME}" 86 | ) 87 | 88 | target_compile_features(triton-paddle-backend PRIVATE cxx_std_11) 89 | target_compile_options( 90 | triton-paddle-backend PRIVATE 91 | $<$,$,$>: 92 | -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> 93 | ) 94 | 95 | set_target_properties( 96 | triton-paddle-backend PROPERTIES 97 | POSITION_INDEPENDENT_CODE ON 98 | OUTPUT_NAME triton_paddle 99 | SKIP_BUILD_RPATH TRUE 100 | LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_paddle.ldscript 101 | LINK_FLAGS "-Wl,--version-script libtriton_paddle.ldscript" 102 | ) 103 | 104 | target_link_libraries( 105 | triton-paddle-backend 106 | PRIVATE 107 | triton-backend-utils # from repo-backend 108 | triton-core-serverstub # from repo-core 109 | ) 110 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | FROM nvcr.io/nvidia/tritonserver:21.10-py3 as full 28 | FROM nvcr.io/nvidia/tritonserver:21.10-py3-min 29 | 30 | ENV DEBIAN_FRONTEND=noninteractive 31 | ENV DCGM_VERSION=2.2.9 32 | RUN apt update && apt install -y --no-install-recommends software-properties-common \ 33 | && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin \ 34 | && mkdir -p /etc/apt/preferences.d/cuda-repository-pin-600 \ 35 | && mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600/ \ 36 | && apt-key del 7fa2af80 \ 37 | && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \ 38 | && dpkg -i cuda-keyring_1.0-1_all.deb \ 39 | && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub \ 40 | && add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" \ 41 | && apt-get update && apt-get install -y --no-install-recommends datacenter-gpu-manager=1:2.2.9 42 | 43 | RUN apt update \ 44 | && apt install -y --no-install-recommends libre2-5 libb64-0d python3 python3-pip libarchive-dev \ 45 | && python3 -m pip install -U pip \ 46 | && python3 -m pip install paddlepaddle-gpu paddlenlp faster_tokenizer 47 | 48 | COPY --from=full /opt/tritonserver/bin /opt/tritonserver/bin 49 | COPY --from=full /opt/tritonserver/lib /opt/tritonserver/lib 50 | COPY --from=full /opt/tritonserver/include /opt/tritonserver/include 51 | COPY --from=full /opt/tritonserver/backends/python /opt/tritonserver/backends/python 52 | COPY --from=full /opt/tritonserver/backends/onnxruntime /opt/tritonserver/backends/onnxruntime 53 | 54 | COPY paddle-lib/paddle/lib paddle-lib/onnxruntime/lib paddle-lib/paddle2onnx/lib paddle-lib/mkldnn/lib paddle-lib/mklml/lib /opt/paddle/ 55 | COPY build/libtriton_paddle.so /opt/tritonserver/backends/paddle/ 56 | 57 | ENV LD_LIBRARY_PATH="/opt/paddle/:$LD_LIBRARY_PATH" 58 | ENV PATH="/opt/tritonserver/bin:$PATH" 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of NVIDIA CORPORATION nor the names of its 12 | contributors may be used to endorse or promote products derived 13 | from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | README_en.md -------------------------------------------------------------------------------- /README_cn.md: -------------------------------------------------------------------------------- 1 | 28 | 简体中文 | [English](README_en.md) 29 | 30 | # Triton Paddle Backend 31 | 32 | ## Table of Contents 33 | 34 | - [快速开始](#快速开始) 35 | - [拉取镜像](#拉取镜像) 36 | - [创建模型仓库](#创建模型仓库) 37 | - [启动服务](#启动服务) 38 | - [验证Triton服务](#验证Triton服务是否正常) 39 | - [示例](#运行示例) 40 | - [ERNIE Base](#ernie-base) 41 | - [ResNet50 v1.5](#resnet50-v15) 42 | - [文档](#高阶文档) 43 | - [性能指标](#性能指标) 44 | - [ERNIE Base (T4)](#ernie-base-t4) 45 | - [ResNet50 v1.5 (V100-SXM2-16G)](#resnet50-v15-v100-sxm2-16g) 46 | - [ResNet50 v1.5 (T4)](#resnet50-v15-t4) 47 | 48 | ## 快速开始 49 | 50 | ### 拉取镜像 51 | ``` 52 | docker pull paddlepaddle/triton_paddle:21.10 53 | ``` 54 | 注意: 目前只支持Triton Inference Serve 21.10版本镜像,[Triton Inference Serve 镜像介绍](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).其他版本需要从源码编译 55 | 56 | ### 创建模型仓库 57 | 当Triton Inference Server启动服务时,可以指定一个或多个模型仓库来部署模型,详细描述见文档[模型仓库](docs/zh_CN/model_repository.md)。在[examples](examples)中有模型仓库示例,可以通过以下脚本获取: 58 | ```bash 59 | $ cd examples 60 | $ ./fetch_models.sh 61 | $ cd .. # back to root of paddle_backend 62 | ``` 63 | 64 | ### 启动服务 65 | 1. 启动容器 66 | ``` 67 | docker run --gpus=all --rm -it --name triton_server --net=host -e CUDA_VISIBLE_DEVICES=0 \ 68 | -v `pwd`/examples/models:/workspace/models \ 69 | paddlepaddle/triton_paddle:21.10 /bin/bash 70 | ``` 71 | 2. 进入容器: 72 | ``` 73 | docker exec -it triton_server /bin/bash 74 | ``` 75 | 3. 启动服务 76 | ``` 77 | /opt/tritonserver/bin/tritonserver --model-repository=/workspace/models 78 | ``` 79 | 可以使用`/opt/tritonserver/bin/tritonserver --help`查看启动服务的所有参数介绍 80 | 81 | ### 验证Triton服务是否正常 82 | 在启动服务的机器上使用curl指令,发送HTTP请求可以得到服务的状态 83 | 84 | ``` 85 | $ curl -v localhost:8000/v2/health/ready 86 | ... 87 | < HTTP/1.1 200 OK 88 | < Content-Length: 0 89 | < Content-Type: text/plain 90 | ``` 91 | HTTP请求返回200代表服务正常,否则服务有问题 92 | 93 | ## 运行示例 94 | 95 | 在运行示例之前,需要确保服务已经启动并[正常运行](#验证Triton服务是否正常). 96 | 97 | 进入[examples](examples)目录 98 | ```bash 99 | $ cd examples 100 | ``` 101 | 102 | ### ERNIE Base 103 | 运行Ernie模型benchmark测试脚本: 104 | ```bash 105 | $ bash perf_ernie.sh 106 | ``` 107 | 108 | ### ResNet50 v1.5 109 | 运行ResNet50-v1.5模型benchmark脚本: 110 | ```bash 111 | $ bash perf_resnet50_v1.5.sh 112 | ``` 113 | 114 | ## 高阶文档 115 | - [模型仓库](docs/zh_CN/model_repository.md) 116 | - [模型配置](docs/zh_CN/model_configuration.md) 117 | 118 | ## 性能指标 119 | 120 | ### ERNIE Base (T4) 121 | 122 | | Precision | Backend Accelerator | Client Batch Size | Sequences/second | P90 Latency (ms) | P95 Latency (ms) | P99 Latency (ms) | Avg Latency (ms) | 123 | |:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:| 124 | | FP16 | TensorRT | 1 | 270.0 | 3.813 | 3.846 | 4.007 | 3.692 | 125 | | FP16 | TensorRT | 2 | 500.4 | 4.282 | 4.332 | 4.709 | 3.980 | 126 | | FP16 | TensorRT | 4 | 831.2 | 5.141 | 5.242 | 5.569 | 4.797 | 127 | | FP16 | TensorRT | 8 | 1128.0 | 7.788 | 7.949 | 8.255 | 7.089 | 128 | | FP16 | TensorRT | 16 | 1363.2 | 12.702 | 12.993 | 13.507 | 11.738 | 129 | | FP16 | TensorRT | 32 | 1529.6 | 22.495 | 22.817 | 24.634 | 20.901 | 130 | 131 | ### ResNet50 v1.5 (V100-SXM2-16G) 132 | 133 | | Precision | Backend Accelerator | Client Batch Size | Sequences/second | P90 Latency (ms) | P95 Latency (ms) | P99 Latency (ms) | Avg Latency (ms) | 134 | |:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:| 135 | | FP16 | TensorRT | 1 | 288.8 | 3.494 | 3.524 | 3.608 | 3.462 | 136 | | FP16 | TensorRT | 2 | 494.0 | 4.083 | 4.110 | 4.208 | 4.047 | 137 | | FP16 | TensorRT | 4 | 758.4 | 5.327 | 5.359 | 5.460 | 5.273 | 138 | | FP16 | TensorRT | 8 | 1044.8 | 7.728 | 7.770 | 7.949 | 7.658 | 139 | | FP16 | TensorRT | 16 | 1267.2 | 12.742 | 12.810 | 13.883 | 12.647 | 140 | | FP16 | TensorRT | 32 | 1113.6 | 28.840 | 29.044 | 30.357 | 28.641 | 141 | | FP16 | TensorRT | 64 | 1100.8 | 58.512 | 58.642 | 59.967 | 58.251 | 142 | | FP16 | TensorRT | 128 | 1049.6 | 121.371 | 121.834 | 123.371 | 119.991 | 143 | 144 | ### ResNet50 v1.5 (T4) 145 | | Precision | Backend Accelerator | Client Batch Size | Sequences/second | P90 Latency (ms) | P95 Latency (ms) | P99 Latency (ms) | Avg Latency (ms) | 146 | |:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:| 147 | | FP16 | TensorRT | 1 | 291.8 | 3.471 | 3.489 | 3.531 | 3.427 | 148 | | FP16 | TensorRT | 2 | 466.0 | 4.323 | 4.336 | 4.382 | 4.288 | 149 | | FP16 | TensorRT | 4 | 665.6 | 6.031 | 6.071 | 6.142 | 6.011 | 150 | | FP16 | TensorRT | 8 | 833.6 | 9.662 | 9.684 | 9.767 | 9.609 | 151 | | FP16 | TensorRT | 16 | 899.2 | 18.061 | 18.208 | 18.899 | 17.748 | 152 | | FP16 | TensorRT | 32 | 761.6 | 42.333 | 43.456 | 44.167 | 41.740 | 153 | | FP16 | TensorRT | 64 | 793.6 | 79.860 | 80.410 | 80.807 | 79.680 | 154 | | FP16 | TensorRT | 128 | 793.6 | 158.207 | 158.278 | 158.643 | 157.543 | 155 | -------------------------------------------------------------------------------- /README_en.md: -------------------------------------------------------------------------------- 1 | 28 | English | [简体中文](README_cn.md) 29 | 30 | # Triton Paddle Backend 31 | 32 | ## Table of Contents 33 | 34 | - [Quick Start](#quick-start) 35 | - [Pull Image](#pull-image) 36 | - [Create a Model Repository](#create-a-model-repository) 37 | - [Launch Triton Inference Server](#launch-triton-inference-server) 38 | - [Verify Triton Is Running Correctly](#verify-triton-is-running-correctly) 39 | - [Examples](#examples) 40 | - [ERNIE Base](#ernie-base) 41 | - [ResNet50 v1.5](#resnet50-v15) 42 | - [Performance](#performance) 43 | - [ERNIE Base (T4)](#ernie-base-t4) 44 | - [ResNet50 v1.5 (V100-SXM2-16G)](#resnet50-v15-v100-sxm2-16g) 45 | - [ResNet50 v1.5 (T4)](#resnet50-v15-t4) 46 | 47 | ## Quick Start 48 | 49 | ### Pull Image 50 | 51 | ```bash 52 | docker pull paddlepaddle/triton_paddle:21.10 53 | ``` 54 | 55 | Note: Only Triton Inference Server 21.10 image is supported. 56 | 57 | ### Create A Model Repository 58 | 59 | The model repository is the directory where you 60 | place the models that you want Triton to server. An example model 61 | repository is included in the [examples](examples). Before using the repository, 62 | you must fetch it by the following scripts. 63 | 64 | ```bash 65 | $ cd examples 66 | $ ./fetch_models.sh 67 | $ cd .. # back to root of paddle_backend 68 | ``` 69 | 70 | ### Launch Triton Inference Server 71 | 72 | 1. Launch the image 73 | 74 | ```bash 75 | $ docker run --gpus=all --rm -it --name triton_server --net=host -e CUDA_VISIBLE_DEVICES=0 \ 76 | -v `pwd`/examples/models:/workspace/models \ 77 | paddlepaddle/triton_paddle:21.10 /bin/bash 78 | ``` 79 | 80 | 2. Launch the triton inference server 81 | 82 | ```bash 83 | /opt/tritonserver/bin/tritonserver --model-repository=/workspace/models 84 | ``` 85 | 86 | Note: `/opt/tritonserver/bin/tritonserver --help` for all available parameters 87 | 88 | ### Verify Triton Is Running Correctly 89 | 90 | Use Triton’s *ready* endpoint to verify that the server and the models 91 | are ready for inference. From the host system use curl to access the 92 | HTTP endpoint that indicates server status. 93 | 94 | ``` 95 | $ curl -v localhost:8000/v2/health/ready 96 | ... 97 | < HTTP/1.1 200 OK 98 | < Content-Length: 0 99 | < Content-Type: text/plain 100 | ``` 101 | 102 | The HTTP request returns status 200 if Triton is ready and non-200 if 103 | it is not ready. 104 | 105 | ## Examples 106 | 107 | Before running the examples, please make sure the triton server is running [correctly](#verify-triton-is-running-correctly). 108 | 109 | Change working directory to [examples](examples) 110 | ```bash 111 | $ cd examples 112 | ``` 113 | 114 | ### ERNIE Base 115 | [ERNIE-2.0](https://github.com/PaddlePaddle/ERNIE) is a pre-training framework for language understanding. 116 | 117 | Steps to run the benchmark on ERNIE 118 | ```bash 119 | $ bash perf_ernie.sh 120 | ``` 121 | 122 | ### ResNet50 v1.5 123 | The [ResNet50-v1.5](https://ngc.nvidia.com/catalog/resources/nvidia:resnet_50_v1_5_for_pytorch) is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385). 124 | 125 | Steps to run the benchmark on ResNet50-v1.5 126 | ```bash 127 | $ bash perf_resnet50_v1.5.sh 128 | ``` 129 | 130 | Steps to run the inference on ResNet50-v1.5. 131 | 132 | 1. Prepare processed images following [DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/triton/resnet50#quick-start-guide) and place ``imagenet`` folder under [examples](examples) directory. 133 | 134 | 2. Run the inference 135 | 136 | ```bash 137 | $ bash infer_resnet_v1.5.sh imagenet/ 138 | ``` 139 | 140 | ## Performance 141 | 142 | ### ERNIE Base (T4) 143 | 144 | | Precision | Backend Accelerator | Client Batch Size | Sequences/second | P90 Latency (ms) | P95 Latency (ms) | P99 Latency (ms) | Avg Latency (ms) | 145 | |:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:| 146 | | FP16 | TensorRT | 1 | 270.0 | 3.813 | 3.846 | 4.007 | 3.692 | 147 | | FP16 | TensorRT | 2 | 500.4 | 4.282 | 4.332 | 4.709 | 3.980 | 148 | | FP16 | TensorRT | 4 | 831.2 | 5.141 | 5.242 | 5.569 | 4.797 | 149 | | FP16 | TensorRT | 8 | 1128.0 | 7.788 | 7.949 | 8.255 | 7.089 | 150 | | FP16 | TensorRT | 16 | 1363.2 | 12.702 | 12.993 | 13.507 | 11.738 | 151 | | FP16 | TensorRT | 32 | 1529.6 | 22.495 | 22.817 | 24.634 | 20.901 | 152 | 153 | ### ResNet50 v1.5 (V100-SXM2-16G) 154 | 155 | | Precision | Backend Accelerator | Client Batch Size | Sequences/second | P90 Latency (ms) | P95 Latency (ms) | P99 Latency (ms) | Avg Latency (ms) | 156 | |:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:| 157 | | FP16 | TensorRT | 1 | 288.8 | 3.494 | 3.524 | 3.608 | 3.462 | 158 | | FP16 | TensorRT | 2 | 494.0 | 4.083 | 4.110 | 4.208 | 4.047 | 159 | | FP16 | TensorRT | 4 | 758.4 | 5.327 | 5.359 | 5.460 | 5.273 | 160 | | FP16 | TensorRT | 8 | 1044.8 | 7.728 | 7.770 | 7.949 | 7.658 | 161 | | FP16 | TensorRT | 16 | 1267.2 | 12.742 | 12.810 | 13.883 | 12.647 | 162 | | FP16 | TensorRT | 32 | 1113.6 | 28.840 | 29.044 | 30.357 | 28.641 | 163 | | FP16 | TensorRT | 64 | 1100.8 | 58.512 | 58.642 | 59.967 | 58.251 | 164 | | FP16 | TensorRT | 128 | 1049.6 | 121.371 | 121.834 | 123.371 | 119.991 | 165 | 166 | ### ResNet50 v1.5 (T4) 167 | | Precision | Backend Accelerator | Client Batch Size | Sequences/second | P90 Latency (ms) | P95 Latency (ms) | P99 Latency (ms) | Avg Latency (ms) | 168 | |:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:| 169 | | FP16 | TensorRT | 1 | 291.8 | 3.471 | 3.489 | 3.531 | 3.427 | 170 | | FP16 | TensorRT | 2 | 466.0 | 4.323 | 4.336 | 4.382 | 4.288 | 171 | | FP16 | TensorRT | 4 | 665.6 | 6.031 | 6.071 | 6.142 | 6.011 | 172 | | FP16 | TensorRT | 8 | 833.6 | 9.662 | 9.684 | 9.767 | 9.609 | 173 | | FP16 | TensorRT | 16 | 899.2 | 18.061 | 18.208 | 18.899 | 17.748 | 174 | | FP16 | TensorRT | 32 | 761.6 | 42.333 | 43.456 | 44.167 | 41.740 | 175 | | FP16 | TensorRT | 64 | 793.6 | 79.860 | 80.410 | 80.807 | 79.680 | 176 | | FP16 | TensorRT | 128 | 793.6 | 158.207 | 158.278 | 158.643 | 157.543 | 177 | -------------------------------------------------------------------------------- /docs/model_configuration.md: -------------------------------------------------------------------------------- 1 | 28 | 29 | # Model Configuration 30 | 31 | ## General Model Configuration 32 | For the general model configuration information, please visit [triton-inference-server/server/docs/model_configuration](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md). 33 | 34 | ## Platform and Backend 35 | For using paddle backend, no ``platform`` need to be provided. However, you should set ``backend`` to ``"paddle"`` in the model configuration. 36 | ``` 37 | backend: "paddle" 38 | ``` 39 | 40 | ## Paddle TensorRT Prediction Configuration 41 | 42 | Paddle supports inference with tensorrt engine, which can boost inference throughput and reduce latency. 43 | 44 | Related configuration can be set under ``optimization {execution_accelerators {gpu_execution_accelerator{...}}}`` 45 | 46 | There are four sections can be configured, which are ``config``, ``min_shape``, ``max_shape``, ``opt_shape``. 47 | 48 | ### ``config`` 49 | 50 | In ``config``, you can set the ``precision``, ``min_graph_size``, ``max_batch_size``, ``workspace_size``, ``enable_tensorrt_oss``, ``is_dynamic``. 51 | The meaning of the parameters can refer to [Paddle Inference Docs](https://paddle-inference.readthedocs.io/en/latest/api_reference/cxx_api_doc/Config/GPUConfig.html#tensorrt) 52 | 53 | |Parameters |Available options | 54 | |-------------------|-----------------------------------------------------------| 55 | |precision |``"fluid"``, ``"trt_fp32"``, ``"trt_fp16"``, ``"trt_int8"``| 56 | |min_graph_size |``"1"`` ~ ``"2147483647"`` | 57 | |max_batch_size |``"1"`` ~ ``"2147483647"`` | 58 | |workspace_size |``"1"`` ~ ``"2147483647"`` | 59 | |enable_tensorrt_oss|``"0"``, ``"1"`` | 60 | |is_dynamic |``"0"``, ``"1"`` | 61 | 62 | ### ``min_shape``, ``max_shape``, ``opt_shape`` 63 | These sections are only needed if ``is_dynamic`` is ``"1"``. Multiple ``parameters`` can be existed if there are multiple dynamic shape input. The ``key`` in ``parameters`` are the input tensor name, and the ``value`` is the shape, no ``,`` or ``[]`` is needed. 64 | 65 | ### An Dynamic Shape Example 66 | ``` 67 | optimization { 68 | execution_accelerators { 69 | gpu_execution_accelerator : [ 70 | { 71 | name : "config" 72 | parameters { key: "precision" value: "trt_fp16" } 73 | parameters { key: "min_graph_size" value: "5" } 74 | parameters { key: "workspace_size" value: "1073741824" } 75 | parameters { key: "enable_tensorrt_oss" value: "1" } 76 | parameters { key: "is_dynamic" value: "1" } 77 | }, 78 | { 79 | name : "min_shape" 80 | parameters { key: "eval_placeholder_0" value: "1" } 81 | parameters { key: "eval_placeholder_1" value: "1" } 82 | parameters { key: "eval_placeholder_2" value: "1" } 83 | parameters { key: "eval_placeholder_3" value: "1 1 1" } 84 | }, 85 | { 86 | name : "max_shape" 87 | parameters { key: "eval_placeholder_0" value: "4096" } 88 | parameters { key: "eval_placeholder_1" value: "4096" } 89 | parameters { key: "eval_placeholder_2" value: "129" } 90 | parameters { key: "eval_placeholder_3" value: "1 128 1" } 91 | }, 92 | { 93 | name : "opt_shape" 94 | parameters { key: "eval_placeholder_0" value: "128" } 95 | parameters { key: "eval_placeholder_1" value: "128" } 96 | parameters { key: "eval_placeholder_2" value: "2" } 97 | parameters { key: "eval_placeholder_3" value: "1 128 1" } 98 | } 99 | ] 100 | } 101 | } 102 | ``` 103 | -------------------------------------------------------------------------------- /docs/zh_CN/model_configuration.md: -------------------------------------------------------------------------------- 1 | 28 | 29 | # 模型配置 30 | 模型存储库中的每个模型都必须包含一个模型配置,该配置提供了关于模型的必要和可选信息。这些配置信息一般写在 *config.pbtxt* 文件中,[ModelConfig protobuf](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto)格式。 31 | 32 | ## 模型通用最小配置 33 | 详细的模型通用配置请看官网文档: [model_configuration](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md).Triton的最小模型配置必须包括: *platform* 或 *backend* 属性、*max_batch_size* 属性和模型的输入输出. 34 | 35 | 例如一个Paddle模型,有两个输入*input0* 和 *input1*,一个输出*output0*,输入输出都是float32类型的tensor,最大batch为8.则最小的配置如下: 36 | 37 | ``` 38 | backend: "paddle" 39 | max_batch_size: 8 40 | input [ 41 | { 42 | name: "input0" 43 | data_type: TYPE_FP32 44 | dims: [ 16 ] 45 | }, 46 | { 47 | name: "input1" 48 | data_type: TYPE_FP32 49 | dims: [ 16 ] 50 | } 51 | ] 52 | output [ 53 | { 54 | name: "output0" 55 | data_type: TYPE_FP32 56 | dims: [ 16 ] 57 | } 58 | ] 59 | ``` 60 | 61 | ### Name, Platform and Backend 62 | 模型配置中 *name* 属性是可选的。如果模型没有在配置中指定,则使用模型的目录名;如果指定了该属性,它必须要跟模型的目录名一致。 63 | 64 | 使用 *paddle backend*,没有*platform*属性可以配置,必须配置*backend*属性为*paddle*。 65 | 66 | ``` 67 | backend: "paddle" 68 | ``` 69 | 70 | ### Paddle Backend特有配置 71 | 72 | Paddle后端目前支持*cpu*和*gpu*推理,*cpu*上支持开启*oneDNN*和*ORT*加速,*gpu*上支持开启*TensorRT*加速。 73 | 74 | 75 | #### Paddle Native配置 76 | Paddle后端中,使用*Native*推理只需配置 *Instance Groups*,决定模型运行在CPU还是GPU上。 77 | 78 | **Native CPU** 79 | ``` 80 | instance_group [ 81 | { 82 | #创建两个CPU实例 83 | count: 2 84 | kind: KIND_CPU 85 | } 86 | ] 87 | ``` 88 | 89 | **Native GPU** 90 | 在*GPU 0*上部署2个实例,在*GPU1*和*GPU*上分别不是1个实例 91 | 92 | ``` 93 | instance_group [ 94 | { 95 | count: 2 96 | kind: KIND_GPU 97 | gpus: [ 0 ] 98 | }, 99 | { 100 | count: 1 101 | kind: KIND_GPU 102 | gpus: [ 1, 2 ] 103 | } 104 | ] 105 | ``` 106 | 107 | ### Paddle oneDNN配置 108 | oneDNN(原MKL-DNN)是由英特尔开发的开源深度学习软件包,支持神经网络在CPU上的高性能计算,在Paddle后端中通过如下配置打开oneDNN加速: 109 | ``` 110 | instance_group [ { kind: KIND_CPU }] 111 | 112 | optimization { 113 | execution_accelerators { 114 | cpu_execution_accelerator : [ 115 | { 116 | name : "mkldnn" 117 | # 设置op计算的线程数为4 118 | parameters { key: "cpu_threads" value: "4" } 119 | # 缓存OneDNN最新10种输入shape 120 | parameters { key: "capacity" value: "10" } 121 | # 使用int8量化 122 | parameters { key: "use_int8" value: "0" } 123 | } 124 | ] 125 | } 126 | } 127 | ``` 128 | 129 | ### Paddle ORT配置 130 | ONNX Runtime是由微软开源的一款推理引擎,Paddle Inference通过Paddle2ONNX集成ONNX Runtime作为推理的后端之一,在Paddle后端中通过如下配置打开ONNX Runtime加速: 131 | 132 | ``` 133 | instance_group [ { kind: KIND_CPU }] 134 | 135 | optimization { 136 | execution_accelerators { 137 | cpu_execution_accelerator : [ 138 | { 139 | name : "ort" 140 | # 设置op计算的线程数为4 141 | parameters { key: "cpu_threads" value: "4" } 142 | } 143 | ] 144 | } 145 | } 146 | ``` 147 | 148 | ### Paddle TensorRT配置 149 | 150 | TensorRT 是一个针对 NVIDIA GPU 及 Jetson 系列硬件的高性能机器学习推理 SDK,可以使得深度学习模型在这些硬件上的部署获得更好的性能。Paddle Inference 以子图方式集成了 TensorRT,将可用 TensorRT 加速的算子组成子图供给 TensorRT,以获取 TensorRT 加速的同时,保留 PaddlePaddle 即训即推的能力。 151 | 152 | TensorRT的配置选项需要写在这个配置中: ``optimization {execution_accelerators {gpu_execution_accelerator{...}}}`` 153 | 154 | 一共有四个选项:``tensorrt``, ``min_shape``, ``max_shape``, ``opt_shape``. 155 | 156 | ##### tensorrt选项 157 | 158 | 在``tensorrt``中能够设置``precision``, ``min_graph_size``, ``max_batch_size``, ``workspace_size``, ``enable_tensorrt_oss``, ``is_dynamic``. 159 | 详细参数解释请看官网文档[Paddle Inference Docs](https://paddle-inference.readthedocs.io/en/latest/api_reference/cxx_api_doc/Config/GPUConfig.html#tensorrt) 160 | 161 | |Parameters |Available options | 162 | |-------------------|-----------------------------------------------------------| 163 | |precision |``"trt_fp32"``, ``"trt_fp16"``, ``"trt_int8"``| 164 | |min_graph_size |``"1"`` ~ ``"2147483647"`` | 165 | |max_batch_size |``"1"`` ~ ``"2147483647"`` | 166 | |workspace_size |``"1"`` ~ ``"2147483647"`` | 167 | |enable_tensorrt_oss|``"0"``, ``"1"`` | 168 | |is_dynamic |``"0"``, ``"1"`` | 169 | 170 | #### min_shape, max_shape, opt_shape选项 171 | 当且仅当开启动态shape时(*is_dynamic*为*1*),每个输入需要设置最大形状(*max_shape*)、最小形状(*min_shape*)和最常见形状(*opt_shape*)。其中字典*parameters*中*key*为输入的名字,*value*为对应输入的最大、最小、最常见shape。 172 | 173 | #### TensorRT动态shape例子 174 | ``` 175 | optimization { 176 | execution_accelerators { 177 | gpu_execution_accelerator : [ 178 | { 179 | name : "tensorrt" 180 | # 使用TensorRT的FP16推理 181 | parameters { key: "precision" value: "trt_fp16" } 182 | # 设置TensorRT的子图最小op数为3 183 | parameters { key: "min_graph_size" value: "3" } 184 | parameters { key: "workspace_size" value: "1073741824" } 185 | # 不使用变长 186 | parameters { key: "enable_tensorrt_oss" value: "0" } 187 | # 开启动态shape 188 | parameters { key: "is_dynamic" value: "1" } 189 | }, 190 | { 191 | name : "min_shape" 192 | parameters { key: "eval_placeholder_0" value: "1" } 193 | parameters { key: "eval_placeholder_1" value: "1" } 194 | parameters { key: "eval_placeholder_2" value: "1" } 195 | parameters { key: "eval_placeholder_3" value: "1 1 1" } 196 | }, 197 | { 198 | name : "max_shape" 199 | parameters { key: "eval_placeholder_0" value: "4096" } 200 | parameters { key: "eval_placeholder_1" value: "4096" } 201 | parameters { key: "eval_placeholder_2" value: "129" } 202 | parameters { key: "eval_placeholder_3" value: "1 128 1" } 203 | }, 204 | { 205 | name : "opt_shape" 206 | parameters { key: "eval_placeholder_0" value: "128" } 207 | parameters { key: "eval_placeholder_1" value: "128" } 208 | parameters { key: "eval_placeholder_2" value: "2" } 209 | parameters { key: "eval_placeholder_3" value: "1 128 1" } 210 | } 211 | ] 212 | } 213 | } 214 | ``` 215 | -------------------------------------------------------------------------------- /docs/zh_CN/model_repository.md: -------------------------------------------------------------------------------- 1 | 2 | # 模型仓库(Model Repository) 3 | Triton Inference Server启动服务时指定模型仓库中一个或多个模型部署服务。当服务运行时,可以用[Model Management](https://github.com/triton-inference-server/server/blob/main/docs/model_management.md)中描述的方式修改服务中的模型。 4 | 从服务器启动时指定的一个或多个模型存储库中为模型提供服务 5 | 6 | ## 仓库结构 7 | 模型仓库路径通过Triton启动时的*--model-repository*选项指定,可以多次指定*--model-repository*选项来加载多个仓库。例如: 8 | 9 | ``` 10 | $ tritonserver --model-repository= 11 | ``` 12 | 13 | 模型仓库的结构必须按以下的格式创建: 14 | ``` 15 | / 16 | / 17 | [config.pbtxt] 18 | [ ...] 19 | / 20 | 21 | / 22 | 23 | ... 24 | / 25 | [config.pbtxt] 26 | [ ...] 27 | / 28 | 29 | / 30 | 31 | ... 32 | ... 33 | ``` 34 | 在最顶层``模型仓库目录下,必须有0个或多个``模型名字的子目录。每个``模型名字子目录包含部署模型相应的信息,多个表示模型版本的数字子目录和一个描述模型配置的*config.pbtxt*文件。 35 | 36 | Paddle模型存在版本号子目录中,必须为`model.pdmodel`文件和`model.pdiparams`文件。 37 | 38 | ## 模型版本 39 | 每个模型在仓库中可以有一个或多个可用的版本,模型目录中以数字命名的子目录就是对应的版本,数字即版本号。没有以数字命名的子目录,或以*0*开头的子目录都会被忽略。模型配置文件中可以指定[版本策略](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#version-policy),控制Triton启动模型目录中的哪个版本。 40 | 41 | ## 模型仓库示例 42 | Paddle Backend需要的模型必须是2.0版本以上导出的推理模型,模型包含`model.pdmodel`和`model.pdiparams`两个文件放在版本目录中。 43 | 44 | 一个使用Paddle Backend部署的最小模型仓库目录示例: 45 | ``` 46 | / 47 | / 48 | config.pbtxt 49 | 1/ 50 | model.pdmodel 51 | model.pdiparams 52 | 53 | # 真实例子: 54 | models 55 | └── ResNet50 56 | ├── 1 57 | │   ├── model.pdiparams 58 | │   └── model.pdmodel 59 | └── config.pbtxt 60 | ``` -------------------------------------------------------------------------------- /examples/fetch_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | set -e 29 | 30 | download () { 31 | FILE_NAME="models.tar.gz" 32 | 33 | if test -f ${FILE_NAME}; 34 | then 35 | echo "${FILE_NAME} exists." 36 | fi 37 | 38 | wget --no-check-certificate https://paddle-inference-dist.bj.bcebos.com/TritonPaddleBackend/models.tar.gz 39 | 40 | echo "Finish downloading ${FILE_NAME}" 41 | } 42 | 43 | download 44 | 45 | echo 'Extracting models.tar.gz' 46 | tar zxvf models.tar.gz 47 | -------------------------------------------------------------------------------- /examples/infer_ernie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | import sys 29 | import json 30 | import argparse 31 | import numpy as np 32 | 33 | import tritonclient.http as httpclient 34 | from tritonclient.utils import InferenceServerException 35 | 36 | 37 | FLAGS = None 38 | 39 | 40 | def parse_model_http(model_metadata, model_config): 41 | return model_metadata['inputs'], model_metadata['outputs'] 42 | 43 | 44 | def postprocess(results, output_metadata, batch_size): 45 | """ 46 | Post-process results to show classifications. 47 | """ 48 | 49 | output_array = results.as_numpy(output_metadata[0]['name']) 50 | return np.argmax(output_array, axis=1) 51 | 52 | 53 | def read_input(filename): 54 | with open(filename) as file: 55 | data = json.load(file) 56 | return data 57 | 58 | 59 | def requestGenerator(input_metadata, output_metadata, FLAGS, input_data): 60 | 61 | # Set the input data 62 | inputs = list() 63 | 64 | for input_ in input_metadata: 65 | input_name = input_['name'] 66 | runtime_data = input_data[input_name] 67 | data = np.asarray(runtime_data['content'], dtype=np.int32) 68 | data = data.reshape(runtime_data['shape']) 69 | inputs.append( 70 | httpclient.InferInput(input_name, data.shape, input_['datatype'])) 71 | inputs[-1].set_data_from_numpy(data, binary_data=True) 72 | 73 | outputs = list() 74 | for output in output_metadata: 75 | outputs.append( 76 | httpclient.InferRequestedOutput(output['name'], 77 | binary_data=True)) 78 | 79 | yield inputs, outputs, FLAGS.model_name, FLAGS.model_version 80 | 81 | 82 | if __name__ == '__main__': 83 | parser = argparse.ArgumentParser() 84 | parser.add_argument('-v', 85 | '--verbose', 86 | action="store_true", 87 | required=False, 88 | default=False, 89 | help='Enable verbose output') 90 | parser.add_argument('-m', 91 | '--model-name', 92 | type=str, 93 | required=True, 94 | help='Name of model') 95 | parser.add_argument( 96 | '-x', 97 | '--model-version', 98 | type=str, 99 | required=False, 100 | default="", 101 | help='Version of model. Default is to use latest version.') 102 | parser.add_argument('-b', 103 | '--batch-size', 104 | type=int, 105 | required=False, 106 | default=1, 107 | help='Batch size. Default is 1.') 108 | parser.add_argument('-u', 109 | '--url', 110 | type=str, 111 | required=False, 112 | default='localhost:8000', 113 | help='Inference server URL. Default is localhost:8000.') 114 | parser.add_argument('-i', 115 | '--protocol', 116 | type=str, 117 | required=False, 118 | choices=['HTTP'], 119 | default='HTTP', 120 | help='Protocol used to communicate with ' + 121 | 'the inference service. Default is HTTP.') 122 | parser.add_argument('image_filename', 123 | type=str, 124 | nargs='?', 125 | default=None, 126 | help='Input image / Input folder.') 127 | FLAGS = parser.parse_args() 128 | 129 | try: 130 | triton_client = httpclient.InferenceServerClient( 131 | url=FLAGS.url, verbose=FLAGS.verbose, concurrency=1) 132 | except Exception as exception: 133 | print("client creation failed: " + str(exception)) 134 | sys.exit(1) 135 | 136 | # Make sure the model matches our requirements, and get some 137 | # properties of the model that we need for preprocessing 138 | try: 139 | model_metadata = triton_client.get_model_metadata( 140 | model_name=FLAGS.model_name, model_version=FLAGS.model_version) 141 | except InferenceServerException as e: 142 | print("failed to retrieve the metadata: " + str(e)) 143 | sys.exit(1) 144 | 145 | try: 146 | model_config = triton_client.get_model_config( 147 | model_name=FLAGS.model_name, model_version=FLAGS.model_version) 148 | except InferenceServerException as e: 149 | print("failed to retrieve the config: " + str(e)) 150 | sys.exit(1) 151 | 152 | requests = [] 153 | responses = [] 154 | request_ids = [] 155 | 156 | input_metadata, output_metadata = parse_model_http(model_metadata, model_config) 157 | 158 | json_data = read_input(f'data/perf.{FLAGS.batch_size}.json') 159 | input_data = json_data['data'] 160 | if 'ground_truth' in json_data: 161 | ground_truth = json_data['ground_truth'] 162 | 163 | for idx, batch_data in enumerate(input_data): 164 | try: 165 | for inputs, outputs, model_name, model_version in requestGenerator(input_metadata, output_metadata, FLAGS, batch_data): 166 | responses.append( 167 | triton_client.infer(FLAGS.model_name, 168 | inputs, 169 | request_id=str(idx), 170 | model_version=FLAGS.model_version, 171 | outputs=outputs)) 172 | except InferenceServerException as e: 173 | print("inference failed: " + str(e)) 174 | sys.exit(1) 175 | 176 | results = list() 177 | for response in responses: 178 | this_id = response.get_response()["id"] 179 | results.extend(postprocess(response, output_metadata, FLAGS.batch_size)) 180 | 181 | if 'ground_truth' in json_data: 182 | print('Accuracy:', sum(np.asarray(ground_truth) == np.asarray(results))/len(ground_truth)) 183 | -------------------------------------------------------------------------------- /examples/infer_resnet50_v1.5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | import argparse 29 | from functools import partial 30 | import os 31 | import sys 32 | 33 | from PIL import Image 34 | import numpy as np 35 | from attrdict import AttrDict 36 | 37 | import tritonclient.grpc as grpcclient 38 | import tritonclient.grpc.model_config_pb2 as mc 39 | import tritonclient.http as httpclient 40 | from tritonclient.utils import InferenceServerException 41 | from tritonclient.utils import triton_to_np_dtype 42 | 43 | if sys.version_info >= (3, 0): 44 | import queue 45 | else: 46 | import Queue as queue 47 | 48 | 49 | class UserData: 50 | 51 | def __init__(self): 52 | self._completed_requests = queue.Queue() 53 | 54 | 55 | # Callback function used for async_stream_infer() 56 | def completion_callback(user_data, result, error): 57 | # passing error raise and handling out 58 | user_data._completed_requests.put((result, error)) 59 | 60 | 61 | FLAGS = None 62 | 63 | 64 | def parse_model(model_metadata, model_config): 65 | """ 66 | Check the configuration of a model to make sure it meets the 67 | requirements for an image classification network (as expected by 68 | this client) 69 | """ 70 | if len(model_metadata.inputs) != 1: 71 | raise Exception("expecting 1 input, got {}".format( 72 | len(model_metadata.inputs))) 73 | if len(model_metadata.outputs) != 1: 74 | raise Exception("expecting 1 output, got {}".format( 75 | len(model_metadata.outputs))) 76 | 77 | if len(model_config.input) != 1: 78 | raise Exception( 79 | "expecting 1 input in model configuration, got {}".format( 80 | len(model_config.input))) 81 | 82 | input_metadata = model_metadata.inputs[0] 83 | input_config = model_config.input[0] 84 | output_metadata = model_metadata.outputs[0] 85 | 86 | if output_metadata.datatype != "FP32": 87 | raise Exception("expecting output datatype to be FP32, model '" + 88 | model_metadata.name + "' output type is " + 89 | output_metadata.datatype) 90 | 91 | # Output is expected to be a vector. But allow any number of 92 | # dimensions as long as all but 1 is size 1 (e.g. { 10 }, { 1, 10 93 | # }, { 10, 1, 1 } are all ok). Ignore the batch dimension if there 94 | # is one. 95 | output_batch_dim = (model_config.max_batch_size > 0) 96 | non_one_cnt = 0 97 | for dim in output_metadata.shape: 98 | if output_batch_dim: 99 | output_batch_dim = False 100 | elif dim > 1: 101 | non_one_cnt += 1 102 | if non_one_cnt > 1: 103 | raise Exception("expecting model output to be a vector") 104 | 105 | # Model input must have 3 dims, either CHW or HWC (not counting 106 | # the batch dimension), either CHW or HWC 107 | input_batch_dim = (model_config.max_batch_size > 0) 108 | expected_input_dims = 3 + (1 if input_batch_dim else 0) 109 | if len(input_metadata.shape) != expected_input_dims: 110 | raise Exception( 111 | "expecting input to have {} dimensions, model '{}' input has {}". 112 | format(expected_input_dims, model_metadata.name, 113 | len(input_metadata.shape))) 114 | 115 | if type(input_config.format) == str: 116 | FORMAT_ENUM_TO_INT = dict(mc.ModelInput.Format.items()) 117 | input_config.format = FORMAT_ENUM_TO_INT[input_config.format] 118 | 119 | if ((input_config.format != mc.ModelInput.FORMAT_NCHW) and 120 | (input_config.format != mc.ModelInput.FORMAT_NHWC) and 121 | (input_config.format != mc.ModelInput.FORMAT_NONE)): 122 | raise Exception("unexpected input format " + 123 | mc.ModelInput.Format.Name(input_config.format) + 124 | ", expecting " + 125 | mc.ModelInput.Format.Name(mc.ModelInput.FORMAT_NCHW) + 126 | " or " + 127 | mc.ModelInput.Format.Name(mc.ModelInput.FORMAT_NHWC)) 128 | 129 | if input_config.format == mc.ModelInput.FORMAT_NHWC: 130 | h = input_metadata.shape[1 if input_batch_dim else 0] 131 | w = input_metadata.shape[2 if input_batch_dim else 1] 132 | c = input_metadata.shape[3 if input_batch_dim else 2] 133 | else: 134 | c = input_metadata.shape[1 if input_batch_dim else 0] 135 | h = input_metadata.shape[2 if input_batch_dim else 1] 136 | w = input_metadata.shape[3 if input_batch_dim else 2] 137 | 138 | return (model_config.max_batch_size, input_metadata.name, 139 | output_metadata.name, c, h, w, input_config.format, 140 | input_metadata.datatype) 141 | 142 | 143 | def preprocess(img, format, dtype, c, h, w, scaling, protocol): 144 | """ 145 | Pre-process an image to meet the size, type and format 146 | requirements specified by the parameters. 147 | """ 148 | # np.set_printoptions(threshold='nan') 149 | 150 | if c == 1: 151 | sample_img = img.convert('L') 152 | else: 153 | sample_img = img.convert('RGB') 154 | 155 | resized_img = sample_img.resize((w, h), Image.BILINEAR) 156 | resized = np.array(resized_img) 157 | if resized.ndim == 2: 158 | resized = resized[:, :, np.newaxis] 159 | 160 | npdtype = triton_to_np_dtype(dtype) 161 | typed = resized.astype(npdtype) 162 | 163 | if scaling == 'INCEPTION': 164 | scaled = (typed / 127.5) - 1 165 | elif scaling == 'VGG': 166 | if c == 1: 167 | scaled = typed - np.asarray((128,), dtype=npdtype) 168 | else: 169 | scaled = typed - np.asarray((123, 117, 104), dtype=npdtype) 170 | elif scaling == 'RESNET': 171 | scaled = (typed / 255 - np.array([0.485, 0.456, 0.406], dtype=npdtype))/np.array([0.229, 0.224, 0.225], dtype=npdtype) 172 | else: 173 | scaled = typed 174 | 175 | # Swap to CHW if necessary 176 | if format == mc.ModelInput.FORMAT_NHWC: 177 | ordered = scaled 178 | else: 179 | ordered = np.transpose(scaled, (2, 0, 1)) 180 | 181 | # Channels are in RGB order. Currently model configuration data 182 | # doesn't provide any information as to other channel orderings 183 | # (like BGR) so we just assume RGB. 184 | return ordered 185 | 186 | 187 | def postprocess(results, output_name, batch_size, batching): 188 | """ 189 | Post-process results to show classifications. 190 | """ 191 | 192 | output_array = results.as_numpy(output_name) 193 | if len(output_array) != batch_size: 194 | raise Exception("expected {} results, got {}".format( 195 | batch_size, len(output_array))) 196 | 197 | # Include special handling for non-batching models 198 | for results in output_array: 199 | if not batching: 200 | results = [results] 201 | for result in results: 202 | if output_array.dtype.type == np.object_: 203 | cls = "".join(chr(x) for x in result).split(':') 204 | else: 205 | cls = result.split(':') 206 | print(" {} ({})".format(cls[0], cls[1])) 207 | 208 | 209 | def requestGenerator(batched_image_data, input_name, output_name, dtype, FLAGS): 210 | protocol = FLAGS.protocol.lower() 211 | 212 | if protocol == "grpc": 213 | client = grpcclient 214 | else: 215 | client = httpclient 216 | 217 | # Set the input data 218 | inputs = [client.InferInput(input_name, batched_image_data.shape, dtype)] 219 | inputs[0].set_data_from_numpy(batched_image_data) 220 | 221 | outputs = [ 222 | client.InferRequestedOutput(output_name, class_count=FLAGS.classes) 223 | ] 224 | 225 | yield inputs, outputs, FLAGS.model_name, FLAGS.model_version 226 | 227 | 228 | def convert_http_metadata_config(_metadata, _config): 229 | _model_metadata = AttrDict(_metadata) 230 | _model_config = AttrDict(_config) 231 | 232 | return _model_metadata, _model_config 233 | 234 | 235 | if __name__ == '__main__': 236 | parser = argparse.ArgumentParser() 237 | parser.add_argument('-v', 238 | '--verbose', 239 | action="store_true", 240 | required=False, 241 | default=False, 242 | help='Enable verbose output') 243 | parser.add_argument('-a', 244 | '--async', 245 | dest="async_set", 246 | action="store_true", 247 | required=False, 248 | default=False, 249 | help='Use asynchronous inference API') 250 | parser.add_argument('--streaming', 251 | action="store_true", 252 | required=False, 253 | default=False, 254 | help='Use streaming inference API. ' + 255 | 'The flag is only available with gRPC protocol.') 256 | parser.add_argument('-m', 257 | '--model-name', 258 | type=str, 259 | required=True, 260 | help='Name of model') 261 | parser.add_argument( 262 | '-x', 263 | '--model-version', 264 | type=str, 265 | required=False, 266 | default="", 267 | help='Version of model. Default is to use latest version.') 268 | parser.add_argument('-b', 269 | '--batch-size', 270 | type=int, 271 | required=False, 272 | default=1, 273 | help='Batch size. Default is 1.') 274 | parser.add_argument('-c', 275 | '--classes', 276 | type=int, 277 | required=False, 278 | default=1, 279 | help='Number of class results to report. Default is 1.') 280 | parser.add_argument( 281 | '-s', 282 | '--scaling', 283 | type=str, 284 | choices=['RESNET'], 285 | required=False, 286 | default='RESNET', 287 | help='Type of scaling to apply to image pixels. Default is RESNET.') 288 | parser.add_argument('-u', 289 | '--url', 290 | type=str, 291 | required=False, 292 | default='localhost:8000', 293 | help='Inference server URL. Default is localhost:8000.') 294 | parser.add_argument('-i', 295 | '--protocol', 296 | type=str, 297 | required=False, 298 | default='HTTP', 299 | help='Protocol (HTTP/gRPC) used to communicate with ' + 300 | 'the inference service. Default is HTTP.') 301 | parser.add_argument('image_filename', 302 | type=str, 303 | nargs='?', 304 | default=None, 305 | help='Input image / Input folder.') 306 | FLAGS = parser.parse_args() 307 | 308 | if FLAGS.streaming and FLAGS.protocol.lower() != "grpc": 309 | raise Exception("Streaming is only allowed with gRPC protocol") 310 | 311 | try: 312 | if FLAGS.protocol.lower() == "grpc": 313 | # Create gRPC client for communicating with the server 314 | triton_client = grpcclient.InferenceServerClient( 315 | url=FLAGS.url, verbose=FLAGS.verbose) 316 | else: 317 | # Specify large enough concurrency to handle the 318 | # the number of requests. 319 | concurrency = 20 if FLAGS.async_set else 1 320 | triton_client = httpclient.InferenceServerClient( 321 | url=FLAGS.url, verbose=FLAGS.verbose, concurrency=concurrency) 322 | except Exception as e: 323 | print("client creation failed: " + str(e)) 324 | sys.exit(1) 325 | 326 | # Make sure the model matches our requirements, and get some 327 | # properties of the model that we need for preprocessing 328 | try: 329 | model_metadata = triton_client.get_model_metadata( 330 | model_name=FLAGS.model_name, model_version=FLAGS.model_version) 331 | except InferenceServerException as e: 332 | print("failed to retrieve the metadata: " + str(e)) 333 | sys.exit(1) 334 | 335 | try: 336 | model_config = triton_client.get_model_config( 337 | model_name=FLAGS.model_name, model_version=FLAGS.model_version) 338 | except InferenceServerException as e: 339 | print("failed to retrieve the config: " + str(e)) 340 | sys.exit(1) 341 | 342 | if FLAGS.protocol.lower() == "grpc": 343 | model_config = model_config.config 344 | else: 345 | model_metadata, model_config = convert_http_metadata_config( 346 | model_metadata, model_config) 347 | 348 | max_batch_size, input_name, output_name, c, h, w, format, dtype = parse_model( 349 | model_metadata, model_config) 350 | 351 | filenames = [] 352 | if os.path.isdir(FLAGS.image_filename): 353 | filenames = [ 354 | os.path.join(FLAGS.image_filename, f) 355 | for f in os.listdir(FLAGS.image_filename) 356 | if os.path.isfile(os.path.join(FLAGS.image_filename, f)) 357 | ] 358 | else: 359 | filenames = [ 360 | FLAGS.image_filename, 361 | ] 362 | 363 | filenames.sort() 364 | 365 | # Preprocess the images into input data according to model 366 | # requirements 367 | image_data = [] 368 | for filename in filenames: 369 | img = Image.open(filename) 370 | image_data.append( 371 | preprocess(img, format, dtype, c, h, w, FLAGS.scaling, 372 | FLAGS.protocol.lower())) 373 | 374 | # Send requests of FLAGS.batch_size images. If the number of 375 | # images isn't an exact multiple of FLAGS.batch_size then just 376 | # start over with the first images until the batch is filled. 377 | requests = [] 378 | responses = [] 379 | result_filenames = [] 380 | request_ids = [] 381 | image_idx = 0 382 | last_request = False 383 | user_data = UserData() 384 | 385 | # Holds the handles to the ongoing HTTP async requests. 386 | async_requests = [] 387 | 388 | sent_count = 0 389 | 390 | if FLAGS.streaming: 391 | triton_client.start_stream(partial(completion_callback, user_data)) 392 | 393 | while not last_request: 394 | input_filenames = [] 395 | repeated_image_data = [] 396 | 397 | for idx in range(FLAGS.batch_size): 398 | input_filenames.append(filenames[image_idx]) 399 | repeated_image_data.append(image_data[image_idx]) 400 | image_idx = (image_idx + 1) % len(image_data) 401 | if image_idx == 0: 402 | last_request = True 403 | 404 | if max_batch_size > 0: 405 | batched_image_data = np.stack(repeated_image_data, axis=0) 406 | else: 407 | batched_image_data = repeated_image_data[0] 408 | 409 | # Send request 410 | try: 411 | for inputs, outputs, model_name, model_version in requestGenerator( 412 | batched_image_data, input_name, output_name, dtype, FLAGS): 413 | sent_count += 1 414 | if FLAGS.streaming: 415 | triton_client.async_stream_infer( 416 | FLAGS.model_name, 417 | inputs, 418 | request_id=str(sent_count), 419 | model_version=FLAGS.model_version, 420 | outputs=outputs) 421 | elif FLAGS.async_set: 422 | if FLAGS.protocol.lower() == "grpc": 423 | triton_client.async_infer( 424 | FLAGS.model_name, 425 | inputs, 426 | partial(completion_callback, user_data), 427 | request_id=str(sent_count), 428 | model_version=FLAGS.model_version, 429 | outputs=outputs) 430 | else: 431 | async_requests.append( 432 | triton_client.async_infer( 433 | FLAGS.model_name, 434 | inputs, 435 | request_id=str(sent_count), 436 | model_version=FLAGS.model_version, 437 | outputs=outputs)) 438 | else: 439 | responses.append( 440 | triton_client.infer(FLAGS.model_name, 441 | inputs, 442 | request_id=str(sent_count), 443 | model_version=FLAGS.model_version, 444 | outputs=outputs)) 445 | 446 | except InferenceServerException as e: 447 | print("inference failed: " + str(e)) 448 | if FLAGS.streaming: 449 | triton_client.stop_stream() 450 | sys.exit(1) 451 | 452 | if FLAGS.streaming: 453 | triton_client.stop_stream() 454 | 455 | if FLAGS.protocol.lower() == "grpc": 456 | if FLAGS.streaming or FLAGS.async_set: 457 | processed_count = 0 458 | while processed_count < sent_count: 459 | (results, error) = user_data._completed_requests.get() 460 | processed_count += 1 461 | if error is not None: 462 | print("inference failed: " + str(error)) 463 | sys.exit(1) 464 | responses.append(results) 465 | else: 466 | if FLAGS.async_set: 467 | # Collect results from the ongoing async requests 468 | # for HTTP Async requests. 469 | for async_request in async_requests: 470 | responses.append(async_request.get_result()) 471 | 472 | for response in responses: 473 | if FLAGS.protocol.lower() == "grpc": 474 | this_id = response.get_response().id 475 | else: 476 | this_id = response.get_response()["id"] 477 | print("Request {}, batch size {}".format(this_id, FLAGS.batch_size)) 478 | postprocess(response, output_name, FLAGS.batch_size, max_batch_size > 0) 479 | 480 | print("PASS") 481 | -------------------------------------------------------------------------------- /examples/infer_resnet50_v1.5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | docker run -it --rm \ 29 | --net=host \ 30 | -v`pwd`:/workspace/examples \ 31 | nvcr.io/nvidia/tritonserver:21.04-py3-sdk \ 32 | /bin/bash -c \ 33 | "python examples/infer_resnet50_v1.5.py -m ResNet50-v1.5 -c 3 examples/$1" 34 | -------------------------------------------------------------------------------- /examples/models/ERNIE/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "ERNIE" 2 | backend: "paddle" 3 | max_batch_size: 64 4 | input [ 5 | { 6 | name: "input_ids" 7 | data_type: TYPE_INT64 8 | dims: [ -1 ], 9 | }, 10 | { 11 | name: "token_type_ids" 12 | data_type: TYPE_INT64 13 | dims: [ -1 ], 14 | } 15 | ] 16 | output [ 17 | { 18 | name: "linear_113.tmp_1" 19 | data_type: TYPE_FP32 20 | dims: [ 15 ] 21 | } 22 | ] 23 | 24 | instance_group [ 25 | { 26 | count: 1 27 | kind: KIND_GPU 28 | #gpus: [ 0 ] 29 | } 30 | ] 31 | 32 | optimization { 33 | execution_accelerators { 34 | gpu_execution_accelerator : [ 35 | { 36 | name : "tensorrt" 37 | parameters { key: "precision" value: "trt_fp32" } 38 | parameters { key: "min_graph_size" value: "3" } 39 | parameters { key: "max_batch_size" value: "16" } 40 | parameters { key: "workspace_size" value: "2147483647" } 41 | parameters { key: "enable_tensorrt_oss" value: "0" } 42 | parameters { key: "is_dynamic" value: "1" } 43 | }, 44 | { 45 | name : "min_shape" 46 | parameters { key: "input_ids" value: "1 2" } 47 | parameters { key: "token_type_ids" value: "1 2" } 48 | }, 49 | { 50 | name : "max_shape" 51 | parameters { key: "input_ids" value: "16 128" } 52 | parameters { key: "token_type_ids" value: "16 128" } 53 | }, 54 | { 55 | name : "opt_shape" 56 | parameters { key: "input_ids" value: "16 128" } 57 | parameters { key: "token_type_ids" value: "16 128" } 58 | } 59 | ] 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /examples/models/ResNet50-v1.5/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "ResNet50-v1.5" 2 | backend: "paddle" 3 | max_batch_size: 128 4 | 5 | input [ 6 | { 7 | name: "x0" 8 | data_type: TYPE_FP32 9 | dims: [ 3, 224, 224 ] 10 | } 11 | ] 12 | 13 | output [ 14 | { 15 | name: "save_infer_model/scale_0.tmp_1" 16 | data_type: TYPE_FP32 17 | dims: [ 1000 ] 18 | } 19 | ] 20 | 21 | instance_group [ 22 | { 23 | count: 1 24 | kind: KIND_GPU 25 | gpus: [0] 26 | } 27 | ] 28 | 29 | dynamic_batching { 30 | preferred_batch_size: [ 64, 128 ] 31 | max_queue_delay_microseconds: 0 32 | } 33 | 34 | -------------------------------------------------------------------------------- /examples/perf_ernie.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | docker run -it --rm \ 28 | --net=host \ 29 | -v `pwd`/perf_data:/workspace/data \ 30 | nvcr.io/nvidia/tritonserver:21.10-py3-sdk \ 31 | /bin/bash -c \ 32 | 'for b in 1 2 4 8 16; do perf_analyzer -m ERNIE --shape input_ids:128 --shape token_type_ids:128 --input-data zero -b ${b}; done' 33 | -------------------------------------------------------------------------------- /examples/perf_resnet50_v1.5.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | docker run -it --rm \ 28 | --net=host \ 29 | nvcr.io/nvidia/tritonserver:21.10-py3-sdk \ 30 | /bin/bash -c \ 31 | 'for b in 1 2 4 8 16 32 64 128; do perf_analyzer -m ResNet50-v1.5 -b $b; done' 32 | -------------------------------------------------------------------------------- /paddle-lib/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | FROM nvcr.io/nvidia/tritonserver:21.10-py3 28 | 29 | ENV DEBIAN_FRONTEND=noninteractive 30 | 31 | RUN apt-key del 7fa2af80 \ 32 | && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \ 33 | && dpkg -i cuda-keyring_1.0-1_all.deb 34 | 35 | RUN apt-get update \ 36 | && apt-get install -y --no-install-recommends \ 37 | cmake \ 38 | patchelf \ 39 | python3-dev \ 40 | unzip \ 41 | gcc-8 \ 42 | g++-8 \ 43 | libgl1 \ 44 | libssl-dev 45 | 46 | RUN git clone 'https://github.com/PaddlePaddle/Paddle.git' 47 | WORKDIR /opt/tritonserver/Paddle 48 | RUN git pull && git checkout release/2.3 49 | 50 | RUN python3 -m pip install pyyaml && mkdir build-env && \ 51 | cd build-env && \ 52 | cmake .. -DWITH_PYTHON=OFF \ 53 | -DWITH_GPU=ON \ 54 | -DWITH_TESTING=OFF \ 55 | -DWITH_INFERENCE_API_TEST=OFF \ 56 | -DCMAKE_BUILD_TYPE=Release \ 57 | -DCUDA_ARCH_NAME=Auto \ 58 | -DON_INFER=ON \ 59 | -DWITH_MKL=ON \ 60 | -DWITH_TENSORRT=ON \ 61 | -DWITH_ONNXRUNTIME=ON \ 62 | -DCMAKE_C_COMPILER=`which gcc-8` -DCMAKE_CXX_COMPILER=`which g++-8` && \ 63 | make -j`nproc` 64 | -------------------------------------------------------------------------------- /paddle-lib/build_paddle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | set -xe 29 | 30 | docker build -t paddle-build . 31 | docker create --rm --name triton_paddle_build paddle-build:latest 32 | 33 | docker cp triton_paddle_build:/opt/tritonserver/Paddle/build-env/paddle_inference_install_dir/paddle . 34 | docker cp triton_paddle_build:/opt/tritonserver/Paddle/build-env/paddle_inference_install_dir/third_party/install/paddle2onnx . 35 | docker cp triton_paddle_build:/opt/tritonserver/Paddle/build-env/paddle_inference_install_dir/third_party/install/onnxruntime . 36 | docker cp triton_paddle_build:/opt/tritonserver/Paddle/build-env/paddle_inference_install_dir/third_party/install/mkldnn . 37 | docker cp triton_paddle_build:/opt/tritonserver/Paddle/build-env/paddle_inference_install_dir/third_party/install/mklml . 38 | 39 | rm paddle/lib/libpaddle_inference.a 40 | docker rm triton_paddle_build -------------------------------------------------------------------------------- /scripts/build_paddle_backend.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | if [ ! -d "./cmake-3.18.6-Linux-x86_64/" ]; then 28 | wget https://github.com/Kitware/CMake/releases/download/v3.18.6/cmake-3.18.6-Linux-x86_64.tar.gz 29 | tar -zxvf cmake-3.18.6-Linux-x86_64.tar.gz 30 | rm -rf cmake-3.18.6-Linux-x86_64.tar.gz 31 | fi 32 | 33 | docker run -it --rm \ 34 | -v`pwd`:/workspace/paddle_backend \ 35 | nvcr.io/nvidia/tritonserver:21.10-py3 \ 36 | bash -c \ 37 | 'cd /workspace/paddle_backend; rm -rf build; mkdir build; cd build;apt-get update; apt-get install -y --no-install-recommends rapidjson-dev;export PATH=/workspace/paddle_backend/cmake-3.18.6-Linux-x86_64/bin:$PATH;cmake .. -DPADDLE_INFERENCE_DIR=../paddle-lib/paddle -DTRITON_COMMON_REPO_TAG=r21.10 -DTRITON_CORE_REPO_TAG=r21.10 -DTRITON_BACKEND_REPO_TAG=r21.10; make -j`nproc`' 38 | -------------------------------------------------------------------------------- /scripts/launch_triton_server.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | set -xe 29 | 30 | docker build -t tritonserver-paddle . 31 | 32 | docker run --gpus=all --rm -it \ 33 | --net=host \ 34 | -e CUDA_VISIBLE_DEVICES=0 \ 35 | -v `pwd`/examples/models:/workspace/models \ 36 | tritonserver-paddle:latest /bin/bash -c \ 37 | '/opt/tritonserver/bin/tritonserver --model-repository=/workspace/models' 38 | -------------------------------------------------------------------------------- /src/libtriton_paddle.ldscript: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | { 27 | global: 28 | TRITONBACKEND_*; 29 | local: *; 30 | }; 31 | -------------------------------------------------------------------------------- /src/paddle.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | #include "paddle_backend_utils.h" 32 | #include "paddle_inference_api.h" 33 | #include "triton/backend/backend_common.h" 34 | #include "triton/backend/backend_input_collector.h" 35 | #include "triton/backend/backend_model.h" 36 | #include "triton/backend/backend_model_instance.h" 37 | #include "triton/backend/backend_output_responder.h" 38 | 39 | struct TRITONPADDLE_Tensor; 40 | 41 | // Paddle Predictor Wrapper 42 | struct TRITONPADDLE_Model; 43 | 44 | class ModelImpl { 45 | public: 46 | ModelImpl( 47 | const char* model_path, const char* param_path, 48 | TRITONPADDLE_Config* config, const int32_t device_id, cudaStream_t stream); 49 | ~ModelImpl() = default; 50 | void CollectShapeRun(paddle_infer::Predictor* predictor, 51 | const std::map>& shape); 52 | void CollectTensorRtShapeRange(const char* model_path, const char* param_path, 53 | TRITONPADDLE_Config* config, 54 | const int32_t device_id); 55 | TRITONPADDLE_Error* Run(); 56 | 57 | TRITONPADDLE_Error* GetInputPtr( 58 | const char* name, const TRITONPADDLE_DataType dtype, 59 | const TRITONPADDLE_Shape& shape, char** ptr); 60 | 61 | TRITONPADDLE_Error* GetOutputMetadata( 62 | const char* name, TRITONPADDLE_DataType* dtype, TRITONPADDLE_Shape* shape, 63 | char** ptr); 64 | 65 | TRITONPADDLE_Error* ZeroCopyRun(); 66 | 67 | private: 68 | // TODO(wilber): unique_ptr? 69 | std::unique_ptr analysis_config_; 70 | std::shared_ptr predictor_; 71 | paddle_infer::PlaceType place_type_; 72 | std::string shape_range_info_; 73 | }; 74 | 75 | void ModelImpl::CollectShapeRun(paddle_infer::Predictor* predictor, 76 | const std::map>& shape) { 77 | auto input_names = predictor->GetInputNames(); 78 | auto input_type = predictor->GetInputTypes(); 79 | for(auto name : input_names) { 80 | if(shape.find(name) == shape.end() or 81 | input_type.find(name) == input_type.end()) { 82 | TRITONPADDLE_Error* error = TRITONPADDLE_ErrorNew( 83 | std::string("Paddle Input name [") + std::string(name) + 84 | std::string("] is not one of the trt dynamic_shape")); 85 | THROW_IF_TRITONPADDLE_ERROR(error); 86 | } 87 | 88 | auto tensor = predictor->GetInputHandle(name); 89 | auto shape_value = shape.at(name); 90 | int shape_num = std::accumulate(shape_value.begin(), shape_value.end(), 1, 91 | std::multiplies()); 92 | tensor->Reshape(shape_value); 93 | auto dtype = input_type[name]; 94 | switch (dtype) { 95 | case paddle_infer::DataType::FLOAT32: { 96 | std::vector input_data(shape_num, 1.0); 97 | tensor->CopyFromCpu(input_data.data()); 98 | break; 99 | } 100 | case paddle_infer::DataType::INT32: { 101 | std::vector input_data(shape_num, 1); 102 | tensor->CopyFromCpu(input_data.data()); 103 | break; 104 | } 105 | case paddle_infer::DataType::INT64: { 106 | std::vector input_data(shape_num, 1); 107 | tensor->CopyFromCpu(input_data.data()); 108 | break; 109 | } 110 | case paddle_infer::DataType::FLOAT16: { 111 | std::vector input_data(shape_num, (phi::dtype::float16)1.0); 112 | tensor->CopyFromCpu(input_data.data()); 113 | break; 114 | } 115 | default: { 116 | TRITONPADDLE_Error* error = TRITONPADDLE_ErrorNew(std::string( 117 | "input data Paddle backend only supports FP32/INT32/INT64 currently")); 118 | THROW_IF_TRITONPADDLE_ERROR(error); 119 | break; 120 | } 121 | } 122 | } 123 | predictor->Run(); 124 | } 125 | 126 | void ModelImpl::CollectTensorRtShapeRange(const char* model_path, const char* param_path, 127 | TRITONPADDLE_Config* config, 128 | const int32_t device_id) { 129 | paddle_infer::Config analysis_config; 130 | if (param_path == nullptr) { 131 | analysis_config.SetModel(model_path, ""); 132 | } else { 133 | analysis_config.SetModel(model_path, param_path); 134 | } 135 | // analysis_config.EnableUseGpu(100, device_id); 136 | analysis_config.CollectShapeRangeInfo(shape_range_info_); 137 | auto predictor = paddle_infer::CreatePredictor(analysis_config); 138 | CollectShapeRun(predictor.get(), config->dynamic_min_shape_); 139 | CollectShapeRun(predictor.get(), config->dynamic_max_shape_); 140 | CollectShapeRun(predictor.get(), config->dynamic_opt_shape_); 141 | } 142 | 143 | ModelImpl::ModelImpl( 144 | const char* model_path, const char* param_path, TRITONPADDLE_Config* config, 145 | const int32_t device_id, cudaStream_t stream) 146 | { 147 | analysis_config_.reset(new paddle_infer::Config()); 148 | 149 | if (param_path == nullptr) { 150 | analysis_config_->SetModel(model_path, ""); 151 | } else { 152 | analysis_config_->SetModel(model_path, param_path); 153 | } 154 | 155 | // default settings 156 | analysis_config_->SwitchSpecifyInputNames(true); 157 | analysis_config_->SwitchIrOptim(true); 158 | analysis_config_->EnableMemoryOptim(); 159 | analysis_config_->SwitchUseFeedFetchOps(false); 160 | 161 | if (config->use_cpu_) { 162 | place_type_ = paddle_infer::PlaceType::kCPU; 163 | analysis_config_->SetCpuMathLibraryNumThreads(config->cpu_math_library_num_threads_); 164 | if(config->use_ort_) { 165 | analysis_config_->EnableONNXRuntime(); 166 | analysis_config_->EnableORTOptimization(); 167 | } else if(config->use_mkldnn_) { 168 | analysis_config_->EnableMKLDNN(); 169 | analysis_config_->SetMkldnnCacheCapacity(config->mkldnn_capacity_); 170 | // Release/2.3 don't support mkldnn_int8 171 | // if(config->use_mkldnn_int8_) 172 | // analysis_config_->EnableMkldnnInt8(); 173 | } 174 | } else { 175 | place_type_ = paddle_infer::PlaceType::kGPU; 176 | analysis_config_->EnableUseGpu(100, device_id); 177 | analysis_config_->SetExecStream((void*)stream); 178 | 179 | paddle::AnalysisConfig::Precision compute_precision; 180 | compute_precision = paddle::AnalysisConfig::Precision::kFloat32; 181 | if (config->precision_ == TRITONPADDLE_MODE_FP32) { 182 | compute_precision = paddle::AnalysisConfig::Precision::kFloat32; 183 | } else if (config->precision_ == TRITONPADDLE_MODE_FP16) { 184 | compute_precision = paddle::AnalysisConfig::Precision::kHalf; 185 | } else if (config->precision_ == TRITONPADDLE_MODE_INT8) { 186 | compute_precision = paddle::AnalysisConfig::Precision::kInt8; 187 | } else { 188 | TRITONPADDLE_Error* error = TRITONPADDLE_ErrorNew( 189 | "unknown precision type when setting tensorrt compute precision."); 190 | THROW_IF_TRITONPADDLE_ERROR(error); 191 | } 192 | 193 | if (config->use_trt_) { 194 | analysis_config_->EnableTensorRtEngine( 195 | config->workspace_size_, config->max_batch_size_, 196 | config->min_graph_size_, compute_precision, false, false); 197 | if (config->enable_tensorrt_oss_) { 198 | analysis_config_->EnableVarseqlen(); 199 | } 200 | if (config->is_dynamic_) { 201 | shape_range_info_ = triton::backend::JoinPath({config->model_dir_, "shape_range_info.pbtxt"}); 202 | if (!config->disenable_trt_tune_) { 203 | CollectTensorRtShapeRange(model_path, param_path, config, device_id); 204 | } 205 | analysis_config_->EnableTunedTensorRtDynamicShape(shape_range_info_); 206 | } 207 | } 208 | } 209 | predictor_ = std::move(paddle_infer::CreatePredictor(*analysis_config_.get())); 210 | } 211 | 212 | TRITONPADDLE_Error* 213 | ModelImpl::Run() 214 | { 215 | predictor_->Run(); 216 | 217 | // TODO: paddle predictor stream controll 218 | if(analysis_config_->use_gpu()) 219 | cudaDeviceSynchronize(); 220 | return nullptr; 221 | } 222 | 223 | TRITONPADDLE_Error* 224 | ModelImpl::GetInputPtr( 225 | const char* name, const TRITONPADDLE_DataType dtype, 226 | const TRITONPADDLE_Shape& shape, char** ptr) 227 | { 228 | auto input_names = predictor_->GetInputNames(); 229 | 230 | // check whether the given name is in predictor_ input names 231 | if (std::find(input_names.begin(), input_names.end(), std::string(name)) == 232 | input_names.end()) { 233 | return TRITONPADDLE_ErrorNew( 234 | std::string("Input name [") + std::string(name) + 235 | std::string("] is not one of the Paddle predictor input")); 236 | } 237 | 238 | auto tensor = predictor_->GetInputHandle(name); 239 | tensor->Reshape(shape.CompatibleShape()); 240 | switch (dtype) { 241 | case TRITONPADDLE_TYPE_FP32: 242 | *ptr = reinterpret_cast( 243 | tensor->mutable_data(place_type_)); 244 | break; 245 | case TRITONPADDLE_TYPE_INT32: 246 | *ptr = reinterpret_cast( 247 | tensor->mutable_data(place_type_)); 248 | break; 249 | case TRITONPADDLE_TYPE_INT64: 250 | *ptr = reinterpret_cast( 251 | tensor->mutable_data(place_type_)); 252 | break; 253 | case TRITONPADDLE_TYPE_FP16: 254 | *ptr = reinterpret_cast( 255 | tensor->mutable_data(place_type_)); 256 | break; 257 | default: 258 | return TRITONPADDLE_ErrorNew(std::string( 259 | "Paddle backend only supports FP32/INT32/INT64 currently")); 260 | } 261 | 262 | return nullptr; 263 | } 264 | 265 | TRITONPADDLE_Error* 266 | ModelImpl::GetOutputMetadata( 267 | const char* name, TRITONPADDLE_DataType* dtype, TRITONPADDLE_Shape* shape, 268 | char** ptr) 269 | { 270 | auto output_names = predictor_->GetOutputNames(); 271 | 272 | // check whether the given name is in predictor_ output names 273 | if (std::find(output_names.begin(), output_names.end(), std::string(name)) == 274 | output_names.end()) { 275 | return TRITONPADDLE_ErrorNew( 276 | std::string("Output name [") + std::string(name) + 277 | std::string("] is not one of the Paddle predictor input")); 278 | } 279 | 280 | auto tensor = predictor_->GetOutputHandle(name); 281 | auto tensor_type = tensor->type(); 282 | auto tensor_shape = tensor->shape(); 283 | 284 | *dtype = ConvertDataType(tensor_type); 285 | *shape = TRITONPADDLE_Shape(tensor_shape); 286 | 287 | switch (*dtype) { 288 | case TRITONPADDLE_TYPE_FP32: 289 | *ptr = reinterpret_cast( 290 | tensor->mutable_data(place_type_)); 291 | break; 292 | case TRITONPADDLE_TYPE_INT64: 293 | *ptr = reinterpret_cast( 294 | tensor->mutable_data(place_type_)); 295 | break; 296 | case TRITONPADDLE_TYPE_INT32: 297 | *ptr = reinterpret_cast( 298 | tensor->mutable_data(place_type_)); 299 | break; 300 | case TRITONPADDLE_TYPE_FP16: 301 | *ptr = reinterpret_cast( 302 | tensor->mutable_data(place_type_)); 303 | break; 304 | /* 305 | case TRITONPADDLE_TYPE_INT8: 306 | *ptr = reinterpret_cast( 307 | tensor->mutable_data(place_type_)); 308 | break; 309 | case TRITONPADDLE_TYPE_UINT8: 310 | *ptr = reinterpret_cast( 311 | tensor->mutable_data(place_type_)); 312 | break; 313 | */ 314 | default: 315 | return TRITONPADDLE_ErrorNew(std::string( 316 | "Paddle backend currently only support FP32/INT32/INT64")); 317 | } 318 | 319 | return nullptr; 320 | } 321 | 322 | TRITONSERVER_Error* 323 | TRITONPADDLE_ModelCreate( 324 | TRITONPADDLE_Model** model, const char* model_path, const char* param_path, 325 | TRITONPADDLE_Config* config, const int32_t device_id, cudaStream_t stream) 326 | { 327 | try { 328 | ModelImpl* model_impl = 329 | new ModelImpl(model_path, param_path, config, device_id, stream); 330 | *model = reinterpret_cast(model_impl); 331 | } 332 | catch (const TRITONPADDLE_Exception& ex) { 333 | RETURN_IF_TRITONPADDLE_ERROR(ex.err_); 334 | } 335 | return nullptr; 336 | } 337 | 338 | void 339 | TRITONPADDLE_ModelDelete(TRITONPADDLE_Model* model) 340 | { 341 | if (model != nullptr) { 342 | ModelImpl* mi = reinterpret_cast(model); 343 | delete mi; 344 | } 345 | } 346 | 347 | TRITONPADDLE_Error* 348 | TRITONPADDLE_ModelRun(TRITONPADDLE_Model* model) 349 | { 350 | ModelImpl* m = reinterpret_cast(model); 351 | return m->Run(); 352 | } 353 | 354 | class TensorImpl { 355 | public: 356 | TensorImpl( 357 | const char* name, TRITONPADDLE_DataType dtype, 358 | const TRITONPADDLE_Shape& shape, char* data_ptr); 359 | ~TensorImpl() = default; 360 | 361 | const std::string& Name() const { return name_; } 362 | TRITONPADDLE_DataType DataType() const { return dtype_; } 363 | TRITONPADDLE_Shape Shape() const { return shape_; } 364 | 365 | char* Base() const { return base_; } 366 | size_t ByteSize() const { return byte_size_; } 367 | 368 | private: 369 | const std::string name_; 370 | const TRITONPADDLE_DataType dtype_; 371 | const TRITONPADDLE_Shape shape_; 372 | 373 | char* base_; 374 | size_t byte_size_; 375 | }; 376 | 377 | TensorImpl::TensorImpl( 378 | const char* name, TRITONPADDLE_DataType dtype, 379 | const TRITONPADDLE_Shape& shape, char* data_ptr) 380 | : name_(name), dtype_(dtype), shape_(shape), base_(data_ptr) 381 | { 382 | byte_size_ = shape.NumElements() * TRITONPADDLE_DataTypeByteSize(dtype); 383 | } 384 | 385 | TRITONPADDLE_Tensor* 386 | TRITONPADDLE_TensorNew( 387 | TRITONPADDLE_Model* model, const char* name, TRITONPADDLE_DataType dtype, 388 | const TRITONPADDLE_Shape& shape) 389 | { 390 | char* data_ptr; 391 | ModelImpl* m = reinterpret_cast(model); 392 | auto err = m->GetInputPtr(name, dtype, shape, &data_ptr); 393 | if (err != nullptr) { 394 | return nullptr; 395 | } 396 | 397 | TensorImpl* tensor = new TensorImpl(name, dtype, shape, data_ptr); 398 | return reinterpret_cast(tensor); 399 | } 400 | 401 | TRITONPADDLE_Tensor* 402 | TRITONPADDLE_TensorNew(TRITONPADDLE_Model* model, const char* name) 403 | { 404 | char* data_ptr; 405 | TRITONPADDLE_DataType dtype; 406 | TRITONPADDLE_Shape shape; 407 | 408 | ModelImpl* m = reinterpret_cast(model); 409 | auto err = m->GetOutputMetadata(name, &dtype, &shape, &data_ptr); 410 | if (err != nullptr) { 411 | return nullptr; 412 | } 413 | 414 | TensorImpl* tensor = new TensorImpl(name, dtype, shape, data_ptr); 415 | return reinterpret_cast(tensor); 416 | } 417 | 418 | char* 419 | TRITONPADDLE_TensorData(TRITONPADDLE_Tensor* tensor) 420 | { 421 | TensorImpl* t = reinterpret_cast(tensor); 422 | return t->Base(); 423 | } 424 | 425 | size_t 426 | TRITONPADDLE_TensorDataByteSize(TRITONPADDLE_Tensor* tensor) 427 | { 428 | TensorImpl* t = reinterpret_cast(tensor); 429 | return t->ByteSize(); 430 | } 431 | 432 | TRITONPADDLE_DataType 433 | TRITONPADDLE_TensorDataType(TRITONPADDLE_Tensor* tensor) 434 | { 435 | TensorImpl* t = reinterpret_cast(tensor); 436 | return t->DataType(); 437 | } 438 | 439 | TRITONPADDLE_Shape 440 | TRITONPADDLE_TensorShape(TRITONPADDLE_Tensor* tensor) 441 | { 442 | TensorImpl* t = reinterpret_cast(tensor); 443 | return t->Shape(); 444 | } 445 | 446 | namespace triton { namespace backend { namespace paddle { 447 | 448 | using TRITONPADDLEModelHandle = std::shared_ptr; 449 | 450 | class ModelState : public BackendModel { 451 | public: 452 | static TRITONSERVER_Error* Create( 453 | TRITONBACKEND_Model* triton_model, ModelState** state); 454 | virtual ~ModelState() = default; 455 | TRITONPADDLE_Config* PaddleConfig() { return &config_; } 456 | 457 | private: 458 | ModelState(TRITONBACKEND_Model* triton_model); 459 | 460 | // Auto-complete the model configuration 461 | TRITONSERVER_Error* AutoCompleteConfig(); 462 | 463 | // Validate that model configuration is supported by this backend 464 | TRITONSERVER_Error* ValidateModelConfig(); 465 | 466 | TRITONPADDLE_Config config_; 467 | }; 468 | 469 | TRITONSERVER_Error* 470 | ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) 471 | { 472 | try { 473 | *state = new ModelState(triton_model); 474 | } 475 | catch (const BackendModelException& ex) { 476 | RETURN_ERROR_IF_TRUE( 477 | ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, 478 | std::string("unexpected nullptr in BackendModelException")); 479 | RETURN_IF_ERROR(ex.err_); 480 | } 481 | 482 | // Auto-complete the configuration if requested... 483 | bool auto_complete_config = false; 484 | RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig( 485 | triton_model, &auto_complete_config)); 486 | if (auto_complete_config) { 487 | RETURN_IF_ERROR((*state)->AutoCompleteConfig()); 488 | 489 | triton::common::TritonJson::WriteBuffer json_buffer; 490 | (*state)->ModelConfig().Write(&json_buffer); 491 | 492 | TRITONSERVER_Message* message; 493 | RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson( 494 | &message, json_buffer.Base(), json_buffer.Size())); 495 | RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig( 496 | triton_model, 1 /* config_version */, message)); 497 | } 498 | 499 | RETURN_IF_ERROR((*state)->ValidateModelConfig()); 500 | 501 | return nullptr; // success 502 | } 503 | 504 | ModelState::ModelState(TRITONBACKEND_Model* triton_model) 505 | : BackendModel(triton_model) 506 | { 507 | 508 | triton::common::TritonJson::Value optimization; 509 | if (not ModelConfig().Find("optimization", &optimization)) { 510 | return; 511 | } 512 | 513 | triton::common::TritonJson::Value eas; 514 | if (not optimization.Find("execution_accelerators", &eas)) { 515 | return; 516 | } 517 | 518 | // CPU execution providers 519 | { 520 | triton::common::TritonJson::Value cpu_eas; 521 | if (eas.Find("cpu_execution_accelerator", &cpu_eas)) { 522 | for (size_t idx = 0; idx < cpu_eas.ArraySize(); idx++) { 523 | triton::common::TritonJson::Value ea; 524 | THROW_IF_BACKEND_MODEL_ERROR(cpu_eas.IndexAsObject(idx, &ea)); 525 | std::string name; 526 | THROW_IF_BACKEND_MODEL_ERROR(ea.MemberAsString("name", &name)); 527 | if (name == "mkldnn") { 528 | config_.use_mkldnn_ = true; 529 | } else if (name == "ort") { 530 | config_.use_ort_ = true; 531 | } else if (name != "") { 532 | TRITONSERVER_Error* error = TRITONSERVER_ErrorNew( 533 | TRITONSERVER_ERROR_INVALID_ARG, 534 | std::string( 535 | "unknown cpu_execution_accelerator name '" + name + 536 | "' is provided. Available choices are [mkldnn, ort]") 537 | .c_str()); 538 | THROW_IF_BACKEND_MODEL_ERROR(error); 539 | } 540 | triton::common::TritonJson::Value params; 541 | if (ea.Find("parameters", ¶ms)) { 542 | std::vector param_keys; 543 | THROW_IF_BACKEND_MODEL_ERROR(params.Members(¶m_keys)); 544 | for (const auto& param_key : param_keys) { 545 | std::string value_string; 546 | if (param_key == "cpu_threads") { 547 | THROW_IF_BACKEND_MODEL_ERROR( 548 | params.MemberAsString(param_key.c_str(), &value_string)); 549 | THROW_IF_BACKEND_MODEL_ERROR( 550 | ParseIntValue(value_string, &config_.cpu_math_library_num_threads_)); 551 | } else if (param_key == "capacity") { 552 | THROW_IF_BACKEND_MODEL_ERROR( 553 | params.MemberAsString(param_key.c_str(), &value_string)); 554 | THROW_IF_BACKEND_MODEL_ERROR( 555 | ParseIntValue(value_string, &config_.mkldnn_capacity_)); 556 | } else if (param_key == "use_int8") { 557 | THROW_IF_BACKEND_MODEL_ERROR( 558 | params.MemberAsString(param_key.c_str(), &value_string)); 559 | THROW_IF_BACKEND_MODEL_ERROR( 560 | ParseBoolValue(value_string, &config_.use_mkldnn_int8_)); 561 | } 562 | } 563 | } 564 | } 565 | } 566 | } 567 | 568 | // GPU execution providers 569 | { 570 | triton::common::TritonJson::Value gpu_eas; 571 | if (eas.Find("gpu_execution_accelerator", &gpu_eas)) { 572 | for (size_t idx = 0; idx < gpu_eas.ArraySize(); idx++) { 573 | triton::common::TritonJson::Value ea; 574 | THROW_IF_BACKEND_MODEL_ERROR(gpu_eas.IndexAsObject(idx, &ea)); 575 | std::string name; 576 | THROW_IF_BACKEND_MODEL_ERROR(ea.MemberAsString("name", &name)); 577 | 578 | if (name == "tensorrt") { 579 | config_.use_trt_ = true; 580 | triton::common::TritonJson::Value params; 581 | if (ea.Find("parameters", ¶ms)) { 582 | std::vector param_keys; 583 | THROW_IF_BACKEND_MODEL_ERROR(params.Members(¶m_keys)); 584 | for (const auto& param_key : param_keys) { 585 | std::string value_string; 586 | if (param_key == "precision") { 587 | THROW_IF_BACKEND_MODEL_ERROR( 588 | params.MemberAsString(param_key.c_str(), &value_string)); 589 | std::transform( 590 | value_string.begin(), value_string.end(), value_string.begin(), 591 | ::tolower); 592 | if (value_string == "trt_fp32") { 593 | config_.precision_ = TRITONPADDLE_MODE_FP32; 594 | } else if (value_string == "trt_fp16") { 595 | config_.precision_ = TRITONPADDLE_MODE_FP16; 596 | } else if (value_string == "trt_int8") { 597 | config_.precision_ = TRITONPADDLE_MODE_INT8; 598 | } else { 599 | TRITONSERVER_Error* error = TRITONSERVER_ErrorNew( 600 | TRITONSERVER_ERROR_INVALID_ARG, 601 | std::string( 602 | "unknown precision type '" + value_string + 603 | "' is provided. Available choices are [fluid, trt_fp32, " 604 | "trt_fp16, trt_int8]") 605 | .c_str()); 606 | THROW_IF_BACKEND_MODEL_ERROR(error); 607 | } 608 | } else if (param_key == "min_graph_size") { 609 | THROW_IF_BACKEND_MODEL_ERROR( 610 | params.MemberAsString(param_key.c_str(), &value_string)); 611 | THROW_IF_BACKEND_MODEL_ERROR( 612 | ParseLongLongValue(value_string, &config_.min_graph_size_)); 613 | } else if (param_key == "workspace_size") { 614 | THROW_IF_BACKEND_MODEL_ERROR( 615 | params.MemberAsString(param_key.c_str(), &value_string)); 616 | THROW_IF_BACKEND_MODEL_ERROR( 617 | ParseLongLongValue(value_string, &config_.workspace_size_)); 618 | } else if (param_key == "max_batch_size") { 619 | THROW_IF_BACKEND_MODEL_ERROR( 620 | params.MemberAsString(param_key.c_str(), &value_string)); 621 | THROW_IF_BACKEND_MODEL_ERROR( 622 | ParseLongLongValue(value_string, &config_.max_batch_size_)); 623 | } else if (param_key == "enable_tensorrt_oss") { 624 | THROW_IF_BACKEND_MODEL_ERROR( 625 | params.MemberAsString(param_key.c_str(), &value_string)); 626 | THROW_IF_BACKEND_MODEL_ERROR( 627 | ParseBoolValue(value_string, &config_.enable_tensorrt_oss_)); 628 | } else if (param_key == "is_dynamic") { 629 | THROW_IF_BACKEND_MODEL_ERROR( 630 | params.MemberAsString(param_key.c_str(), &value_string)); 631 | THROW_IF_BACKEND_MODEL_ERROR( 632 | ParseBoolValue(value_string, &config_.is_dynamic_)); 633 | } else if (param_key == "disenable_trt_tune") { 634 | THROW_IF_BACKEND_MODEL_ERROR( 635 | params.MemberAsString(param_key.c_str(), &value_string)); 636 | THROW_IF_BACKEND_MODEL_ERROR( 637 | ParseBoolValue(value_string, &config_.disenable_trt_tune_)); 638 | } else { 639 | TRITONSERVER_Error* error = TRITONSERVER_ErrorNew( 640 | TRITONSERVER_ERROR_INVALID_ARG, 641 | std::string( 642 | "unknown parameter '" + param_key + 643 | "' is provided for GPU execution accelerator " 644 | "config. Available choices are [precision, " 645 | "min_graph_size, workspace_size, max_batch_size, " 646 | "enable_tensorrt_oss, is_dynamic]") 647 | .c_str()); 648 | THROW_IF_BACKEND_MODEL_ERROR(error); 649 | } 650 | } 651 | } 652 | } else if ( 653 | name == "min_shape" or name == "max_shape" or name == "opt_shape") { 654 | triton::common::TritonJson::Value params; 655 | if (ea.Find("parameters", ¶ms)) { 656 | std::vector input_names; 657 | THROW_IF_BACKEND_MODEL_ERROR(params.Members(&input_names)); 658 | for (const auto& input_name : input_names) { 659 | std::string str_shape; 660 | THROW_IF_BACKEND_MODEL_ERROR( 661 | params.MemberAsString(input_name.c_str(), &str_shape)); 662 | if (name == "min_shape") { 663 | config_.dynamic_min_shape_[input_name] = 664 | TRITONPADDLE_Shape(str_shape).CompatibleShape(); 665 | } else if (name == "max_shape") { 666 | config_.dynamic_max_shape_[input_name] = 667 | TRITONPADDLE_Shape(str_shape).CompatibleShape(); 668 | } else { 669 | config_.dynamic_opt_shape_[input_name] = 670 | TRITONPADDLE_Shape(str_shape).CompatibleShape(); 671 | } 672 | } 673 | } 674 | } else { 675 | TRITONSERVER_Error* error = TRITONSERVER_ErrorNew( 676 | TRITONSERVER_ERROR_INVALID_ARG, 677 | std::string( 678 | "unknown name '" + name + 679 | "' is provided for GPU execution accelerator " 680 | "Available choices are [config, min_shape, max_shape, opt_shape]") 681 | .c_str()); 682 | THROW_IF_BACKEND_MODEL_ERROR(error); 683 | } 684 | } 685 | } 686 | } 687 | } 688 | 689 | TRITONSERVER_Error* 690 | ModelState::AutoCompleteConfig() 691 | { 692 | // Auto-complete configuration if requests 693 | LOG_MESSAGE( 694 | TRITONSERVER_LOG_WARN, 695 | (std::string("skipping model configuration auto-complete for '") + 696 | Name() + "': not supported for paddle backend") 697 | .c_str()); 698 | 699 | return nullptr; // success 700 | } 701 | 702 | TRITONSERVER_Error* 703 | ModelState::ValidateModelConfig() 704 | { 705 | triton::common::TritonJson::WriteBuffer buffer; 706 | RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer)); 707 | LOG_MESSAGE( 708 | TRITONSERVER_LOG_VERBOSE, 709 | (std::string("model configuration:\n") + buffer.Contents()).c_str()); 710 | 711 | triton::common::TritonJson::Value ios; 712 | RETURN_IF_ERROR(ModelConfig().MemberAsArray("input", &ios)); 713 | for (size_t i = 0; i < ios.ArraySize(); i++) { 714 | triton::common::TritonJson::Value io; 715 | RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); 716 | std::string io_name; 717 | RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); 718 | // Check datatypes 719 | std::string io_dtype; 720 | RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); 721 | RETURN_ERROR_IF_TRUE( 722 | ConvertDataType(io_dtype) == 723 | TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INVALID, 724 | TRITONSERVER_ERROR_INVALID_ARG, 725 | std::string("unsupported datatype '") + io_dtype + "' for tensor '" + 726 | io_name + "' for model '" + Name() + "'"); 727 | } 728 | RETURN_IF_ERROR(ModelConfig().MemberAsArray("output", &ios)); 729 | for (size_t i = 0; i < ios.ArraySize(); i++) { 730 | triton::common::TritonJson::Value io; 731 | RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); 732 | std::string io_name; 733 | RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); 734 | // Check datatypes 735 | std::string io_dtype; 736 | RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); 737 | RETURN_ERROR_IF_TRUE( 738 | ConvertDataType(io_dtype) == 739 | TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INVALID, 740 | TRITONSERVER_ERROR_INVALID_ARG, 741 | std::string("unsupported datatype '") + io_dtype + "' for tensor '" + 742 | io_name + "' for model '" + Name() + "'"); 743 | } 744 | 745 | return nullptr; // success 746 | } 747 | 748 | class ModelInstanceState : public BackendModelInstance { 749 | public: 750 | static TRITONSERVER_Error* Create( 751 | ModelState* model_state, 752 | TRITONBACKEND_ModelInstance* triton_model_instance, 753 | ModelInstanceState** state); 754 | virtual ~ModelInstanceState() = default; 755 | 756 | // Get the state of the model that corresponds to this instance. 757 | ModelState* StateForModel() const { return model_state_; } 758 | 759 | void ProcessRequests( 760 | TRITONBACKEND_Request** requests, const uint32_t request_count); 761 | 762 | private: 763 | ModelInstanceState( 764 | ModelState* model_state, 765 | TRITONBACKEND_ModelInstance* triton_model_instance); 766 | 767 | TRITONSERVER_Error* DetermineModelAndParamsPath( 768 | const std::string& model_dir, std::string* model_path, 769 | std::string* param_path); 770 | 771 | void SetInputTensors( 772 | size_t total_batch_size, TRITONBACKEND_Request** requests, 773 | const uint32_t request_count, 774 | std::vector* responses); 775 | 776 | void ReadOutputTensors( 777 | size_t total_batch_size, const std::vector& output_names, 778 | TRITONBACKEND_Request** requests, const uint32_t request_count, 779 | std::vector* responses); 780 | 781 | ModelState* model_state_; 782 | TRITONPADDLEModelHandle triton_paddle_model_; 783 | }; 784 | 785 | TRITONSERVER_Error* 786 | ModelInstanceState::Create( 787 | ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, 788 | ModelInstanceState** state) 789 | { 790 | try { 791 | *state = new ModelInstanceState(model_state, triton_model_instance); 792 | } 793 | catch (const BackendModelInstanceException& ex) { 794 | RETURN_ERROR_IF_TRUE( 795 | ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, 796 | std::string("unexpected nullptr in BackendModelInstanceException")); 797 | RETURN_IF_ERROR(ex.err_); 798 | } 799 | 800 | return nullptr; // success 801 | } 802 | 803 | TRITONSERVER_Error* 804 | ModelInstanceState::DetermineModelAndParamsPath( 805 | const std::string& model_dir, std::string* model_path, 806 | std::string* param_path) 807 | { 808 | bool exists; 809 | *model_path = JoinPath({model_dir, "model.pdmodel"}); 810 | RETURN_IF_ERROR(FileExists(*model_path, &exists)); 811 | if (not exists) { 812 | return TRITONSERVER_ErrorNew( 813 | TRITONSERVER_ERROR_NOT_FOUND, 814 | std::string( 815 | "Paddle model should be named as 'model.pdmodel'").c_str()); 816 | } 817 | 818 | *param_path = JoinPath({model_dir, "model.pdiparams"}); 819 | RETURN_IF_ERROR(FileExists(*param_path, &exists)); 820 | if (not exists) { 821 | LOG_MESSAGE( 822 | TRITONSERVER_LOG_INFO, 823 | (std::string("Paddle params should be named as 'model.pdiparams' or not provided.").c_str())); 824 | *param_path = ""; 825 | } 826 | 827 | return nullptr; 828 | } 829 | 830 | ModelInstanceState::ModelInstanceState( 831 | ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) 832 | : BackendModelInstance(model_state, triton_model_instance), 833 | model_state_(model_state) 834 | { 835 | auto config = model_state->PaddleConfig(); 836 | auto model_dir = JoinPath( 837 | {model_state->RepositoryPath(), std::to_string(model_state->Version())}); 838 | config->model_dir_ = model_dir; 839 | 840 | std::string model_path; 841 | std::string param_path; 842 | THROW_IF_BACKEND_INSTANCE_ERROR( 843 | DetermineModelAndParamsPath(model_dir, &model_path, ¶m_path)); 844 | 845 | switch (Kind()) { 846 | case TRITONSERVER_INSTANCEGROUPKIND_CPU: 847 | config->use_cpu_ = true; 848 | break; 849 | case TRITONSERVER_INSTANCEGROUPKIND_GPU: 850 | config->use_cpu_ = false; 851 | break; 852 | default: 853 | throw BackendModelInstanceException(TRITONSERVER_ErrorNew( 854 | TRITONSERVER_ERROR_INTERNAL, 855 | (std::string("unexpected instance kind for ") + name_ + 856 | ", paddle_backend only supports CPU/GPU.") 857 | .c_str())); 858 | } 859 | 860 | TRITONPADDLE_Model* triton_paddle_model = nullptr; 861 | THROW_IF_BACKEND_INSTANCE_ERROR(TRITONPADDLE_ModelCreate( 862 | &triton_paddle_model, model_path.c_str(), 863 | param_path.empty() ? nullptr : param_path.c_str(), 864 | config, DeviceId(), CudaStream())); 865 | triton_paddle_model_.reset(triton_paddle_model, TRITONPADDLE_ModelDelete); 866 | } 867 | 868 | void 869 | ModelInstanceState::SetInputTensors( 870 | size_t total_batch_size, TRITONBACKEND_Request** requests, 871 | const uint32_t request_count, 872 | std::vector* responses) 873 | { 874 | // TRITONSERVER_Error* 875 | // ModelInstanceState::SetInputTensors( 876 | // size_t total_batch_size, TRITONBACKEND_Request** requests, 877 | // const uint32_t request_count, 878 | // std::vector* responses, 879 | // BackendInputCollector* collector, std::vector* input_names, 880 | // bool* cuda_copy) 881 | // { 882 | bool cuda_copy = false; 883 | BackendInputCollector collector( 884 | requests, request_count, responses, 885 | StateForModel()->TritonMemoryManager(), 886 | StateForModel()->EnablePinnedInput(), CudaStream()); 887 | 888 | const int max_batch_size = model_state_->MaxBatchSize(); 889 | 890 | // All requests must have equally-sized input tensors so use any 891 | // request as the representative for the input tensors. 892 | uint32_t input_count; 893 | RESPOND_ALL_AND_RETURN_IF_ERROR( 894 | responses, request_count, 895 | TRITONBACKEND_RequestInputCount(requests[0], &input_count)); 896 | 897 | for (uint32_t input_idx = 0; input_idx < input_count; ++input_idx) { 898 | TRITONBACKEND_Input* input; 899 | RESPOND_ALL_AND_RETURN_IF_ERROR( 900 | responses, request_count, 901 | TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input)); 902 | 903 | const char* name; 904 | TRITONSERVER_DataType datatype; 905 | const int64_t* shape; 906 | uint32_t dims_count; 907 | RESPOND_ALL_AND_RETURN_IF_ERROR( 908 | responses, request_count, 909 | TRITONBACKEND_InputProperties( 910 | input, &name, &datatype, &shape, &dims_count, nullptr, nullptr)); 911 | 912 | // The shape for the entire input patch, [total_batch_size, ...] 913 | std::vector batchn_shape(shape, shape + dims_count); 914 | 915 | if (max_batch_size != 0) { 916 | batchn_shape[0] = total_batch_size; 917 | } 918 | 919 | TRITONPADDLE_Tensor* tensor = TRITONPADDLE_TensorNew( 920 | triton_paddle_model_.get(), name, ConvertDataType(datatype), 921 | TRITONPADDLE_Shape(batchn_shape)); 922 | 923 | if (tensor == nullptr) { 924 | auto err = TRITONSERVER_ErrorNew( 925 | TRITONSERVER_ERROR_INTERNAL, 926 | (std::string("Failed to create input tensor '") + name + 927 | "' with shape " + backend::ShapeToString(batchn_shape) + 928 | " and data type " + TRITONSERVER_DataTypeString(datatype) + 929 | " for '" + Name() + "'") 930 | .c_str()); 931 | SendErrorForResponses(responses, request_count, err); 932 | return; 933 | } 934 | 935 | if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { 936 | collector.ProcessTensor( 937 | name, TRITONPADDLE_TensorData(tensor), 938 | TRITONPADDLE_TensorDataByteSize(tensor), TRITONSERVER_MEMORY_GPU, 939 | DeviceId()); 940 | } 941 | else { 942 | collector.ProcessTensor( 943 | name, TRITONPADDLE_TensorData(tensor), 944 | TRITONPADDLE_TensorDataByteSize(tensor), TRITONSERVER_MEMORY_CPU, 945 | 0); 946 | } 947 | } 948 | 949 | cuda_copy |= collector.Finalize(); 950 | if (cuda_copy) { 951 | cudaStreamSynchronize(CudaStream()); 952 | } 953 | } 954 | 955 | void 956 | ModelInstanceState::ReadOutputTensors( 957 | size_t total_batch_size, const std::vector& output_names, 958 | TRITONBACKEND_Request** requests, const uint32_t request_count, 959 | std::vector* responses) 960 | { 961 | BackendOutputResponder responder( 962 | requests, request_count, responses, StateForModel()->MaxBatchSize(), 963 | StateForModel()->TritonMemoryManager(), 964 | StateForModel()->EnablePinnedOutput(), CudaStream()); 965 | 966 | bool cuda_copy = false; 967 | for (size_t idx = 0; idx < output_names.size(); ++idx) { 968 | const std::string& name = output_names[idx]; 969 | 970 | TRITONPADDLE_Tensor* tensor = 971 | TRITONPADDLE_TensorNew(triton_paddle_model_.get(), name.c_str()); 972 | 973 | if (tensor == nullptr) { 974 | auto err = TRITONSERVER_ErrorNew( 975 | TRITONSERVER_ERROR_INTERNAL, 976 | (std::string("Failed to create output tensor '") + name + " for '" + 977 | Name() + "'") 978 | .c_str()); 979 | SendErrorForResponses(responses, request_count, err); 980 | return; 981 | } 982 | 983 | auto dtype = ConvertDataType(TRITONPADDLE_TensorDataType(tensor)); 984 | auto shape = TRITONPADDLE_TensorShape(tensor).Shape(); 985 | 986 | if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { 987 | responder.ProcessTensor( 988 | name, dtype, shape, TRITONPADDLE_TensorData(tensor), 989 | TRITONSERVER_MEMORY_GPU, DeviceId()); 990 | } else { 991 | responder.ProcessTensor( 992 | name, dtype, shape, TRITONPADDLE_TensorData(tensor), 993 | TRITONSERVER_MEMORY_CPU, 0); 994 | } 995 | } 996 | 997 | cuda_copy |= responder.Finalize(); 998 | if (cuda_copy) { 999 | cudaStreamSynchronize(CudaStream()); 1000 | } 1001 | 1002 | } 1003 | 1004 | void 1005 | ModelInstanceState::ProcessRequests( 1006 | TRITONBACKEND_Request** requests, const uint32_t request_count) 1007 | { 1008 | LOG_MESSAGE( 1009 | TRITONSERVER_LOG_VERBOSE, 1010 | (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + 1011 | std::to_string(request_count) + " requests") 1012 | .c_str()); 1013 | 1014 | uint64_t exec_start_ns = 0; 1015 | SET_TIMESTAMP(exec_start_ns); 1016 | 1017 | const int max_batch_size = model_state_->MaxBatchSize(); 1018 | 1019 | // For each request collect the total batch size for this inference 1020 | // execution. The batch-size, number of inputs, and size of each 1021 | // input has already been checked so don't need to do that here. 1022 | size_t total_batch_size = 0; 1023 | for (size_t i = 0; i < request_count; ++i) { 1024 | // If we get a nullptr request then something is badly wrong. Fail 1025 | // and release all requests. 1026 | if (requests[i] == nullptr) { 1027 | RequestsRespondWithError( 1028 | requests, request_count, 1029 | TRITONSERVER_ErrorNew( 1030 | TRITONSERVER_ERROR_INTERNAL, 1031 | std::string( 1032 | "null request given to Paddle backend for '" + Name() + "'") 1033 | .c_str())); 1034 | return; 1035 | } 1036 | 1037 | if (max_batch_size > 0) { 1038 | // Retrieve the batch size from one of the inputs, if the model 1039 | // supports batching, the first dimension size is batch size 1040 | TRITONBACKEND_Input* input; 1041 | TRITONSERVER_Error* err = 1042 | TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); 1043 | if (err == nullptr) { 1044 | const int64_t* shape; 1045 | err = TRITONBACKEND_InputProperties( 1046 | input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); 1047 | total_batch_size += shape[0]; 1048 | } 1049 | if (err != nullptr) { 1050 | RequestsRespondWithError(requests, request_count, err); 1051 | return; 1052 | } 1053 | } else { 1054 | total_batch_size += 1; 1055 | } 1056 | } 1057 | 1058 | // If there are no valid requests then no need to run the 1059 | // inference. This should never happen unless called with an empty 1060 | // 'requests' for some reason. 1061 | if (total_batch_size == 0) { 1062 | return; 1063 | } 1064 | 1065 | // Make sure the maximum batch size is not exceeded. The 1066 | // total_batch_size must be 1 for models that don't support batching 1067 | // (i.e. max_batch_size == 0). If max_batch_size is exceeded then 1068 | // scheduler has done something badly wrong so fail and release all 1069 | // requests 1070 | if ((total_batch_size != 1) and 1071 | (total_batch_size > static_cast(max_batch_size))) { 1072 | RequestsRespondWithError( 1073 | requests, request_count, 1074 | TRITONSERVER_ErrorNew( 1075 | TRITONSERVER_ERROR_INTERNAL, 1076 | std::string( 1077 | "batch size " + std::to_string(total_batch_size) + " for '" + 1078 | Name() + "', max allowed is " + std::to_string(max_batch_size)) 1079 | .c_str())); 1080 | return; 1081 | } 1082 | 1083 | // At this point we are committed to running inference with all 1084 | // 'requests'. Create a response for each request. During input 1085 | // processing if there is an error with any request that error will 1086 | // be sent immediately with the corresponding response (and the 1087 | // response pointer will then be nullptr). The request object 1088 | // itself will not be released until after all inferencing is done 1089 | // (below) as we may need to access the request object when 1090 | // determine how to process outputs (for example, even if we don't 1091 | // need the outputs for a request that has an error, we do need to 1092 | // know the size of those outputs associated with the request so we 1093 | // can skip them in the output tensors). 1094 | std::vector responses; 1095 | responses.reserve(request_count); 1096 | 1097 | for (size_t i = 0; i < request_count; ++i) { 1098 | TRITONBACKEND_Response* response; 1099 | auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); 1100 | if (err == nullptr) { 1101 | responses.emplace_back(response); 1102 | } else { 1103 | responses.emplace_back(nullptr); 1104 | LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); 1105 | TRITONSERVER_ErrorDelete(err); 1106 | } 1107 | } 1108 | 1109 | SetInputTensors(total_batch_size, requests, request_count, &responses); 1110 | 1111 | // Collect the names of requested outputs. Do not include outputs 1112 | // for requests that have already responded with an error. 1113 | // TODO: understand here 1114 | std::vector required_outputs; 1115 | std::vector> request_required_outputs(request_count); 1116 | for (size_t idx = 0; idx < request_count; ++idx) { 1117 | const auto& request = requests[idx]; 1118 | auto& response = responses[idx]; 1119 | if (response != nullptr) { 1120 | uint32_t output_count; 1121 | RESPOND_AND_SET_NULL_IF_ERROR( 1122 | &response, TRITONBACKEND_RequestOutputCount(request, &output_count)); 1123 | if (response != nullptr) { 1124 | for (uint32_t output_idx = 0; output_idx < output_count; ++output_idx) { 1125 | const char* output_name; 1126 | RESPOND_AND_SET_NULL_IF_ERROR( 1127 | &response, TRITONBACKEND_RequestOutputName( 1128 | request, output_idx, &output_name)); 1129 | 1130 | if (response != nullptr) { 1131 | required_outputs.push_back(output_name); 1132 | request_required_outputs[idx].push_back(output_name); 1133 | } 1134 | } 1135 | } 1136 | } 1137 | } 1138 | 1139 | uint64_t compute_start_ns = 0; 1140 | SET_TIMESTAMP(compute_start_ns); 1141 | 1142 | TRITONPADDLE_ModelRun(triton_paddle_model_.get()); 1143 | 1144 | uint64_t compute_end_ns = 0; 1145 | SET_TIMESTAMP(compute_end_ns); 1146 | 1147 | ReadOutputTensors( 1148 | total_batch_size, required_outputs, requests, request_count, &responses); 1149 | 1150 | uint64_t exec_end_ns = 0; 1151 | SET_TIMESTAMP(exec_end_ns); 1152 | 1153 | // Send all the responses that haven't already been sent because of 1154 | // an earlier error. Note that the responses are not set to nullptr 1155 | // here as we need that indication below to determine if the request 1156 | // we successful or not. 1157 | for (auto& response : responses) { 1158 | if (response != nullptr) { 1159 | LOG_IF_ERROR( 1160 | TRITONBACKEND_ResponseSend( 1161 | response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), 1162 | "failed to send Paddle backend response"); 1163 | } 1164 | } 1165 | 1166 | // Report statistics for each request. 1167 | for (uint32_t r = 0; r < request_count; ++r) { 1168 | auto& request = requests[r]; 1169 | LOG_IF_ERROR( 1170 | TRITONBACKEND_ModelInstanceReportStatistics( 1171 | TritonModelInstance(), request, 1172 | (responses[r] != nullptr) /* success */, exec_start_ns, 1173 | compute_start_ns, compute_end_ns, exec_end_ns), 1174 | "failed reporting request statistics"); 1175 | 1176 | LOG_IF_ERROR( 1177 | TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), 1178 | "failed releasing request"); 1179 | } 1180 | 1181 | // TODO: Report the entire batch statistics. 1182 | LOG_IF_ERROR( 1183 | TRITONBACKEND_ModelInstanceReportBatchStatistics( 1184 | TritonModelInstance(), total_batch_size, exec_start_ns, 1185 | compute_start_ns, compute_end_ns, exec_end_ns), 1186 | "failed reporting batch request statistics"); 1187 | 1188 | LOG_MESSAGE( 1189 | TRITONSERVER_LOG_VERBOSE, 1190 | (std::string("TRITONBACKEND_ModelExecute: model ") + Name() + 1191 | " released " + std::to_string(request_count) + " requests") 1192 | .c_str()); 1193 | } 1194 | 1195 | extern "C" { 1196 | 1197 | TRITONSERVER_Error* 1198 | TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) 1199 | { 1200 | const char* cname; 1201 | RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname)); 1202 | std::string name(cname); 1203 | 1204 | LOG_MESSAGE( 1205 | TRITONSERVER_LOG_INFO, 1206 | (std::string("TRITONBACKEND_Initialize: ") + name).c_str()); 1207 | 1208 | uint32_t api_version_major, api_version_minor; 1209 | RETURN_IF_ERROR( 1210 | TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor)); 1211 | 1212 | LOG_MESSAGE( 1213 | TRITONSERVER_LOG_INFO, 1214 | (std::string("Triton TRITONBACKEND API version: ") + 1215 | std::to_string(api_version_major) + "." + 1216 | std::to_string(api_version_minor)) 1217 | .c_str()); 1218 | 1219 | LOG_MESSAGE( 1220 | TRITONSERVER_LOG_INFO, 1221 | (std::string("'") + name + "' TRITONBACKEND API version: " + 1222 | std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." + 1223 | std::to_string(TRITONBACKEND_API_VERSION_MINOR)) 1224 | .c_str()); 1225 | 1226 | if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) || 1227 | (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) { 1228 | return TRITONSERVER_ErrorNew( 1229 | TRITONSERVER_ERROR_UNSUPPORTED, 1230 | (std::string("Triton TRITONBACKEND API version: ") + 1231 | std::to_string(api_version_major) + "." + 1232 | std::to_string(api_version_minor) + " does not support '" + name + 1233 | "' TRITONBACKEND API version: " + 1234 | std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." + 1235 | std::to_string(TRITONBACKEND_API_VERSION_MINOR)) 1236 | .c_str()); 1237 | } 1238 | 1239 | // The backend configuration may contain information needed by the 1240 | // backend, such a command-line arguments. 1241 | TRITONSERVER_Message* backend_config_message; 1242 | RETURN_IF_ERROR( 1243 | TRITONBACKEND_BackendConfig(backend, &backend_config_message)); 1244 | 1245 | const char* buffer; 1246 | size_t byte_size; 1247 | RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson( 1248 | backend_config_message, &buffer, &byte_size)); 1249 | LOG_MESSAGE( 1250 | TRITONSERVER_LOG_INFO, 1251 | (std::string("backend configuration:\n") + buffer).c_str()); 1252 | 1253 | return nullptr; // success 1254 | } 1255 | 1256 | TRITONSERVER_Error* 1257 | TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) 1258 | { 1259 | const char* cname; 1260 | RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname)); 1261 | std::string name(cname); 1262 | 1263 | uint64_t version; 1264 | RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version)); 1265 | 1266 | LOG_MESSAGE( 1267 | TRITONSERVER_LOG_INFO, 1268 | (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " + 1269 | std::to_string(version) + ")") 1270 | .c_str()); 1271 | 1272 | // Create a ModelState object and associate it with the 1273 | // TRITONBACKEND_Model. 1274 | ModelState* model_state = nullptr; 1275 | RETURN_IF_ERROR(ModelState::Create(model, &model_state)); 1276 | RETURN_IF_ERROR( 1277 | TRITONBACKEND_ModelSetState(model, reinterpret_cast(model_state))); 1278 | 1279 | return nullptr; // success 1280 | } 1281 | 1282 | TRITONSERVER_Error* 1283 | TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) 1284 | { 1285 | void* vstate; 1286 | RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate)); 1287 | ModelState* model_state = reinterpret_cast(vstate); 1288 | 1289 | LOG_MESSAGE( 1290 | TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state"); 1291 | 1292 | delete model_state; 1293 | 1294 | return nullptr; // success 1295 | } 1296 | 1297 | TRITONSERVER_Error* 1298 | TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) 1299 | { 1300 | const char* cname; 1301 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname)); 1302 | std::string name(cname); 1303 | 1304 | int32_t device_id; 1305 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id)); 1306 | TRITONSERVER_InstanceGroupKind kind; 1307 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind)); 1308 | 1309 | LOG_MESSAGE( 1310 | TRITONSERVER_LOG_INFO, 1311 | (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" + 1312 | TRITONSERVER_InstanceGroupKindString(kind) + " device " + 1313 | std::to_string(device_id) + ")") 1314 | .c_str()); 1315 | 1316 | // Get the model state associated with this instance's model 1317 | TRITONBACKEND_Model* model; 1318 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model)); 1319 | 1320 | void* vmodelstate; 1321 | RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate)); 1322 | ModelState* model_state = reinterpret_cast(vmodelstate); 1323 | 1324 | // With each instance we create a ModelInstanceState object and 1325 | // associate it with the TRITONBACKEND_ModelInstance. 1326 | ModelInstanceState* instance_state; 1327 | RETURN_IF_ERROR( 1328 | ModelInstanceState::Create(model_state, instance, &instance_state)); 1329 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState( 1330 | instance, reinterpret_cast(instance_state))); 1331 | 1332 | return nullptr; 1333 | } 1334 | 1335 | TRITONSERVER_Error* 1336 | TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) 1337 | { 1338 | void* vstate; 1339 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); 1340 | ModelInstanceState* instance_state = 1341 | reinterpret_cast(vstate); 1342 | 1343 | LOG_MESSAGE( 1344 | TRITONSERVER_LOG_INFO, 1345 | "TRITONBACKEND_ModelInstanceFinalize: delete instance state"); 1346 | 1347 | delete instance_state; 1348 | 1349 | return nullptr; 1350 | } 1351 | 1352 | TRITONSERVER_Error* 1353 | TRITONBACKEND_ModelInstanceExecute( 1354 | TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, 1355 | const uint32_t request_count) 1356 | { 1357 | // Triton will not call this function simultaneously for the same 1358 | // 'instance'. But since this backend could be used by multiple 1359 | // instances from multiple models the implementation needs to handle 1360 | // multiple calls to this function at the same time (with different 1361 | // 'instance' objects). Suggested practice for this is to use only 1362 | // function-local and model-instance-specific state (obtained from 1363 | // 'instance'), which is what we do here. 1364 | ModelInstanceState* instance_state; 1365 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState( 1366 | instance, reinterpret_cast(&instance_state))); 1367 | ModelState* model_state = instance_state->StateForModel(); 1368 | 1369 | LOG_MESSAGE( 1370 | TRITONSERVER_LOG_VERBOSE, 1371 | (std::string("model ") + model_state->Name() + ", instance " + 1372 | instance_state->Name() + ", executing " + std::to_string(request_count) + 1373 | " requests") 1374 | .c_str()); 1375 | 1376 | // At this point we accept ownership of 'requests', which means that 1377 | // even if something goes wrong we must still return success from 1378 | // this function. If something does go wrong in processing a 1379 | // particular request then we send an error response just for the 1380 | // specific request. 1381 | instance_state->ProcessRequests(requests, request_count); 1382 | 1383 | return nullptr; // success 1384 | } 1385 | 1386 | } // extern "C" 1387 | }}} // namespace triton::backend::paddle 1388 | -------------------------------------------------------------------------------- /src/paddle_backend_utils.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | #include "paddle_backend_utils.h" 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | // namespace triton { namespace backend { namespace paddle { 36 | 37 | template TRITONPADDLE_Shape::TRITONPADDLE_Shape( 38 | const std::vector& shape); 39 | template TRITONPADDLE_Shape::TRITONPADDLE_Shape( 40 | const std::vector& shape); 41 | 42 | template 43 | TRITONPADDLE_Shape::TRITONPADDLE_Shape(const std::vector& shape) 44 | { 45 | shape_ = std::vector(shape.cbegin(), shape.cend()); 46 | numel_ = std::accumulate( 47 | shape_.cbegin(), shape_.cend(), 1, std::multiplies()); 48 | } 49 | 50 | TRITONPADDLE_Shape::TRITONPADDLE_Shape(const std::string& str) 51 | { 52 | std::vector str_shape; 53 | std::istringstream in(str); 54 | std::copy( 55 | std::istream_iterator(in), 56 | std::istream_iterator(), std::back_inserter(str_shape)); 57 | 58 | std::transform( 59 | str_shape.cbegin(), str_shape.cend(), std::back_inserter(shape_), 60 | [](const std::string& str) -> value_type { 61 | return static_cast(std::stoll(str)); 62 | }); 63 | } 64 | 65 | std::vector 66 | TRITONPADDLE_Shape::CompatibleShape() const 67 | { 68 | return std::vector(shape_.cbegin(), shape_.cend()); 69 | } 70 | 71 | TRITONPADDLE_DataType 72 | ConvertDataType(TRITONSERVER_DataType dtype) 73 | { 74 | switch (dtype) { 75 | case TRITONSERVER_TYPE_INVALID: 76 | return TRITONPADDLE_TYPE_INVALID; 77 | case TRITONSERVER_TYPE_UINT8: 78 | return TRITONPADDLE_TYPE_UINT8; 79 | case TRITONSERVER_TYPE_INT8: 80 | return TRITONPADDLE_TYPE_INT8; 81 | case TRITONSERVER_TYPE_INT32: 82 | return TRITONPADDLE_TYPE_INT32; 83 | case TRITONSERVER_TYPE_INT64: 84 | return TRITONPADDLE_TYPE_INT64; 85 | case TRITONSERVER_TYPE_FP32: 86 | return TRITONPADDLE_TYPE_FP32; 87 | case TRITONSERVER_TYPE_FP16: 88 | return TRITONPADDLE_TYPE_FP16; 89 | default: 90 | break; 91 | } 92 | return TRITONPADDLE_TYPE_INVALID; 93 | } 94 | 95 | TRITONSERVER_DataType 96 | ConvertDataType(TRITONPADDLE_DataType dtype) 97 | { 98 | switch (dtype) { 99 | case TRITONPADDLE_TYPE_INVALID: 100 | return TRITONSERVER_TYPE_INVALID; 101 | case TRITONPADDLE_TYPE_UINT8: 102 | return TRITONSERVER_TYPE_UINT8; 103 | case TRITONPADDLE_TYPE_INT8: 104 | return TRITONSERVER_TYPE_INT8; 105 | case TRITONPADDLE_TYPE_INT32: 106 | return TRITONSERVER_TYPE_INT32; 107 | case TRITONPADDLE_TYPE_INT64: 108 | return TRITONSERVER_TYPE_INT64; 109 | case TRITONPADDLE_TYPE_FP32: 110 | return TRITONSERVER_TYPE_FP32; 111 | case TRITONPADDLE_TYPE_FP16: 112 | return TRITONSERVER_TYPE_FP16; 113 | default: 114 | break; 115 | } 116 | return TRITONSERVER_TYPE_INVALID; 117 | } 118 | 119 | TRITONPADDLE_DataType 120 | ConvertDataType(::paddle_infer::DataType dtype) 121 | { 122 | switch (dtype) { 123 | case ::paddle_infer::DataType::FLOAT32: 124 | return TRITONPADDLE_TYPE_FP32; 125 | case ::paddle_infer::DataType::INT64: 126 | return TRITONPADDLE_TYPE_INT64; 127 | case ::paddle_infer::DataType::INT32: 128 | return TRITONPADDLE_TYPE_INT32; 129 | case ::paddle_infer::DataType::UINT8: 130 | return TRITONPADDLE_TYPE_UINT8; 131 | // case ::paddle_infer::DataType::INT8: 132 | // return TRITONPADDLE_TYPE_INT8; 133 | default: 134 | break; 135 | } 136 | return TRITONPADDLE_TYPE_INVALID; 137 | } 138 | 139 | TRITONPADDLE_DataType 140 | ConvertDataType(const std::string& dtype) 141 | { 142 | if (dtype == "TYPE_INVALID") { 143 | return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INVALID; 144 | } else if (dtype == "TYPE_FP32") { 145 | return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_FP32; 146 | } else if (dtype == "TYPE_UINT8") { 147 | return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_UINT8; 148 | } else if (dtype == "TYPE_INT8") { 149 | return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INT8; 150 | } else if (dtype == "TYPE_INT32") { 151 | return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INT32; 152 | } else if (dtype == "TYPE_INT64") { 153 | return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INT64; 154 | } else if (dtype == "TYPE_FP16") { 155 | return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_FP16; 156 | } 157 | return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INVALID; 158 | } 159 | 160 | size_t 161 | TRITONPADDLE_DataTypeByteSize(TRITONPADDLE_DataType dtype) 162 | { 163 | switch (dtype) { 164 | case TRITONPADDLE_DataType::TRITONPADDLE_TYPE_FP32: 165 | return sizeof(float); 166 | case TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INT64: 167 | return sizeof(int64_t); 168 | case TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INT32: 169 | return sizeof(int32_t); 170 | case TRITONPADDLE_DataType::TRITONPADDLE_TYPE_UINT8: 171 | return sizeof(uint8_t); 172 | case TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INT8: 173 | return sizeof(int8_t); 174 | case TRITONPADDLE_DataType::TRITONPADDLE_TYPE_FP16: 175 | return sizeof(phi::dtype::float16); 176 | default: 177 | break; 178 | } 179 | return 0; // Should not happened, TODO: Error handling 180 | } 181 | 182 | /* Error message */ 183 | 184 | TRITONPADDLE_Error* 185 | TRITONPADDLE_ErrorNew(const std::string& str) 186 | { 187 | TRITONPADDLE_Error* error = new TRITONPADDLE_Error(); 188 | error->msg_ = new char[str.size() + 1]; 189 | std::strcpy(error->msg_, str.c_str()); 190 | return error; 191 | } 192 | 193 | void 194 | TRITONPADDLE_ErrorDelete(TRITONPADDLE_Error* error) 195 | { 196 | if (error == nullptr) { 197 | return; 198 | } 199 | 200 | delete[] error->msg_; 201 | delete error; 202 | } 203 | 204 | TRITONPADDLE_Config::TRITONPADDLE_Config() 205 | : use_trt_(false), max_batch_size_(1), workspace_size_(1 << 30), min_graph_size_(5), 206 | precision_(TRITONPADDLE_MODE_FP32), is_dynamic_(false), 207 | enable_tensorrt_oss_(false), disenable_trt_tune_(false), use_cpu_(false), 208 | use_mkldnn_(false), use_ort_(false), use_mkldnn_int8_(false), 209 | cpu_math_library_num_threads_(1), mkldnn_capacity_(10), model_dir_("") 210 | { 211 | } 212 | 213 | // }}} 214 | -------------------------------------------------------------------------------- /src/paddle_backend_utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | #pragma once 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #include "paddle_inference_api.h" 35 | #include "experimental/phi/common/float16.h" 36 | #include "triton/core/tritonserver.h" 37 | 38 | // namespace triton { namespace backend { namespace paddle { 39 | 40 | #define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \ 41 | do { \ 42 | TRITONSERVER_Error* raarie_err__ = (X); \ 43 | if (raarie_err__ != nullptr) { \ 44 | SendErrorForResponses(RESPONSES, RESPONSES_COUNT, raarie_err__); \ 45 | return; \ 46 | } \ 47 | } while (false) 48 | 49 | #define RETURN_IF_TRITONPADDLE_ERROR(ERR) \ 50 | do { \ 51 | TRITONPADDLE_Error* error__ = (ERR); \ 52 | if (error__ != nullptr) { \ 53 | auto status = \ 54 | TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, error__->msg_); \ 55 | TRITONPADDLE_ErrorDelete(error__); \ 56 | return status; \ 57 | } \ 58 | } while (false) 59 | 60 | #define THROW_IF_TRITONPADDLE_ERROR(X) \ 61 | do { \ 62 | TRITONPADDLE_Error* tie_err__ = (X); \ 63 | if (tie_err__ != nullptr) { \ 64 | throw TRITONPADDLE_Exception(tie_err__); \ 65 | } \ 66 | } while (false) 67 | 68 | typedef struct { 69 | char* msg_; 70 | } TRITONPADDLE_Error; 71 | 72 | struct TRITONPADDLE_Exception { 73 | TRITONPADDLE_Exception(TRITONPADDLE_Error* err) : err_(err) {} 74 | TRITONPADDLE_Error* err_; 75 | }; 76 | 77 | TRITONPADDLE_Error* TRITONPADDLE_ErrorNew(const std::string& str); 78 | 79 | void TRITONPADDLE_ErrorDelete(TRITONPADDLE_Error* error); 80 | 81 | // TRITONPADDLE TYPE 82 | // TODO: Full all possible type? 83 | typedef enum { 84 | TRITONPADDLE_TYPE_FP32, 85 | TRITONPADDLE_TYPE_INT64, 86 | TRITONPADDLE_TYPE_INT32, 87 | TRITONPADDLE_TYPE_UINT8, 88 | TRITONPADDLE_TYPE_INT8, 89 | TRITONPADDLE_TYPE_FP16, 90 | TRITONPADDLE_TYPE_INVALID 91 | } TRITONPADDLE_DataType; 92 | 93 | // TRITONPADDLE SHAPE 94 | class TRITONPADDLE_Shape { 95 | public: 96 | using value_type = int64_t; 97 | 98 | TRITONPADDLE_Shape() = default; 99 | TRITONPADDLE_Shape(const std::string& str); 100 | template 101 | TRITONPADDLE_Shape(const std::vector& shape); 102 | size_t NumElements() const { return numel_; }; 103 | 104 | std::vector CompatibleShape() const; 105 | std::vector Shape() const { return shape_; }; 106 | 107 | private: 108 | std::vector shape_; 109 | size_t numel_; 110 | }; 111 | 112 | TRITONPADDLE_DataType ConvertDataType(TRITONSERVER_DataType dtype); 113 | 114 | TRITONPADDLE_DataType ConvertDataType(::paddle_infer::DataType dtype); 115 | 116 | TRITONPADDLE_DataType ConvertDataType(const std::string& dtype); 117 | 118 | TRITONSERVER_DataType ConvertDataType(TRITONPADDLE_DataType dtype); 119 | 120 | size_t TRITONPADDLE_DataTypeByteSize(TRITONPADDLE_DataType dtype); 121 | 122 | // TRITON PADDLE MODE 123 | typedef enum { 124 | TRITONPADDLE_MODE_FP32, 125 | TRITONPADDLE_MODE_FP16, 126 | TRITONPADDLE_MODE_INT8, 127 | } TRITONPADDLE_Precision; 128 | 129 | // TRITON PADDLE CONFIG 130 | class TRITONPADDLE_Config { 131 | public: 132 | TRITONPADDLE_Config(); 133 | // trt 134 | bool use_trt_; 135 | int64_t max_batch_size_; 136 | int64_t workspace_size_; 137 | int64_t min_graph_size_; 138 | TRITONPADDLE_Precision precision_; 139 | bool is_dynamic_; 140 | bool enable_tensorrt_oss_; 141 | bool disenable_trt_tune_; 142 | // cpu 143 | bool use_cpu_; 144 | bool use_mkldnn_; 145 | bool use_ort_; 146 | bool use_mkldnn_int8_; 147 | int cpu_math_library_num_threads_; 148 | int mkldnn_capacity_; 149 | std::string model_dir_; 150 | 151 | std::map> dynamic_min_shape_; 152 | std::map> dynamic_max_shape_; 153 | std::map> dynamic_opt_shape_; 154 | }; 155 | 156 | // }}} 157 | --------------------------------------------------------------------------------