├── CMakeLists.txt
├── Dockerfile
├── LICENSE
├── README.md
├── README_cn.md
├── README_en.md
├── docs
    ├── model_configuration.md
    └── zh_CN
    │   ├── model_configuration.md
    │   └── model_repository.md
├── examples
    ├── fetch_models.sh
    ├── infer_ernie.py
    ├── infer_resnet50_v1.5.py
    ├── infer_resnet50_v1.5.sh
    ├── models
    │   ├── ERNIE
    │   │   └── config.pbtxt
    │   └── ResNet50-v1.5
    │   │   └── config.pbtxt
    ├── perf_ernie.sh
    └── perf_resnet50_v1.5.sh
├── paddle-lib
    ├── Dockerfile
    └── build_paddle.sh
├── scripts
    ├── build_paddle_backend.sh
    └── launch_triton_server.sh
└── src
    ├── libtriton_paddle.ldscript
    ├── paddle.cc
    ├── paddle_backend_utils.cc
    └── paddle_backend_utils.h


/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | cmake_minimum_required(VERSION 3.17)
 28 | 
 29 | project(trironpaddlebackend LANGUAGES C CXX)
 30 | 
 31 | set(PADDLE_INFERENCE_DIR "" CACHE PATH "Paths to Paddle Inference Directory. Multiple paths may be specified by sparating them with a semicolon.")
 32 | set(PADDLE_INCLUDE_PATHS "${PADDLE_INFERENCE_DIR}/include"
 33 |   CACHE PATH "Paths to Paddle Inference includes. Multiple paths may be specified by sparating them with a semicolon.")
 34 | set(PADDLE_LIB_PATHS "${PADDLE_INFERENCE_DIR}/lib"
 35 |   CACHE PATH "Paths to Paddle Inference libraries. Multiple paths may be specified by sparating them with a semicolon.")
 36 | set(PADDLE_LIB_NAME "paddle_inference")
 37 | 
 38 | set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
 39 | set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
 40 | set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
 41 | 
 42 | include(FetchContent)
 43 | 
 44 | FetchContent_Declare(
 45 |   repo-common
 46 |   GIT_REPOSITORY https://github.com/triton-inference-server/common.git
 47 |   GIT_TAG ${TRITON_COMMON_REPO_TAG}
 48 |   GIT_SHALLOW ON
 49 | )
 50 | FetchContent_Declare(
 51 |   repo-core
 52 |   GIT_REPOSITORY https://github.com/triton-inference-server/core.git
 53 |   GIT_TAG ${TRITON_CORE_REPO_TAG}
 54 |   GIT_SHALLOW ON
 55 | )
 56 | FetchContent_Declare(
 57 |   repo-backend
 58 |   GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
 59 |   GIT_TAG ${TRITON_BACKEND_REPO_TAG}
 60 |   GIT_SHALLOW ON
 61 | )
 62 | FetchContent_MakeAvailable(repo-common repo-core repo-backend)
 63 | 
 64 | configure_file(src/libtriton_paddle.ldscript libtriton_paddle.ldscript COPYONLY)
 65 | 
 66 | add_library(
 67 |   triton-paddle-backend SHARED
 68 |   src/paddle.cc
 69 |   src/paddle_backend_utils.cc
 70 | )
 71 | 
 72 | target_include_directories(
 73 |   triton-paddle-backend
 74 |   PRIVATE
 75 |     ${CMAKE_CURRENT_SOURCE_DIR}/src
 76 | )
 77 | 
 78 | target_include_directories(
 79 |   triton-paddle-backend
 80 |   PRIVATE ${PADDLE_INCLUDE_PATHS}
 81 | )
 82 | 
 83 | target_link_libraries(
 84 |   triton-paddle-backend
 85 |   PRIVATE "-L${PADDLE_LIB_PATHS} -l${PADDLE_LIB_NAME}"
 86 | )
 87 | 
 88 | target_compile_features(triton-paddle-backend PRIVATE cxx_std_11)
 89 | target_compile_options(
 90 |   triton-paddle-backend PRIVATE
 91 |   $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
 92 |     -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>
 93 | )
 94 | 
 95 | set_target_properties(
 96 |   triton-paddle-backend PROPERTIES
 97 |   POSITION_INDEPENDENT_CODE ON
 98 |   OUTPUT_NAME triton_paddle
 99 |   SKIP_BUILD_RPATH TRUE
100 |   LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_paddle.ldscript
101 |   LINK_FLAGS "-Wl,--version-script libtriton_paddle.ldscript"
102 | )
103 | 
104 | target_link_libraries(
105 |   triton-paddle-backend
106 |   PRIVATE
107 |     triton-backend-utils    # from repo-backend
108 |     triton-core-serverstub  # from repo-core
109 | )
110 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | FROM nvcr.io/nvidia/tritonserver:21.10-py3 as full
28 | FROM nvcr.io/nvidia/tritonserver:21.10-py3-min
29 | 
30 | ENV DEBIAN_FRONTEND=noninteractive
31 | ENV DCGM_VERSION=2.2.9
32 | RUN apt update && apt install -y --no-install-recommends software-properties-common \
33 |     && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin \
34 |     && mkdir -p /etc/apt/preferences.d/cuda-repository-pin-600 \
35 |     && mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600/ \
36 |     && apt-key del 7fa2af80 \
37 |     && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \
38 |     && dpkg -i cuda-keyring_1.0-1_all.deb \
39 |     && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub \
40 |     && add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" \
41 |     && apt-get update && apt-get install -y --no-install-recommends datacenter-gpu-manager=1:2.2.9
42 | 
43 | RUN apt update \
44 |     && apt install -y --no-install-recommends libre2-5 libb64-0d python3 python3-pip libarchive-dev \
45 |     && python3 -m pip install -U pip \
46 |     && python3 -m pip install paddlepaddle-gpu paddlenlp faster_tokenizer
47 | 
48 | COPY --from=full /opt/tritonserver/bin /opt/tritonserver/bin
49 | COPY --from=full /opt/tritonserver/lib /opt/tritonserver/lib
50 | COPY --from=full /opt/tritonserver/include /opt/tritonserver/include
51 | COPY --from=full /opt/tritonserver/backends/python /opt/tritonserver/backends/python
52 | COPY --from=full /opt/tritonserver/backends/onnxruntime /opt/tritonserver/backends/onnxruntime
53 | 
54 | COPY paddle-lib/paddle/lib paddle-lib/onnxruntime/lib paddle-lib/paddle2onnx/lib paddle-lib/mkldnn/lib paddle-lib/mklml/lib /opt/paddle/
55 | COPY build/libtriton_paddle.so /opt/tritonserver/backends/paddle/
56 | 
57 | ENV LD_LIBRARY_PATH="/opt/paddle/:$LD_LIBRARY_PATH"
58 | ENV PATH="/opt/tritonserver/bin:$PATH"
59 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions
 5 | are met:
 6 |  * Redistributions of source code must retain the above copyright
 7 |    notice, this list of conditions and the following disclaimer.
 8 |  * Redistributions in binary form must reproduce the above copyright
 9 |    notice, this list of conditions and the following disclaimer in the
10 |    documentation and/or other materials provided with the distribution.
11 |  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |    contributors may be used to endorse or promote products derived
13 |    from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | README_en.md


--------------------------------------------------------------------------------
/README_cn.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | -->
 28 | 简体中文 | [English](README_en.md)
 29 | 
 30 | # Triton Paddle Backend
 31 | 
 32 | ## Table of Contents
 33 | 
 34 | - [快速开始](#快速开始)
 35 |     - [拉取镜像](#拉取镜像)
 36 |     - [创建模型仓库](#创建模型仓库)
 37 |     - [启动服务](#启动服务)
 38 |     - [验证Triton服务](#验证Triton服务是否正常)
 39 | - [示例](#运行示例)
 40 |     - [ERNIE Base](#ernie-base)
 41 |     - [ResNet50 v1.5](#resnet50-v15)
 42 | - [文档](#高阶文档)
 43 | - [性能指标](#性能指标)
 44 |     - [ERNIE Base (T4)](#ernie-base-t4)
 45 |     - [ResNet50 v1.5 (V100-SXM2-16G)](#resnet50-v15-v100-sxm2-16g)
 46 |     - [ResNet50 v1.5 (T4)](#resnet50-v15-t4)
 47 | 
 48 | ## 快速开始
 49 | 
 50 | ### 拉取镜像
 51 | ```
 52 | docker pull paddlepaddle/triton_paddle:21.10
 53 | ```
 54 | 注意: 目前只支持Triton Inference Serve 21.10版本镜像，[Triton Inference Serve 镜像介绍](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).其他版本需要从源码编译
 55 | 
 56 | ### 创建模型仓库
 57 | 当Triton Inference Server启动服务时，可以指定一个或多个模型仓库来部署模型，详细描述见文档[模型仓库](docs/zh_CN/model_repository.md)。在[examples](examples)中有模型仓库示例，可以通过以下脚本获取:
 58 | ```bash
 59 | $ cd examples
 60 | $ ./fetch_models.sh
 61 | $ cd .. # back to root of paddle_backend
 62 | ```
 63 | 
 64 | ### 启动服务
 65 | 1. 启动容器
 66 | ```
 67 | docker run --gpus=all --rm -it --name triton_server --net=host -e CUDA_VISIBLE_DEVICES=0 \
 68 |            -v `pwd`/examples/models:/workspace/models \
 69 |            paddlepaddle/triton_paddle:21.10 /bin/bash
 70 | ```
 71 | 2. 进入容器:
 72 | ```
 73 | docker exec -it triton_server /bin/bash
 74 | ```
 75 | 3. 启动服务
 76 | ```
 77 | /opt/tritonserver/bin/tritonserver --model-repository=/workspace/models
 78 | ```
 79 | 可以使用`/opt/tritonserver/bin/tritonserver --help`查看启动服务的所有参数介绍
 80 | 
 81 | ### 验证Triton服务是否正常
 82 | 在启动服务的机器上使用curl指令，发送HTTP请求可以得到服务的状态
 83 | 
 84 | ```
 85 | $ curl -v localhost:8000/v2/health/ready
 86 | ...
 87 | < HTTP/1.1 200 OK
 88 | < Content-Length: 0
 89 | < Content-Type: text/plain
 90 | ```
 91 | HTTP请求返回200代表服务正常，否则服务有问题
 92 | 
 93 | ## 运行示例
 94 | 
 95 | 在运行示例之前，需要确保服务已经启动并[正常运行](#验证Triton服务是否正常).
 96 | 
 97 | 进入[examples](examples)目录
 98 | ```bash
 99 | $ cd examples
100 | ```
101 | 
102 | ### ERNIE Base
103 | 运行Ernie模型benchmark测试脚本:
104 | ```bash
105 | $ bash perf_ernie.sh
106 | ```
107 | 
108 | ### ResNet50 v1.5
109 | 运行ResNet50-v1.5模型benchmark脚本:
110 | ```bash
111 | $ bash perf_resnet50_v1.5.sh
112 | ```
113 | 
114 | ## 高阶文档
115 | - [模型仓库](docs/zh_CN/model_repository.md)
116 | - [模型配置](docs/zh_CN/model_configuration.md)
117 | 
118 | ## 性能指标
119 | 
120 | ### ERNIE Base (T4)
121 | 
122 | | Precision   | Backend Accelerator  |   Client Batch Size |   Sequences/second |   P90 Latency (ms) |   P95 Latency (ms) |   P99 Latency (ms) |   Avg Latency (ms) |
123 | |:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
124 | | FP16        | TensorRT             |                   1 |               270.0 |         3.813 |         3.846 |         4.007 |         3.692 |
125 | | FP16        | TensorRT             |                   2 |               500.4 |         4.282 |         4.332 |         4.709 |         3.980 |
126 | | FP16        | TensorRT             |                   4 |               831.2 |         5.141 |         5.242 |         5.569 |         4.797 |
127 | | FP16        | TensorRT             |                   8 |              1128.0 |         7.788 |         7.949 |         8.255 |         7.089 |
128 | | FP16        | TensorRT             |                  16 |              1363.2 |        12.702 |        12.993 |        13.507 |        11.738 |
129 | | FP16        | TensorRT             |                  32 |              1529.6 |        22.495 |        22.817 |        24.634 |        20.901 |
130 | 
131 | ### ResNet50 v1.5 (V100-SXM2-16G)
132 | 
133 | | Precision   | Backend Accelerator  |   Client Batch Size |   Sequences/second |   P90 Latency (ms) |   P95 Latency (ms) |   P99 Latency (ms) |   Avg Latency (ms) |
134 | |:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
135 | | FP16        | TensorRT             |                   1 |               288.8 |         3.494 |         3.524 |         3.608 |         3.462 |
136 | | FP16        | TensorRT             |                   2 |               494.0 |         4.083 |         4.110 |         4.208 |         4.047 |
137 | | FP16        | TensorRT             |                   4 |               758.4 |         5.327 |         5.359 |         5.460 |         5.273 |
138 | | FP16        | TensorRT             |                   8 |              1044.8 |         7.728 |         7.770 |         7.949 |         7.658 |
139 | | FP16        | TensorRT             |                  16 |              1267.2 |        12.742 |        12.810 |        13.883 |        12.647 |
140 | | FP16        | TensorRT             |                  32 |              1113.6 |        28.840 |        29.044 |        30.357 |        28.641 |
141 | | FP16        | TensorRT             |                  64 |              1100.8 |        58.512 |        58.642 |        59.967 |        58.251 |
142 | | FP16        | TensorRT             |                 128 |              1049.6 |       121.371 |       121.834 |       123.371 |       119.991 |
143 | 
144 | ### ResNet50 v1.5 (T4)
145 | | Precision   | Backend Accelerator  |   Client Batch Size |   Sequences/second |   P90 Latency (ms) |   P95 Latency (ms) |   P99 Latency (ms) |   Avg Latency (ms) |
146 | |:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
147 | | FP16        | TensorRT             |                   1 |               291.8 |         3.471 |         3.489 |         3.531 |         3.427 |
148 | | FP16        | TensorRT             |                   2 |               466.0 |         4.323 |         4.336 |         4.382 |         4.288 |
149 | | FP16        | TensorRT             |                   4 |               665.6 |         6.031 |         6.071 |         6.142 |         6.011 |
150 | | FP16        | TensorRT             |                   8 |               833.6 |         9.662 |         9.684 |         9.767 |         9.609 |
151 | | FP16        | TensorRT             |                  16 |               899.2 |        18.061 |        18.208 |        18.899 |        17.748 |
152 | | FP16        | TensorRT             |                  32 |               761.6 |        42.333 |        43.456 |        44.167 |        41.740 |
153 | | FP16        | TensorRT             |                  64 |               793.6 |        79.860 |        80.410 |        80.807 |        79.680 |
154 | | FP16        | TensorRT             |                 128 |               793.6 |       158.207 |       158.278 |       158.643 |       157.543 |
155 | 


--------------------------------------------------------------------------------
/README_en.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | -->
 28 | English | [简体中文](README_cn.md)
 29 | 
 30 | # Triton Paddle Backend
 31 | 
 32 | ## Table of Contents
 33 | 
 34 | - [Quick Start](#quick-start)
 35 |     - [Pull Image](#pull-image)
 36 |     - [Create a Model Repository](#create-a-model-repository)
 37 |     - [Launch Triton Inference Server](#launch-triton-inference-server)
 38 |     - [Verify Triton Is Running Correctly](#verify-triton-is-running-correctly)
 39 | - [Examples](#examples)
 40 |     - [ERNIE Base](#ernie-base)
 41 |     - [ResNet50 v1.5](#resnet50-v15)
 42 | - [Performance](#performance)
 43 |     - [ERNIE Base (T4)](#ernie-base-t4)
 44 |     - [ResNet50 v1.5 (V100-SXM2-16G)](#resnet50-v15-v100-sxm2-16g)
 45 |     - [ResNet50 v1.5 (T4)](#resnet50-v15-t4)
 46 | 
 47 | ## Quick Start
 48 | 
 49 | ### Pull Image
 50 | 
 51 | ```bash
 52 | docker pull paddlepaddle/triton_paddle:21.10
 53 | ```
 54 | 
 55 | Note: Only Triton Inference Server 21.10 image is supported.
 56 | 
 57 | ### Create A Model Repository
 58 | 
 59 | The model repository is the directory where you
 60 | place the models that you want Triton to server. An example model
 61 | repository is included in the [examples](examples). Before using the repository,
 62 | you must fetch it by the following scripts.
 63 | 
 64 | ```bash
 65 | $ cd examples
 66 | $ ./fetch_models.sh
 67 | $ cd .. # back to root of paddle_backend
 68 | ```
 69 | 
 70 | ### Launch Triton Inference Server
 71 | 
 72 | 1. Launch the image
 73 | 
 74 | ```bash
 75 | $ docker run --gpus=all --rm -it --name triton_server --net=host -e CUDA_VISIBLE_DEVICES=0 \
 76 |            -v `pwd`/examples/models:/workspace/models \
 77 |            paddlepaddle/triton_paddle:21.10 /bin/bash
 78 | ```
 79 | 
 80 | 2. Launch the triton inference server
 81 | 
 82 | ```bash
 83 | /opt/tritonserver/bin/tritonserver --model-repository=/workspace/models
 84 | ```
 85 | 
 86 | Note: `/opt/tritonserver/bin/tritonserver --help` for all available parameters
 87 | 
 88 | ### Verify Triton Is Running Correctly
 89 | 
 90 | Use Triton’s *ready* endpoint to verify that the server and the models
 91 | are ready for inference. From the host system use curl to access the
 92 | HTTP endpoint that indicates server status.
 93 | 
 94 | ```
 95 | $ curl -v localhost:8000/v2/health/ready
 96 | ...
 97 | < HTTP/1.1 200 OK
 98 | < Content-Length: 0
 99 | < Content-Type: text/plain
100 | ```
101 | 
102 | The HTTP request returns status 200 if Triton is ready and non-200 if
103 | it is not ready.
104 | 
105 | ## Examples
106 | 
107 | Before running the examples, please make sure the triton server is running [correctly](#verify-triton-is-running-correctly).
108 | 
109 | Change working directory to [examples](examples)
110 | ```bash
111 | $ cd examples
112 | ```
113 | 
114 | ### ERNIE Base
115 | [ERNIE-2.0](https://github.com/PaddlePaddle/ERNIE) is a pre-training framework for language understanding.
116 | 
117 | Steps to run the benchmark on ERNIE
118 | ```bash
119 | $ bash perf_ernie.sh
120 | ```
121 | 
122 | ### ResNet50 v1.5
123 | The [ResNet50-v1.5](https://ngc.nvidia.com/catalog/resources/nvidia:resnet_50_v1_5_for_pytorch) is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).
124 | 
125 | Steps to run the benchmark on ResNet50-v1.5
126 | ```bash
127 | $ bash perf_resnet50_v1.5.sh
128 | ```
129 | 
130 | Steps to run the inference on ResNet50-v1.5.
131 | 
132 | 1. Prepare processed images following [DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/triton/resnet50#quick-start-guide) and place ``imagenet`` folder under [examples](examples) directory.
133 | 
134 | 2. Run the inference
135 | 
136 | ```bash
137 | $ bash infer_resnet_v1.5.sh imagenet/<id>
138 | ```
139 | 
140 | ## Performance
141 | 
142 | ### ERNIE Base (T4)
143 | 
144 | | Precision   | Backend Accelerator  |   Client Batch Size |   Sequences/second |   P90 Latency (ms) |   P95 Latency (ms) |   P99 Latency (ms) |   Avg Latency (ms) |
145 | |:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
146 | | FP16        | TensorRT             |                   1 |               270.0 |         3.813 |         3.846 |         4.007 |         3.692 |
147 | | FP16        | TensorRT             |                   2 |               500.4 |         4.282 |         4.332 |         4.709 |         3.980 |
148 | | FP16        | TensorRT             |                   4 |               831.2 |         5.141 |         5.242 |         5.569 |         4.797 |
149 | | FP16        | TensorRT             |                   8 |              1128.0 |         7.788 |         7.949 |         8.255 |         7.089 |
150 | | FP16        | TensorRT             |                  16 |              1363.2 |        12.702 |        12.993 |        13.507 |        11.738 |
151 | | FP16        | TensorRT             |                  32 |              1529.6 |        22.495 |        22.817 |        24.634 |        20.901 |
152 | 
153 | ### ResNet50 v1.5 (V100-SXM2-16G)
154 | 
155 | | Precision   | Backend Accelerator  |   Client Batch Size |   Sequences/second |   P90 Latency (ms) |   P95 Latency (ms) |   P99 Latency (ms) |   Avg Latency (ms) |
156 | |:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
157 | | FP16        | TensorRT             |                   1 |               288.8 |         3.494 |         3.524 |         3.608 |         3.462 |
158 | | FP16        | TensorRT             |                   2 |               494.0 |         4.083 |         4.110 |         4.208 |         4.047 |
159 | | FP16        | TensorRT             |                   4 |               758.4 |         5.327 |         5.359 |         5.460 |         5.273 |
160 | | FP16        | TensorRT             |                   8 |              1044.8 |         7.728 |         7.770 |         7.949 |         7.658 |
161 | | FP16        | TensorRT             |                  16 |              1267.2 |        12.742 |        12.810 |        13.883 |        12.647 |
162 | | FP16        | TensorRT             |                  32 |              1113.6 |        28.840 |        29.044 |        30.357 |        28.641 |
163 | | FP16        | TensorRT             |                  64 |              1100.8 |        58.512 |        58.642 |        59.967 |        58.251 |
164 | | FP16        | TensorRT             |                 128 |              1049.6 |       121.371 |       121.834 |       123.371 |       119.991 |
165 | 
166 | ### ResNet50 v1.5 (T4)
167 | | Precision   | Backend Accelerator  |   Client Batch Size |   Sequences/second |   P90 Latency (ms) |   P95 Latency (ms) |   P99 Latency (ms) |   Avg Latency (ms) |
168 | |:------------|:---------------------|--------------------:|--------------------:|--------------:|--------------:|--------------:|--------------:|
169 | | FP16        | TensorRT             |                   1 |               291.8 |         3.471 |         3.489 |         3.531 |         3.427 |
170 | | FP16        | TensorRT             |                   2 |               466.0 |         4.323 |         4.336 |         4.382 |         4.288 |
171 | | FP16        | TensorRT             |                   4 |               665.6 |         6.031 |         6.071 |         6.142 |         6.011 |
172 | | FP16        | TensorRT             |                   8 |               833.6 |         9.662 |         9.684 |         9.767 |         9.609 |
173 | | FP16        | TensorRT             |                  16 |               899.2 |        18.061 |        18.208 |        18.899 |        17.748 |
174 | | FP16        | TensorRT             |                  32 |               761.6 |        42.333 |        43.456 |        44.167 |        41.740 |
175 | | FP16        | TensorRT             |                  64 |               793.6 |        79.860 |        80.410 |        80.807 |        79.680 |
176 | | FP16        | TensorRT             |                 128 |               793.6 |       158.207 |       158.278 |       158.643 |       157.543 |
177 | 


--------------------------------------------------------------------------------
/docs/model_configuration.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | -->
 28 | 
 29 | # Model Configuration
 30 | 
 31 | ## General Model Configuration
 32 | For the general model configuration information, please visit [triton-inference-server/server/docs/model_configuration](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md).
 33 | 
 34 | ## Platform and Backend
 35 | For using paddle backend, no ``platform`` need to be provided. However, you should set ``backend`` to ``"paddle"`` in the model configuration.
 36 | ```
 37 | backend: "paddle"
 38 | ```
 39 | 
 40 | ## Paddle TensorRT Prediction Configuration
 41 | 
 42 | Paddle supports inference with tensorrt engine, which can boost inference throughput and reduce latency. 
 43 | 
 44 | Related configuration can be set under ``optimization {execution_accelerators {gpu_execution_accelerator{...}}}``
 45 | 
 46 | There are four sections can be configured, which are ``config``, ``min_shape``, ``max_shape``, ``opt_shape``.
 47 | 
 48 | ### ``config``
 49 | 
 50 | In ``config``, you can set the ``precision``, ``min_graph_size``, ``max_batch_size``, ``workspace_size``, ``enable_tensorrt_oss``, ``is_dynamic``. 
 51 | The meaning of the parameters can refer to [Paddle Inference Docs](https://paddle-inference.readthedocs.io/en/latest/api_reference/cxx_api_doc/Config/GPUConfig.html#tensorrt)
 52 | 
 53 | |Parameters         |Available options                                          |
 54 | |-------------------|-----------------------------------------------------------|
 55 | |precision          |``"fluid"``, ``"trt_fp32"``, ``"trt_fp16"``, ``"trt_int8"``|
 56 | |min_graph_size     |``"1"`` ~ ``"2147483647"``                                 |
 57 | |max_batch_size     |``"1"`` ~ ``"2147483647"``                                 |
 58 | |workspace_size     |``"1"`` ~ ``"2147483647"``                                 |
 59 | |enable_tensorrt_oss|``"0"``, ``"1"``                                           |
 60 | |is_dynamic         |``"0"``, ``"1"``                                           |
 61 | 
 62 | ### ``min_shape``, ``max_shape``, ``opt_shape``
 63 | These sections are only needed if ``is_dynamic`` is ``"1"``. Multiple ``parameters`` can be existed if there are multiple dynamic shape input. The ``key`` in ``parameters`` are the input tensor name, and the ``value`` is the shape, no ``,`` or ``[]`` is needed.
 64 | 
 65 | ### An Dynamic Shape Example
 66 | ```
 67 | optimization {
 68 |   execution_accelerators {
 69 |     gpu_execution_accelerator : [
 70 |       {
 71 |         name : "config"
 72 |         parameters { key: "precision" value: "trt_fp16" }
 73 |         parameters { key: "min_graph_size" value: "5" }
 74 |         parameters { key: "workspace_size" value: "1073741824" }
 75 |         parameters { key: "enable_tensorrt_oss" value: "1" }
 76 |         parameters { key: "is_dynamic" value: "1" }
 77 |       },
 78 |       {
 79 |         name : "min_shape"
 80 |         parameters { key: "eval_placeholder_0" value: "1" }
 81 |         parameters { key: "eval_placeholder_1" value: "1" }
 82 |         parameters { key: "eval_placeholder_2" value: "1" }
 83 |         parameters { key: "eval_placeholder_3" value: "1 1 1" }
 84 |       },
 85 |       {
 86 |         name : "max_shape"
 87 |         parameters { key: "eval_placeholder_0" value: "4096" }
 88 |         parameters { key: "eval_placeholder_1" value: "4096" }
 89 |         parameters { key: "eval_placeholder_2" value: "129" }
 90 |         parameters { key: "eval_placeholder_3" value: "1 128 1" }
 91 |       },
 92 |       {
 93 |         name : "opt_shape"
 94 |         parameters { key: "eval_placeholder_0" value: "128" }
 95 |         parameters { key: "eval_placeholder_1" value: "128" }
 96 |         parameters { key: "eval_placeholder_2" value: "2" }
 97 |         parameters { key: "eval_placeholder_3" value: "1 128 1" }
 98 |       }
 99 |     ]
100 |   }
101 | }
102 | ```
103 | 


--------------------------------------------------------------------------------
/docs/zh_CN/model_configuration.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | -->
 28 | 
 29 | # 模型配置
 30 | 模型存储库中的每个模型都必须包含一个模型配置，该配置提供了关于模型的必要和可选信息。这些配置信息一般写在 *config.pbtxt* 文件中，[ModelConfig protobuf](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto)格式。
 31 | 
 32 | ## 模型通用最小配置
 33 | 详细的模型通用配置请看官网文档: [model_configuration](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md).Triton的最小模型配置必须包括: *platform* 或 *backend* 属性、*max_batch_size* 属性和模型的输入输出.
 34 | 
 35 | 例如一个Paddle模型，有两个输入*input0* 和 *input1*，一个输出*output0*，输入输出都是float32类型的tensor，最大batch为8.则最小的配置如下:
 36 | 
 37 | ```
 38 |   backend: "paddle"
 39 |   max_batch_size: 8
 40 |   input [
 41 |     {
 42 |       name: "input0"
 43 |       data_type: TYPE_FP32
 44 |       dims: [ 16 ]
 45 |     },
 46 |     {
 47 |       name: "input1"
 48 |       data_type: TYPE_FP32
 49 |       dims: [ 16 ]
 50 |     }
 51 |   ]
 52 |   output [
 53 |     {
 54 |       name: "output0"
 55 |       data_type: TYPE_FP32
 56 |       dims: [ 16 ]
 57 |     }
 58 |   ]
 59 | ```
 60 | 
 61 | ### Name, Platform and Backend
 62 | 模型配置中 *name* 属性是可选的。如果模型没有在配置中指定，则使用模型的目录名；如果指定了该属性，它必须要跟模型的目录名一致。
 63 | 
 64 | 使用 *paddle backend*，没有*platform*属性可以配置，必须配置*backend*属性为*paddle*。
 65 | 
 66 | ```
 67 | backend: "paddle"
 68 | ```
 69 | 
 70 | ### Paddle Backend特有配置
 71 | 
 72 | Paddle后端目前支持*cpu*和*gpu*推理，*cpu*上支持开启*oneDNN*和*ORT*加速，*gpu*上支持开启*TensorRT*加速。
 73 | 
 74 | 
 75 | #### Paddle Native配置
 76 | Paddle后端中，使用*Native*推理只需配置 *Instance Groups*，决定模型运行在CPU还是GPU上。
 77 | 
 78 | **Native CPU**
 79 | ```
 80 |   instance_group [
 81 |     {
 82 |       #创建两个CPU实例
 83 |       count: 2      
 84 |       kind: KIND_CPU
 85 |     }
 86 |   ]
 87 | ```
 88 | 
 89 | **Native GPU**
 90 | 在*GPU 0*上部署2个实例，在*GPU1*和*GPU*上分别不是1个实例
 91 | 
 92 | ```
 93 |   instance_group [
 94 |     {
 95 |       count: 2
 96 |       kind: KIND_GPU
 97 |       gpus: [ 0 ]
 98 |     },
 99 |     {
100 |       count: 1
101 |       kind: KIND_GPU
102 |       gpus: [ 1, 2 ]
103 |     }
104 |   ]
105 | ```
106 | 
107 | ### Paddle oneDNN配置
108 | oneDNN(原MKL-DNN)是由英特尔开发的开源深度学习软件包，支持神经网络在CPU上的高性能计算，在Paddle后端中通过如下配置打开oneDNN加速:
109 | ```
110 | instance_group [ { kind: KIND_CPU }]
111 | 
112 | optimization { 
113 |   execution_accelerators { 
114 |     cpu_execution_accelerator : [ 
115 |       { 
116 |         name : "mkldnn"
117 |         # 设置op计算的线程数为4
118 |         parameters { key: "cpu_threads" value: "4" }
119 |         # 缓存OneDNN最新10种输入shape
120 |         parameters { key: "capacity" value: "10" }
121 |         # 使用int8量化
122 |         parameters { key: "use_int8" value: "0" }
123 |       }
124 |     ]
125 |   }
126 | }
127 | ```
128 | 
129 | ### Paddle ORT配置
130 | ONNX Runtime是由微软开源的一款推理引擎，Paddle Inference通过Paddle2ONNX集成ONNX Runtime作为推理的后端之一，在Paddle后端中通过如下配置打开ONNX Runtime加速:
131 | 
132 | ```
133 | instance_group [ { kind: KIND_CPU }]
134 | 
135 | optimization { 
136 |   execution_accelerators { 
137 |     cpu_execution_accelerator : [ 
138 |       { 
139 |         name : "ort"
140 |         # 设置op计算的线程数为4
141 |         parameters { key: "cpu_threads" value: "4" }
142 |       }
143 |     ]
144 |   }
145 | }
146 | ```
147 | 
148 | ### Paddle TensorRT配置
149 | 
150 | TensorRT 是一个针对 NVIDIA GPU 及 Jetson 系列硬件的高性能机器学习推理 SDK，可以使得深度学习模型在这些硬件上的部署获得更好的性能。Paddle Inference 以子图方式集成了 TensorRT，将可用 TensorRT 加速的算子组成子图供给 TensorRT，以获取 TensorRT 加速的同时，保留 PaddlePaddle 即训即推的能力。
151 | 
152 | TensorRT的配置选项需要写在这个配置中: ``optimization {execution_accelerators {gpu_execution_accelerator{...}}}``
153 | 
154 | 一共有四个选项:``tensorrt``, ``min_shape``, ``max_shape``, ``opt_shape``.
155 | 
156 | ##### tensorrt选项
157 | 
158 | 在``tensorrt``中能够设置``precision``, ``min_graph_size``, ``max_batch_size``, ``workspace_size``, ``enable_tensorrt_oss``, ``is_dynamic``. 
159 | 详细参数解释请看官网文档[Paddle Inference Docs](https://paddle-inference.readthedocs.io/en/latest/api_reference/cxx_api_doc/Config/GPUConfig.html#tensorrt)
160 | 
161 | |Parameters         |Available options                                          |
162 | |-------------------|-----------------------------------------------------------|
163 | |precision          |``"trt_fp32"``, ``"trt_fp16"``, ``"trt_int8"``|
164 | |min_graph_size     |``"1"`` ~ ``"2147483647"``                                 |
165 | |max_batch_size     |``"1"`` ~ ``"2147483647"``                                 |
166 | |workspace_size     |``"1"`` ~ ``"2147483647"``                                 |
167 | |enable_tensorrt_oss|``"0"``, ``"1"``                                           |
168 | |is_dynamic         |``"0"``, ``"1"``                                           |
169 | 
170 | #### min_shape, max_shape, opt_shape选项
171 | 当且仅当开启动态shape时(*is_dynamic*为*1*)，每个输入需要设置最大形状(*max_shape*)、最小形状(*min_shape*)和最常见形状(*opt_shape*)。其中字典*parameters*中*key*为输入的名字，*value*为对应输入的最大、最小、最常见shape。
172 | 
173 | #### TensorRT动态shape例子
174 | ```
175 | optimization {
176 |   execution_accelerators {
177 |     gpu_execution_accelerator : [
178 |       {
179 |         name : "tensorrt"
180 |         # 使用TensorRT的FP16推理
181 |         parameters { key: "precision" value: "trt_fp16" }
182 |         # 设置TensorRT的子图最小op数为3
183 |         parameters { key: "min_graph_size" value: "3" }
184 |         parameters { key: "workspace_size" value: "1073741824" }
185 |         # 不使用变长
186 |         parameters { key: "enable_tensorrt_oss" value: "0" }
187 |         # 开启动态shape
188 |         parameters { key: "is_dynamic" value: "1" }
189 |       },
190 |       {
191 |         name : "min_shape"
192 |         parameters { key: "eval_placeholder_0" value: "1" }
193 |         parameters { key: "eval_placeholder_1" value: "1" }
194 |         parameters { key: "eval_placeholder_2" value: "1" }
195 |         parameters { key: "eval_placeholder_3" value: "1 1 1" }
196 |       },
197 |       {
198 |         name : "max_shape"
199 |         parameters { key: "eval_placeholder_0" value: "4096" }
200 |         parameters { key: "eval_placeholder_1" value: "4096" }
201 |         parameters { key: "eval_placeholder_2" value: "129" }
202 |         parameters { key: "eval_placeholder_3" value: "1 128 1" }
203 |       },
204 |       {
205 |         name : "opt_shape"
206 |         parameters { key: "eval_placeholder_0" value: "128" }
207 |         parameters { key: "eval_placeholder_1" value: "128" }
208 |         parameters { key: "eval_placeholder_2" value: "2" }
209 |         parameters { key: "eval_placeholder_3" value: "1 128 1" }
210 |       }
211 |     ]
212 |   }
213 | }
214 | ```
215 | 


--------------------------------------------------------------------------------
/docs/zh_CN/model_repository.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # 模型仓库(Model Repository)
 3 | Triton Inference Server启动服务时指定模型仓库中一个或多个模型部署服务。当服务运行时，可以用[Model Management](https://github.com/triton-inference-server/server/blob/main/docs/model_management.md)中描述的方式修改服务中的模型。
 4 | 从服务器启动时指定的一个或多个模型存储库中为模型提供服务
 5 | 
 6 | ## 仓库结构
 7 | 模型仓库路径通过Triton启动时的*--model-repository*选项指定，可以多次指定*--model-repository*选项来加载多个仓库。例如:
 8 | 
 9 | ```
10 | $ tritonserver --model-repository=<model-repository-path>
11 | ```
12 | 
13 | 模型仓库的结构必须按以下的格式创建:
14 | ```
15 |   <model-repository-path>/
16 |     <model-name>/
17 |       [config.pbtxt]
18 |       [<output-labels-file> ...]
19 |       <version>/
20 |         <model-definition-file>
21 |       <version>/
22 |         <model-definition-file>
23 |       ...
24 |     <model-name>/
25 |       [config.pbtxt]
26 |       [<output-labels-file> ...]
27 |       <version>/
28 |         <model-definition-file>
29 |       <version>/
30 |         <model-definition-file>
31 |       ...
32 |     ...
33 | ```
34 | 在最顶层`<model-repository-path>`模型仓库目录下，必须有0个或多个`<model-name>`模型名字的子目录。每个`<model-name>`模型名字子目录包含部署模型相应的信息，多个表示模型版本的数字子目录和一个描述模型配置的*config.pbtxt*文件。
35 | 
36 | Paddle模型存在版本号子目录中，必须为`model.pdmodel`文件和`model.pdiparams`文件。
37 | 
38 | ## 模型版本
39 | 每个模型在仓库中可以有一个或多个可用的版本，模型目录中以数字命名的子目录就是对应的版本，数字即版本号。没有以数字命名的子目录，或以*0*开头的子目录都会被忽略。模型配置文件中可以指定[版本策略](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#version-policy)，控制Triton启动模型目录中的哪个版本。
40 | 
41 | ## 模型仓库示例
42 | Paddle Backend需要的模型必须是2.0版本以上导出的推理模型，模型包含`model.pdmodel`和`model.pdiparams`两个文件放在版本目录中。
43 | 
44 | 一个使用Paddle Backend部署的最小模型仓库目录示例:
45 | ```
46 |   <model-repository-path>/
47 |     <model-name>/
48 |       config.pbtxt
49 |       1/
50 |         model.pdmodel
51 |         model.pdiparams
52 | 
53 |   # 真实例子:
54 |   models
55 |   └── ResNet50
56 |       ├── 1
57 |       │   ├── model.pdiparams
58 |       │   └── model.pdmodel
59 |       └── config.pbtxt
60 | ```


--------------------------------------------------------------------------------
/examples/fetch_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | set -e
29 | 
30 | download () {
31 |   FILE_NAME="models.tar.gz"
32 | 
33 |   if test -f ${FILE_NAME};
34 |   then
35 |     echo "${FILE_NAME} exists."
36 |   fi
37 | 
38 |   wget --no-check-certificate https://paddle-inference-dist.bj.bcebos.com/TritonPaddleBackend/models.tar.gz
39 | 
40 |   echo "Finish downloading ${FILE_NAME}"
41 | }
42 | 
43 | download
44 | 
45 | echo 'Extracting models.tar.gz'
46 | tar zxvf models.tar.gz
47 | 


--------------------------------------------------------------------------------
/examples/infer_ernie.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | 
 28 | import sys
 29 | import json
 30 | import argparse
 31 | import numpy as np
 32 | 
 33 | import tritonclient.http as httpclient
 34 | from tritonclient.utils import InferenceServerException
 35 | 
 36 | 
 37 | FLAGS = None
 38 | 
 39 | 
 40 | def parse_model_http(model_metadata, model_config):
 41 |     return model_metadata['inputs'], model_metadata['outputs']
 42 | 
 43 | 
 44 | def postprocess(results, output_metadata, batch_size):
 45 |     """
 46 |     Post-process results to show classifications.
 47 |     """
 48 | 
 49 |     output_array = results.as_numpy(output_metadata[0]['name'])
 50 |     return np.argmax(output_array, axis=1)
 51 | 
 52 | 
 53 | def read_input(filename):
 54 |     with open(filename) as file:
 55 |         data = json.load(file)
 56 |         return data
 57 | 
 58 | 
 59 | def requestGenerator(input_metadata, output_metadata, FLAGS, input_data):
 60 | 
 61 |     # Set the input data
 62 |     inputs = list()
 63 | 
 64 |     for input_ in input_metadata:
 65 |         input_name = input_['name']
 66 |         runtime_data = input_data[input_name]
 67 |         data = np.asarray(runtime_data['content'], dtype=np.int32)
 68 |         data = data.reshape(runtime_data['shape'])
 69 |         inputs.append(
 70 |           httpclient.InferInput(input_name, data.shape, input_['datatype']))
 71 |         inputs[-1].set_data_from_numpy(data, binary_data=True)
 72 | 
 73 |     outputs = list()
 74 |     for output in output_metadata:
 75 |         outputs.append(
 76 |             httpclient.InferRequestedOutput(output['name'],
 77 |                                           binary_data=True))
 78 | 
 79 |     yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
 80 | 
 81 | 
 82 | if __name__ == '__main__':
 83 |     parser = argparse.ArgumentParser()
 84 |     parser.add_argument('-v',
 85 |                       '--verbose',
 86 |                       action="store_true",
 87 |                       required=False,
 88 |                       default=False,
 89 |                       help='Enable verbose output')
 90 |     parser.add_argument('-m',
 91 |                         '--model-name',
 92 |                         type=str,
 93 |                         required=True,
 94 |                         help='Name of model')
 95 |     parser.add_argument(
 96 |         '-x',
 97 |         '--model-version',
 98 |         type=str,
 99 |         required=False,
100 |         default="",
101 |         help='Version of model. Default is to use latest version.')
102 |     parser.add_argument('-b',
103 |                         '--batch-size',
104 |                         type=int,
105 |                         required=False,
106 |                         default=1,
107 |                         help='Batch size. Default is 1.')
108 |     parser.add_argument('-u',
109 |                         '--url',
110 |                         type=str,
111 |                         required=False,
112 |                         default='localhost:8000',
113 |                         help='Inference server URL. Default is localhost:8000.')
114 |     parser.add_argument('-i',
115 |                         '--protocol',
116 |                         type=str,
117 |                         required=False,
118 |                         choices=['HTTP'],
119 |                         default='HTTP',
120 |                         help='Protocol used to communicate with ' +
121 |                         'the inference service. Default is HTTP.')
122 |     parser.add_argument('image_filename',
123 |                         type=str,
124 |                         nargs='?',
125 |                         default=None,
126 |                         help='Input image / Input folder.')
127 |     FLAGS = parser.parse_args()
128 | 
129 |     try:
130 |         triton_client = httpclient.InferenceServerClient(
131 |             url=FLAGS.url, verbose=FLAGS.verbose, concurrency=1)
132 |     except Exception as exception:
133 |         print("client creation failed: " + str(exception))
134 |         sys.exit(1)
135 | 
136 |     # Make sure the model matches our requirements, and get some
137 |     # properties of the model that we need for preprocessing
138 |     try:
139 |         model_metadata = triton_client.get_model_metadata(
140 |             model_name=FLAGS.model_name, model_version=FLAGS.model_version)
141 |     except InferenceServerException as e:
142 |         print("failed to retrieve the metadata: " + str(e))
143 |         sys.exit(1)
144 | 
145 |     try:
146 |         model_config = triton_client.get_model_config(
147 |             model_name=FLAGS.model_name, model_version=FLAGS.model_version)
148 |     except InferenceServerException as e:
149 |         print("failed to retrieve the config: " + str(e))
150 |         sys.exit(1)
151 | 
152 |     requests = []
153 |     responses = []
154 |     request_ids = []
155 | 
156 |     input_metadata, output_metadata = parse_model_http(model_metadata, model_config)
157 | 
158 |     json_data = read_input(f'data/perf.{FLAGS.batch_size}.json')
159 |     input_data = json_data['data']
160 |     if 'ground_truth' in json_data:
161 |         ground_truth = json_data['ground_truth']
162 | 
163 |     for idx, batch_data in enumerate(input_data):
164 |         try:
165 |             for inputs, outputs, model_name, model_version in requestGenerator(input_metadata, output_metadata, FLAGS, batch_data):
166 |                 responses.append(
167 |                     triton_client.infer(FLAGS.model_name,
168 |                         inputs,
169 |                         request_id=str(idx),
170 |                         model_version=FLAGS.model_version,
171 |                         outputs=outputs))
172 |         except InferenceServerException as e:
173 |             print("inference failed: " + str(e))
174 |             sys.exit(1)
175 | 
176 |     results = list()
177 |     for response in responses:
178 |         this_id = response.get_response()["id"]
179 |         results.extend(postprocess(response, output_metadata, FLAGS.batch_size))
180 | 
181 |     if 'ground_truth' in json_data:
182 |         print('Accuracy:', sum(np.asarray(ground_truth) == np.asarray(results))/len(ground_truth))
183 | 


--------------------------------------------------------------------------------
/examples/infer_resnet50_v1.5.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | 
 28 | import argparse
 29 | from functools import partial
 30 | import os
 31 | import sys
 32 | 
 33 | from PIL import Image
 34 | import numpy as np
 35 | from attrdict import AttrDict
 36 | 
 37 | import tritonclient.grpc as grpcclient
 38 | import tritonclient.grpc.model_config_pb2 as mc
 39 | import tritonclient.http as httpclient
 40 | from tritonclient.utils import InferenceServerException
 41 | from tritonclient.utils import triton_to_np_dtype
 42 | 
 43 | if sys.version_info >= (3, 0):
 44 |     import queue
 45 | else:
 46 |     import Queue as queue
 47 | 
 48 | 
 49 | class UserData:
 50 | 
 51 |     def __init__(self):
 52 |         self._completed_requests = queue.Queue()
 53 | 
 54 | 
 55 | # Callback function used for async_stream_infer()
 56 | def completion_callback(user_data, result, error):
 57 |     # passing error raise and handling out
 58 |     user_data._completed_requests.put((result, error))
 59 | 
 60 | 
 61 | FLAGS = None
 62 | 
 63 | 
 64 | def parse_model(model_metadata, model_config):
 65 |     """
 66 |     Check the configuration of a model to make sure it meets the
 67 |     requirements for an image classification network (as expected by
 68 |     this client)
 69 |     """
 70 |     if len(model_metadata.inputs) != 1:
 71 |         raise Exception("expecting 1 input, got {}".format(
 72 |             len(model_metadata.inputs)))
 73 |     if len(model_metadata.outputs) != 1:
 74 |         raise Exception("expecting 1 output, got {}".format(
 75 |             len(model_metadata.outputs)))
 76 | 
 77 |     if len(model_config.input) != 1:
 78 |         raise Exception(
 79 |             "expecting 1 input in model configuration, got {}".format(
 80 |                 len(model_config.input)))
 81 | 
 82 |     input_metadata = model_metadata.inputs[0]
 83 |     input_config = model_config.input[0]
 84 |     output_metadata = model_metadata.outputs[0]
 85 | 
 86 |     if output_metadata.datatype != "FP32":
 87 |         raise Exception("expecting output datatype to be FP32, model '" +
 88 |                         model_metadata.name + "' output type is " +
 89 |                         output_metadata.datatype)
 90 | 
 91 |     # Output is expected to be a vector. But allow any number of
 92 |     # dimensions as long as all but 1 is size 1 (e.g. { 10 }, { 1, 10
 93 |     # }, { 10, 1, 1 } are all ok). Ignore the batch dimension if there
 94 |     # is one.
 95 |     output_batch_dim = (model_config.max_batch_size > 0)
 96 |     non_one_cnt = 0
 97 |     for dim in output_metadata.shape:
 98 |         if output_batch_dim:
 99 |             output_batch_dim = False
100 |         elif dim > 1:
101 |             non_one_cnt += 1
102 |             if non_one_cnt > 1:
103 |                 raise Exception("expecting model output to be a vector")
104 | 
105 |     # Model input must have 3 dims, either CHW or HWC (not counting
106 |     # the batch dimension), either CHW or HWC
107 |     input_batch_dim = (model_config.max_batch_size > 0)
108 |     expected_input_dims = 3 + (1 if input_batch_dim else 0)
109 |     if len(input_metadata.shape) != expected_input_dims:
110 |         raise Exception(
111 |             "expecting input to have {} dimensions, model '{}' input has {}".
112 |             format(expected_input_dims, model_metadata.name,
113 |                    len(input_metadata.shape)))
114 | 
115 |     if type(input_config.format) == str:
116 |         FORMAT_ENUM_TO_INT = dict(mc.ModelInput.Format.items())
117 |         input_config.format = FORMAT_ENUM_TO_INT[input_config.format]
118 | 
119 |     if ((input_config.format != mc.ModelInput.FORMAT_NCHW) and
120 |         (input_config.format != mc.ModelInput.FORMAT_NHWC) and
121 |         (input_config.format != mc.ModelInput.FORMAT_NONE)):
122 |         raise Exception("unexpected input format " +
123 |                         mc.ModelInput.Format.Name(input_config.format) +
124 |                         ", expecting " +
125 |                         mc.ModelInput.Format.Name(mc.ModelInput.FORMAT_NCHW) +
126 |                         " or " +
127 |                         mc.ModelInput.Format.Name(mc.ModelInput.FORMAT_NHWC))
128 | 
129 |     if input_config.format == mc.ModelInput.FORMAT_NHWC:
130 |         h = input_metadata.shape[1 if input_batch_dim else 0]
131 |         w = input_metadata.shape[2 if input_batch_dim else 1]
132 |         c = input_metadata.shape[3 if input_batch_dim else 2]
133 |     else:
134 |         c = input_metadata.shape[1 if input_batch_dim else 0]
135 |         h = input_metadata.shape[2 if input_batch_dim else 1]
136 |         w = input_metadata.shape[3 if input_batch_dim else 2]
137 | 
138 |     return (model_config.max_batch_size, input_metadata.name,
139 |             output_metadata.name, c, h, w, input_config.format,
140 |             input_metadata.datatype)
141 | 
142 | 
143 | def preprocess(img, format, dtype, c, h, w, scaling, protocol):
144 |     """
145 |     Pre-process an image to meet the size, type and format
146 |     requirements specified by the parameters.
147 |     """
148 |     # np.set_printoptions(threshold='nan')
149 | 
150 |     if c == 1:
151 |         sample_img = img.convert('L')
152 |     else:
153 |         sample_img = img.convert('RGB')
154 | 
155 |     resized_img = sample_img.resize((w, h), Image.BILINEAR)
156 |     resized = np.array(resized_img)
157 |     if resized.ndim == 2:
158 |         resized = resized[:, :, np.newaxis]
159 | 
160 |     npdtype = triton_to_np_dtype(dtype)
161 |     typed = resized.astype(npdtype)
162 | 
163 |     if scaling == 'INCEPTION':
164 |         scaled = (typed / 127.5) - 1
165 |     elif scaling == 'VGG':
166 |         if c == 1:
167 |             scaled = typed - np.asarray((128,), dtype=npdtype)
168 |         else:
169 |             scaled = typed - np.asarray((123, 117, 104), dtype=npdtype)
170 |     elif scaling == 'RESNET':
171 |         scaled = (typed / 255 - np.array([0.485, 0.456, 0.406], dtype=npdtype))/np.array([0.229, 0.224, 0.225], dtype=npdtype)
172 |     else:
173 |         scaled = typed
174 | 
175 |     # Swap to CHW if necessary
176 |     if format == mc.ModelInput.FORMAT_NHWC:
177 |         ordered = scaled
178 |     else:
179 |         ordered = np.transpose(scaled, (2, 0, 1))
180 | 
181 |     # Channels are in RGB order. Currently model configuration data
182 |     # doesn't provide any information as to other channel orderings
183 |     # (like BGR) so we just assume RGB.
184 |     return ordered
185 | 
186 | 
187 | def postprocess(results, output_name, batch_size, batching):
188 |     """
189 |     Post-process results to show classifications.
190 |     """
191 | 
192 |     output_array = results.as_numpy(output_name)
193 |     if len(output_array) != batch_size:
194 |         raise Exception("expected {} results, got {}".format(
195 |             batch_size, len(output_array)))
196 | 
197 |     # Include special handling for non-batching models
198 |     for results in output_array:
199 |         if not batching:
200 |             results = [results]
201 |         for result in results:
202 |             if output_array.dtype.type == np.object_:
203 |                 cls = "".join(chr(x) for x in result).split(':')
204 |             else:
205 |                 cls = result.split(':')
206 |             print("    {} ({})".format(cls[0], cls[1]))
207 | 
208 | 
209 | def requestGenerator(batched_image_data, input_name, output_name, dtype, FLAGS):
210 |     protocol = FLAGS.protocol.lower()
211 | 
212 |     if protocol == "grpc":
213 |         client = grpcclient
214 |     else:
215 |         client = httpclient
216 | 
217 |     # Set the input data
218 |     inputs = [client.InferInput(input_name, batched_image_data.shape, dtype)]
219 |     inputs[0].set_data_from_numpy(batched_image_data)
220 | 
221 |     outputs = [
222 |         client.InferRequestedOutput(output_name, class_count=FLAGS.classes)
223 |     ]
224 | 
225 |     yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
226 | 
227 | 
228 | def convert_http_metadata_config(_metadata, _config):
229 |     _model_metadata = AttrDict(_metadata)
230 |     _model_config = AttrDict(_config)
231 | 
232 |     return _model_metadata, _model_config
233 | 
234 | 
235 | if __name__ == '__main__':
236 |     parser = argparse.ArgumentParser()
237 |     parser.add_argument('-v',
238 |                         '--verbose',
239 |                         action="store_true",
240 |                         required=False,
241 |                         default=False,
242 |                         help='Enable verbose output')
243 |     parser.add_argument('-a',
244 |                         '--async',
245 |                         dest="async_set",
246 |                         action="store_true",
247 |                         required=False,
248 |                         default=False,
249 |                         help='Use asynchronous inference API')
250 |     parser.add_argument('--streaming',
251 |                         action="store_true",
252 |                         required=False,
253 |                         default=False,
254 |                         help='Use streaming inference API. ' +
255 |                         'The flag is only available with gRPC protocol.')
256 |     parser.add_argument('-m',
257 |                         '--model-name',
258 |                         type=str,
259 |                         required=True,
260 |                         help='Name of model')
261 |     parser.add_argument(
262 |         '-x',
263 |         '--model-version',
264 |         type=str,
265 |         required=False,
266 |         default="",
267 |         help='Version of model. Default is to use latest version.')
268 |     parser.add_argument('-b',
269 |                         '--batch-size',
270 |                         type=int,
271 |                         required=False,
272 |                         default=1,
273 |                         help='Batch size. Default is 1.')
274 |     parser.add_argument('-c',
275 |                         '--classes',
276 |                         type=int,
277 |                         required=False,
278 |                         default=1,
279 |                         help='Number of class results to report. Default is 1.')
280 |     parser.add_argument(
281 |         '-s',
282 |         '--scaling',
283 |         type=str,
284 |         choices=['RESNET'],
285 |         required=False,
286 |         default='RESNET',
287 |         help='Type of scaling to apply to image pixels. Default is RESNET.')
288 |     parser.add_argument('-u',
289 |                         '--url',
290 |                         type=str,
291 |                         required=False,
292 |                         default='localhost:8000',
293 |                         help='Inference server URL. Default is localhost:8000.')
294 |     parser.add_argument('-i',
295 |                         '--protocol',
296 |                         type=str,
297 |                         required=False,
298 |                         default='HTTP',
299 |                         help='Protocol (HTTP/gRPC) used to communicate with ' +
300 |                         'the inference service. Default is HTTP.')
301 |     parser.add_argument('image_filename',
302 |                         type=str,
303 |                         nargs='?',
304 |                         default=None,
305 |                         help='Input image / Input folder.')
306 |     FLAGS = parser.parse_args()
307 | 
308 |     if FLAGS.streaming and FLAGS.protocol.lower() != "grpc":
309 |         raise Exception("Streaming is only allowed with gRPC protocol")
310 | 
311 |     try:
312 |         if FLAGS.protocol.lower() == "grpc":
313 |             # Create gRPC client for communicating with the server
314 |             triton_client = grpcclient.InferenceServerClient(
315 |                 url=FLAGS.url, verbose=FLAGS.verbose)
316 |         else:
317 |             # Specify large enough concurrency to handle the
318 |             # the number of requests.
319 |             concurrency = 20 if FLAGS.async_set else 1
320 |             triton_client = httpclient.InferenceServerClient(
321 |                 url=FLAGS.url, verbose=FLAGS.verbose, concurrency=concurrency)
322 |     except Exception as e:
323 |         print("client creation failed: " + str(e))
324 |         sys.exit(1)
325 | 
326 |     # Make sure the model matches our requirements, and get some
327 |     # properties of the model that we need for preprocessing
328 |     try:
329 |         model_metadata = triton_client.get_model_metadata(
330 |             model_name=FLAGS.model_name, model_version=FLAGS.model_version)
331 |     except InferenceServerException as e:
332 |         print("failed to retrieve the metadata: " + str(e))
333 |         sys.exit(1)
334 | 
335 |     try:
336 |         model_config = triton_client.get_model_config(
337 |             model_name=FLAGS.model_name, model_version=FLAGS.model_version)
338 |     except InferenceServerException as e:
339 |         print("failed to retrieve the config: " + str(e))
340 |         sys.exit(1)
341 | 
342 |     if FLAGS.protocol.lower() == "grpc":
343 |         model_config = model_config.config
344 |     else:
345 |         model_metadata, model_config = convert_http_metadata_config(
346 |             model_metadata, model_config)
347 | 
348 |     max_batch_size, input_name, output_name, c, h, w, format, dtype = parse_model(
349 |         model_metadata, model_config)
350 | 
351 |     filenames = []
352 |     if os.path.isdir(FLAGS.image_filename):
353 |         filenames = [
354 |             os.path.join(FLAGS.image_filename, f)
355 |             for f in os.listdir(FLAGS.image_filename)
356 |             if os.path.isfile(os.path.join(FLAGS.image_filename, f))
357 |         ]
358 |     else:
359 |         filenames = [
360 |             FLAGS.image_filename,
361 |         ]
362 | 
363 |     filenames.sort()
364 | 
365 |     # Preprocess the images into input data according to model
366 |     # requirements
367 |     image_data = []
368 |     for filename in filenames:
369 |         img = Image.open(filename)
370 |         image_data.append(
371 |             preprocess(img, format, dtype, c, h, w, FLAGS.scaling,
372 |                        FLAGS.protocol.lower()))
373 | 
374 |     # Send requests of FLAGS.batch_size images. If the number of
375 |     # images isn't an exact multiple of FLAGS.batch_size then just
376 |     # start over with the first images until the batch is filled.
377 |     requests = []
378 |     responses = []
379 |     result_filenames = []
380 |     request_ids = []
381 |     image_idx = 0
382 |     last_request = False
383 |     user_data = UserData()
384 | 
385 |     # Holds the handles to the ongoing HTTP async requests.
386 |     async_requests = []
387 | 
388 |     sent_count = 0
389 | 
390 |     if FLAGS.streaming:
391 |         triton_client.start_stream(partial(completion_callback, user_data))
392 | 
393 |     while not last_request:
394 |         input_filenames = []
395 |         repeated_image_data = []
396 | 
397 |         for idx in range(FLAGS.batch_size):
398 |             input_filenames.append(filenames[image_idx])
399 |             repeated_image_data.append(image_data[image_idx])
400 |             image_idx = (image_idx + 1) % len(image_data)
401 |             if image_idx == 0:
402 |                 last_request = True
403 | 
404 |         if max_batch_size > 0:
405 |             batched_image_data = np.stack(repeated_image_data, axis=0)
406 |         else:
407 |             batched_image_data = repeated_image_data[0]
408 | 
409 |         # Send request
410 |         try:
411 |             for inputs, outputs, model_name, model_version in requestGenerator(
412 |                     batched_image_data, input_name, output_name, dtype, FLAGS):
413 |                 sent_count += 1
414 |                 if FLAGS.streaming:
415 |                     triton_client.async_stream_infer(
416 |                         FLAGS.model_name,
417 |                         inputs,
418 |                         request_id=str(sent_count),
419 |                         model_version=FLAGS.model_version,
420 |                         outputs=outputs)
421 |                 elif FLAGS.async_set:
422 |                     if FLAGS.protocol.lower() == "grpc":
423 |                         triton_client.async_infer(
424 |                             FLAGS.model_name,
425 |                             inputs,
426 |                             partial(completion_callback, user_data),
427 |                             request_id=str(sent_count),
428 |                             model_version=FLAGS.model_version,
429 |                             outputs=outputs)
430 |                     else:
431 |                         async_requests.append(
432 |                             triton_client.async_infer(
433 |                                 FLAGS.model_name,
434 |                                 inputs,
435 |                                 request_id=str(sent_count),
436 |                                 model_version=FLAGS.model_version,
437 |                                 outputs=outputs))
438 |                 else:
439 |                     responses.append(
440 |                         triton_client.infer(FLAGS.model_name,
441 |                                             inputs,
442 |                                             request_id=str(sent_count),
443 |                                             model_version=FLAGS.model_version,
444 |                                             outputs=outputs))
445 | 
446 |         except InferenceServerException as e:
447 |             print("inference failed: " + str(e))
448 |             if FLAGS.streaming:
449 |                 triton_client.stop_stream()
450 |             sys.exit(1)
451 | 
452 |     if FLAGS.streaming:
453 |         triton_client.stop_stream()
454 | 
455 |     if FLAGS.protocol.lower() == "grpc":
456 |         if FLAGS.streaming or FLAGS.async_set:
457 |             processed_count = 0
458 |             while processed_count < sent_count:
459 |                 (results, error) = user_data._completed_requests.get()
460 |                 processed_count += 1
461 |                 if error is not None:
462 |                     print("inference failed: " + str(error))
463 |                     sys.exit(1)
464 |                 responses.append(results)
465 |     else:
466 |         if FLAGS.async_set:
467 |             # Collect results from the ongoing async requests
468 |             # for HTTP Async requests.
469 |             for async_request in async_requests:
470 |                 responses.append(async_request.get_result())
471 | 
472 |     for response in responses:
473 |         if FLAGS.protocol.lower() == "grpc":
474 |             this_id = response.get_response().id
475 |         else:
476 |             this_id = response.get_response()["id"]
477 |         print("Request {}, batch size {}".format(this_id, FLAGS.batch_size))
478 |         postprocess(response, output_name, FLAGS.batch_size, max_batch_size > 0)
479 | 
480 |     print("PASS")
481 | 


--------------------------------------------------------------------------------
/examples/infer_resnet50_v1.5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | docker run -it --rm \
29 |            --net=host \
30 |            -v`pwd`:/workspace/examples \
31 |            nvcr.io/nvidia/tritonserver:21.04-py3-sdk \
32 |            /bin/bash -c \
33 |            "python examples/infer_resnet50_v1.5.py -m ResNet50-v1.5 -c 3 examples/$1"
34 | 


--------------------------------------------------------------------------------
/examples/models/ERNIE/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "ERNIE"
 2 | backend: "paddle"
 3 | max_batch_size: 64
 4 | input [
 5 |   {
 6 |     name: "input_ids"
 7 |     data_type: TYPE_INT64
 8 |     dims: [ -1 ],
 9 |   },
10 |   {
11 |     name: "token_type_ids"
12 |     data_type: TYPE_INT64
13 |     dims: [ -1 ],
14 |   }
15 | ]
16 | output [
17 |   {
18 |     name: "linear_113.tmp_1"
19 |     data_type: TYPE_FP32
20 |     dims: [ 15 ]
21 |   }
22 | ]
23 | 
24 | instance_group [ 
25 |   { 
26 |     count: 1
27 |     kind: KIND_GPU
28 |     #gpus: [ 0 ]
29 |   }
30 | ]
31 | 
32 | optimization { 
33 |   execution_accelerators { 
34 |     gpu_execution_accelerator : [ 
35 |       { 
36 |         name : "tensorrt"
37 |         parameters { key: "precision" value: "trt_fp32" }
38 |         parameters { key: "min_graph_size" value: "3" }
39 |         parameters { key: "max_batch_size" value: "16" }
40 |         parameters { key: "workspace_size" value: "2147483647" }
41 |         parameters { key: "enable_tensorrt_oss" value: "0" }
42 |         parameters { key: "is_dynamic" value: "1" }
43 |       },
44 |       {
45 |         name : "min_shape"
46 |         parameters { key: "input_ids" value: "1 2" }
47 |         parameters { key: "token_type_ids" value: "1 2" }
48 |       },
49 |       {
50 |         name : "max_shape"
51 |         parameters { key: "input_ids" value: "16 128" }
52 |         parameters { key: "token_type_ids" value: "16 128" }
53 |       },
54 |       {
55 |         name : "opt_shape"
56 |         parameters { key: "input_ids" value: "16 128" }
57 |         parameters { key: "token_type_ids" value: "16 128" }
58 |       }
59 |     ]
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/examples/models/ResNet50-v1.5/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "ResNet50-v1.5"
 2 | backend: "paddle"
 3 | max_batch_size: 128
 4 | 
 5 | input [
 6 |   {
 7 |     name: "x0"
 8 |     data_type: TYPE_FP32
 9 |     dims: [ 3, 224, 224 ]
10 |   }
11 | ]
12 | 
13 | output [
14 |   {
15 |     name: "save_infer_model/scale_0.tmp_1"
16 |     data_type: TYPE_FP32
17 |     dims: [ 1000 ]
18 |   }
19 | ]
20 | 
21 | instance_group [ 
22 |   { 
23 |     count: 1
24 |     kind: KIND_GPU
25 |     gpus: [0]
26 |   }
27 | ]
28 | 
29 | dynamic_batching {
30 |     preferred_batch_size: [ 64, 128 ]
31 |     max_queue_delay_microseconds: 0
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/examples/perf_ernie.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | docker run -it --rm \
28 |            --net=host \
29 |            -v `pwd`/perf_data:/workspace/data \
30 |            nvcr.io/nvidia/tritonserver:21.10-py3-sdk \
31 |            /bin/bash -c \
32 |            'for b in 1 2 4 8 16; do perf_analyzer -m ERNIE --shape input_ids:128 --shape token_type_ids:128 --input-data zero -b ${b}; done'
33 | 


--------------------------------------------------------------------------------
/examples/perf_resnet50_v1.5.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | docker run -it --rm \
28 |            --net=host \
29 |            nvcr.io/nvidia/tritonserver:21.10-py3-sdk \
30 |            /bin/bash -c \
31 |            'for b in 1 2 4 8 16 32 64 128; do perf_analyzer -m ResNet50-v1.5 -b $b; done'
32 | 


--------------------------------------------------------------------------------
/paddle-lib/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | FROM nvcr.io/nvidia/tritonserver:21.10-py3
28 | 
29 | ENV DEBIAN_FRONTEND=noninteractive
30 | 
31 | RUN apt-key del 7fa2af80 \
32 |     && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \
33 |     && dpkg -i cuda-keyring_1.0-1_all.deb
34 | 
35 | RUN apt-get update \
36 |  && apt-get install -y --no-install-recommends \
37 |         cmake \
38 |         patchelf \
39 |         python3-dev \
40 |         unzip \
41 |         gcc-8 \
42 |         g++-8 \
43 |         libgl1 \
44 |         libssl-dev
45 | 
46 | RUN git clone 'https://github.com/PaddlePaddle/Paddle.git'
47 | WORKDIR /opt/tritonserver/Paddle
48 | RUN git pull && git checkout release/2.3
49 | 
50 | RUN python3 -m pip install pyyaml && mkdir build-env && \
51 |     cd build-env && \
52 |     cmake .. -DWITH_PYTHON=OFF \
53 |              -DWITH_GPU=ON \
54 |              -DWITH_TESTING=OFF \
55 |              -DWITH_INFERENCE_API_TEST=OFF \
56 |              -DCMAKE_BUILD_TYPE=Release \
57 |              -DCUDA_ARCH_NAME=Auto \
58 |              -DON_INFER=ON \
59 |              -DWITH_MKL=ON \
60 |              -DWITH_TENSORRT=ON \
61 |              -DWITH_ONNXRUNTIME=ON \
62 |              -DCMAKE_C_COMPILER=`which gcc-8` -DCMAKE_CXX_COMPILER=`which g++-8` && \
63 |     make -j`nproc`
64 | 


--------------------------------------------------------------------------------
/paddle-lib/build_paddle.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | set -xe
29 | 
30 | docker build -t paddle-build .
31 | docker create --rm --name triton_paddle_build paddle-build:latest
32 | 
33 | docker cp triton_paddle_build:/opt/tritonserver/Paddle/build-env/paddle_inference_install_dir/paddle .
34 | docker cp triton_paddle_build:/opt/tritonserver/Paddle/build-env/paddle_inference_install_dir/third_party/install/paddle2onnx .
35 | docker cp triton_paddle_build:/opt/tritonserver/Paddle/build-env/paddle_inference_install_dir/third_party/install/onnxruntime .
36 | docker cp triton_paddle_build:/opt/tritonserver/Paddle/build-env/paddle_inference_install_dir/third_party/install/mkldnn .
37 | docker cp triton_paddle_build:/opt/tritonserver/Paddle/build-env/paddle_inference_install_dir/third_party/install/mklml .
38 | 
39 | rm paddle/lib/libpaddle_inference.a
40 | docker rm triton_paddle_build


--------------------------------------------------------------------------------
/scripts/build_paddle_backend.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | if [ ! -d "./cmake-3.18.6-Linux-x86_64/" ]; then
28 |     wget https://github.com/Kitware/CMake/releases/download/v3.18.6/cmake-3.18.6-Linux-x86_64.tar.gz
29 |     tar -zxvf cmake-3.18.6-Linux-x86_64.tar.gz
30 |     rm -rf cmake-3.18.6-Linux-x86_64.tar.gz
31 | fi
32 | 
33 | docker run -it --rm \
34 |            -v`pwd`:/workspace/paddle_backend \
35 |            nvcr.io/nvidia/tritonserver:21.10-py3 \
36 |            bash -c \
37 |            'cd /workspace/paddle_backend; rm -rf build; mkdir build; cd build;apt-get update; apt-get install -y --no-install-recommends rapidjson-dev;export PATH=/workspace/paddle_backend/cmake-3.18.6-Linux-x86_64/bin:$PATH;cmake .. -DPADDLE_INFERENCE_DIR=../paddle-lib/paddle -DTRITON_COMMON_REPO_TAG=r21.10 -DTRITON_CORE_REPO_TAG=r21.10 -DTRITON_BACKEND_REPO_TAG=r21.10; make -j`nproc`'
38 | 


--------------------------------------------------------------------------------
/scripts/launch_triton_server.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | set -xe
29 | 
30 | docker build -t tritonserver-paddle .
31 | 
32 | docker run --gpus=all --rm -it \
33 |            --net=host \
34 |            -e CUDA_VISIBLE_DEVICES=0 \
35 |            -v `pwd`/examples/models:/workspace/models \
36 |            tritonserver-paddle:latest /bin/bash -c \
37 |            '/opt/tritonserver/bin/tritonserver --model-repository=/workspace/models'
38 | 


--------------------------------------------------------------------------------
/src/libtriton_paddle.ldscript:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | {
27 |   global:
28 |     TRITONBACKEND_*;
29 |   local: *;
30 | };
31 | 


--------------------------------------------------------------------------------
/src/paddle.cc:
--------------------------------------------------------------------------------
   1 | // Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
   2 | //
   3 | // Redistribution and use in source and binary forms, with or without
   4 | // modification, are permitted provided that the following conditions
   5 | // are met:
   6 | //  * Redistributions of source code must retain the above copyright
   7 | //    notice, this list of conditions and the following disclaimer.
   8 | //  * Redistributions in binary form must reproduce the above copyright
   9 | //    notice, this list of conditions and the following disclaimer in the
  10 | //    documentation and/or other materials provided with the distribution.
  11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
  12 | //    contributors may be used to endorse or promote products derived
  13 | //    from this software without specific prior written permission.
  14 | //
  15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
  16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 | 
  27 | #include <algorithm>
  28 | #include <memory>
  29 | #include <numeric>
  30 | 
  31 | #include "paddle_backend_utils.h"
  32 | #include "paddle_inference_api.h"
  33 | #include "triton/backend/backend_common.h"
  34 | #include "triton/backend/backend_input_collector.h"
  35 | #include "triton/backend/backend_model.h"
  36 | #include "triton/backend/backend_model_instance.h"
  37 | #include "triton/backend/backend_output_responder.h"
  38 | 
  39 | struct TRITONPADDLE_Tensor;
  40 | 
  41 | // Paddle Predictor Wrapper
  42 | struct TRITONPADDLE_Model;
  43 | 
  44 | class ModelImpl {
  45 |  public:
  46 |   ModelImpl(
  47 |       const char* model_path, const char* param_path,
  48 |       TRITONPADDLE_Config* config, const int32_t device_id, cudaStream_t stream);
  49 |   ~ModelImpl() = default;
  50 |   void CollectShapeRun(paddle_infer::Predictor* predictor,
  51 |                        const std::map<std::string, std::vector<int>>& shape);
  52 |   void CollectTensorRtShapeRange(const char* model_path, const char* param_path,
  53 |                                  TRITONPADDLE_Config* config,
  54 |                                  const int32_t device_id);
  55 |   TRITONPADDLE_Error* Run();
  56 | 
  57 |   TRITONPADDLE_Error* GetInputPtr(
  58 |       const char* name, const TRITONPADDLE_DataType dtype,
  59 |       const TRITONPADDLE_Shape& shape, char** ptr);
  60 | 
  61 |   TRITONPADDLE_Error* GetOutputMetadata(
  62 |       const char* name, TRITONPADDLE_DataType* dtype, TRITONPADDLE_Shape* shape,
  63 |       char** ptr);
  64 | 
  65 |   TRITONPADDLE_Error* ZeroCopyRun();
  66 | 
  67 |  private:
  68 |   // TODO(wilber): unique_ptr?
  69 |   std::unique_ptr<paddle_infer::Config> analysis_config_;
  70 |   std::shared_ptr<paddle_infer::Predictor> predictor_;
  71 |   paddle_infer::PlaceType place_type_;
  72 |   std::string shape_range_info_;
  73 | };
  74 | 
  75 | void ModelImpl::CollectShapeRun(paddle_infer::Predictor* predictor,
  76 |                                 const std::map<std::string, std::vector<int>>& shape) {
  77 |   auto input_names = predictor->GetInputNames();
  78 |   auto input_type = predictor->GetInputTypes();
  79 |   for(auto name : input_names) {
  80 |     if(shape.find(name) == shape.end() or
  81 |        input_type.find(name) == input_type.end()) {
  82 |       TRITONPADDLE_Error* error = TRITONPADDLE_ErrorNew(
  83 |           std::string("Paddle Input name [") + std::string(name) +
  84 |           std::string("] is not one of the trt dynamic_shape"));
  85 |       THROW_IF_TRITONPADDLE_ERROR(error);
  86 |     }
  87 | 
  88 |     auto tensor = predictor->GetInputHandle(name);
  89 |     auto shape_value = shape.at(name);
  90 |     int shape_num = std::accumulate(shape_value.begin(), shape_value.end(), 1,
  91 |                                     std::multiplies<int>());
  92 |     tensor->Reshape(shape_value);
  93 |     auto dtype = input_type[name];
  94 |     switch (dtype) {
  95 |       case paddle_infer::DataType::FLOAT32: {
  96 |         std::vector<float> input_data(shape_num, 1.0);
  97 |         tensor->CopyFromCpu(input_data.data());
  98 |         break;
  99 |       }
 100 |       case paddle_infer::DataType::INT32: {
 101 |         std::vector<int> input_data(shape_num, 1);
 102 |         tensor->CopyFromCpu(input_data.data());
 103 |         break;
 104 |       }
 105 |       case paddle_infer::DataType::INT64: {
 106 |         std::vector<int64_t> input_data(shape_num, 1);
 107 |         tensor->CopyFromCpu(input_data.data());
 108 |         break;
 109 |       }
 110 |       case paddle_infer::DataType::FLOAT16: {
 111 |         std::vector<phi::dtype::float16> input_data(shape_num, (phi::dtype::float16)1.0);
 112 |         tensor->CopyFromCpu(input_data.data());
 113 |         break;
 114 |       }
 115 |       default: {
 116 |         TRITONPADDLE_Error* error = TRITONPADDLE_ErrorNew(std::string(
 117 |             "input data Paddle backend only supports FP32/INT32/INT64 currently"));
 118 |         THROW_IF_TRITONPADDLE_ERROR(error);
 119 |         break;
 120 |       }
 121 |     }
 122 |   }
 123 |   predictor->Run();
 124 | }
 125 | 
 126 | void ModelImpl::CollectTensorRtShapeRange(const char* model_path, const char* param_path,
 127 |                                           TRITONPADDLE_Config* config,
 128 |                                           const int32_t device_id) {
 129 |   paddle_infer::Config analysis_config;
 130 |   if (param_path == nullptr) {
 131 |     analysis_config.SetModel(model_path, "");
 132 |   } else {
 133 |     analysis_config.SetModel(model_path, param_path);
 134 |   }
 135 |   // analysis_config.EnableUseGpu(100, device_id);
 136 |   analysis_config.CollectShapeRangeInfo(shape_range_info_);
 137 |   auto predictor = paddle_infer::CreatePredictor(analysis_config);
 138 |   CollectShapeRun(predictor.get(), config->dynamic_min_shape_);
 139 |   CollectShapeRun(predictor.get(), config->dynamic_max_shape_);
 140 |   CollectShapeRun(predictor.get(), config->dynamic_opt_shape_);
 141 | }
 142 | 
 143 | ModelImpl::ModelImpl(
 144 |     const char* model_path, const char* param_path, TRITONPADDLE_Config* config,
 145 |     const int32_t device_id, cudaStream_t stream)
 146 | {
 147 |   analysis_config_.reset(new paddle_infer::Config());
 148 | 
 149 |   if (param_path == nullptr) {
 150 |     analysis_config_->SetModel(model_path, "");
 151 |   } else {
 152 |     analysis_config_->SetModel(model_path, param_path);
 153 |   }
 154 | 
 155 |   // default settings
 156 |   analysis_config_->SwitchSpecifyInputNames(true);
 157 |   analysis_config_->SwitchIrOptim(true);
 158 |   analysis_config_->EnableMemoryOptim();
 159 |   analysis_config_->SwitchUseFeedFetchOps(false);
 160 | 
 161 |   if (config->use_cpu_) {
 162 |     place_type_ = paddle_infer::PlaceType::kCPU;
 163 |     analysis_config_->SetCpuMathLibraryNumThreads(config->cpu_math_library_num_threads_);
 164 |     if(config->use_ort_) {
 165 |       analysis_config_->EnableONNXRuntime();
 166 |       analysis_config_->EnableORTOptimization();
 167 |     } else if(config->use_mkldnn_) {
 168 |       analysis_config_->EnableMKLDNN();
 169 |       analysis_config_->SetMkldnnCacheCapacity(config->mkldnn_capacity_);
 170 |       // Release/2.3 don't support mkldnn_int8
 171 |       // if(config->use_mkldnn_int8_)
 172 |       //   analysis_config_->EnableMkldnnInt8();
 173 |     }
 174 |   } else {
 175 |     place_type_ = paddle_infer::PlaceType::kGPU;
 176 |     analysis_config_->EnableUseGpu(100, device_id);
 177 |     analysis_config_->SetExecStream((void*)stream);
 178 | 
 179 |     paddle::AnalysisConfig::Precision compute_precision;
 180 |     compute_precision = paddle::AnalysisConfig::Precision::kFloat32;
 181 |     if (config->precision_ == TRITONPADDLE_MODE_FP32) {
 182 |       compute_precision = paddle::AnalysisConfig::Precision::kFloat32;
 183 |     } else if (config->precision_ == TRITONPADDLE_MODE_FP16) {
 184 |       compute_precision = paddle::AnalysisConfig::Precision::kHalf;
 185 |     } else if (config->precision_ == TRITONPADDLE_MODE_INT8) {
 186 |       compute_precision = paddle::AnalysisConfig::Precision::kInt8;
 187 |     } else {
 188 |       TRITONPADDLE_Error* error = TRITONPADDLE_ErrorNew(
 189 |           "unknown precision type when setting tensorrt compute precision.");
 190 |       THROW_IF_TRITONPADDLE_ERROR(error);
 191 |     }
 192 | 
 193 |     if (config->use_trt_) {
 194 |       analysis_config_->EnableTensorRtEngine(
 195 |         config->workspace_size_, config->max_batch_size_,
 196 |         config->min_graph_size_, compute_precision, false, false);
 197 |       if (config->enable_tensorrt_oss_) {
 198 |         analysis_config_->EnableVarseqlen();
 199 |       }
 200 |       if (config->is_dynamic_) {
 201 |         shape_range_info_ = triton::backend::JoinPath({config->model_dir_, "shape_range_info.pbtxt"});
 202 |         if (!config->disenable_trt_tune_) {
 203 |           CollectTensorRtShapeRange(model_path, param_path, config, device_id);
 204 |         }
 205 |         analysis_config_->EnableTunedTensorRtDynamicShape(shape_range_info_);
 206 |       }
 207 |     }
 208 |   }
 209 |   predictor_ = std::move(paddle_infer::CreatePredictor(*analysis_config_.get()));
 210 | }
 211 | 
 212 | TRITONPADDLE_Error*
 213 | ModelImpl::Run()
 214 | {
 215 |   predictor_->Run();
 216 | 
 217 |   // TODO: paddle predictor stream controll
 218 |   if(analysis_config_->use_gpu())
 219 |     cudaDeviceSynchronize();
 220 |   return nullptr;
 221 | }
 222 | 
 223 | TRITONPADDLE_Error*
 224 | ModelImpl::GetInputPtr(
 225 |     const char* name, const TRITONPADDLE_DataType dtype,
 226 |     const TRITONPADDLE_Shape& shape, char** ptr)
 227 | {
 228 |   auto input_names = predictor_->GetInputNames();
 229 | 
 230 |   // check whether the given name is in predictor_ input names
 231 |   if (std::find(input_names.begin(), input_names.end(), std::string(name)) ==
 232 |       input_names.end()) {
 233 |     return TRITONPADDLE_ErrorNew(
 234 |         std::string("Input name [") + std::string(name) +
 235 |         std::string("] is not one of the Paddle predictor input"));
 236 |   }
 237 | 
 238 |   auto tensor = predictor_->GetInputHandle(name);
 239 |   tensor->Reshape(shape.CompatibleShape());
 240 |   switch (dtype) {
 241 |     case TRITONPADDLE_TYPE_FP32:
 242 |       *ptr = reinterpret_cast<char*>(
 243 |           tensor->mutable_data<float>(place_type_));
 244 |       break;
 245 |     case TRITONPADDLE_TYPE_INT32:
 246 |       *ptr = reinterpret_cast<char*>(
 247 |           tensor->mutable_data<int32_t>(place_type_));
 248 |       break;
 249 |     case TRITONPADDLE_TYPE_INT64:
 250 |       *ptr = reinterpret_cast<char*>(
 251 |           tensor->mutable_data<int64_t>(place_type_));
 252 |       break;
 253 |     case TRITONPADDLE_TYPE_FP16:
 254 |       *ptr = reinterpret_cast<char*>(
 255 |           tensor->mutable_data<phi::dtype::float16>(place_type_));
 256 |       break;
 257 |     default:
 258 |       return TRITONPADDLE_ErrorNew(std::string(
 259 |           "Paddle backend only supports FP32/INT32/INT64 currently"));
 260 |   }
 261 | 
 262 |   return nullptr;
 263 | }
 264 | 
 265 | TRITONPADDLE_Error*
 266 | ModelImpl::GetOutputMetadata(
 267 |     const char* name, TRITONPADDLE_DataType* dtype, TRITONPADDLE_Shape* shape,
 268 |     char** ptr)
 269 | {
 270 |   auto output_names = predictor_->GetOutputNames();
 271 | 
 272 |   // check whether the given name is in predictor_ output names
 273 |   if (std::find(output_names.begin(), output_names.end(), std::string(name)) ==
 274 |       output_names.end()) {
 275 |     return TRITONPADDLE_ErrorNew(
 276 |         std::string("Output name [") + std::string(name) +
 277 |         std::string("] is not one of the Paddle predictor input"));
 278 |   }
 279 | 
 280 |   auto tensor = predictor_->GetOutputHandle(name);
 281 |   auto tensor_type = tensor->type();
 282 |   auto tensor_shape = tensor->shape();
 283 | 
 284 |   *dtype = ConvertDataType(tensor_type);
 285 |   *shape = TRITONPADDLE_Shape(tensor_shape);
 286 | 
 287 |   switch (*dtype) {
 288 |     case TRITONPADDLE_TYPE_FP32:
 289 |       *ptr = reinterpret_cast<char*>(
 290 |           tensor->mutable_data<float>(place_type_));
 291 |       break;
 292 |     case TRITONPADDLE_TYPE_INT64:
 293 |       *ptr = reinterpret_cast<char*>(
 294 |           tensor->mutable_data<int64_t>(place_type_));
 295 |       break;
 296 |     case TRITONPADDLE_TYPE_INT32:
 297 |       *ptr = reinterpret_cast<char*>(
 298 |           tensor->mutable_data<int32_t>(place_type_));
 299 |       break;
 300 |     case TRITONPADDLE_TYPE_FP16:
 301 |       *ptr = reinterpret_cast<char*>(
 302 |           tensor->mutable_data<phi::dtype::float16>(place_type_));
 303 |       break;
 304 |     /*
 305 |     case TRITONPADDLE_TYPE_INT8:
 306 |       *ptr = reinterpret_cast<char*>(
 307 |           tensor->mutable_data<int8_t>(place_type_));
 308 |       break;
 309 |     case TRITONPADDLE_TYPE_UINT8:
 310 |       *ptr = reinterpret_cast<char*>(
 311 |           tensor->mutable_data<uint8_t>(place_type_));
 312 |       break;
 313 |     */
 314 |     default:
 315 |       return TRITONPADDLE_ErrorNew(std::string(
 316 |           "Paddle backend currently only support FP32/INT32/INT64"));
 317 |   }
 318 | 
 319 |   return nullptr;
 320 | }
 321 | 
 322 | TRITONSERVER_Error*
 323 | TRITONPADDLE_ModelCreate(
 324 |     TRITONPADDLE_Model** model, const char* model_path, const char* param_path,
 325 |     TRITONPADDLE_Config* config, const int32_t device_id, cudaStream_t stream)
 326 | {
 327 |   try {
 328 |     ModelImpl* model_impl =
 329 |         new ModelImpl(model_path, param_path, config, device_id, stream);
 330 |     *model = reinterpret_cast<TRITONPADDLE_Model*>(model_impl);
 331 |   }
 332 |   catch (const TRITONPADDLE_Exception& ex) {
 333 |     RETURN_IF_TRITONPADDLE_ERROR(ex.err_);
 334 |   }
 335 |   return nullptr;
 336 | }
 337 | 
 338 | void
 339 | TRITONPADDLE_ModelDelete(TRITONPADDLE_Model* model)
 340 | {
 341 |   if (model != nullptr) {
 342 |     ModelImpl* mi = reinterpret_cast<ModelImpl*>(model);
 343 |     delete mi;
 344 |   }
 345 | }
 346 | 
 347 | TRITONPADDLE_Error*
 348 | TRITONPADDLE_ModelRun(TRITONPADDLE_Model* model)
 349 | {
 350 |   ModelImpl* m = reinterpret_cast<ModelImpl*>(model);
 351 |   return m->Run();
 352 | }
 353 | 
 354 | class TensorImpl {
 355 |  public:
 356 |   TensorImpl(
 357 |       const char* name, TRITONPADDLE_DataType dtype,
 358 |       const TRITONPADDLE_Shape& shape, char* data_ptr);
 359 |   ~TensorImpl() = default;
 360 | 
 361 |   const std::string& Name() const { return name_; }
 362 |   TRITONPADDLE_DataType DataType() const { return dtype_; }
 363 |   TRITONPADDLE_Shape Shape() const { return shape_; }
 364 | 
 365 |   char* Base() const { return base_; }
 366 |   size_t ByteSize() const { return byte_size_; }
 367 | 
 368 |  private:
 369 |   const std::string name_;
 370 |   const TRITONPADDLE_DataType dtype_;
 371 |   const TRITONPADDLE_Shape shape_;
 372 | 
 373 |   char* base_;
 374 |   size_t byte_size_;
 375 | };
 376 | 
 377 | TensorImpl::TensorImpl(
 378 |     const char* name, TRITONPADDLE_DataType dtype,
 379 |     const TRITONPADDLE_Shape& shape, char* data_ptr)
 380 |     : name_(name), dtype_(dtype), shape_(shape), base_(data_ptr)
 381 | {
 382 |   byte_size_ = shape.NumElements() * TRITONPADDLE_DataTypeByteSize(dtype);
 383 | }
 384 | 
 385 | TRITONPADDLE_Tensor*
 386 | TRITONPADDLE_TensorNew(
 387 |     TRITONPADDLE_Model* model, const char* name, TRITONPADDLE_DataType dtype,
 388 |     const TRITONPADDLE_Shape& shape)
 389 | {
 390 |   char* data_ptr;
 391 |   ModelImpl* m = reinterpret_cast<ModelImpl*>(model);
 392 |   auto err = m->GetInputPtr(name, dtype, shape, &data_ptr);
 393 |   if (err != nullptr) {
 394 |     return nullptr;
 395 |   }
 396 | 
 397 |   TensorImpl* tensor = new TensorImpl(name, dtype, shape, data_ptr);
 398 |   return reinterpret_cast<TRITONPADDLE_Tensor*>(tensor);
 399 | }
 400 | 
 401 | TRITONPADDLE_Tensor*
 402 | TRITONPADDLE_TensorNew(TRITONPADDLE_Model* model, const char* name)
 403 | {
 404 |   char* data_ptr;
 405 |   TRITONPADDLE_DataType dtype;
 406 |   TRITONPADDLE_Shape shape;
 407 | 
 408 |   ModelImpl* m = reinterpret_cast<ModelImpl*>(model);
 409 |   auto err = m->GetOutputMetadata(name, &dtype, &shape, &data_ptr);
 410 |   if (err != nullptr) {
 411 |     return nullptr;
 412 |   }
 413 | 
 414 |   TensorImpl* tensor = new TensorImpl(name, dtype, shape, data_ptr);
 415 |   return reinterpret_cast<TRITONPADDLE_Tensor*>(tensor);
 416 | }
 417 | 
 418 | char*
 419 | TRITONPADDLE_TensorData(TRITONPADDLE_Tensor* tensor)
 420 | {
 421 |   TensorImpl* t = reinterpret_cast<TensorImpl*>(tensor);
 422 |   return t->Base();
 423 | }
 424 | 
 425 | size_t
 426 | TRITONPADDLE_TensorDataByteSize(TRITONPADDLE_Tensor* tensor)
 427 | {
 428 |   TensorImpl* t = reinterpret_cast<TensorImpl*>(tensor);
 429 |   return t->ByteSize();
 430 | }
 431 | 
 432 | TRITONPADDLE_DataType
 433 | TRITONPADDLE_TensorDataType(TRITONPADDLE_Tensor* tensor)
 434 | {
 435 |   TensorImpl* t = reinterpret_cast<TensorImpl*>(tensor);
 436 |   return t->DataType();
 437 | }
 438 | 
 439 | TRITONPADDLE_Shape
 440 | TRITONPADDLE_TensorShape(TRITONPADDLE_Tensor* tensor)
 441 | {
 442 |   TensorImpl* t = reinterpret_cast<TensorImpl*>(tensor);
 443 |   return t->Shape();
 444 | }
 445 | 
 446 | namespace triton { namespace backend { namespace paddle {
 447 | 
 448 | using TRITONPADDLEModelHandle = std::shared_ptr<TRITONPADDLE_Model>;
 449 | 
 450 | class ModelState : public BackendModel {
 451 |  public:
 452 |   static TRITONSERVER_Error* Create(
 453 |       TRITONBACKEND_Model* triton_model, ModelState** state);
 454 |   virtual ~ModelState() = default;
 455 |   TRITONPADDLE_Config* PaddleConfig() { return &config_; }
 456 | 
 457 |  private:
 458 |   ModelState(TRITONBACKEND_Model* triton_model);
 459 | 
 460 |   // Auto-complete the model configuration
 461 |   TRITONSERVER_Error* AutoCompleteConfig();
 462 | 
 463 |   // Validate that model configuration is supported by this backend
 464 |   TRITONSERVER_Error* ValidateModelConfig();
 465 | 
 466 |   TRITONPADDLE_Config config_;
 467 | };
 468 | 
 469 | TRITONSERVER_Error*
 470 | ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
 471 | {
 472 |   try {
 473 |     *state = new ModelState(triton_model);
 474 |   }
 475 |   catch (const BackendModelException& ex) {
 476 |     RETURN_ERROR_IF_TRUE(
 477 |         ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
 478 |         std::string("unexpected nullptr in BackendModelException"));
 479 |     RETURN_IF_ERROR(ex.err_);
 480 |   }
 481 | 
 482 |   // Auto-complete the configuration if requested...
 483 |   bool auto_complete_config = false;
 484 |   RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(
 485 |       triton_model, &auto_complete_config));
 486 |   if (auto_complete_config) {
 487 |     RETURN_IF_ERROR((*state)->AutoCompleteConfig());
 488 | 
 489 |     triton::common::TritonJson::WriteBuffer json_buffer;
 490 |     (*state)->ModelConfig().Write(&json_buffer);
 491 | 
 492 |     TRITONSERVER_Message* message;
 493 |     RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson(
 494 |         &message, json_buffer.Base(), json_buffer.Size()));
 495 |     RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig(
 496 |         triton_model, 1 /* config_version */, message));
 497 |   }
 498 | 
 499 |   RETURN_IF_ERROR((*state)->ValidateModelConfig());
 500 | 
 501 |   return nullptr;  // success
 502 | }
 503 | 
 504 | ModelState::ModelState(TRITONBACKEND_Model* triton_model)
 505 |     : BackendModel(triton_model)
 506 | {
 507 | 
 508 |   triton::common::TritonJson::Value optimization;
 509 |   if (not ModelConfig().Find("optimization", &optimization)) {
 510 |     return;
 511 |   }
 512 | 
 513 |   triton::common::TritonJson::Value eas;
 514 |   if (not optimization.Find("execution_accelerators", &eas)) {
 515 |     return;
 516 |   }
 517 | 
 518 |   // CPU execution providers
 519 |   {
 520 |     triton::common::TritonJson::Value cpu_eas;
 521 |     if (eas.Find("cpu_execution_accelerator", &cpu_eas)) {
 522 |       for (size_t idx = 0; idx < cpu_eas.ArraySize(); idx++) {
 523 |         triton::common::TritonJson::Value ea;
 524 |         THROW_IF_BACKEND_MODEL_ERROR(cpu_eas.IndexAsObject(idx, &ea));
 525 |         std::string name;
 526 |         THROW_IF_BACKEND_MODEL_ERROR(ea.MemberAsString("name", &name));
 527 |         if (name == "mkldnn") {
 528 |           config_.use_mkldnn_ = true;
 529 |         } else if (name == "ort") {
 530 |           config_.use_ort_ = true;
 531 |         } else if (name != "") {
 532 |           TRITONSERVER_Error* error = TRITONSERVER_ErrorNew(
 533 |               TRITONSERVER_ERROR_INVALID_ARG,
 534 |               std::string(
 535 |                   "unknown cpu_execution_accelerator name '" + name +
 536 |                   "' is provided. Available choices are [mkldnn, ort]")
 537 |                   .c_str());
 538 |           THROW_IF_BACKEND_MODEL_ERROR(error);
 539 |         }
 540 |         triton::common::TritonJson::Value params;
 541 |         if (ea.Find("parameters", &params)) {
 542 |           std::vector<std::string> param_keys;
 543 |           THROW_IF_BACKEND_MODEL_ERROR(params.Members(&param_keys));
 544 |           for (const auto& param_key : param_keys) {
 545 |             std::string value_string;
 546 |             if (param_key == "cpu_threads") {
 547 |               THROW_IF_BACKEND_MODEL_ERROR(
 548 |                   params.MemberAsString(param_key.c_str(), &value_string));
 549 |               THROW_IF_BACKEND_MODEL_ERROR(
 550 |                   ParseIntValue(value_string, &config_.cpu_math_library_num_threads_));
 551 |             } else if (param_key == "capacity") {
 552 |               THROW_IF_BACKEND_MODEL_ERROR(
 553 |                   params.MemberAsString(param_key.c_str(), &value_string));
 554 |               THROW_IF_BACKEND_MODEL_ERROR(
 555 |                   ParseIntValue(value_string, &config_.mkldnn_capacity_));
 556 |             } else if (param_key == "use_int8") {
 557 |               THROW_IF_BACKEND_MODEL_ERROR(
 558 |                   params.MemberAsString(param_key.c_str(), &value_string));
 559 |               THROW_IF_BACKEND_MODEL_ERROR(
 560 |                   ParseBoolValue(value_string, &config_.use_mkldnn_int8_));
 561 |             }
 562 |           }
 563 |         }
 564 |       }
 565 |     }
 566 |   }
 567 | 
 568 |   // GPU execution providers
 569 |   {
 570 |     triton::common::TritonJson::Value gpu_eas;
 571 |     if (eas.Find("gpu_execution_accelerator", &gpu_eas)) {
 572 |       for (size_t idx = 0; idx < gpu_eas.ArraySize(); idx++) {
 573 |         triton::common::TritonJson::Value ea;
 574 |         THROW_IF_BACKEND_MODEL_ERROR(gpu_eas.IndexAsObject(idx, &ea));
 575 |         std::string name;
 576 |         THROW_IF_BACKEND_MODEL_ERROR(ea.MemberAsString("name", &name));
 577 | 
 578 |         if (name == "tensorrt") {
 579 |           config_.use_trt_ = true;
 580 |           triton::common::TritonJson::Value params;
 581 |           if (ea.Find("parameters", &params)) {
 582 |             std::vector<std::string> param_keys;
 583 |             THROW_IF_BACKEND_MODEL_ERROR(params.Members(&param_keys));
 584 |             for (const auto& param_key : param_keys) {
 585 |               std::string value_string;
 586 |               if (param_key == "precision") {
 587 |                 THROW_IF_BACKEND_MODEL_ERROR(
 588 |                     params.MemberAsString(param_key.c_str(), &value_string));
 589 |                 std::transform(
 590 |                     value_string.begin(), value_string.end(), value_string.begin(),
 591 |                     ::tolower);
 592 |                 if (value_string == "trt_fp32") {
 593 |                   config_.precision_ = TRITONPADDLE_MODE_FP32;
 594 |                 } else if (value_string == "trt_fp16") {
 595 |                   config_.precision_ = TRITONPADDLE_MODE_FP16;
 596 |                 } else if (value_string == "trt_int8") {
 597 |                   config_.precision_ = TRITONPADDLE_MODE_INT8;
 598 |                 } else {
 599 |                   TRITONSERVER_Error* error = TRITONSERVER_ErrorNew(
 600 |                       TRITONSERVER_ERROR_INVALID_ARG,
 601 |                       std::string(
 602 |                           "unknown precision type '" + value_string +
 603 |                           "' is provided. Available choices are [fluid, trt_fp32, "
 604 |                           "trt_fp16, trt_int8]")
 605 |                           .c_str());
 606 |                   THROW_IF_BACKEND_MODEL_ERROR(error);
 607 |                 }
 608 |               } else if (param_key == "min_graph_size") {
 609 |                 THROW_IF_BACKEND_MODEL_ERROR(
 610 |                     params.MemberAsString(param_key.c_str(), &value_string));
 611 |                 THROW_IF_BACKEND_MODEL_ERROR(
 612 |                     ParseLongLongValue(value_string, &config_.min_graph_size_));
 613 |               } else if (param_key == "workspace_size") {
 614 |                 THROW_IF_BACKEND_MODEL_ERROR(
 615 |                     params.MemberAsString(param_key.c_str(), &value_string));
 616 |                 THROW_IF_BACKEND_MODEL_ERROR(
 617 |                     ParseLongLongValue(value_string, &config_.workspace_size_));
 618 |               } else if (param_key == "max_batch_size") {
 619 |                 THROW_IF_BACKEND_MODEL_ERROR(
 620 |                     params.MemberAsString(param_key.c_str(), &value_string));
 621 |                 THROW_IF_BACKEND_MODEL_ERROR(
 622 |                     ParseLongLongValue(value_string, &config_.max_batch_size_));
 623 |               } else if (param_key == "enable_tensorrt_oss") {
 624 |                 THROW_IF_BACKEND_MODEL_ERROR(
 625 |                     params.MemberAsString(param_key.c_str(), &value_string));
 626 |                 THROW_IF_BACKEND_MODEL_ERROR(
 627 |                     ParseBoolValue(value_string, &config_.enable_tensorrt_oss_));
 628 |               } else if (param_key == "is_dynamic") {
 629 |                 THROW_IF_BACKEND_MODEL_ERROR(
 630 |                     params.MemberAsString(param_key.c_str(), &value_string));
 631 |                 THROW_IF_BACKEND_MODEL_ERROR(
 632 |                     ParseBoolValue(value_string, &config_.is_dynamic_));
 633 |               } else if (param_key == "disenable_trt_tune") {
 634 |                 THROW_IF_BACKEND_MODEL_ERROR(
 635 |                     params.MemberAsString(param_key.c_str(), &value_string));
 636 |                 THROW_IF_BACKEND_MODEL_ERROR(
 637 |                     ParseBoolValue(value_string, &config_.disenable_trt_tune_));
 638 |               } else {
 639 |                 TRITONSERVER_Error* error = TRITONSERVER_ErrorNew(
 640 |                     TRITONSERVER_ERROR_INVALID_ARG,
 641 |                     std::string(
 642 |                         "unknown parameter '" + param_key +
 643 |                         "' is provided for GPU execution accelerator "
 644 |                         "config. Available choices are [precision, "
 645 |                         "min_graph_size, workspace_size, max_batch_size, "
 646 |                         "enable_tensorrt_oss, is_dynamic]")
 647 |                         .c_str());
 648 |                 THROW_IF_BACKEND_MODEL_ERROR(error);
 649 |               }
 650 |             }
 651 |           }
 652 |         } else if (
 653 |             name == "min_shape" or name == "max_shape" or name == "opt_shape") {
 654 |           triton::common::TritonJson::Value params;
 655 |           if (ea.Find("parameters", &params)) {
 656 |             std::vector<std::string> input_names;
 657 |             THROW_IF_BACKEND_MODEL_ERROR(params.Members(&input_names));
 658 |             for (const auto& input_name : input_names) {
 659 |               std::string str_shape;
 660 |               THROW_IF_BACKEND_MODEL_ERROR(
 661 |                   params.MemberAsString(input_name.c_str(), &str_shape));
 662 |               if (name == "min_shape") {
 663 |                 config_.dynamic_min_shape_[input_name] =
 664 |                     TRITONPADDLE_Shape(str_shape).CompatibleShape();
 665 |               } else if (name == "max_shape") {
 666 |                 config_.dynamic_max_shape_[input_name] =
 667 |                     TRITONPADDLE_Shape(str_shape).CompatibleShape();
 668 |               } else {
 669 |                 config_.dynamic_opt_shape_[input_name] =
 670 |                     TRITONPADDLE_Shape(str_shape).CompatibleShape();
 671 |               }
 672 |             }
 673 |           }
 674 |         } else {
 675 |           TRITONSERVER_Error* error = TRITONSERVER_ErrorNew(
 676 |               TRITONSERVER_ERROR_INVALID_ARG,
 677 |               std::string(
 678 |                   "unknown name '" + name +
 679 |                   "' is provided for GPU execution accelerator "
 680 |                   "Available choices are [config, min_shape, max_shape, opt_shape]")
 681 |                   .c_str());
 682 |           THROW_IF_BACKEND_MODEL_ERROR(error);
 683 |         }
 684 |       }
 685 |     }
 686 |   }
 687 | }
 688 | 
 689 | TRITONSERVER_Error*
 690 | ModelState::AutoCompleteConfig()
 691 | {
 692 |   // Auto-complete configuration if requests
 693 |   LOG_MESSAGE(
 694 |       TRITONSERVER_LOG_WARN,
 695 |       (std::string("skipping model configuration auto-complete for '") +
 696 |        Name() + "': not supported for paddle backend")
 697 |           .c_str());
 698 | 
 699 |   return nullptr;  // success
 700 | }
 701 | 
 702 | TRITONSERVER_Error*
 703 | ModelState::ValidateModelConfig()
 704 | {
 705 |   triton::common::TritonJson::WriteBuffer buffer;
 706 |   RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer));
 707 |   LOG_MESSAGE(
 708 |       TRITONSERVER_LOG_VERBOSE,
 709 |       (std::string("model configuration:\n") + buffer.Contents()).c_str());
 710 | 
 711 |   triton::common::TritonJson::Value ios;
 712 |   RETURN_IF_ERROR(ModelConfig().MemberAsArray("input", &ios));
 713 |   for (size_t i = 0; i < ios.ArraySize(); i++) {
 714 |     triton::common::TritonJson::Value io;
 715 |     RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
 716 |     std::string io_name;
 717 |     RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
 718 |     // Check datatypes
 719 |     std::string io_dtype;
 720 |     RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
 721 |     RETURN_ERROR_IF_TRUE(
 722 |         ConvertDataType(io_dtype) ==
 723 |             TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INVALID,
 724 |         TRITONSERVER_ERROR_INVALID_ARG,
 725 |         std::string("unsupported datatype '") + io_dtype + "' for tensor '" +
 726 |             io_name + "' for model '" + Name() + "'");
 727 |   }
 728 |   RETURN_IF_ERROR(ModelConfig().MemberAsArray("output", &ios));
 729 |   for (size_t i = 0; i < ios.ArraySize(); i++) {
 730 |     triton::common::TritonJson::Value io;
 731 |     RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
 732 |     std::string io_name;
 733 |     RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
 734 |     // Check datatypes
 735 |     std::string io_dtype;
 736 |     RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
 737 |     RETURN_ERROR_IF_TRUE(
 738 |         ConvertDataType(io_dtype) ==
 739 |             TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INVALID,
 740 |         TRITONSERVER_ERROR_INVALID_ARG,
 741 |         std::string("unsupported datatype '") + io_dtype + "' for tensor '" +
 742 |             io_name + "' for model '" + Name() + "'");
 743 |   }
 744 | 
 745 |   return nullptr;  // success
 746 | }
 747 | 
 748 | class ModelInstanceState : public BackendModelInstance {
 749 |  public:
 750 |   static TRITONSERVER_Error* Create(
 751 |       ModelState* model_state,
 752 |       TRITONBACKEND_ModelInstance* triton_model_instance,
 753 |       ModelInstanceState** state);
 754 |   virtual ~ModelInstanceState() = default;
 755 | 
 756 |   // Get the state of the model that corresponds to this instance.
 757 |   ModelState* StateForModel() const { return model_state_; }
 758 | 
 759 |   void ProcessRequests(
 760 |       TRITONBACKEND_Request** requests, const uint32_t request_count);
 761 | 
 762 |  private:
 763 |   ModelInstanceState(
 764 |       ModelState* model_state,
 765 |       TRITONBACKEND_ModelInstance* triton_model_instance);
 766 | 
 767 |   TRITONSERVER_Error* DetermineModelAndParamsPath(
 768 |       const std::string& model_dir, std::string* model_path,
 769 |       std::string* param_path);
 770 | 
 771 |   void SetInputTensors(
 772 |       size_t total_batch_size, TRITONBACKEND_Request** requests,
 773 |       const uint32_t request_count,
 774 |       std::vector<TRITONBACKEND_Response*>* responses);
 775 | 
 776 |   void ReadOutputTensors(
 777 |       size_t total_batch_size, const std::vector<std::string>& output_names,
 778 |       TRITONBACKEND_Request** requests, const uint32_t request_count,
 779 |       std::vector<TRITONBACKEND_Response*>* responses);
 780 | 
 781 |   ModelState* model_state_;
 782 |   TRITONPADDLEModelHandle triton_paddle_model_;
 783 | };
 784 | 
 785 | TRITONSERVER_Error*
 786 | ModelInstanceState::Create(
 787 |     ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
 788 |     ModelInstanceState** state)
 789 | {
 790 |   try {
 791 |     *state = new ModelInstanceState(model_state, triton_model_instance);
 792 |   }
 793 |   catch (const BackendModelInstanceException& ex) {
 794 |     RETURN_ERROR_IF_TRUE(
 795 |         ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
 796 |         std::string("unexpected nullptr in BackendModelInstanceException"));
 797 |     RETURN_IF_ERROR(ex.err_);
 798 |   }
 799 | 
 800 |   return nullptr;  // success
 801 | }
 802 | 
 803 | TRITONSERVER_Error*
 804 | ModelInstanceState::DetermineModelAndParamsPath(
 805 |     const std::string& model_dir, std::string* model_path,
 806 |     std::string* param_path)
 807 | {
 808 |   bool exists;
 809 |   *model_path = JoinPath({model_dir, "model.pdmodel"});
 810 |   RETURN_IF_ERROR(FileExists(*model_path, &exists));
 811 |   if (not exists) {
 812 |     return TRITONSERVER_ErrorNew(
 813 |         TRITONSERVER_ERROR_NOT_FOUND,
 814 |         std::string(
 815 |             "Paddle model should be named as 'model.pdmodel'").c_str());
 816 |   }
 817 | 
 818 |   *param_path = JoinPath({model_dir, "model.pdiparams"});
 819 |   RETURN_IF_ERROR(FileExists(*param_path, &exists));
 820 |   if (not exists) {
 821 |     LOG_MESSAGE(
 822 |       TRITONSERVER_LOG_INFO,
 823 |       (std::string("Paddle params should be named as 'model.pdiparams' or not provided.").c_str()));
 824 |     *param_path = "";
 825 |   }
 826 | 
 827 |   return nullptr;
 828 | }
 829 | 
 830 | ModelInstanceState::ModelInstanceState(
 831 |     ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
 832 |     : BackendModelInstance(model_state, triton_model_instance),
 833 |       model_state_(model_state)
 834 | {
 835 |   auto config = model_state->PaddleConfig();
 836 |   auto model_dir = JoinPath(
 837 |       {model_state->RepositoryPath(), std::to_string(model_state->Version())});
 838 |   config->model_dir_ = model_dir;
 839 | 
 840 |   std::string model_path;
 841 |   std::string param_path;
 842 |   THROW_IF_BACKEND_INSTANCE_ERROR(
 843 |       DetermineModelAndParamsPath(model_dir, &model_path, &param_path));
 844 | 
 845 |   switch (Kind()) {
 846 |     case TRITONSERVER_INSTANCEGROUPKIND_CPU:
 847 |       config->use_cpu_ = true;
 848 |       break;
 849 |     case TRITONSERVER_INSTANCEGROUPKIND_GPU:
 850 |       config->use_cpu_ = false;
 851 |       break;
 852 |     default:
 853 |       throw BackendModelInstanceException(TRITONSERVER_ErrorNew(
 854 |           TRITONSERVER_ERROR_INTERNAL,
 855 |           (std::string("unexpected instance kind for ") + name_ +
 856 |            ", paddle_backend only supports CPU/GPU.")
 857 |               .c_str()));
 858 |   }
 859 | 
 860 |   TRITONPADDLE_Model* triton_paddle_model = nullptr;
 861 |   THROW_IF_BACKEND_INSTANCE_ERROR(TRITONPADDLE_ModelCreate(
 862 |       &triton_paddle_model, model_path.c_str(),
 863 |       param_path.empty() ? nullptr : param_path.c_str(),
 864 |       config, DeviceId(), CudaStream()));
 865 |   triton_paddle_model_.reset(triton_paddle_model, TRITONPADDLE_ModelDelete);
 866 | }
 867 | 
 868 | void
 869 | ModelInstanceState::SetInputTensors(
 870 |     size_t total_batch_size, TRITONBACKEND_Request** requests,
 871 |     const uint32_t request_count,
 872 |     std::vector<TRITONBACKEND_Response*>* responses)
 873 | {
 874 | // TRITONSERVER_Error*
 875 | // ModelInstanceState::SetInputTensors(
 876 | //     size_t total_batch_size, TRITONBACKEND_Request** requests,
 877 | //     const uint32_t request_count,
 878 | //     std::vector<TRITONBACKEND_Response*>* responses,
 879 | //     BackendInputCollector* collector, std::vector<const char*>* input_names,
 880 | //     bool* cuda_copy)
 881 | // {
 882 |   bool cuda_copy = false;
 883 |   BackendInputCollector collector(
 884 |       requests, request_count, responses,
 885 |       StateForModel()->TritonMemoryManager(),
 886 |       StateForModel()->EnablePinnedInput(), CudaStream());
 887 | 
 888 |   const int max_batch_size = model_state_->MaxBatchSize();
 889 | 
 890 |   // All requests must have equally-sized input tensors so use any
 891 |   // request as the representative for the input tensors.
 892 |   uint32_t input_count;
 893 |   RESPOND_ALL_AND_RETURN_IF_ERROR(
 894 |       responses, request_count,
 895 |       TRITONBACKEND_RequestInputCount(requests[0], &input_count));
 896 | 
 897 |   for (uint32_t input_idx = 0; input_idx < input_count; ++input_idx) {
 898 |     TRITONBACKEND_Input* input;
 899 |     RESPOND_ALL_AND_RETURN_IF_ERROR(
 900 |         responses, request_count,
 901 |         TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
 902 | 
 903 |     const char* name;
 904 |     TRITONSERVER_DataType datatype;
 905 |     const int64_t* shape;
 906 |     uint32_t dims_count;
 907 |     RESPOND_ALL_AND_RETURN_IF_ERROR(
 908 |         responses, request_count,
 909 |         TRITONBACKEND_InputProperties(
 910 |             input, &name, &datatype, &shape, &dims_count, nullptr, nullptr));
 911 | 
 912 |     // The shape for the entire input patch, [total_batch_size, ...]
 913 |     std::vector<int64_t> batchn_shape(shape, shape + dims_count);
 914 | 
 915 |     if (max_batch_size != 0) {
 916 |       batchn_shape[0] = total_batch_size;
 917 |     }
 918 | 
 919 |     TRITONPADDLE_Tensor* tensor = TRITONPADDLE_TensorNew(
 920 |         triton_paddle_model_.get(), name, ConvertDataType(datatype),
 921 |         TRITONPADDLE_Shape(batchn_shape));
 922 | 
 923 |     if (tensor == nullptr) {
 924 |       auto err = TRITONSERVER_ErrorNew(
 925 |           TRITONSERVER_ERROR_INTERNAL,
 926 |           (std::string("Failed to create input tensor '") + name +
 927 |            "' with shape " + backend::ShapeToString(batchn_shape) +
 928 |            " and data type " + TRITONSERVER_DataTypeString(datatype) +
 929 |            " for '" + Name() + "'")
 930 |               .c_str());
 931 |       SendErrorForResponses(responses, request_count, err);
 932 |       return;
 933 |     }
 934 | 
 935 |     if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
 936 |       collector.ProcessTensor(
 937 |           name, TRITONPADDLE_TensorData(tensor),
 938 |           TRITONPADDLE_TensorDataByteSize(tensor), TRITONSERVER_MEMORY_GPU,
 939 |           DeviceId());
 940 |     }
 941 |     else {
 942 |         collector.ProcessTensor(
 943 |           name, TRITONPADDLE_TensorData(tensor),
 944 |           TRITONPADDLE_TensorDataByteSize(tensor), TRITONSERVER_MEMORY_CPU,
 945 |           0);
 946 |     }
 947 |   }
 948 | 
 949 |   cuda_copy |= collector.Finalize();
 950 |   if (cuda_copy) {
 951 |     cudaStreamSynchronize(CudaStream());
 952 |   }
 953 | }
 954 | 
 955 | void
 956 | ModelInstanceState::ReadOutputTensors(
 957 |     size_t total_batch_size, const std::vector<std::string>& output_names,
 958 |     TRITONBACKEND_Request** requests, const uint32_t request_count,
 959 |     std::vector<TRITONBACKEND_Response*>* responses)
 960 | {
 961 |   BackendOutputResponder responder(
 962 |       requests, request_count, responses, StateForModel()->MaxBatchSize(),
 963 |       StateForModel()->TritonMemoryManager(),
 964 |       StateForModel()->EnablePinnedOutput(), CudaStream());
 965 | 
 966 |   bool cuda_copy = false;
 967 |   for (size_t idx = 0; idx < output_names.size(); ++idx) {
 968 |     const std::string& name = output_names[idx];
 969 | 
 970 |     TRITONPADDLE_Tensor* tensor =
 971 |         TRITONPADDLE_TensorNew(triton_paddle_model_.get(), name.c_str());
 972 | 
 973 |     if (tensor == nullptr) {
 974 |       auto err = TRITONSERVER_ErrorNew(
 975 |           TRITONSERVER_ERROR_INTERNAL,
 976 |           (std::string("Failed to create output tensor '") + name + " for '" +
 977 |            Name() + "'")
 978 |               .c_str());
 979 |       SendErrorForResponses(responses, request_count, err);
 980 |       return;
 981 |     }
 982 | 
 983 |     auto dtype = ConvertDataType(TRITONPADDLE_TensorDataType(tensor));
 984 |     auto shape = TRITONPADDLE_TensorShape(tensor).Shape();
 985 | 
 986 |     if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
 987 |       responder.ProcessTensor(
 988 |           name, dtype, shape, TRITONPADDLE_TensorData(tensor),
 989 |           TRITONSERVER_MEMORY_GPU, DeviceId());
 990 |     } else {
 991 |       responder.ProcessTensor(
 992 |           name, dtype, shape, TRITONPADDLE_TensorData(tensor),
 993 |           TRITONSERVER_MEMORY_CPU, 0);
 994 |     }
 995 |   }
 996 | 
 997 |   cuda_copy |= responder.Finalize();
 998 |   if (cuda_copy) {
 999 |     cudaStreamSynchronize(CudaStream());
1000 |   }
1001 | 
1002 | }
1003 | 
1004 | void
1005 | ModelInstanceState::ProcessRequests(
1006 |     TRITONBACKEND_Request** requests, const uint32_t request_count)
1007 | {
1008 |   LOG_MESSAGE(
1009 |       TRITONSERVER_LOG_VERBOSE,
1010 |       (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " +
1011 |        std::to_string(request_count) + " requests")
1012 |           .c_str());
1013 | 
1014 |   uint64_t exec_start_ns = 0;
1015 |   SET_TIMESTAMP(exec_start_ns);
1016 | 
1017 |   const int max_batch_size = model_state_->MaxBatchSize();
1018 | 
1019 |   // For each request collect the total batch size for this inference
1020 |   // execution. The batch-size, number of inputs, and size of each
1021 |   // input has already been checked so don't need to do that here.
1022 |   size_t total_batch_size = 0;
1023 |   for (size_t i = 0; i < request_count; ++i) {
1024 |     // If we get a nullptr request then something is badly wrong. Fail
1025 |     // and release all requests.
1026 |     if (requests[i] == nullptr) {
1027 |       RequestsRespondWithError(
1028 |           requests, request_count,
1029 |           TRITONSERVER_ErrorNew(
1030 |               TRITONSERVER_ERROR_INTERNAL,
1031 |               std::string(
1032 |                   "null request given to Paddle backend for '" + Name() + "'")
1033 |                   .c_str()));
1034 |       return;
1035 |     }
1036 | 
1037 |     if (max_batch_size > 0) {
1038 |       // Retrieve the batch size from one of the inputs, if the model
1039 |       // supports batching, the first dimension size is batch size
1040 |       TRITONBACKEND_Input* input;
1041 |       TRITONSERVER_Error* err =
1042 |           TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
1043 |       if (err == nullptr) {
1044 |         const int64_t* shape;
1045 |         err = TRITONBACKEND_InputProperties(
1046 |             input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
1047 |         total_batch_size += shape[0];
1048 |       }
1049 |       if (err != nullptr) {
1050 |         RequestsRespondWithError(requests, request_count, err);
1051 |         return;
1052 |       }
1053 |     } else {
1054 |       total_batch_size += 1;
1055 |     }
1056 |   }
1057 | 
1058 |   // If there are no valid requests then no need to run the
1059 |   // inference. This should never happen unless called with an empty
1060 |   // 'requests' for some reason.
1061 |   if (total_batch_size == 0) {
1062 |     return;
1063 |   }
1064 | 
1065 |   // Make sure the maximum batch size is not exceeded. The
1066 |   // total_batch_size must be 1 for models that don't support batching
1067 |   // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
1068 |   // scheduler has done something badly wrong so fail and release all
1069 |   // requests
1070 |   if ((total_batch_size != 1) and
1071 |       (total_batch_size > static_cast<size_t>(max_batch_size))) {
1072 |     RequestsRespondWithError(
1073 |         requests, request_count,
1074 |         TRITONSERVER_ErrorNew(
1075 |             TRITONSERVER_ERROR_INTERNAL,
1076 |             std::string(
1077 |                 "batch size " + std::to_string(total_batch_size) + " for '" +
1078 |                 Name() + "', max allowed is " + std::to_string(max_batch_size))
1079 |                 .c_str()));
1080 |     return;
1081 |   }
1082 | 
1083 |   // At this point we are committed to running inference with all
1084 |   // 'requests'. Create a response for each request. During input
1085 |   // processing if there is an error with any request that error will
1086 |   // be sent immediately with the corresponding response (and the
1087 |   // response pointer will then be nullptr). The request object
1088 |   // itself will not be released until after all inferencing is done
1089 |   // (below) as we may need to access the request object when
1090 |   // determine how to process outputs (for example, even if we don't
1091 |   // need the outputs for a request that has an error,  we do need to
1092 |   // know the size of those outputs associated with the request so we
1093 |   // can skip them in the output tensors).
1094 |   std::vector<TRITONBACKEND_Response*> responses;
1095 |   responses.reserve(request_count);
1096 | 
1097 |   for (size_t i = 0; i < request_count; ++i) {
1098 |     TRITONBACKEND_Response* response;
1099 |     auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
1100 |     if (err == nullptr) {
1101 |       responses.emplace_back(response);
1102 |     } else {
1103 |       responses.emplace_back(nullptr);
1104 |       LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
1105 |       TRITONSERVER_ErrorDelete(err);
1106 |     }
1107 |   }
1108 | 
1109 |   SetInputTensors(total_batch_size, requests, request_count, &responses);
1110 | 
1111 |   // Collect the names of requested outputs. Do not include outputs
1112 |   // for requests that have already responded with an error.
1113 |   // TODO: understand here
1114 |   std::vector<std::string> required_outputs;
1115 |   std::vector<std::vector<std::string>> request_required_outputs(request_count);
1116 |   for (size_t idx = 0; idx < request_count; ++idx) {
1117 |     const auto& request = requests[idx];
1118 |     auto& response = responses[idx];
1119 |     if (response != nullptr) {
1120 |       uint32_t output_count;
1121 |       RESPOND_AND_SET_NULL_IF_ERROR(
1122 |           &response, TRITONBACKEND_RequestOutputCount(request, &output_count));
1123 |       if (response != nullptr) {
1124 |         for (uint32_t output_idx = 0; output_idx < output_count; ++output_idx) {
1125 |           const char* output_name;
1126 |           RESPOND_AND_SET_NULL_IF_ERROR(
1127 |               &response, TRITONBACKEND_RequestOutputName(
1128 |                              request, output_idx, &output_name));
1129 | 
1130 |           if (response != nullptr) {
1131 |             required_outputs.push_back(output_name);
1132 |             request_required_outputs[idx].push_back(output_name);
1133 |           }
1134 |         }
1135 |       }
1136 |     }
1137 |   }
1138 | 
1139 |   uint64_t compute_start_ns = 0;
1140 |   SET_TIMESTAMP(compute_start_ns);
1141 | 
1142 |   TRITONPADDLE_ModelRun(triton_paddle_model_.get());
1143 | 
1144 |   uint64_t compute_end_ns = 0;
1145 |   SET_TIMESTAMP(compute_end_ns);
1146 | 
1147 |   ReadOutputTensors(
1148 |       total_batch_size, required_outputs, requests, request_count, &responses);
1149 | 
1150 |   uint64_t exec_end_ns = 0;
1151 |   SET_TIMESTAMP(exec_end_ns);
1152 | 
1153 |   // Send all the responses that haven't already been sent because of
1154 |   // an earlier error. Note that the responses are not set to nullptr
1155 |   // here as we need that indication below to determine if the request
1156 |   // we successful or not.
1157 |   for (auto& response : responses) {
1158 |     if (response != nullptr) {
1159 |       LOG_IF_ERROR(
1160 |           TRITONBACKEND_ResponseSend(
1161 |               response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
1162 |           "failed to send Paddle backend response");
1163 |     }
1164 |   }
1165 | 
1166 |   // Report statistics for each request.
1167 |   for (uint32_t r = 0; r < request_count; ++r) {
1168 |     auto& request = requests[r];
1169 |     LOG_IF_ERROR(
1170 |         TRITONBACKEND_ModelInstanceReportStatistics(
1171 |             TritonModelInstance(), request,
1172 |             (responses[r] != nullptr) /* success */, exec_start_ns,
1173 |             compute_start_ns, compute_end_ns, exec_end_ns),
1174 |         "failed reporting request statistics");
1175 | 
1176 |     LOG_IF_ERROR(
1177 |         TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
1178 |         "failed releasing request");
1179 |   }
1180 | 
1181 |   // TODO: Report the entire batch statistics.
1182 |   LOG_IF_ERROR(
1183 |       TRITONBACKEND_ModelInstanceReportBatchStatistics(
1184 |           TritonModelInstance(), total_batch_size, exec_start_ns,
1185 |           compute_start_ns, compute_end_ns, exec_end_ns),
1186 |       "failed reporting batch request statistics");
1187 | 
1188 |   LOG_MESSAGE(
1189 |       TRITONSERVER_LOG_VERBOSE,
1190 |       (std::string("TRITONBACKEND_ModelExecute: model ") + Name() +
1191 |        " released " + std::to_string(request_count) + " requests")
1192 |           .c_str());
1193 | }
1194 | 
1195 | extern "C" {
1196 | 
1197 | TRITONSERVER_Error*
1198 | TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
1199 | {
1200 |   const char* cname;
1201 |   RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
1202 |   std::string name(cname);
1203 | 
1204 |   LOG_MESSAGE(
1205 |       TRITONSERVER_LOG_INFO,
1206 |       (std::string("TRITONBACKEND_Initialize: ") + name).c_str());
1207 | 
1208 |   uint32_t api_version_major, api_version_minor;
1209 |   RETURN_IF_ERROR(
1210 |       TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
1211 | 
1212 |   LOG_MESSAGE(
1213 |       TRITONSERVER_LOG_INFO,
1214 |       (std::string("Triton TRITONBACKEND API version: ") +
1215 |        std::to_string(api_version_major) + "." +
1216 |        std::to_string(api_version_minor))
1217 |           .c_str());
1218 | 
1219 |   LOG_MESSAGE(
1220 |       TRITONSERVER_LOG_INFO,
1221 |       (std::string("'") + name + "' TRITONBACKEND API version: " +
1222 |        std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
1223 |        std::to_string(TRITONBACKEND_API_VERSION_MINOR))
1224 |           .c_str());
1225 | 
1226 |   if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
1227 |       (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
1228 |     return TRITONSERVER_ErrorNew(
1229 |         TRITONSERVER_ERROR_UNSUPPORTED,
1230 |         (std::string("Triton TRITONBACKEND API version: ") +
1231 |          std::to_string(api_version_major) + "." +
1232 |          std::to_string(api_version_minor) + " does not support '" + name +
1233 |          "' TRITONBACKEND API version: " +
1234 |          std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
1235 |          std::to_string(TRITONBACKEND_API_VERSION_MINOR))
1236 |             .c_str());
1237 |   }
1238 | 
1239 |   // The backend configuration may contain information needed by the
1240 |   // backend, such a command-line arguments.
1241 |   TRITONSERVER_Message* backend_config_message;
1242 |   RETURN_IF_ERROR(
1243 |       TRITONBACKEND_BackendConfig(backend, &backend_config_message));
1244 | 
1245 |   const char* buffer;
1246 |   size_t byte_size;
1247 |   RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(
1248 |       backend_config_message, &buffer, &byte_size));
1249 |   LOG_MESSAGE(
1250 |       TRITONSERVER_LOG_INFO,
1251 |       (std::string("backend configuration:\n") + buffer).c_str());
1252 | 
1253 |   return nullptr;  // success
1254 | }
1255 | 
1256 | TRITONSERVER_Error*
1257 | TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
1258 | {
1259 |   const char* cname;
1260 |   RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
1261 |   std::string name(cname);
1262 | 
1263 |   uint64_t version;
1264 |   RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
1265 | 
1266 |   LOG_MESSAGE(
1267 |       TRITONSERVER_LOG_INFO,
1268 |       (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " +
1269 |        std::to_string(version) + ")")
1270 |           .c_str());
1271 | 
1272 |   // Create a ModelState object and associate it with the
1273 |   // TRITONBACKEND_Model.
1274 |   ModelState* model_state = nullptr;
1275 |   RETURN_IF_ERROR(ModelState::Create(model, &model_state));
1276 |   RETURN_IF_ERROR(
1277 |       TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
1278 | 
1279 |   return nullptr;  // success
1280 | }
1281 | 
1282 | TRITONSERVER_Error*
1283 | TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
1284 | {
1285 |   void* vstate;
1286 |   RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
1287 |   ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
1288 | 
1289 |   LOG_MESSAGE(
1290 |       TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state");
1291 | 
1292 |   delete model_state;
1293 | 
1294 |   return nullptr;  // success
1295 | }
1296 | 
1297 | TRITONSERVER_Error*
1298 | TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
1299 | {
1300 |   const char* cname;
1301 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));
1302 |   std::string name(cname);
1303 | 
1304 |   int32_t device_id;
1305 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id));
1306 |   TRITONSERVER_InstanceGroupKind kind;
1307 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind));
1308 | 
1309 |   LOG_MESSAGE(
1310 |       TRITONSERVER_LOG_INFO,
1311 |       (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" +
1312 |        TRITONSERVER_InstanceGroupKindString(kind) + " device " +
1313 |        std::to_string(device_id) + ")")
1314 |           .c_str());
1315 | 
1316 |   // Get the model state associated with this instance's model
1317 |   TRITONBACKEND_Model* model;
1318 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
1319 | 
1320 |   void* vmodelstate;
1321 |   RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
1322 |   ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
1323 | 
1324 |   // With each instance we create a ModelInstanceState object and
1325 |   // associate it with the TRITONBACKEND_ModelInstance.
1326 |   ModelInstanceState* instance_state;
1327 |   RETURN_IF_ERROR(
1328 |       ModelInstanceState::Create(model_state, instance, &instance_state));
1329 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
1330 |       instance, reinterpret_cast<void*>(instance_state)));
1331 | 
1332 |   return nullptr;
1333 | }
1334 | 
1335 | TRITONSERVER_Error*
1336 | TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
1337 | {
1338 |   void* vstate;
1339 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
1340 |   ModelInstanceState* instance_state =
1341 |       reinterpret_cast<ModelInstanceState*>(vstate);
1342 | 
1343 |   LOG_MESSAGE(
1344 |       TRITONSERVER_LOG_INFO,
1345 |       "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
1346 | 
1347 |   delete instance_state;
1348 | 
1349 |   return nullptr;
1350 | }
1351 | 
1352 | TRITONSERVER_Error*
1353 | TRITONBACKEND_ModelInstanceExecute(
1354 |     TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
1355 |     const uint32_t request_count)
1356 | {
1357 |   // Triton will not call this function simultaneously for the same
1358 |   // 'instance'. But since this backend could be used by multiple
1359 |   // instances from multiple models the implementation needs to handle
1360 |   // multiple calls to this function at the same time (with different
1361 |   // 'instance' objects). Suggested practice for this is to use only
1362 |   // function-local and model-instance-specific state (obtained from
1363 |   // 'instance'), which is what we do here.
1364 |   ModelInstanceState* instance_state;
1365 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
1366 |       instance, reinterpret_cast<void**>(&instance_state)));
1367 |   ModelState* model_state = instance_state->StateForModel();
1368 | 
1369 |   LOG_MESSAGE(
1370 |       TRITONSERVER_LOG_VERBOSE,
1371 |       (std::string("model ") + model_state->Name() + ", instance " +
1372 |        instance_state->Name() + ", executing " + std::to_string(request_count) +
1373 |        " requests")
1374 |           .c_str());
1375 | 
1376 |   // At this point we accept ownership of 'requests', which means that
1377 |   // even if something goes wrong we must still return success from
1378 |   // this function. If something does go wrong in processing a
1379 |   // particular request then we send an error response just for the
1380 |   // specific request.
1381 |   instance_state->ProcessRequests(requests, request_count);
1382 | 
1383 |   return nullptr;  // success
1384 | }
1385 | 
1386 | }  // extern "C"
1387 | }}}  // namespace triton::backend::paddle
1388 | 


--------------------------------------------------------------------------------
/src/paddle_backend_utils.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  2 | //
  3 | // Redistribution and use in source and binary forms, with or without
  4 | // modification, are permitted provided that the following conditions
  5 | // are met:
  6 | //  * Redistributions of source code must retain the above copyright
  7 | //    notice, this list of conditions and the following disclaimer.
  8 | //  * Redistributions in binary form must reproduce the above copyright
  9 | //    notice, this list of conditions and the following disclaimer in the
 10 | //    documentation and/or other materials provided with the distribution.
 11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | //    contributors may be used to endorse or promote products derived
 13 | //    from this software without specific prior written permission.
 14 | //
 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | #include "paddle_backend_utils.h"
 28 | 
 29 | #include <algorithm>
 30 | #include <functional>
 31 | #include <iterator>
 32 | #include <numeric>
 33 | #include <sstream>
 34 | 
 35 | // namespace triton { namespace backend { namespace paddle {
 36 | 
 37 | template TRITONPADDLE_Shape::TRITONPADDLE_Shape(
 38 |     const std::vector<int64_t>& shape);
 39 | template TRITONPADDLE_Shape::TRITONPADDLE_Shape(
 40 |     const std::vector<int32_t>& shape);
 41 | 
 42 | template <typename T>
 43 | TRITONPADDLE_Shape::TRITONPADDLE_Shape(const std::vector<T>& shape)
 44 | {
 45 |   shape_ = std::vector<value_type>(shape.cbegin(), shape.cend());
 46 |   numel_ = std::accumulate(
 47 |       shape_.cbegin(), shape_.cend(), 1, std::multiplies<value_type>());
 48 | }
 49 | 
 50 | TRITONPADDLE_Shape::TRITONPADDLE_Shape(const std::string& str)
 51 | {
 52 |   std::vector<std::string> str_shape;
 53 |   std::istringstream in(str);
 54 |   std::copy(
 55 |       std::istream_iterator<std::string>(in),
 56 |       std::istream_iterator<std::string>(), std::back_inserter(str_shape));
 57 | 
 58 |   std::transform(
 59 |       str_shape.cbegin(), str_shape.cend(), std::back_inserter(shape_),
 60 |       [](const std::string& str) -> value_type {
 61 |         return static_cast<value_type>(std::stoll(str));
 62 |       });
 63 | }
 64 | 
 65 | std::vector<int32_t>
 66 | TRITONPADDLE_Shape::CompatibleShape() const
 67 | {
 68 |   return std::vector<int32_t>(shape_.cbegin(), shape_.cend());
 69 | }
 70 | 
 71 | TRITONPADDLE_DataType
 72 | ConvertDataType(TRITONSERVER_DataType dtype)
 73 | {
 74 |   switch (dtype) {
 75 |     case TRITONSERVER_TYPE_INVALID:
 76 |       return TRITONPADDLE_TYPE_INVALID;
 77 |     case TRITONSERVER_TYPE_UINT8:
 78 |       return TRITONPADDLE_TYPE_UINT8;
 79 |     case TRITONSERVER_TYPE_INT8:
 80 |       return TRITONPADDLE_TYPE_INT8;
 81 |     case TRITONSERVER_TYPE_INT32:
 82 |       return TRITONPADDLE_TYPE_INT32;
 83 |     case TRITONSERVER_TYPE_INT64:
 84 |       return TRITONPADDLE_TYPE_INT64;
 85 |     case TRITONSERVER_TYPE_FP32:
 86 |       return TRITONPADDLE_TYPE_FP32;
 87 |     case TRITONSERVER_TYPE_FP16:
 88 |       return TRITONPADDLE_TYPE_FP16;
 89 |     default:
 90 |       break;
 91 |   }
 92 |   return TRITONPADDLE_TYPE_INVALID;
 93 | }
 94 | 
 95 | TRITONSERVER_DataType
 96 | ConvertDataType(TRITONPADDLE_DataType dtype)
 97 | {
 98 |   switch (dtype) {
 99 |     case TRITONPADDLE_TYPE_INVALID:
100 |       return TRITONSERVER_TYPE_INVALID;
101 |     case TRITONPADDLE_TYPE_UINT8:
102 |       return TRITONSERVER_TYPE_UINT8;
103 |     case TRITONPADDLE_TYPE_INT8:
104 |       return TRITONSERVER_TYPE_INT8;
105 |     case TRITONPADDLE_TYPE_INT32:
106 |       return TRITONSERVER_TYPE_INT32;
107 |     case TRITONPADDLE_TYPE_INT64:
108 |       return TRITONSERVER_TYPE_INT64;
109 |     case TRITONPADDLE_TYPE_FP32:
110 |       return TRITONSERVER_TYPE_FP32;
111 |     case TRITONPADDLE_TYPE_FP16:
112 |       return TRITONSERVER_TYPE_FP16;
113 |     default:
114 |       break;
115 |   }
116 |   return TRITONSERVER_TYPE_INVALID;
117 | }
118 | 
119 | TRITONPADDLE_DataType
120 | ConvertDataType(::paddle_infer::DataType dtype)
121 | {
122 |   switch (dtype) {
123 |     case ::paddle_infer::DataType::FLOAT32:
124 |       return TRITONPADDLE_TYPE_FP32;
125 |     case ::paddle_infer::DataType::INT64:
126 |       return TRITONPADDLE_TYPE_INT64;
127 |     case ::paddle_infer::DataType::INT32:
128 |       return TRITONPADDLE_TYPE_INT32;
129 |     case ::paddle_infer::DataType::UINT8:
130 |       return TRITONPADDLE_TYPE_UINT8;
131 |     // case ::paddle_infer::DataType::INT8:
132 |     //   return TRITONPADDLE_TYPE_INT8;
133 |     default:
134 |       break;
135 |   }
136 |   return TRITONPADDLE_TYPE_INVALID;
137 | }
138 | 
139 | TRITONPADDLE_DataType
140 | ConvertDataType(const std::string& dtype)
141 | {
142 |   if (dtype == "TYPE_INVALID") {
143 |     return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INVALID;
144 |   } else if (dtype == "TYPE_FP32") {
145 |     return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_FP32;
146 |   } else if (dtype == "TYPE_UINT8") {
147 |     return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_UINT8;
148 |   } else if (dtype == "TYPE_INT8") {
149 |     return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INT8;
150 |   } else if (dtype == "TYPE_INT32") {
151 |     return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INT32;
152 |   } else if (dtype == "TYPE_INT64") {
153 |     return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INT64;
154 |   } else if (dtype == "TYPE_FP16") {
155 |     return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_FP16;
156 |   } 
157 |   return TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INVALID;
158 | }
159 | 
160 | size_t
161 | TRITONPADDLE_DataTypeByteSize(TRITONPADDLE_DataType dtype)
162 | {
163 |   switch (dtype) {
164 |     case TRITONPADDLE_DataType::TRITONPADDLE_TYPE_FP32:
165 |       return sizeof(float);
166 |     case TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INT64:
167 |       return sizeof(int64_t);
168 |     case TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INT32:
169 |       return sizeof(int32_t);
170 |     case TRITONPADDLE_DataType::TRITONPADDLE_TYPE_UINT8:
171 |       return sizeof(uint8_t);
172 |     case TRITONPADDLE_DataType::TRITONPADDLE_TYPE_INT8:
173 |       return sizeof(int8_t);
174 |     case TRITONPADDLE_DataType::TRITONPADDLE_TYPE_FP16:
175 |       return sizeof(phi::dtype::float16);
176 |     default:
177 |       break;
178 |   }
179 |   return 0;  // Should not happened, TODO: Error handling
180 | }
181 | 
182 | /* Error message */
183 | 
184 | TRITONPADDLE_Error*
185 | TRITONPADDLE_ErrorNew(const std::string& str)
186 | {
187 |   TRITONPADDLE_Error* error = new TRITONPADDLE_Error();
188 |   error->msg_ = new char[str.size() + 1];
189 |   std::strcpy(error->msg_, str.c_str());
190 |   return error;
191 | }
192 | 
193 | void
194 | TRITONPADDLE_ErrorDelete(TRITONPADDLE_Error* error)
195 | {
196 |   if (error == nullptr) {
197 |     return;
198 |   }
199 | 
200 |   delete[] error->msg_;
201 |   delete error;
202 | }
203 | 
204 | TRITONPADDLE_Config::TRITONPADDLE_Config()
205 |     : use_trt_(false), max_batch_size_(1), workspace_size_(1 << 30), min_graph_size_(5),
206 |       precision_(TRITONPADDLE_MODE_FP32), is_dynamic_(false),
207 |       enable_tensorrt_oss_(false), disenable_trt_tune_(false), use_cpu_(false),
208 |       use_mkldnn_(false), use_ort_(false), use_mkldnn_int8_(false),
209 |       cpu_math_library_num_threads_(1), mkldnn_capacity_(10), model_dir_("")
210 | {
211 | }
212 | 
213 | // }}}
214 | 


--------------------------------------------------------------------------------
/src/paddle_backend_utils.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  2 | //
  3 | // Redistribution and use in source and binary forms, with or without
  4 | // modification, are permitted provided that the following conditions
  5 | // are met:
  6 | //  * Redistributions of source code must retain the above copyright
  7 | //    notice, this list of conditions and the following disclaimer.
  8 | //  * Redistributions in binary form must reproduce the above copyright
  9 | //    notice, this list of conditions and the following disclaimer in the
 10 | //    documentation and/or other materials provided with the distribution.
 11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | //    contributors may be used to endorse or promote products derived
 13 | //    from this software without specific prior written permission.
 14 | //
 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | #pragma once
 28 | 
 29 | #include <cstdint>
 30 | #include <cstring>
 31 | #include <string>
 32 | #include <vector>
 33 | 
 34 | #include "paddle_inference_api.h"
 35 | #include "experimental/phi/common/float16.h"
 36 | #include "triton/core/tritonserver.h"
 37 | 
 38 | // namespace triton { namespace backend { namespace paddle {
 39 | 
 40 | #define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \
 41 |   do {                                                                 \
 42 |     TRITONSERVER_Error* raarie_err__ = (X);                            \
 43 |     if (raarie_err__ != nullptr) {                                     \
 44 |       SendErrorForResponses(RESPONSES, RESPONSES_COUNT, raarie_err__); \
 45 |       return;                                                          \
 46 |     }                                                                  \
 47 |   } while (false)
 48 | 
 49 | #define RETURN_IF_TRITONPADDLE_ERROR(ERR)                                    \
 50 |   do {                                                                       \
 51 |     TRITONPADDLE_Error* error__ = (ERR);                                     \
 52 |     if (error__ != nullptr) {                                                \
 53 |       auto status =                                                          \
 54 |           TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, error__->msg_); \
 55 |       TRITONPADDLE_ErrorDelete(error__);                                     \
 56 |       return status;                                                         \
 57 |     }                                                                        \
 58 |   } while (false)
 59 | 
 60 | #define THROW_IF_TRITONPADDLE_ERROR(X)         \
 61 |   do {                                         \
 62 |     TRITONPADDLE_Error* tie_err__ = (X);       \
 63 |     if (tie_err__ != nullptr) {                \
 64 |       throw TRITONPADDLE_Exception(tie_err__); \
 65 |     }                                          \
 66 |   } while (false)
 67 | 
 68 | typedef struct {
 69 |   char* msg_;
 70 | } TRITONPADDLE_Error;
 71 | 
 72 | struct TRITONPADDLE_Exception {
 73 |   TRITONPADDLE_Exception(TRITONPADDLE_Error* err) : err_(err) {}
 74 |   TRITONPADDLE_Error* err_;
 75 | };
 76 | 
 77 | TRITONPADDLE_Error* TRITONPADDLE_ErrorNew(const std::string& str);
 78 | 
 79 | void TRITONPADDLE_ErrorDelete(TRITONPADDLE_Error* error);
 80 | 
 81 | // TRITONPADDLE TYPE
 82 | // TODO: Full all possible type?
 83 | typedef enum {
 84 |   TRITONPADDLE_TYPE_FP32,
 85 |   TRITONPADDLE_TYPE_INT64,
 86 |   TRITONPADDLE_TYPE_INT32,
 87 |   TRITONPADDLE_TYPE_UINT8,
 88 |   TRITONPADDLE_TYPE_INT8,
 89 |   TRITONPADDLE_TYPE_FP16,
 90 |   TRITONPADDLE_TYPE_INVALID
 91 | } TRITONPADDLE_DataType;
 92 | 
 93 | // TRITONPADDLE SHAPE
 94 | class TRITONPADDLE_Shape {
 95 |  public:
 96 |   using value_type = int64_t;
 97 | 
 98 |   TRITONPADDLE_Shape() = default;
 99 |   TRITONPADDLE_Shape(const std::string& str);
100 |   template <typename T>
101 |   TRITONPADDLE_Shape(const std::vector<T>& shape);
102 |   size_t NumElements() const { return numel_; };
103 | 
104 |   std::vector<int32_t> CompatibleShape() const;
105 |   std::vector<value_type> Shape() const { return shape_; };
106 | 
107 |  private:
108 |   std::vector<value_type> shape_;
109 |   size_t numel_;
110 | };
111 | 
112 | TRITONPADDLE_DataType ConvertDataType(TRITONSERVER_DataType dtype);
113 | 
114 | TRITONPADDLE_DataType ConvertDataType(::paddle_infer::DataType dtype);
115 | 
116 | TRITONPADDLE_DataType ConvertDataType(const std::string& dtype);
117 | 
118 | TRITONSERVER_DataType ConvertDataType(TRITONPADDLE_DataType dtype);
119 | 
120 | size_t TRITONPADDLE_DataTypeByteSize(TRITONPADDLE_DataType dtype);
121 | 
122 | // TRITON PADDLE MODE
123 | typedef enum {
124 |   TRITONPADDLE_MODE_FP32,
125 |   TRITONPADDLE_MODE_FP16,
126 |   TRITONPADDLE_MODE_INT8,
127 | } TRITONPADDLE_Precision;
128 | 
129 | // TRITON PADDLE CONFIG
130 | class TRITONPADDLE_Config {
131 |  public:
132 |   TRITONPADDLE_Config();
133 |   // trt
134 |   bool use_trt_;
135 |   int64_t max_batch_size_;
136 |   int64_t workspace_size_;
137 |   int64_t min_graph_size_;
138 |   TRITONPADDLE_Precision precision_;
139 |   bool is_dynamic_;
140 |   bool enable_tensorrt_oss_;
141 |   bool disenable_trt_tune_;
142 |   // cpu
143 |   bool use_cpu_;
144 |   bool use_mkldnn_;
145 |   bool use_ort_;
146 |   bool use_mkldnn_int8_;
147 |   int cpu_math_library_num_threads_;
148 |   int mkldnn_capacity_;
149 |   std::string model_dir_;
150 | 
151 |   std::map<std::string, std::vector<int>> dynamic_min_shape_;
152 |   std::map<std::string, std::vector<int>> dynamic_max_shape_;
153 |   std::map<std::string, std::vector<int>> dynamic_opt_shape_;
154 | };
155 | 
156 | // }}}
157 | 


--------------------------------------------------------------------------------