├── .clang-format ├── .github └── workflows │ └── pre-commit.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake └── TritonPyTorchBackendConfig.cmake.in ├── pyproject.toml ├── src ├── libtorch.cc ├── libtorch_utils.cc ├── libtorch_utils.h ├── libtriton_pytorch.ldscript └── model.py └── tools └── gen_pb_exec_env.sh /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Google 3 | 4 | IndentWidth: 2 5 | ColumnLimit: 80 6 | ContinuationIndentWidth: 4 7 | UseTab: Never 8 | MaxEmptyLinesToKeep: 2 9 | 10 | SortIncludes: true 11 | CompactNamespaces: true 12 | ReflowComments: true 13 | 14 | DerivePointerAlignment: false 15 | PointerAlignment: Left 16 | 17 | AllowShortIfStatementsOnASingleLine: false 18 | AllowShortBlocksOnASingleLine: false 19 | AllowShortFunctionsOnASingleLine: Inline 20 | 21 | AlwaysBreakAfterReturnType: TopLevelDefinitions 22 | AlignAfterOpenBracket: AlwaysBreak 23 | BreakBeforeBraces: Custom 24 | BraceWrapping: 25 | AfterClass: false 26 | AfterControlStatement: false 27 | AfterEnum: false 28 | AfterFunction: true 29 | AfterNamespace: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | BeforeCatch: true 33 | 34 | BinPackArguments: true 35 | BinPackParameters: true 36 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 37 | 38 | IndentCaseLabels: true 39 | 40 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: pre-commit 28 | 29 | on: 30 | pull_request: 31 | 32 | jobs: 33 | pre-commit: 34 | runs-on: ubuntu-22.04 35 | steps: 36 | - uses: actions/checkout@v3 37 | - uses: actions/setup-python@v3 38 | - uses: pre-commit/action@v3.0.0 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | /.vscode 3 | *.so 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | repos: 28 | - repo: https://github.com/timothycrosley/isort 29 | rev: 5.12.0 30 | hooks: 31 | - id: isort 32 | additional_dependencies: [toml] 33 | - repo: https://github.com/psf/black 34 | rev: 23.1.0 35 | hooks: 36 | - id: black 37 | types_or: [python, cython] 38 | - repo: https://github.com/PyCQA/flake8 39 | rev: 5.0.4 40 | hooks: 41 | - id: flake8 42 | args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501] 43 | types_or: [python, cython] 44 | - repo: https://github.com/pre-commit/mirrors-clang-format 45 | rev: v16.0.5 46 | hooks: 47 | - id: clang-format 48 | types_or: [c, c++, cuda, proto, textproto, java] 49 | args: ["-fallback-style=none", "-style=file", "-i"] 50 | - repo: https://github.com/codespell-project/codespell 51 | rev: v2.2.4 52 | hooks: 53 | - id: codespell 54 | additional_dependencies: [tomli] 55 | args: ["--toml", "pyproject.toml"] 56 | exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$) 57 | # More details about these pre-commit hooks here: 58 | # https://pre-commit.com/hooks.html 59 | - repo: https://github.com/pre-commit/pre-commit-hooks 60 | rev: v4.4.0 61 | hooks: 62 | - id: check-case-conflict 63 | - id: check-executables-have-shebangs 64 | - id: check-merge-conflict 65 | - id: check-json 66 | - id: check-toml 67 | - id: check-yaml 68 | - id: check-shebang-scripts-are-executable 69 | - id: end-of-file-fixer 70 | types_or: [c, c++, cuda, proto, textproto, java, python] 71 | - id: mixed-line-ending 72 | - id: requirements-txt-fixer 73 | - id: trailing-whitespace 74 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | cmake_minimum_required (VERSION 3.18) 28 | 29 | project(tritonpytorchbackend LANGUAGES C CXX) 30 | 31 | # Use C++17 standard as Triton's minimum required. 32 | set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.") 33 | 34 | # 35 | # Options 36 | # 37 | # To build the PyTorch backend you must either: 38 | # 39 | # - Point to the already built PyTorch and Torchvision using 40 | # TRITON_PYTORCH_INCLUDE_PATHS and TRITON_PYTORCH_LIB_PATHS 41 | # 42 | # or: 43 | # 44 | # - Set TRITON_PYTORCH_DOCKER_IMAGE to use the docker image of 45 | # PyTorch to base the build off. 46 | # 47 | 48 | option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON) 49 | option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) 50 | option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF) 51 | option(TRITON_PYTORCH_ENABLE_TORCHTRT "Enable TorchTRT support" OFF) 52 | option(TRITON_PYTORCH_ENABLE_TORCHVISION "Enable Torchvision support" ON) 53 | 54 | set(TRITON_PYTORCH_DOCKER_IMAGE "" CACHE STRING "Docker image containing the PyTorch build required by backend.") 55 | set(TRITON_PYTORCH_INCLUDE_PATHS "" CACHE PATH "Paths to Torch includes") 56 | set(TRITON_PYTORCH_LIB_PATHS "" CACHE PATH "Paths to Torch libraries") 57 | 58 | set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from") 59 | set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") 60 | set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") 61 | set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") 62 | 63 | if(NOT CMAKE_BUILD_TYPE) 64 | set(CMAKE_BUILD_TYPE Release) 65 | endif() 66 | 67 | set(TRITON_PYTORCH_DOCKER_BUILD OFF) 68 | if(TRITON_PYTORCH_LIB_PATHS STREQUAL "") 69 | if(TRITON_PYTORCH_DOCKER_IMAGE STREQUAL "") 70 | message(FATAL_ERROR "Using the PyTorch docker based build requires TRITON_PYTORCH_DOCKER_IMAGE") 71 | endif() 72 | set(TRITON_PYTORCH_DOCKER_BUILD ON) 73 | message(STATUS "Using PyTorch docker: ${TRITON_PYTORCH_DOCKER_IMAGE}") 74 | else() 75 | # Look for installed Torch-TRT package in lib paths 76 | if(TRITON_PYTORCH_ENABLE_TORCHTRT AND NOT EXISTS "${TRITON_PYTORCH_LIB_PATHS}/libtorchtrt_runtime.so") 77 | message(WARNING "TRITON_PYTORCH_ENABLE_TORCHTRT is on, but TRITON_PYTORCH_LIB_PATHS does not contain Torch-TRT package") 78 | endif() 79 | 80 | # Look for installed Torchvision package in lib paths 81 | find_library( LIBTORCHVISION libtorchvision.so libtorchvision.so.1 PATHS ${TRITON_PYTORCH_LIB_PATHS} ) 82 | if(NOT ${LIBTORCHVISION}) 83 | message(WARNING "TRITON_PYTORCH_ENABLE_TORCHVISION is on, but TRITON_PYTORCH_LIB_PATHS does not contain Torchvision package") 84 | endif(NOT ${LIBTORCHVISION}) 85 | endif() 86 | 87 | # Python.h needed by torch headers. 88 | find_package(Python3 REQUIRED COMPONENTS Development.Module) 89 | 90 | set(RHEL_BUILD OFF) 91 | set(LIB_DIR "lib") 92 | set(LIBTORCH_LIBS_PATH "/usr/local/lib") 93 | set(PY_INSTALL_PATH "/usr/local/lib/python3.12/dist-packages") 94 | if(LINUX) 95 | file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") 96 | if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") 97 | set(RHEL_BUILD ON) 98 | set(LIB_DIR "lib64") 99 | set(PY_INSTALL_PATH "/opt/_internal/cpython-3.12.1/lib/python3.12/site-packages") 100 | if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") 101 | set(LIBTORCH_LIBS_PATH "/opt/_internal/cpython-3.12.1/lib") 102 | endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") 103 | endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") 104 | endif(LINUX) 105 | 106 | # 107 | # Dependencies 108 | # 109 | # FetchContent's composability isn't very good. We must include the 110 | # transitive closure of all repos so that we can override the tag. 111 | # 112 | include(FetchContent) 113 | 114 | FetchContent_Declare( 115 | repo-common 116 | GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git 117 | GIT_TAG ${TRITON_COMMON_REPO_TAG} 118 | GIT_SHALLOW ON 119 | ) 120 | FetchContent_Declare( 121 | repo-core 122 | GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git 123 | GIT_TAG ${TRITON_CORE_REPO_TAG} 124 | GIT_SHALLOW ON 125 | ) 126 | FetchContent_Declare( 127 | repo-backend 128 | GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git 129 | GIT_TAG ${TRITON_BACKEND_REPO_TAG} 130 | GIT_SHALLOW ON 131 | ) 132 | FetchContent_MakeAvailable(repo-common repo-core repo-backend) 133 | 134 | # 135 | # CUDA 136 | # 137 | if(${TRITON_ENABLE_GPU}) 138 | find_package(CUDAToolkit REQUIRED) 139 | else() 140 | if (${TRITON_PYTORCH_ENABLE_TORCHTRT}) 141 | message(FATAL_ERROR "TRITON_PYTORCH_ENABLE_TORCHTRT is ON when TRITON_ENABLE_GPU is OFF") 142 | endif() 143 | endif() # TRITON_ENABLE_GPU 144 | 145 | if(${TRITON_ENABLE_NVTX}) 146 | add_definitions(-DTRITON_ENABLE_NVTX=1) 147 | endif() # TRITON_ENABLE_NVTX 148 | 149 | # 150 | # Shared library implementing the Triton Backend API 151 | # 152 | configure_file(src/libtriton_pytorch.ldscript libtriton_pytorch.ldscript COPYONLY) 153 | 154 | set(PT_LIBS 155 | "libc10.so" 156 | "libc10_cuda.so" 157 | "libtorch.so" 158 | "libtorch_cpu.so" 159 | "libtorch_cuda.so" 160 | "libtorch_cuda_linalg.so" 161 | "libtorch_global_deps.so" 162 | "libjpeg.so.62" 163 | ) 164 | 165 | if (${TRITON_PYTORCH_ENABLE_TORCHVISION}) 166 | set(PT_LIBS 167 | ${PT_LIBS} 168 | $,libtorchvision.so,libtorchvision.so.1> 169 | ) 170 | endif() # TRITON_PYTORCH_ENABLE_TORCHVISION 171 | 172 | if (${TRITON_PYTORCH_ENABLE_TORCHTRT}) 173 | set(PT_LIBS 174 | ${PT_LIBS} 175 | "libtorchtrt_runtime.so" 176 | ) 177 | endif() # TRITON_PYTORCH_ENABLE_TORCHTRT 178 | 179 | if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") 180 | set(LIBS_ARCH "aarch64") 181 | set(LIBTORCH_LIBS 182 | "libopenblas.so.0" 183 | "libnvpl_blas_core.so.0" 184 | "libnvpl_blas_ilp64_gomp.so.0" 185 | "libnvpl_blas_ilp64_seq.so.0" 186 | "libnvpl_blas_lp64_gomp.so.0" 187 | "libnvpl_blas_lp64_seq.so.0" 188 | "libnvpl_lapack_core.so.0" 189 | "libnvpl_lapack_ilp64_gomp.so.0" 190 | "libnvpl_lapack_ilp64_seq.so.0" 191 | "libnvpl_lapack_lp64_gomp.so.0" 192 | "libnvpl_lapack_lp64_seq.so.0" 193 | ) 194 | else() 195 | set(LIBS_ARCH "x86_64") 196 | set(LIBTORCH_LIBS 197 | "libmkl_avx2.so.1" 198 | "libmkl_avx512.so.1" 199 | "libmkl_core.so.1" 200 | "libmkl_def.so.1" 201 | "libmkl_gnu_thread.so.1" 202 | "libmkl_intel_lp64.so.1" 203 | "libmkl_intel_thread.so.1" 204 | "libmkl_rt.so.1" 205 | "libmkl_sequential.so.1" 206 | "libmkl_vml_def.so.1" 207 | ) 208 | endif() 209 | set(OPENCV_LIBS 210 | "libopencv_video.so" 211 | "libopencv_videoio.so" 212 | "libopencv_highgui.so" 213 | "libopencv_imgcodecs.so" 214 | "libopencv_imgproc.so" 215 | "libopencv_core.so" 216 | "libopencv_calib3d.so" 217 | "libopencv_flann.so" 218 | "libopencv_features2d.so" 219 | $,libjpeg.so.62,libjpeg.so> 220 | $,libpng16.so.16,libpng16.so> 221 | ) 222 | 223 | # The patchelf commands ensure the MKL libraries are loaded correctly during runtime 224 | # Without these, the framework/backend complains of missing libraries / symbols and 225 | # in some cases leads to segmentation faults. 226 | if (${TRITON_PYTORCH_DOCKER_BUILD}) 227 | string(REPLACE ";" " " LIBTORCH_LIBS_STR "${LIBTORCH_LIBS}") 228 | string(RANDOM 8 "abcdefghijklmnopqrstuvwxyz" random_id) 229 | 230 | add_custom_command( 231 | OUTPUT 232 | ${PT_LIBS} 233 | ${LIBTORCH_LIBS} 234 | ${OPENCV_LIBS} 235 | LICENSE.pytorch 236 | include/torch 237 | include/torchvision 238 | COMMAND ${CMAKE_COMMAND} -E make_directory "include/torchvision" 239 | COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE} 240 | COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true 241 | COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE} 242 | COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:${LIBTORCH_LIBS_PATH}/$i $i ; done" 243 | COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libc10.so libc10.so 244 | COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libc10_cuda.so libc10_cuda.so 245 | COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch.so libtorch.so 246 | COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cpu.so libtorch_cpu.so 247 | COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda.so libtorch_cuda.so 248 | COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so 249 | COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_global_deps.so libtorch_global_deps.so 250 | COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so 251 | # TODO: Revisit when not needed by making it part of cuda base container. 252 | COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/cuda/lib64/libcusparseLt.so libcusparseLt.so; 253 | COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -a -L pytorch_backend_ptlib:/usr/local/lib64/libtorchvision.so libtorchvision.so; else docker cp -a -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so.1 libtorchvision.so.1; fi; fi" 254 | COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision; fi" 255 | COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi" 256 | COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true 257 | COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/LICENSE LICENSE.pytorch 258 | COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/include include/torch 259 | COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/. 260 | COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_videoio.so libopencv_videoio.so 261 | COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_highgui.so libopencv_highgui.so 262 | COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_video.so libopencv_video.so 263 | COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_imgcodecs.so libopencv_imgcodecs.so 264 | COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_imgproc.so libopencv_imgproc.so 265 | COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_core.so libopencv_core.so 266 | COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_calib3d.so libopencv_calib3d.so 267 | COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_features2d.so libopencv_features2d.so 268 | COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_flann.so libopencv_flann.so 269 | COMMAND /bin/sh -c "if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -L pytorch_backend_ptlib:/usr/lib64/libjpeg.so.62 libjpeg.so.62; else docker cp -L pytorch_backend_ptlib:/usr/local/lib/libjpeg.so.62 libjpeg.so.62 && docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so; fi;" 270 | COMMAND /bin/sh -c "if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -L pytorch_backend_ptlib:/usr/lib64/libpng16.so.16 libpng16.so.16; else docker cp -L pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so libpng16.so; fi;" 271 | COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi" 272 | COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_def.so.1; fi" 273 | COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx2.so.1; fi" 274 | COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_avx2.so.1; fi" 275 | COMMAND /bin/sh -c "if [ -f libmkl_avx512.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx512.so.1; fi" 276 | COMMAND /bin/sh -c "if [ -f libmkl_avx512.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_avx512.so.1; fi" 277 | COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_vml_def.so.1; fi" 278 | COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_intel_thread.so.1 libmkl_vml_def.so.1; fi" 279 | COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_vml_def.so.1; fi" 280 | COMMAND /bin/sh -c "if [ -f libmkl_intel_thread.so.1 ]; then patchelf --add-needed libmkl_intel_lp64.so.1 libmkl_intel_thread.so.1; fi" 281 | COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'OFF' ]; then ln -s libtorchvision.so.1 libtorchvision.so; fi; fi;" 282 | COMMAND docker rm pytorch_backend_ptlib 283 | COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}" 284 | VERBATIM 285 | ) 286 | add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) 287 | add_library(ptlib SHARED IMPORTED GLOBAL) 288 | add_dependencies(ptlib ptlib_target) 289 | 290 | # Just one of the libs are enough to ensure the docker build 291 | set_target_properties( 292 | ptlib 293 | PROPERTIES 294 | IMPORTED_LOCATION libtorch.so 295 | ) 296 | endif() # TRITON_PYTORCH_DOCKER_BUILD 297 | 298 | add_library( 299 | triton-pytorch-backend SHARED 300 | src/libtorch.cc 301 | src/libtorch_utils.cc 302 | src/libtorch_utils.h 303 | ) 304 | 305 | add_library( 306 | TritonPyTorchBackend::triton-pytorch-backend ALIAS triton-pytorch-backend 307 | ) 308 | 309 | target_include_directories( 310 | triton-pytorch-backend 311 | PRIVATE 312 | ${CMAKE_CURRENT_SOURCE_DIR}/src 313 | ${Python3_INCLUDE_DIRS} 314 | ) 315 | 316 | if (${TRITON_PYTORCH_DOCKER_BUILD}) 317 | target_include_directories( 318 | triton-pytorch-backend 319 | PRIVATE 320 | ${CMAKE_CURRENT_BINARY_DIR}/include/torch 321 | ${CMAKE_CURRENT_BINARY_DIR}/include/torch/torch/csrc/api/include 322 | ${CMAKE_CURRENT_BINARY_DIR}/include/torchvision 323 | ) 324 | else() 325 | target_include_directories( 326 | triton-pytorch-backend 327 | PRIVATE ${TRITON_PYTORCH_INCLUDE_PATHS} 328 | ) 329 | endif() # TRITON_PYTORCH_DOCKER_BUILD 330 | 331 | # Need to turn off -Werror due to Torchvision vision.h extern initialization 332 | # Unfortunately gcc does not provide a specific flag to ignore the specific 333 | # warning: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=45977 334 | target_compile_features(triton-pytorch-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) 335 | target_compile_options( 336 | triton-pytorch-backend PRIVATE 337 | $<$,$,$>: 338 | -Wall -Wextra -Wno-unused-parameter -Wno-type-limits> 339 | ) 340 | 341 | if(${TRITON_ENABLE_GPU}) 342 | target_compile_definitions( 343 | triton-pytorch-backend 344 | PRIVATE TRITON_ENABLE_GPU=1 345 | ) 346 | endif() # TRITON_ENABLE_GPU 347 | 348 | set_target_properties( 349 | triton-pytorch-backend 350 | PROPERTIES 351 | POSITION_INDEPENDENT_CODE ON 352 | OUTPUT_NAME triton_pytorch 353 | SKIP_BUILD_RPATH TRUE 354 | BUILD_WITH_INSTALL_RPATH TRUE 355 | INSTALL_RPATH_USE_LINK_PATH FALSE 356 | INSTALL_RPATH "$\{ORIGIN\}" 357 | LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_pytorch.ldscript 358 | LINK_FLAGS "-Wl,--no-as-needed,--version-script libtriton_pytorch.ldscript" 359 | ) 360 | 361 | # Need to turn off unused-but-set-variable due to Torchvision 362 | # Need to turn off unknown-pragmas due to ATen OpenMP 363 | set_target_properties( 364 | triton-pytorch-backend 365 | PROPERTIES COMPILE_FLAGS 366 | "-Wno-unknown-pragmas -Wno-unused-but-set-variable" 367 | ) 368 | 369 | if (${TRITON_PYTORCH_DOCKER_BUILD}) 370 | add_dependencies( 371 | triton-pytorch-backend 372 | ptlib 373 | ) 374 | endif() # TRITON_PYTORCH_DOCKER_BUILD 375 | 376 | message(STATUS "Torchvision support is ${TRITON_PYTORCH_ENABLE_TORCHVISION}") 377 | message(STATUS "Torch-TRT support is ${TRITON_PYTORCH_ENABLE_TORCHTRT}") 378 | 379 | set(TRITON_PYTORCH_LDFLAGS "") 380 | if (${TRITON_PYTORCH_DOCKER_BUILD}) 381 | set(TRITON_PYTORCH_LIBS "${CMAKE_CURRENT_BINARY_DIR}/libtorch.so") 382 | 383 | if (${TRITON_PYTORCH_ENABLE_TORCHVISION}) 384 | set(TRITON_PYTORCH_LIBS 385 | ${TRITON_PYTORCH_LIBS} 386 | "${CMAKE_CURRENT_BINARY_DIR}/$,libtorchvision.so,libtorchvision.so.1>") 387 | endif() # TRITON_PYTORCH_ENABLE_TORCHVISION 388 | 389 | if (${TRITON_PYTORCH_ENABLE_TORCHTRT}) 390 | set(TRITON_PYTORCH_LIBS 391 | ${TRITON_PYTORCH_LIBS} 392 | "${CMAKE_CURRENT_BINARY_DIR}/libtorchtrt_runtime.so") 393 | endif() # TRITON_PYTORCH_ENABLE_TORCHTRT 394 | else() 395 | set (TRITON_PYTORCH_LIBS "-ltorch") 396 | 397 | if (${TRITON_PYTORCH_ENABLE_TORCHVISION}) 398 | set(TRITON_PYTORCH_LIBS 399 | ${TRITON_PYTORCH_LIBS} 400 | "-ltorchvision" 401 | ) 402 | endif() # TRITON_PYTORCH_ENABLE_TORCHVISION 403 | 404 | if (${TRITON_PYTORCH_ENABLE_TORCHTRT}) 405 | set(TRITON_PYTORCH_LIBS 406 | ${TRITON_PYTORCH_LIBS} 407 | "-ltorchtrt_runtime" 408 | ) 409 | endif() # TRITON_PYTORCH_ENABLE_TORCHTRT 410 | 411 | FOREACH(p ${TRITON_PYTORCH_LIB_PATHS}) 412 | set(TRITON_PYTORCH_LDFLAGS ${TRITON_PYTORCH_LDFLAGS} "-L${p}") 413 | ENDFOREACH(p) 414 | endif() # TRITON_PYTORCH_DOCKER_BUILD 415 | 416 | target_link_libraries( 417 | triton-pytorch-backend 418 | PRIVATE 419 | triton-core-serverapi # from repo-core 420 | triton-core-backendapi # from repo-core 421 | triton-core-serverstub # from repo-core 422 | triton-backend-utils # from repo-backend 423 | ${TRITON_PYTORCH_LDFLAGS} 424 | ${TRITON_PYTORCH_LIBS} 425 | ) 426 | 427 | if(${TRITON_ENABLE_GPU}) 428 | target_link_libraries( 429 | triton-pytorch-backend 430 | PRIVATE 431 | CUDA::cudart 432 | ) 433 | endif() # TRITON_ENABLE_GPU 434 | 435 | # 436 | # Install 437 | # 438 | include(GNUInstallDirs) 439 | set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonPyTorchBackend) 440 | 441 | install( 442 | TARGETS 443 | triton-pytorch-backend 444 | EXPORT 445 | triton-pytorch-backend-targets 446 | LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch 447 | ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch 448 | ) 449 | 450 | if (${TRITON_PYTORCH_DOCKER_BUILD}) 451 | set(PT_LIB_PATHS "") 452 | FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) 453 | set(PT_LIB_PATHS ${PT_LIB_PATHS} "${CMAKE_CURRENT_BINARY_DIR}/${plib}") 454 | ENDFOREACH(plib) 455 | 456 | install( 457 | FILES 458 | ${PT_LIB_PATHS} 459 | ${CMAKE_CURRENT_BINARY_DIR}/libcusparseLt.so 460 | ${CMAKE_CURRENT_BINARY_DIR}/LICENSE.pytorch 461 | DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch 462 | ) 463 | 464 | if (${TRITON_PYTORCH_ENABLE_TORCHTRT}) 465 | install( 466 | FILES 467 | ${CMAKE_CURRENT_BINARY_DIR}/torchtrtc 468 | DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch 469 | ) 470 | endif() # TRITON_PYTORCH_ENABLE_TORCHTRT 471 | 472 | FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) 473 | install( 474 | CODE 475 | "EXECUTE_PROCESS( 476 | COMMAND patchelf --set-rpath \$ORIGIN ${plib} 477 | RESULT_VARIABLE PATCHELF_STATUS 478 | WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) 479 | if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0) 480 | message(FATAL_ERROR \"FAILED: to run patchelf\") 481 | endif()" 482 | ) 483 | ENDFOREACH(plib) 484 | 485 | set(OPENCV_VERSION "406") 486 | install( 487 | CODE 488 | "EXECUTE_PROCESS( 489 | COMMAND ln -sf libopencv_video.so libopencv_video.so.${OPENCV_VERSION} 490 | COMMAND ln -sf libopencv_videoio.so libopencv_videoio.so.${OPENCV_VERSION} 491 | COMMAND ln -sf libopencv_highgui.so libopencv_highgui.so.${OPENCV_VERSION} 492 | COMMAND ln -sf libopencv_imgcodecs.so libopencv_imgcodecs.so.${OPENCV_VERSION} 493 | COMMAND ln -sf libopencv_imgproc.so libopencv_imgproc.so.${OPENCV_VERSION} 494 | COMMAND ln -sf libopencv_core.so libopencv_core.so.${OPENCV_VERSION} 495 | COMMAND ln -sf libopencv_calib3d.so libopencv_calib3d.so.${OPENCV_VERSION} 496 | COMMAND ln -sf libopencv_features2d.so libopencv_features2d.so.${OPENCV_VERSION} 497 | COMMAND ln -sf libopencv_flann.so libopencv_flann.so.${OPENCV_VERSION} 498 | COMMAND ln -sf libpng16.so libpng16.so.16 499 | COMMAND ln -sf libjpeg.so libjpeg.so.8 500 | COMMAND ln -sf libcusparseLt.so libcusparseLt.so.0 501 | RESULT_VARIABLE LINK_STATUS 502 | WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) 503 | if(LINK_STATUS AND NOT LINK_STATUS EQUAL 0) 504 | message(FATAL_ERROR \"FAILED: to create links\") 505 | endif()" 506 | ) 507 | else() 508 | FOREACH(plib ${PT_LIBS}) 509 | set(PT_LIB_PATHS ${PT_LIB_PATHS} "${TRITON_PYTORCH_LIB_PATHS}/${plib}") 510 | ENDFOREACH(plib) 511 | 512 | install( 513 | FILES 514 | ${PT_LIB_PATHS} 515 | DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch 516 | ) 517 | 518 | FOREACH(plib ${PT_LIBS}) 519 | install( 520 | CODE 521 | "EXECUTE_PROCESS( 522 | COMMAND patchelf --set-rpath \$ORIGIN ${plib} 523 | RESULT_VARIABLE PATCHELF_STATUS 524 | WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) 525 | if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0) 526 | message(FATAL_ERROR \"FAILED: to run patchelf\") 527 | endif()" 528 | ) 529 | ENDFOREACH(plib) 530 | endif() # TRITON_PYTORCH_DOCKER_BUILD 531 | 532 | install( 533 | EXPORT 534 | triton-pytorch-backend-targets 535 | FILE 536 | TritonPyTorchBackendTargets.cmake 537 | NAMESPACE 538 | TritonPyTorchBackend:: 539 | DESTINATION 540 | ${INSTALL_CONFIGDIR} 541 | ) 542 | 543 | install( 544 | FILES 545 | src/model.py 546 | DESTINATION 547 | ${CMAKE_INSTALL_PREFIX}/backends/pytorch 548 | ) 549 | 550 | include(CMakePackageConfigHelpers) 551 | configure_package_config_file( 552 | ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonPyTorchBackendConfig.cmake.in 553 | ${CMAKE_CURRENT_BINARY_DIR}/TritonPyTorchBackendConfig.cmake 554 | INSTALL_DESTINATION ${INSTALL_CONFIGDIR} 555 | ) 556 | 557 | install( 558 | FILES 559 | ${CMAKE_CURRENT_BINARY_DIR}/TritonPyTorchBackendConfig.cmake 560 | DESTINATION ${INSTALL_CONFIGDIR} 561 | ) 562 | 563 | # 564 | # Export from build tree 565 | # 566 | export( 567 | EXPORT triton-pytorch-backend-targets 568 | FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonPyTorchBackendTargets.cmake 569 | NAMESPACE TritonPyTorchBackend:: 570 | ) 571 | 572 | export(PACKAGE TritonPyTorchBackend) 573 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of NVIDIA CORPORATION nor the names of its 12 | contributors may be used to endorse or promote products derived 13 | from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 28 | 29 | [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) 30 | 31 | # PyTorch (LibTorch) Backend 32 | 33 | The Triton backend for [PyTorch](https://github.com/pytorch/pytorch). 34 | You can learn more about Triton backends in the [backend 35 | repo](https://github.com/triton-inference-server/backend). Ask 36 | questions or report problems on the [issues 37 | page](https://github.com/triton-inference-server/server/issues). 38 | This backend is designed to run [TorchScript](https://pytorch.org/docs/stable/jit.html) 39 | models using the PyTorch C++ API. All models created in PyTorch 40 | using the python API must be traced/scripted to produce a TorchScript 41 | model. 42 | 43 | Where can I ask general questions about Triton and Triton backends? 44 | Be sure to read all the information below as well as the [general 45 | Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server) 46 | available in the main [server](https://github.com/triton-inference-server/server) 47 | repo. If you don't find your answer there you can ask questions on the 48 | main Triton [issues page](https://github.com/triton-inference-server/server/issues). 49 | 50 | ## Build the PyTorch Backend 51 | 52 | Use a recent cmake to build. First install the required dependencies. 53 | 54 | ``` 55 | $ apt-get install rapidjson-dev python3-dev python3-pip 56 | $ pip3 install patchelf==0.17.2 57 | ``` 58 | 59 | An appropriate PyTorch container from [NGC](https://ngc.nvidia.com) must be used. 60 | For example, to build a backend that uses the 23.04 version of the PyTorch 61 | container from NGC: 62 | 63 | ``` 64 | $ mkdir build 65 | $ cd build 66 | $ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:23.04-py3" .. 67 | $ make install 68 | ``` 69 | 70 | The following required Triton repositories will be pulled and used in 71 | the build. By default, the "main" branch/tag will be used for each repo 72 | but the listed CMake argument can be used to override. 73 | 74 | * triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag] 75 | * triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag] 76 | * triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag] 77 | 78 | ## Build the PyTorch Backend With Custom PyTorch 79 | 80 | Currently, Triton requires that a specially patched version of 81 | PyTorch be used with the PyTorch backend. The full source for 82 | these PyTorch versions are available as Docker images from 83 | [NGC](https://ngc.nvidia.com). For example, the PyTorch version 84 | compatible with the 22.12 release of Triton is available as 85 | nvcr.io/nvidia/pytorch:22.12-py3. 86 | 87 | Copy over the LibTorch and Torchvision headers and libraries from the 88 | [PyTorch NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) 89 | into local directories. You can see which headers and libraries 90 | are needed/copied from the docker. 91 | 92 | ``` 93 | $ mkdir build 94 | $ cd build 95 | $ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_INCLUDE_PATHS="/torch;/torch/torch/csrc/api/include;/torchvision" -DTRITON_PYTORCH_LIB_PATHS="" .. 96 | $ make install 97 | ``` 98 | 99 | ## Using the PyTorch Backend 100 | 101 | ### Parameters 102 | 103 | Triton exposes some flags to control the execution mode of the TorchScript models through 104 | the Parameters section of the model's `config.pbtxt` file. 105 | 106 | * `DISABLE_OPTIMIZED_EXECUTION`: Boolean flag to disable the optimized execution 107 | of TorchScript models. By default, the optimized execution is always enabled. 108 | 109 | The initial calls to a loaded TorchScript model take extremely long. Due to this longer 110 | model warmup [issue](https://github.com/pytorch/pytorch/issues/57894), Triton also allows 111 | execution of models without these optimizations. In some models, optimized execution 112 | does not benefit performance as seen [here](https://github.com/pytorch/pytorch/issues/19978) 113 | and in other cases impacts performance negatively, as seen [here](https://github.com/pytorch/pytorch/issues/53824). 114 | 115 | The section of model config file specifying this parameter will look like: 116 | 117 | ``` 118 | parameters: { 119 | key: "DISABLE_OPTIMIZED_EXECUTION" 120 | value: { 121 | string_value: "true" 122 | } 123 | } 124 | ``` 125 | 126 | * `INFERENCE_MODE`: Boolean flag to enable the Inference Mode execution 127 | of TorchScript models. By default, the inference mode is enabled. 128 | 129 | [InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new 130 | RAII guard analogous to NoGradMode to be used when you are certain your operations 131 | will have no interactions with autograd. Compared to NoGradMode, code run under 132 | this mode gets better performance by disabling autograd. 133 | 134 | Please note that in some models, InferenceMode might not benefit performance 135 | and in fewer cases might impact performance negatively. 136 | 137 | The section of model config file specifying this parameter will look like: 138 | 139 | ``` 140 | parameters: { 141 | key: "INFERENCE_MODE" 142 | value: { 143 | string_value: "true" 144 | } 145 | } 146 | ``` 147 | 148 | * `DISABLE_CUDNN`: Boolean flag to disable the cuDNN library. By default, cuDNN is enabled. 149 | 150 | [cuDNN](https://developer.nvidia.com/cudnn) is a GPU-accelerated library of primitives for 151 | deep neural networks. cuDNN provides highly tuned implementations for standard routines. 152 | 153 | Typically, models run with cuDNN enabled are faster. However there are some exceptions 154 | where using cuDNN can be slower, cause higher memory usage or result in errors. 155 | 156 | 157 | The section of model config file specifying this parameter will look like: 158 | 159 | ``` 160 | parameters: { 161 | key: "DISABLE_CUDNN" 162 | value: { 163 | string_value: "true" 164 | } 165 | } 166 | ``` 167 | 168 | * `ENABLE_WEIGHT_SHARING`: Boolean flag to enable model instances on the same device to 169 | share weights. This optimization should not be used with stateful models. If not specified, 170 | weight sharing is disabled. 171 | 172 | The section of model config file specifying this parameter will look like: 173 | 174 | ``` 175 | parameters: { 176 | key: "ENABLE_WEIGHT_SHARING" 177 | value: { 178 | string_value: "true" 179 | } 180 | } 181 | ``` 182 | 183 | * `ENABLE_CACHE_CLEANING`: Boolean flag to enable CUDA cache cleaning after each model execution. 184 | If not specified, cache cleaning is disabled. This flag has no effect if model is on CPU. 185 | Setting this flag to true will negatively impact the performance due to additional CUDA cache 186 | cleaning operation after each model execution. Therefore, you should only use this flag if you 187 | serve multiple models with Triton and encounter CUDA out of memory issue during model executions. 188 | 189 | The section of model config file specifying this parameter will look like: 190 | 191 | ``` 192 | parameters: { 193 | key: "ENABLE_CACHE_CLEANING" 194 | value: { 195 | string_value:"true" 196 | } 197 | } 198 | ``` 199 | 200 | * `INTER_OP_THREAD_COUNT`: 201 | 202 | PyTorch allows using multiple CPU threads during TorchScript model inference. 203 | One or more inference threads execute a model’s forward pass on the given 204 | inputs. Each inference thread invokes a JIT interpreter that executes the ops 205 | of a model inline, one by one. This parameter sets the size of this thread 206 | pool. The default value of this setting is the number of cpu cores. Please refer 207 | to [this](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html) 208 | document on how to set this parameter properly. 209 | 210 | The section of model config file specifying this parameter will look like: 211 | 212 | ``` 213 | parameters: { 214 | key: "INTER_OP_THREAD_COUNT" 215 | value: { 216 | string_value:"1" 217 | } 218 | } 219 | ``` 220 | 221 | * `INTRA_OP_THREAD_COUNT`: 222 | 223 | In addition to the inter-op parallelism, PyTorch can also utilize multiple threads 224 | within the ops (intra-op parallelism). This can be useful in many cases, including 225 | element-wise ops on large tensors, convolutions, GEMMs, embedding lookups and 226 | others. The default value for this setting is the number of CPU cores. Please refer 227 | to [this](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html) 228 | document on how to set this parameter properly. 229 | 230 | The section of model config file specifying this parameter will look like: 231 | 232 | ``` 233 | parameters: { 234 | key: "INTRA_OP_THREAD_COUNT" 235 | value: { 236 | string_value:"1" 237 | } 238 | } 239 | ``` 240 | 241 | * Additional Optimizations: Three additional boolean parameters are available to disable 242 | certain Torch optimizations that can sometimes cause latency regressions in models with 243 | complex execution modes and dynamic shapes. If not specified, all are enabled by default. 244 | 245 | `ENABLE_JIT_EXECUTOR` 246 | 247 | `ENABLE_JIT_PROFILING` 248 | 249 | ### Support 250 | 251 | #### Model Instance Group Kind 252 | 253 | The PyTorch backend supports the following kinds of 254 | [Model Instance Groups](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) 255 | where the input tensors are placed as follows: 256 | 257 | * `KIND_GPU`: Inputs are prepared on the GPU device associated with the model 258 | instance. 259 | 260 | * `KIND_CPU`: Inputs are prepared on the CPU. 261 | 262 | * `KIND_MODEL`: Inputs are prepared on the CPU. When loading the model, the 263 | backend does not choose the GPU device for the model; instead, it respects the 264 | device(s) specified in the model and uses them as they are during inference. 265 | This is useful when the model internally utilizes multiple GPUs, as demonstrated 266 | in this 267 | [example model](https://github.com/triton-inference-server/server/blob/main/qa/L0_libtorch_instance_group_kind_model/gen_models.py). 268 | If no device is specified in the model, the backend uses the first available 269 | GPU device. This feature is available starting in the 23.06 release. 270 | 271 | ### Important Notes 272 | 273 | * The execution of PyTorch model on GPU is asynchronous in nature. See 274 | [here](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution) 275 | for more details. Consequently, an error in PyTorch model execution may 276 | be raised during the next few inference requests to the server. Setting 277 | environment variable `CUDA_LAUNCH_BLOCKING=1` when launching server will 278 | help in correctly debugging failing cases by forcing synchronous execution. 279 | * The PyTorch model in such cases may or may not recover from the failed 280 | state and a restart of the server may be required to continue serving 281 | successfully. 282 | 283 | * PyTorch does not support Tensor of Strings but it does support models that 284 | accept a List of Strings as input(s) / produces a List of String as output(s). 285 | For these models Triton allows users to pass String input(s)/receive String 286 | output(s) using the String datatype. As a limitation of using List instead of 287 | Tensor for String I/O, only for 1-dimensional input(s)/output(s) are supported 288 | for I/O of String type. 289 | 290 | * In a multi-GPU environment, a potential runtime issue can occur when using 291 | [Tracing](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) 292 | to generate a 293 | [TorchScript](https://pytorch.org/docs/stable/jit.html) model. This issue 294 | arises due to a device mismatch between the model instance and the tensor. By 295 | default, Triton creates a single execution instance of the model for each 296 | available GPU. The runtime error occurs when a request is sent to a model 297 | instance with a different GPU device from the one used during the TorchScript 298 | generation process. To address this problem, it is highly recommended to use 299 | [Scripting](https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script) 300 | instead of Tracing for model generation in a multi-GPU environment. Scripting 301 | avoids the device mismatch issue and ensures compatibility with different GPUs 302 | when used with Triton. However, if using Tracing is unavoidable, there is a 303 | workaround available. You can explicitly specify the GPU device for the model 304 | instance in the 305 | [model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) 306 | to ensure that the model instance and the tensors used for inference are 307 | assigned to the same GPU device as on which the model was traced. 308 | 309 | # PyTorch 2.0 Backend \[Experimental\] 310 | 311 | > [!WARNING] 312 | > *This feature is subject to change and removal.* 313 | 314 | Starting from 24.01, PyTorch models can be served directly via 315 | [Python runtime](src/model.py). By default, Triton will use the 316 | [LibTorch runtime](#pytorch-libtorch-backend) for PyTorch models. To use Python 317 | runtime, provide the following 318 | [runtime setting](https://github.com/triton-inference-server/backend/blob/main/README.md#backend-shared-library) 319 | in the model configuration: 320 | 321 | ``` 322 | runtime: "model.py" 323 | ``` 324 | 325 | ## Dependencies 326 | 327 | ### Python backend dependency 328 | 329 | This feature depends on 330 | [Python backend](https://github.com/triton-inference-server/python_backend), 331 | see 332 | [Python-based Backends](https://github.com/triton-inference-server/backend/blob/main/docs/python_based_backends.md) 333 | for more details. 334 | 335 | ### PyTorch dependency 336 | 337 | This feature will take advantage of the 338 | [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile) 339 | optimization, make sure the 340 | [PyTorch 2.0+ pip package](https://pypi.org/project/torch) is available in the 341 | same Python environment. 342 | 343 | Alternatively, a [Python Execution Environment](#using-custom-python-execution-environments) 344 | with the PyTorch dependency may be used. It can be created with the 345 | [provided script](tools/gen_pb_exec_env.sh). The resulting 346 | `pb_exec_env_model.py.tar.gz` file should be placed at the same 347 | [backend shared library](https://github.com/triton-inference-server/backend/blob/main/README.md#backend-shared-library) 348 | directory as the [Python runtime](src/model.py). 349 | 350 | ## Model Layout 351 | 352 | ### PyTorch 2.0 models 353 | 354 | The model repository should look like: 355 | 356 | ``` 357 | model_repository/ 358 | `-- model_directory 359 | |-- 1 360 | | |-- model.py 361 | | `-- [model.pt] 362 | `-- config.pbtxt 363 | ``` 364 | 365 | The `model.py` contains the class definition of the PyTorch model. The class 366 | should extend the 367 | [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). 368 | The `model.pt` may be optionally provided which contains the saved 369 | [`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference) 370 | of the model. 371 | 372 | ### TorchScript models 373 | 374 | The model repository should look like: 375 | 376 | ``` 377 | model_repository/ 378 | `-- model_directory 379 | |-- 1 380 | | `-- model.pt 381 | `-- config.pbtxt 382 | ``` 383 | 384 | The `model.pt` is the TorchScript model file. 385 | 386 | ## Customization 387 | 388 | The following PyTorch settings may be customized by setting parameters on the 389 | `config.pbtxt`. 390 | 391 | [`torch.set_num_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads) 392 | - Key: NUM_THREADS 393 | - Value: The number of threads used for intraop parallelism on CPU. 394 | 395 | [`torch.set_num_interop_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_interop_threads.html#torch.set_num_interop_threads) 396 | - Key: NUM_INTEROP_THREADS 397 | - Value: The number of threads used for interop parallelism (e.g. in JIT 398 | interpreter) on CPU. 399 | 400 | [`torch.compile()` parameters](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile) 401 | - Key: TORCH_COMPILE_OPTIONAL_PARAMETERS 402 | - Value: Any of following parameter(s) encoded as a JSON object. 403 | - fullgraph (*bool*): Whether it is ok to break model into several subgraphs. 404 | - dynamic (*bool*): Use dynamic shape tracing. 405 | - backend (*str*): The backend to be used. 406 | - mode (*str*): Can be either "default", "reduce-overhead" or "max-autotune". 407 | - options (*dict*): A dictionary of options to pass to the backend. 408 | - disable (*bool*): Turn `torch.compile()` into a no-op for testing. 409 | 410 | For example: 411 | ``` 412 | parameters: { 413 | key: "NUM_THREADS" 414 | value: { string_value: "4" } 415 | } 416 | parameters: { 417 | key: "TORCH_COMPILE_OPTIONAL_PARAMETERS" 418 | value: { string_value: "{\"disable\": true}" } 419 | } 420 | ``` 421 | 422 | ## Limitations 423 | 424 | Following are few known limitations of this feature: 425 | - Python functions optimizable by `torch.compile` may not be served directly in 426 | the `model.py` file, they need to be enclosed by a class extending the 427 | [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). 428 | - Model weights cannot be shared across multiple instances on the same GPU 429 | device. 430 | - When using `KIND_MODEL` as model instance kind, the default device of the 431 | first parameter on the model is used. 432 | -------------------------------------------------------------------------------- /cmake/TritonPyTorchBackendConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | include(CMakeFindDependencyMacro) 28 | 29 | get_filename_component( 30 | TRITONPYTORCHBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH 31 | ) 32 | 33 | list(APPEND CMAKE_MODULE_PATH ${TRITONPYTORCHBACKEND_CMAKE_DIR}) 34 | 35 | if(NOT TARGET TritonPyTorchBackend::triton-pytorch-backend) 36 | include("${TRITONPYTORCHBACKEND_CMAKE_DIR}/TritonPyTorchBackendTargets.cmake") 37 | endif() 38 | 39 | set(TRITONPYTORCHBACKEND_LIBRARIES TritonPyTorchBackend::triton-pytorch-backend) 40 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | [tool.codespell] 28 | # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override - 29 | # this is only to allow you to run codespell interactively 30 | skip = "./.git,./.github" 31 | # ignore short words, and typename parameters like OffsetT 32 | ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" 33 | # use the 'clear' dictionary for unambiguous spelling mistakes 34 | builtin = "clear" 35 | # disable warnings about binary files and wrong encoding 36 | quiet-level = 3 37 | 38 | [tool.isort] 39 | profile = "black" 40 | use_parentheses = true 41 | multi_line_output = 3 42 | include_trailing_comma = true 43 | force_grid_wrap = 0 44 | ensure_newline_before_comments = true 45 | line_length = 88 46 | balanced_wrapping = true 47 | indent = " " 48 | skip = ["build"] 49 | 50 | -------------------------------------------------------------------------------- /src/libtorch.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | #include 28 | 29 | #include 30 | #include 31 | 32 | #include "libtorch_utils.h" 33 | #include "triton/backend/backend_common.h" 34 | #include "triton/backend/backend_input_collector.h" 35 | #include "triton/backend/backend_memory.h" 36 | #include "triton/backend/backend_model.h" 37 | #include "triton/backend/backend_model_instance.h" 38 | #include "triton/backend/backend_output_responder.h" 39 | #include "triton/common/nvtx.h" 40 | #include "triton/core/tritonbackend.h" 41 | 42 | #ifdef TRITON_PYTORCH_ENABLE_TORCHVISION 43 | // Suppress warnings in torch headers 44 | #pragma GCC diagnostic push 45 | #pragma GCC diagnostic ignored "-Wsign-compare" 46 | #pragma warning(push, 0) 47 | #include 48 | #include // Torchvision header 49 | #pragma warning(pop) 50 | #pragma GCC diagnostic pop 51 | #endif // TRITON_PYTORCH_ENABLE_TORCHVISION 52 | 53 | #ifdef TRITON_ENABLE_GPU 54 | #include 55 | #include 56 | #include 57 | #endif // TRITON_ENABLE_GPU 58 | 59 | // for thread control 60 | // https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api 61 | // https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133 62 | #include 63 | 64 | 65 | // 66 | // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API. 67 | // 68 | 69 | namespace triton { namespace backend { namespace pytorch { 70 | 71 | // 72 | // ModelState 73 | // 74 | // State associated with a model that is using this backend. An object 75 | // of this class is created and associated with each 76 | // TRITONBACKEND_Model. 77 | // 78 | class ModelState : public BackendModel { 79 | public: 80 | static TRITONSERVER_Error* Create( 81 | TRITONBACKEND_Model* triton_model, ModelState** state); 82 | virtual ~ModelState() = default; 83 | 84 | // Load a TorchScript model using 'artifact_name' as the name for the 85 | // TorchScript file. Return in 'model_path' the full path to the 86 | // TorchScript file, return in 'torch_model' the Torch Module 87 | // representing the model. 88 | TRITONSERVER_Error* LoadModel( 89 | const std::string& artifact_name, const torch::Device device, 90 | std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind, 91 | std::shared_ptr* torch_model); 92 | 93 | bool EnabledOptimizedExecution() { return enable_optimized_execution_; } 94 | const std::pair& EnabledTensorExprFuser() const 95 | { 96 | return enable_tensor_fuser_pair_; 97 | } 98 | const std::pair& EnabledJitProfiling() const 99 | { 100 | return enable_jit_profiling_pair_; 101 | } 102 | const std::pair& EnabledJitExecutor() const 103 | { 104 | return enable_jit_executor_pair_; 105 | } 106 | bool EnabledInferenceMode() { return enable_inference_mode_; } 107 | bool EnabledCudnn() { return enable_cudnn_; } 108 | bool EnabledCacheCleaning() { return enable_cache_cleaning_; } 109 | 110 | bool EnabledWeightSharing() { return enable_weight_sharing_; } 111 | const std::map>& ModelOutputs() 112 | { 113 | return model_outputs_; 114 | } 115 | 116 | private: 117 | ModelState(TRITONBACKEND_Model* triton_model); 118 | TRITONSERVER_Error* AutoCompleteConfig(); 119 | 120 | // Parses and validates parameters in config 121 | TRITONSERVER_Error* ParseParameters(); 122 | 123 | // Flag to indicate whether optimized execution is enabled. Defaults to true. 124 | bool enable_optimized_execution_; 125 | 126 | // Flag to indicate whether inference mode is enabled. Defaults to false. 127 | bool enable_inference_mode_; 128 | 129 | // Flag to indicate whether cudnn is enabled. Defaults to true. 130 | bool enable_cudnn_; 131 | 132 | // Flag to indicate whether cache cleaning after each run is enabled. 133 | // Defaults to false. 134 | bool enable_cache_cleaning_; 135 | 136 | // Flag to indicate whether weight sharing is enabled. Defaults to false. 137 | bool enable_weight_sharing_; 138 | 139 | // Flag pairs to indicate if various JIT settings are set and 140 | // enabled respectively. Defaults to (false, true). Default behavior 141 | // is to do nothing if not explicitly set. 142 | std::pair enable_tensor_fuser_pair_; 143 | std::pair enable_jit_profiling_pair_; 144 | std::pair enable_jit_executor_pair_; 145 | 146 | // Model mapping for shared TorchScript model across all instances on the 147 | // same device. The key is a pair of isGPU and device index. 148 | std::map< 149 | std::pair, std::shared_ptr> 150 | torch_models_; 151 | 152 | // model_outputs is a map that contains unique outputs that the model must 153 | // provide. The first pair is the model output index and the second is 154 | // the index in the model state, -1 is used if one is not required. 155 | // In the model configuration, the output in the state configuration 156 | // can have intersection with the outputs section of the model. If an output 157 | // is specified both in the output section and state section, it indicates 158 | // that the backend must return the output state to the client too. 159 | std::map> model_outputs_; 160 | }; 161 | 162 | TRITONSERVER_Error* 163 | ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) 164 | { 165 | try { 166 | *state = new ModelState(triton_model); 167 | } 168 | catch (const BackendModelException& ex) { 169 | RETURN_ERROR_IF_TRUE( 170 | ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, 171 | std::string("unexpected nullptr in BackendModelException")); 172 | RETURN_IF_ERROR(ex.err_); 173 | } 174 | 175 | // Auto-complete the configuration if requested... 176 | bool auto_complete_config = false; 177 | RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig( 178 | triton_model, &auto_complete_config)); 179 | if (auto_complete_config) { 180 | RETURN_IF_ERROR((*state)->AutoCompleteConfig()); 181 | RETURN_IF_ERROR((*state)->SetModelConfig()); 182 | } 183 | 184 | auto& model_outputs = (*state)->model_outputs_; 185 | // Parse the output states in the model configuration 186 | triton::common::TritonJson::Value sequence_batching; 187 | if ((*state)->ModelConfig().Find("sequence_batching", &sequence_batching)) { 188 | triton::common::TritonJson::Value states; 189 | if (sequence_batching.Find("state", &states)) { 190 | for (size_t i = 0; i < states.ArraySize(); i++) { 191 | triton::common::TritonJson::Value state; 192 | RETURN_IF_ERROR(states.IndexAsObject(i, &state)); 193 | std::string output_state_name; 194 | RETURN_IF_ERROR( 195 | state.MemberAsString("output_name", &output_state_name)); 196 | auto it = model_outputs.find(output_state_name); 197 | if (it == model_outputs.end()) { 198 | model_outputs.insert({output_state_name, std::make_pair(-1, i)}); 199 | } else { 200 | it->second.second = i; 201 | } 202 | } 203 | } 204 | } 205 | 206 | // Parse the output names in the model configuration 207 | triton::common::TritonJson::Value outputs; 208 | RETURN_IF_ERROR((*state)->ModelConfig().MemberAsArray("output", &outputs)); 209 | for (size_t i = 0; i < outputs.ArraySize(); i++) { 210 | triton::common::TritonJson::Value output; 211 | THROW_IF_BACKEND_INSTANCE_ERROR(outputs.IndexAsObject(i, &output)); 212 | 213 | // Use names from ModelConfig by reference since the model 214 | // config will persist longer than this inference execution. 215 | std::string output_name; 216 | THROW_IF_BACKEND_INSTANCE_ERROR( 217 | output.MemberAsString("name", &output_name)); 218 | 219 | auto it = model_outputs.find(output_name); 220 | if (it == model_outputs.end()) { 221 | model_outputs.insert({output_name, std::make_pair(i, -1)}); 222 | } else { 223 | it->second.first = i; 224 | } 225 | } 226 | 227 | RETURN_IF_ERROR((*state)->ParseParameters()); 228 | 229 | return nullptr; // success 230 | } 231 | 232 | ModelState::ModelState(TRITONBACKEND_Model* triton_model) 233 | : BackendModel(triton_model), enable_optimized_execution_(true), 234 | enable_inference_mode_(true), enable_cudnn_(true), 235 | enable_cache_cleaning_(false), enable_weight_sharing_(false), 236 | enable_tensor_fuser_pair_({false, true}), 237 | enable_jit_profiling_pair_({false, true}), 238 | enable_jit_executor_pair_({false, true}) 239 | { 240 | } 241 | 242 | TRITONSERVER_Error* 243 | ModelState::LoadModel( 244 | const std::string& artifact_name, const torch::Device device, 245 | std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind, 246 | std::shared_ptr* torch_model) 247 | { 248 | // Find the TorchScript file that describes the model. If the model 249 | // configuration doesn't have an explicit model file specified then 250 | // use the default name ("model.pt"). 251 | std::string cc_model_filename = artifact_name; 252 | if (cc_model_filename.empty()) { 253 | cc_model_filename = "model.pt"; 254 | } 255 | 256 | *model_path = JoinPath( 257 | {RepositoryPath(), std::to_string(Version()), cc_model_filename}); 258 | 259 | { 260 | bool exists; 261 | RETURN_IF_ERROR(FileExists(*model_path, &exists)); 262 | RETURN_ERROR_IF_FALSE( 263 | exists, TRITONSERVER_ERROR_UNAVAILABLE, 264 | std::string("unable to find '") + *model_path + 265 | "' for model instance '" + Name() + "'"); 266 | } 267 | 268 | // If weight sharing is enabled, skip loading model if 269 | // it is already available on the target device 270 | std::pair device_pair; 271 | if (enable_weight_sharing_) { 272 | device_pair = std::make_pair(!device.is_cpu(), device.index()); 273 | auto mit = torch_models_.find(device_pair); 274 | if (mit != torch_models_.end()) { 275 | *torch_model = mit->second; 276 | LOG_MESSAGE( 277 | TRITONSERVER_LOG_INFO, 278 | (std::string("Reusing TorchScript model for instance '") + Name() + 279 | "'") 280 | .c_str()); 281 | return nullptr; // success 282 | } 283 | } 284 | 285 | // Serialize the torch model to string 286 | std::string model_data_str; 287 | RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str)); 288 | 289 | // InferenceMode should be used to guard all tensors operations including 290 | // model loading: https://pytorch.org/cppdocs/notes/inference_mode.html 291 | torch::InferenceMode infer_guard(EnabledInferenceMode()); 292 | 293 | try { 294 | std::istringstream model_stream(model_data_str); 295 | if (kind == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { 296 | // Load the model without selecting a device. 297 | torch_model->reset( 298 | new torch::jit::Module(torch::jit::load(model_stream))); 299 | } else { 300 | torch_model->reset( 301 | new torch::jit::Module(torch::jit::load(model_stream, device))); 302 | } 303 | } 304 | catch (const std::exception& ex) { 305 | return TRITONSERVER_ErrorNew( 306 | TRITONSERVER_ERROR_INTERNAL, 307 | ("failed to load model '" + Name() + "': " + ex.what()).c_str()); 308 | } 309 | 310 | if (enable_weight_sharing_) { 311 | if (!((torch_models_.emplace(device_pair, *torch_model)).second)) { 312 | std::string type = device.is_cpu() ? "CPU" : "GPU"; 313 | LOG_MESSAGE( 314 | TRITONSERVER_LOG_WARN, 315 | (std::string("Model already found on target ") + type + " device " + 316 | "(id " + std::to_string(device.index()) + ") for '" + Name() + "'") 317 | .c_str()); 318 | } 319 | } 320 | 321 | return nullptr; // success 322 | } 323 | 324 | TRITONSERVER_Error* 325 | ModelState::AutoCompleteConfig() 326 | { 327 | // Auto-complete configuration is not supported since PyTorch does not 328 | // store/capture sufficient model metadata so just log error instead. 329 | LOG_MESSAGE( 330 | TRITONSERVER_LOG_WARN, 331 | (std::string("skipping model configuration auto-complete for '") + 332 | Name() + "': not supported for pytorch backend") 333 | .c_str()); 334 | 335 | return nullptr; // success 336 | } 337 | 338 | TRITONSERVER_Error* 339 | ModelState::ParseParameters() 340 | { 341 | triton::common::TritonJson::Value params; 342 | bool status = model_config_.Find("parameters", ¶ms); 343 | if (status) { 344 | // If 'DISABLE_OPTIMIZED_EXECUTION' is not present in 'parameters' then no 345 | // update is made to 'enable_optimized_execution_'. 346 | bool disable_optimized_execution = false; 347 | TRITONSERVER_Error* err = ParseParameter( 348 | params, "DISABLE_OPTIMIZED_EXECUTION", &disable_optimized_execution); 349 | if (err != nullptr) { 350 | if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { 351 | return err; 352 | } else { 353 | TRITONSERVER_ErrorDelete(err); 354 | } 355 | } 356 | enable_optimized_execution_ = !disable_optimized_execution; 357 | 358 | LOG_MESSAGE( 359 | TRITONSERVER_LOG_INFO, 360 | (std::string("Optimized execution is ") + 361 | (enable_optimized_execution_ ? "enabled" : "disabled") + 362 | " for model instance '" + Name() + "'") 363 | .c_str()); 364 | 365 | // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then 366 | // no update is made to 'enable_cache_cleaning_'. 367 | err = ParseParameter( 368 | params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_); 369 | if (err != nullptr) { 370 | if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { 371 | return err; 372 | } else { 373 | TRITONSERVER_ErrorDelete(err); 374 | } 375 | } 376 | 377 | LOG_MESSAGE( 378 | TRITONSERVER_LOG_INFO, 379 | (std::string("Cache Cleaning is ") + 380 | (enable_cache_cleaning_ ? "enabled" : "disabled") + 381 | " for model instance '" + Name() + "'") 382 | .c_str()); 383 | 384 | // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made 385 | // to 'enable_inference_mode_'. 386 | err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_); 387 | if (err != nullptr) { 388 | if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { 389 | return err; 390 | } else { 391 | TRITONSERVER_ErrorDelete(err); 392 | } 393 | } 394 | LOG_MESSAGE( 395 | TRITONSERVER_LOG_INFO, 396 | (std::string("Inference Mode is ") + 397 | (enable_inference_mode_ ? "enabled" : "disabled") + 398 | " for model instance '" + Name() + "'") 399 | .c_str()); 400 | 401 | // If 'DISABLE_CUDNN' is not present in 'parameters' then no update is made 402 | // to 'enable_cudnn_'. 403 | bool disable_cudnn = false; 404 | err = ParseParameter(params, "DISABLE_CUDNN", &disable_cudnn); 405 | if (err != nullptr) { 406 | if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { 407 | return err; 408 | } else { 409 | TRITONSERVER_ErrorDelete(err); 410 | } 411 | } 412 | enable_cudnn_ = !disable_cudnn; 413 | LOG_MESSAGE( 414 | TRITONSERVER_LOG_INFO, 415 | (std::string("cuDNN is ") + (enable_cudnn_ ? "enabled" : "disabled") + 416 | " for model instance '" + Name() + "'") 417 | .c_str()); 418 | 419 | // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no 420 | // update is made to 'enable_tensor_fuser'. 421 | bool enable_tensor_fuser = false; 422 | err = ParseParameter(params, "ENABLE_TENSOR_FUSER", &enable_tensor_fuser); 423 | if (err != nullptr) { 424 | if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { 425 | return err; 426 | } else { 427 | TRITONSERVER_ErrorDelete(err); 428 | } 429 | } else { 430 | enable_tensor_fuser_pair_ = {true, enable_tensor_fuser}; 431 | LOG_MESSAGE( 432 | TRITONSERVER_LOG_INFO, 433 | (std::string("Tensor fuser is ") + 434 | (enable_tensor_fuser ? "enabled" : "disabled") + 435 | " for model instance '" + Name() + "'") 436 | .c_str()); 437 | } 438 | 439 | // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no 440 | // update is made to 'enable_weight_sharing'. 441 | err = ParseParameter( 442 | params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_); 443 | if (err != nullptr) { 444 | if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { 445 | return err; 446 | } else { 447 | TRITONSERVER_ErrorDelete(err); 448 | } 449 | } else { 450 | LOG_MESSAGE( 451 | TRITONSERVER_LOG_INFO, 452 | (std::string("Weight sharing is ") + 453 | (enable_weight_sharing_ ? "enabled" : "disabled") + 454 | " for model instance '" + Name() + "'") 455 | .c_str()); 456 | } 457 | 458 | // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update 459 | // is made to 'enable_jit_profiling'. 460 | bool enable_jit_profiling = false; 461 | err = ParseParameter(params, "ENABLE_JIT_PROFILING", &enable_jit_profiling); 462 | if (err != nullptr) { 463 | if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { 464 | return err; 465 | } else { 466 | TRITONSERVER_ErrorDelete(err); 467 | } 468 | } else { 469 | enable_jit_profiling_pair_ = {true, enable_jit_profiling}; 470 | LOG_MESSAGE( 471 | TRITONSERVER_LOG_INFO, 472 | (std::string("Jit profiling is ") + 473 | (enable_jit_profiling ? "enabled" : "disabled") + 474 | " for model instance '" + Name() + "'") 475 | .c_str()); 476 | } 477 | 478 | // If 'ENABLE_JIT_EXECUTOR' is not present in 'parameters' then no update is 479 | // made to 'enable_jit_executor'. 480 | bool enable_jit_executor = false; 481 | err = ParseParameter(params, "ENABLE_JIT_EXECUTOR", &enable_jit_executor); 482 | if (err != nullptr) { 483 | if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { 484 | return err; 485 | } else { 486 | TRITONSERVER_ErrorDelete(err); 487 | } 488 | } else { 489 | enable_jit_executor_pair_ = {true, enable_jit_executor}; 490 | LOG_MESSAGE( 491 | TRITONSERVER_LOG_INFO, 492 | (std::string("Jit executor is ") + 493 | (enable_jit_executor ? "enabled" : "disabled") + 494 | " for model instance '" + Name() + "'") 495 | .c_str()); 496 | } 497 | 498 | // If 'INTRA_OP_THREAD_COUNT' is not present in 'parameters' then no update 499 | // is made to 'intra_op_thread_count', which by default will take all 500 | // threads 501 | int intra_op_thread_count = -1; 502 | err = 503 | ParseParameter(params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count); 504 | if (err != nullptr) { 505 | if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { 506 | return err; 507 | } else { 508 | TRITONSERVER_ErrorDelete(err); 509 | } 510 | } else { 511 | if (intra_op_thread_count > 0) { 512 | at::set_num_threads(intra_op_thread_count); 513 | LOG_MESSAGE( 514 | TRITONSERVER_LOG_INFO, 515 | (std::string("Intra op thread count is set to ") + 516 | std::to_string(intra_op_thread_count) + " for model instance '" + 517 | Name() + "'") 518 | .c_str()); 519 | } 520 | } 521 | 522 | // If 'INTER_OP_THREAD_COUNT' is not present in 'parameters' then no update 523 | // is made to 'inter_op_thread_count', which by default will take all 524 | // threads 525 | int inter_op_thread_count = -1; 526 | err = 527 | ParseParameter(params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count); 528 | if (err != nullptr) { 529 | if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { 530 | return err; 531 | } else { 532 | TRITONSERVER_ErrorDelete(err); 533 | } 534 | } else { 535 | if (inter_op_thread_count > 0) { 536 | at::set_num_interop_threads(inter_op_thread_count); 537 | LOG_MESSAGE( 538 | TRITONSERVER_LOG_INFO, 539 | (std::string("Inter op thread count is set to ") + 540 | std::to_string(inter_op_thread_count) + " for model instance '" + 541 | Name() + "'") 542 | .c_str()); 543 | } 544 | } 545 | } 546 | 547 | return nullptr; 548 | } 549 | 550 | // The naming convention followed for inputs/outputs in the model configuration. 551 | // Outputs don't support FORWARD_ARGUMENT. 552 | enum class NamingConvention { 553 | NAMED_INDEX, 554 | FORWARD_ARGUMENT, 555 | STRICT_CONFIG_ORDERING 556 | }; 557 | 558 | // 559 | // ModelInstanceState 560 | // 561 | // State associated with a model instance. An object of this class is 562 | // created and associated with each TRITONBACKEND_ModelInstance. 563 | // 564 | class ModelInstanceState : public BackendModelInstance { 565 | public: 566 | static TRITONSERVER_Error* Create( 567 | ModelState* model_state, 568 | TRITONBACKEND_ModelInstance* triton_model_instance, 569 | ModelInstanceState** state); 570 | virtual ~ModelInstanceState(); 571 | 572 | // Get the state of the model that corresponds to this instance. 573 | ModelState* StateForModel() const { return model_state_; } 574 | 575 | // Execute... 576 | void ProcessRequests( 577 | TRITONBACKEND_Request** requests, const uint32_t request_count); 578 | 579 | // Clear CUDA cache 580 | void ClearCache(); 581 | 582 | private: 583 | ModelInstanceState( 584 | ModelState* model_state, 585 | TRITONBACKEND_ModelInstance* triton_model_instance); 586 | TRITONSERVER_Error* ValidateBooleanSequenceControl( 587 | triton::common::TritonJson::Value& sequence_batching, 588 | const std::string& control_kind, bool required, bool* have_control); 589 | TRITONSERVER_Error* ValidateTypedSequenceControl( 590 | triton::common::TritonJson::Value& sequence_batching, 591 | const std::string& control_kind, bool required, bool* have_control); 592 | TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt); 593 | void AddInputToMap( 594 | NamingConvention naming_convention, 595 | const std::vector allowed_inputs, const std::string& io_name, 596 | const uint32_t index); 597 | TRITONSERVER_Error* ValidateOutputs(); 598 | void Execute( 599 | std::vector* responses, 600 | const uint32_t response_count, 601 | std::vector* input_tensors, 602 | std::vector* output_tensors); 603 | TRITONSERVER_Error* SetInputTensors( 604 | size_t total_batch_size, TRITONBACKEND_Request** requests, 605 | const uint32_t request_count, 606 | std::vector* responses, 607 | BackendInputCollector* collector, std::vector* input_names, 608 | std::vector* input_tensors, bool* cuda_copy); 609 | TRITONSERVER_Error* ReadOutputTensors( 610 | size_t total_batch_size, 611 | const std::vector& output_tensors, 612 | TRITONBACKEND_Request** requests, const uint32_t request_count, 613 | std::vector* responses); 614 | TRITONSERVER_Error* RecordBackendTimestamp( 615 | uint64_t* timestamp, void* cuda_event); 616 | 617 | // Get the naming convention for inputs/outputs from the model configuration 618 | TRITONSERVER_Error* GetNamingConvention( 619 | NamingConvention* naming_convention, 620 | const std::vector& allowed_io); 621 | 622 | // Create CUDA events for statistics collection. 623 | void CreateCudaEvents(const int32_t& device_id); 624 | 625 | // Get the appropriate CUDA stream for input and output handling based on the 626 | // instance group type. 627 | cudaStream_t GetCudaStreamByInstanceKind(); 628 | 629 | // Replace the default CUDA stream with the stream we created to ensure proper 630 | // cuda stream synchronization. 631 | void SetCurrentCudaStream( 632 | const cudaStream_t& stream, const int32_t& device_id); 633 | 634 | // Get the elapsed time between two CUDA events. 635 | float GetCudaEventElapsedTime( 636 | const cudaEvent_t& start_event, const cudaEvent_t& end_event); 637 | 638 | ModelState* model_state_; 639 | 640 | // The full path to the TorchScript model file. 641 | std::string model_path_; 642 | 643 | std::shared_ptr torch_model_; 644 | torch::Device device_; 645 | 646 | // Map from configuration name for an input to the index of 647 | // that input in the model. 648 | std::unordered_map input_index_map_; 649 | uint32_t batch_input_count_ = 0; 650 | 651 | // Map from configuration name for an output to the index of 652 | // that output in the model. 653 | std::unordered_map output_index_map_; 654 | std::unordered_map output_dtype_map_; 655 | 656 | // If the input to the tensor is a dictionary of tensors. 657 | bool is_dict_input_; 658 | 659 | // If the model supports batching. 660 | bool supports_batching_; 661 | 662 | cudaEvent_t compute_input_start_event_; 663 | cudaEvent_t compute_infer_start_event_; 664 | cudaEvent_t compute_output_start_event_; 665 | 666 | // Store the cuda streams created for the 'KIND_MODEL' instance group. 667 | std::vector stream_vec_; 668 | 669 | // The number of available devices. 670 | int device_cnt_; 671 | }; 672 | 673 | TRITONSERVER_Error* 674 | ModelInstanceState::Create( 675 | ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, 676 | ModelInstanceState** state) 677 | { 678 | try { 679 | *state = new ModelInstanceState(model_state, triton_model_instance); 680 | } 681 | catch (const BackendModelInstanceException& ex) { 682 | RETURN_ERROR_IF_TRUE( 683 | ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, 684 | std::string("unexpected nullptr in BackendModelInstanceException")); 685 | RETURN_IF_ERROR(ex.err_); 686 | } 687 | 688 | return nullptr; // success 689 | } 690 | 691 | ModelInstanceState::ModelInstanceState( 692 | ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) 693 | : BackendModelInstance(model_state, triton_model_instance), 694 | model_state_(model_state), device_(torch::kCPU), is_dict_input_(false), 695 | device_cnt_(0) 696 | { 697 | if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { 698 | #ifdef TRITON_ENABLE_GPU 699 | device_ = torch::Device(torch::kCUDA, DeviceId()); 700 | CreateCudaEvents(DeviceId()); 701 | #endif 702 | } 703 | 704 | #ifdef TRITON_ENABLE_GPU 705 | device_cnt_ = torch::cuda::device_count(); 706 | #endif 707 | 708 | THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel( 709 | ArtifactFilename(), device_, &model_path_, Kind(), &torch_model_)); 710 | 711 | if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { 712 | #ifdef TRITON_ENABLE_GPU 713 | // Since we cannot determine the exact devices used by the model, we create 714 | // a CUDA stream for every available device to ensure proper synchronization 715 | // of CUDA streams. This approach may have implications when a timestamp is 716 | // captured on a device that is not used by the model. Currently, this issue 717 | // is addressed by synchronizing the CUDA streams before recording 718 | // timestamps to prevent timestamp skewing. However, in the future, any 719 | // modifications to the CUDA stream synchronization logic should be handled 720 | // with caution. 721 | for (int i = 0; i < device_cnt_; i++) { 722 | cudaStream_t stream; 723 | THROW_IF_BACKEND_INSTANCE_ERROR( 724 | CreateCudaStream(i, 0 /* cuda_stream_priority */, &stream)); 725 | stream_vec_.push_back(stream); 726 | } 727 | if (!stream_vec_.empty()) { 728 | // Create CUDA events on the first device that will be used for collecting 729 | // inputs/outputs. 730 | CreateCudaEvents(0); 731 | } 732 | #endif 733 | } 734 | 735 | size_t expected_input_cnt = 0; 736 | { 737 | triton::common::TritonJson::Value inputs; 738 | if (model_state->ModelConfig().Find("input", &inputs)) { 739 | expected_input_cnt = inputs.ArraySize(); 740 | } 741 | 742 | triton::common::TritonJson::Value config_batch_inputs; 743 | if (model_state->ModelConfig().Find("batch_input", &config_batch_inputs)) { 744 | batch_input_count_ = config_batch_inputs.ArraySize(); 745 | expected_input_cnt += batch_input_count_; 746 | } 747 | } 748 | 749 | // If this is a sequence model then make sure that the required 750 | // inputs are present in the model and have the correct shape and 751 | // datatype. 752 | triton::common::TritonJson::Value sequence_batching; 753 | if (model_state->ModelConfig().Find( 754 | "sequence_batching", &sequence_batching)) { 755 | bool have_start, have_end, have_ready, have_corrid; 756 | THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( 757 | sequence_batching, "CONTROL_SEQUENCE_START", false /* required */, 758 | &have_start)); 759 | THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( 760 | sequence_batching, "CONTROL_SEQUENCE_END", false /* required */, 761 | &have_end)); 762 | THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( 763 | sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */, 764 | &have_ready)); 765 | THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl( 766 | sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */, 767 | &have_corrid)); 768 | if (have_start) { 769 | expected_input_cnt += 1; 770 | } 771 | if (have_end) { 772 | expected_input_cnt += 1; 773 | } 774 | if (have_ready) { 775 | expected_input_cnt += 1; 776 | } 777 | if (have_corrid) { 778 | expected_input_cnt += 1; 779 | } 780 | // Add the state inputs to the expected count 781 | triton::common::TritonJson::Value states; 782 | if (sequence_batching.Find("state", &states)) { 783 | expected_input_cnt += states.ArraySize(); 784 | } 785 | } 786 | supports_batching_ = model_state_->MaxBatchSize() > 0; 787 | 788 | THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt)); 789 | THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs()); 790 | } 791 | 792 | void 793 | ModelInstanceState::ClearCache() 794 | { 795 | #ifdef TRITON_ENABLE_GPU 796 | if (device_.is_cuda() || 797 | ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { 798 | c10::cuda::CUDACachingAllocator::emptyCache(); 799 | } 800 | #endif // TRITON_ENABLE_GPU 801 | } 802 | 803 | ModelInstanceState::~ModelInstanceState() 804 | { 805 | torch_model_.reset(); 806 | ClearCache(); 807 | 808 | if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { 809 | #ifdef TRITON_ENABLE_GPU 810 | for (size_t i = 0; i < stream_vec_.size(); i++) { 811 | LOG_IF_ERROR( 812 | ConvertCUDAStatusToTritonError( 813 | cudaSetDevice(i), TRITONSERVER_ERROR_INTERNAL, 814 | "Failed to set the device"), 815 | "Failed to set the device"); 816 | 817 | LOG_IF_ERROR( 818 | ConvertCUDAStatusToTritonError( 819 | cudaStreamDestroy(stream_vec_[i]), TRITONSERVER_ERROR_INTERNAL, 820 | "Failed to destroy cuda stream"), 821 | "~ModelInstanceState error: "); 822 | stream_vec_[i] = nullptr; 823 | } 824 | #endif 825 | } 826 | } 827 | 828 | TRITONSERVER_Error* 829 | ModelInstanceState::ValidateBooleanSequenceControl( 830 | triton::common::TritonJson::Value& sequence_batching, 831 | const std::string& control_kind, bool required, bool* have_control) 832 | { 833 | std::string tensor_name; 834 | std::string tensor_datatype; 835 | RETURN_IF_ERROR(GetBooleanSequenceControlProperties( 836 | sequence_batching, model_state_->Name(), control_kind, required, 837 | &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr, 838 | nullptr, nullptr)); 839 | *have_control = !tensor_name.empty(); 840 | if (*have_control) { 841 | std::string deliminator = "__"; 842 | int ip_index = 0; 843 | int start_pos = tensor_name.find(deliminator); 844 | if (start_pos == -1) { 845 | return TRITONSERVER_ErrorNew( 846 | TRITONSERVER_ERROR_INTERNAL, 847 | ("input '" + tensor_name + 848 | "' does not follow __ naming convention.") 849 | .c_str()); 850 | } 851 | 852 | // check if the index part of the name is not an integer 853 | std::string index_str = tensor_name.substr(start_pos + 2); 854 | for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { 855 | if (std::isdigit(*itr) == 0) { 856 | return TRITONSERVER_ErrorNew( 857 | TRITONSERVER_ERROR_INTERNAL, 858 | ("input '" + tensor_name + 859 | "' does not follow __ naming convention.") 860 | .c_str()); 861 | } 862 | } 863 | 864 | ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); 865 | input_index_map_[tensor_name] = ip_index; 866 | } 867 | 868 | return nullptr; // success 869 | } 870 | 871 | TRITONSERVER_Error* 872 | ModelInstanceState::ValidateTypedSequenceControl( 873 | triton::common::TritonJson::Value& sequence_batching, 874 | const std::string& control_kind, bool required, bool* have_control) 875 | { 876 | std::string tensor_name; 877 | std::string tensor_datatype; 878 | RETURN_IF_ERROR(GetTypedSequenceControlProperties( 879 | sequence_batching, model_state_->Name(), control_kind, required, 880 | &tensor_name, &tensor_datatype)); 881 | *have_control = !tensor_name.empty(); 882 | if (*have_control) { 883 | std::string deliminator = "__"; 884 | int ip_index = 0; 885 | int start_pos = tensor_name.find(deliminator); 886 | if (start_pos == -1) { 887 | return TRITONSERVER_ErrorNew( 888 | TRITONSERVER_ERROR_INTERNAL, 889 | ("input '" + tensor_name + 890 | "' does not follow __ naming convention.") 891 | .c_str()); 892 | } 893 | 894 | // check if the index part of the name is not an integer 895 | std::string index_str = tensor_name.substr(start_pos + 2); 896 | for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { 897 | if (std::isdigit(*itr) == 0) { 898 | return TRITONSERVER_ErrorNew( 899 | TRITONSERVER_ERROR_INTERNAL, 900 | ("input '" + tensor_name + 901 | "' does not follow __ naming convention.") 902 | .c_str()); 903 | } 904 | } 905 | 906 | // check if the data type is supported by PyTorch 907 | if (!ModelConfigDataTypeToTorchType(tensor_datatype).first) { 908 | return TRITONSERVER_ErrorNew( 909 | TRITONSERVER_ERROR_INTERNAL, 910 | ("input '" + tensor_name + "' type '" + tensor_datatype + 911 | "' is not supported by PyTorch.") 912 | .c_str()); 913 | } 914 | 915 | ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); 916 | input_index_map_[tensor_name] = ip_index; 917 | } 918 | 919 | return nullptr; // success 920 | } 921 | 922 | void 923 | ModelInstanceState::AddInputToMap( 924 | NamingConvention naming_convention, 925 | const std::vector allowed_inputs, const std::string& io_name, 926 | const uint32_t index) 927 | { 928 | std::string deliminator = "__"; 929 | 930 | if (is_dict_input_) { 931 | // If dictionary, index is irrelevant but we use the map to store the 932 | // input names since they are the keys for the dictionary 933 | input_index_map_[io_name] = index; 934 | } else { 935 | switch (naming_convention) { 936 | case NamingConvention::FORWARD_ARGUMENT: { 937 | auto itr = 938 | std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name); 939 | if (itr != allowed_inputs.end()) { 940 | input_index_map_[io_name] = 941 | std::distance(allowed_inputs.begin(), itr); 942 | } 943 | return; 944 | } 945 | case NamingConvention::NAMED_INDEX: { 946 | int start_pos = io_name.find(deliminator); 947 | int ip_index = std::atoi(io_name.substr(start_pos + 2).c_str()); 948 | input_index_map_[io_name] = ip_index; 949 | return; 950 | } 951 | case NamingConvention::STRICT_CONFIG_ORDERING: { 952 | input_index_map_[io_name] = index; 953 | return; 954 | } 955 | } 956 | } 957 | } 958 | 959 | TRITONSERVER_Error* 960 | ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) 961 | { 962 | // Collect all the expected input tensor names and validate that the model 963 | // configuration specifies only those. 964 | std::vector allowed_inputs; 965 | 966 | const torch::jit::Method& method = torch_model_->get_method("forward"); 967 | const auto& schema = method.function().getSchema(); 968 | const std::vector& arguments = schema.arguments(); 969 | 970 | // Currently, only models with a single input of type Dict(str, Tensor) are 971 | // supported. If the model expects more than one input then they must be all 972 | // be of type Tensor. 973 | // 974 | // Ignore the argument at idx 0 if it is of Class type (self param in forward 975 | // function) 976 | size_t start_idx = 0; 977 | if ((arguments.size() > 0) && 978 | (arguments.at(0).type()->kind() == c10::TypeKind::ClassType)) { 979 | start_idx = 1; 980 | } 981 | if ((arguments.size() == (1 + start_idx)) && 982 | (arguments.at(start_idx).type()->kind() == c10::TypeKind::DictType)) { 983 | is_dict_input_ = true; 984 | } else if (arguments.size() > start_idx) { 985 | // Return error if multiple inputs are of kind DictType 986 | for (size_t i = start_idx + 1; i < arguments.size(); i++) { 987 | if (arguments.at(i).type()->kind() == c10::TypeKind::DictType) { 988 | return TRITONSERVER_ErrorNew( 989 | TRITONSERVER_ERROR_INTERNAL, 990 | "Multiple inputs of kind DictType were detected. Only a single " 991 | "input of type Dict(str, Tensor) is supported."); 992 | } 993 | } 994 | 995 | // Return error if all inputs are not of type Tensor 996 | for (size_t i = start_idx; i < arguments.size(); i++) { 997 | if ((arguments.at(i).type()->kind() != c10::TypeKind::TensorType) && 998 | (arguments.at(i).type()->kind() != c10::TypeKind::ListType)) { 999 | return TRITONSERVER_ErrorNew( 1000 | TRITONSERVER_ERROR_INTERNAL, 1001 | (std::string("An input of type '") + arguments.at(i).type()->str() + 1002 | "' was detected in the model. Only a single input of type " 1003 | "Dict(str, Tensor) or input(s) of type Tensor are supported.") 1004 | .c_str()); 1005 | } 1006 | allowed_inputs.emplace_back(arguments.at(i).name()); 1007 | } 1008 | 1009 | // If all inputs are tensors, match number of expected inputs between model 1010 | // and configuration 1011 | if ((arguments.size() - start_idx) != expected_input_cnt) { 1012 | return TRITONSERVER_ErrorNew( 1013 | TRITONSERVER_ERROR_INVALID_ARG, 1014 | (std::string("unable to load model '") + model_state_->Name() + 1015 | "', configuration expects " + std::to_string(expected_input_cnt) + 1016 | " inputs, model provides " + 1017 | std::to_string(arguments.size() - start_idx)) 1018 | .c_str()); 1019 | } 1020 | } 1021 | 1022 | triton::common::TritonJson::Value ios; 1023 | RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios)); 1024 | 1025 | if (ios.ArraySize() == 0) { 1026 | return TRITONSERVER_ErrorNew( 1027 | TRITONSERVER_ERROR_INTERNAL, 1028 | "model configuration must contain at least one input, none were " 1029 | "specified."); 1030 | } 1031 | 1032 | NamingConvention naming_convention; 1033 | RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs)); 1034 | 1035 | for (size_t i = 0; i < ios.ArraySize(); i++) { 1036 | triton::common::TritonJson::Value io; 1037 | RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); 1038 | 1039 | // Validate name 1040 | std::string io_name; 1041 | RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); 1042 | AddInputToMap(naming_convention, allowed_inputs, io_name, i); 1043 | // Validate data type 1044 | std::string io_dtype; 1045 | RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); 1046 | const auto pr = ModelConfigDataTypeToTorchType(io_dtype); 1047 | if (!pr.first && (io_dtype != "TYPE_STRING")) { 1048 | return TRITONSERVER_ErrorNew( 1049 | TRITONSERVER_ERROR_INTERNAL, 1050 | ("unsupported datatype " + io_dtype + " for input '" + io_name + 1051 | "' for model '" + model_state_->Name() + "'") 1052 | .c_str()); 1053 | } 1054 | 1055 | // Validate shape for String inputs. Only allow 1 dimension. 1056 | if (io_dtype == "TYPE_STRING") { 1057 | // If a reshape is provided for the input then use that when 1058 | // validating the model shapes. 1059 | std::vector dims; 1060 | triton::common::TritonJson::Value reshape; 1061 | if (io.Find("reshape", &reshape)) { 1062 | RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); 1063 | } else { 1064 | RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); 1065 | } 1066 | 1067 | if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { 1068 | return TRITONSERVER_ErrorNew( 1069 | TRITONSERVER_ERROR_INTERNAL, 1070 | ("Triton only supports 1 dimensional List of String as input for " 1071 | "'" + 1072 | std::string(io_name) + "' for model '" + model_state_->Name() + 1073 | "'") 1074 | .c_str()); 1075 | } 1076 | } 1077 | } 1078 | triton::common::TritonJson::Value sequence_batching; 1079 | if (model_state_->ModelConfig().Find( 1080 | "sequence_batching", &sequence_batching)) { 1081 | triton::common::TritonJson::Value states; 1082 | if (sequence_batching.Find("state", &states)) { 1083 | for (size_t i = 0; i < states.ArraySize(); i++) { 1084 | triton::common::TritonJson::Value state; 1085 | RETURN_IF_ERROR(states.IndexAsObject(i, &state)); 1086 | std::string state_name; 1087 | RETURN_IF_ERROR(state.MemberAsString("input_name", &state_name)); 1088 | AddInputToMap(naming_convention, allowed_inputs, state_name, i); 1089 | 1090 | // Validate data type 1091 | std::string state_dtype; 1092 | RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype)); 1093 | const auto pr = ModelConfigDataTypeToTorchType(state_dtype); 1094 | if (!pr.first && (state_dtype != "TYPE_STRING")) { 1095 | return TRITONSERVER_ErrorNew( 1096 | TRITONSERVER_ERROR_INTERNAL, 1097 | ("unsupported datatype " + state_dtype + " for input state '" + 1098 | state_name + "' for model '" + model_state_->Name() + "'") 1099 | .c_str()); 1100 | } 1101 | 1102 | // Validate shape for String inputs. Only allow 1 dimension. 1103 | if (state_dtype == "TYPE_STRING") { 1104 | std::vector dims; 1105 | if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { 1106 | return TRITONSERVER_ErrorNew( 1107 | TRITONSERVER_ERROR_INTERNAL, 1108 | ("Triton only supports 1 dimensional List of String as input " 1109 | "for " 1110 | "'" + 1111 | std::string(state_name) + "' for model '" + 1112 | model_state_->Name() + "'") 1113 | .c_str()); 1114 | } 1115 | } 1116 | } 1117 | } 1118 | } 1119 | 1120 | triton::common::TritonJson::Value batch_inputs; 1121 | RETURN_IF_ERROR( 1122 | model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs)); 1123 | size_t i = 0; 1124 | for (const auto& batch_input : StateForModel()->BatchInputs()) { 1125 | for (const auto& input_name : batch_input.TargetNames()) { 1126 | AddInputToMap( 1127 | naming_convention, allowed_inputs, input_name, i + ios.ArraySize()); 1128 | i++; 1129 | } 1130 | } 1131 | 1132 | return nullptr; // success 1133 | } 1134 | 1135 | TRITONSERVER_Error* 1136 | ModelInstanceState::ValidateOutputs() 1137 | { 1138 | triton::common::TritonJson::Value ios; 1139 | RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios)); 1140 | std::string deliminator = "__"; 1141 | int op_index = 0; 1142 | 1143 | if (ios.ArraySize() == 0) { 1144 | return TRITONSERVER_ErrorNew( 1145 | TRITONSERVER_ERROR_INTERNAL, 1146 | "model configuration must contain at least one output, none were " 1147 | "specified."); 1148 | } 1149 | 1150 | NamingConvention naming_convention; 1151 | RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {})); 1152 | 1153 | for (size_t i = 0; i < ios.ArraySize(); i++) { 1154 | triton::common::TritonJson::Value io; 1155 | RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); 1156 | 1157 | // Validate name 1158 | std::string io_name; 1159 | RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); 1160 | switch (naming_convention) { 1161 | case NamingConvention::NAMED_INDEX: { 1162 | int start_pos = io_name.find(deliminator); 1163 | op_index = std::atoi(io_name.substr(start_pos + 2).c_str()); 1164 | break; 1165 | } 1166 | case NamingConvention::STRICT_CONFIG_ORDERING: { 1167 | op_index = i; 1168 | break; 1169 | } 1170 | default: 1171 | break; 1172 | } 1173 | 1174 | // Validate data type 1175 | std::string io_dtype; 1176 | RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); 1177 | const auto pr = ModelConfigDataTypeToTorchType(io_dtype); 1178 | if (!pr.first && (io_dtype != "TYPE_STRING")) { 1179 | return TRITONSERVER_ErrorNew( 1180 | TRITONSERVER_ERROR_INTERNAL, 1181 | ("unsupported datatype " + io_dtype + " for output '" + io_name + 1182 | "' for model '" + model_state_->Name() + "'") 1183 | .c_str()); 1184 | } 1185 | 1186 | // Validate shape for String outputs. Only allow 1 dimension. 1187 | if (io_dtype == "TYPE_STRING") { 1188 | // If a reshape is provided for the output then use that when 1189 | // validating the model shapes. 1190 | std::vector dims; 1191 | triton::common::TritonJson::Value reshape; 1192 | if (io.Find("reshape", &reshape)) { 1193 | RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); 1194 | } else { 1195 | RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); 1196 | } 1197 | 1198 | if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { 1199 | return TRITONSERVER_ErrorNew( 1200 | TRITONSERVER_ERROR_INTERNAL, 1201 | ("Triton only supports 1 dimensional List of String as output for " 1202 | "'" + 1203 | std::string(io_name) + "' for model '" + model_state_->Name() + 1204 | "'") 1205 | .c_str()); 1206 | } 1207 | } 1208 | 1209 | output_index_map_[io_name] = op_index; 1210 | output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second); 1211 | } 1212 | 1213 | triton::common::TritonJson::Value sequence_batching; 1214 | if (model_state_->ModelConfig().Find( 1215 | "sequence_batching", &sequence_batching)) { 1216 | triton::common::TritonJson::Value states; 1217 | if (sequence_batching.Find("state", &states)) { 1218 | for (size_t i = 0; i < states.ArraySize(); i++) { 1219 | triton::common::TritonJson::Value state; 1220 | RETURN_IF_ERROR(states.IndexAsObject(i, &state)); 1221 | std::string state_name; 1222 | RETURN_IF_ERROR(state.MemberAsString("output_name", &state_name)); 1223 | std::string state_dtype; 1224 | RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype)); 1225 | std::vector dims; 1226 | RETURN_IF_ERROR(ParseShape(state, "dims", &dims)); 1227 | 1228 | // For state, naming convention is enforced to be NAMED_INDEX 1229 | int start_pos = state_name.find(deliminator); 1230 | op_index = std::atoi(state_name.substr(start_pos + 2).c_str()); 1231 | 1232 | const auto pr = ModelConfigDataTypeToTorchType(state_dtype); 1233 | if (!pr.first && (state_dtype != "TYPE_STRING")) { 1234 | return TRITONSERVER_ErrorNew( 1235 | TRITONSERVER_ERROR_INTERNAL, 1236 | ("unsupported datatype " + state_dtype + " for state '" + 1237 | state_name + "' for model '" + model_state_->Name() + "'") 1238 | .c_str()); 1239 | } 1240 | 1241 | // Validate shape for String outputs. Only allow 1 dimension. 1242 | if (state_dtype == "TYPE_STRING") { 1243 | if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { 1244 | return TRITONSERVER_ErrorNew( 1245 | TRITONSERVER_ERROR_INTERNAL, 1246 | ("Triton only supports 1 dimensional List of String as output " 1247 | "for " 1248 | "'" + 1249 | std::string(state_name) + "' for model '" + 1250 | model_state_->Name() + "'") 1251 | .c_str()); 1252 | } 1253 | } 1254 | 1255 | output_index_map_[state_name] = op_index; 1256 | output_dtype_map_[state_name] = ConvertTorchTypeToDataType(pr.second); 1257 | } 1258 | } 1259 | } 1260 | 1261 | return nullptr; // success 1262 | } 1263 | 1264 | void 1265 | ModelInstanceState::ProcessRequests( 1266 | TRITONBACKEND_Request** requests, const uint32_t request_count) 1267 | { 1268 | LOG_MESSAGE( 1269 | TRITONSERVER_LOG_VERBOSE, 1270 | (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + 1271 | std::to_string(request_count) + " requests") 1272 | .c_str()); 1273 | 1274 | #ifdef TRITON_ENABLE_GPU 1275 | if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { 1276 | SetCurrentCudaStream(stream_, DeviceId()); 1277 | } else if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { 1278 | // Replace the default stream of each device with the one we created. 1279 | for (size_t i = 0; i < stream_vec_.size(); i++) { 1280 | SetCurrentCudaStream(stream_vec_[i], i); 1281 | } 1282 | } 1283 | #endif 1284 | 1285 | NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); 1286 | 1287 | uint64_t exec_start_ns = 0; 1288 | SET_TIMESTAMP(exec_start_ns); 1289 | 1290 | const int max_batch_size = model_state_->MaxBatchSize(); 1291 | 1292 | // For each request collect the total batch size for this inference 1293 | // execution. The batch-size, number of inputs, and size of each 1294 | // input has already been checked so don't need to do that here. 1295 | size_t total_batch_size = 0; 1296 | for (size_t i = 0; i < request_count; i++) { 1297 | // If we get a nullptr request then something is badly wrong. Fail 1298 | // and release all requests. 1299 | if (requests[i] == nullptr) { 1300 | RequestsRespondWithError( 1301 | requests, request_count, 1302 | TRITONSERVER_ErrorNew( 1303 | TRITONSERVER_ERROR_INTERNAL, 1304 | std::string( 1305 | "null request given to PyTorch backend for '" + Name() + "'") 1306 | .c_str())); 1307 | return; 1308 | } 1309 | } 1310 | 1311 | // At this point we are committed to running inference with all 1312 | // 'requests'. Create a response for each request. During input 1313 | // processing if there is an error with any request that error will 1314 | // be sent immediately with the corresponding response (and the 1315 | // response unique_ptr will then be nullptr). The request object 1316 | // itself will not be released until after all inferencing is done 1317 | // (below) as we may need to access the request object when 1318 | // determine how to process outputs (for example, even if we don't 1319 | // need the outputs for a request that has an error, we do need to 1320 | // know the size of those outputs associated with the request so we 1321 | // can skip them in the output tensors). 1322 | std::vector responses; 1323 | responses.reserve(request_count); 1324 | bool all_response_failed = false; 1325 | 1326 | for (size_t i = 0; i < request_count; i++) { 1327 | TRITONBACKEND_Response* response; 1328 | auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); 1329 | if (err == nullptr) { 1330 | responses.emplace_back(response); 1331 | } else { 1332 | responses.emplace_back(nullptr); 1333 | LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); 1334 | TRITONSERVER_ErrorDelete(err); 1335 | } 1336 | } 1337 | 1338 | for (size_t i = 0; i < request_count; i++) { 1339 | if (max_batch_size > 0) { 1340 | // Retrieve the batch size from one of the inputs, if the model 1341 | // supports batching, the first dimension size is batch size. 1342 | TRITONBACKEND_Input* input; 1343 | TRITONSERVER_Error* err = 1344 | TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); 1345 | if (err == nullptr) { 1346 | const int64_t* shape; 1347 | err = TRITONBACKEND_InputProperties( 1348 | input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); 1349 | total_batch_size += shape[0]; 1350 | } 1351 | if (err != nullptr) { 1352 | RESPOND_ALL_AND_SET_TRUE_IF_ERROR( 1353 | responses, request_count, all_response_failed, err); 1354 | } 1355 | } else { 1356 | total_batch_size += 1; 1357 | } 1358 | } 1359 | 1360 | // If there are no valid payloads then no need to run the inference. 1361 | if (total_batch_size == 0) { 1362 | return; 1363 | } 1364 | 1365 | // Make sure the maximum batch size is not exceeded. The 1366 | // total_batch_size must be 1 for models that don't support batching 1367 | // (i.e. max_batch_size == 0). If max_batch_size is exceeded then 1368 | // scheduler has done something badly wrong so fail and release all 1369 | // requests. 1370 | if (!all_response_failed) { 1371 | if ((total_batch_size != 1) && 1372 | (total_batch_size > (size_t)max_batch_size)) { 1373 | RESPOND_ALL_AND_SET_TRUE_IF_ERROR( 1374 | responses, request_count, all_response_failed, 1375 | TRITONSERVER_ErrorNew( 1376 | TRITONSERVER_ERROR_INTERNAL, 1377 | std::string( 1378 | "batch size " + std::to_string(total_batch_size) + " for '" + 1379 | Name() + "', max allowed is " + 1380 | std::to_string(max_batch_size)) 1381 | .c_str())); 1382 | } 1383 | } 1384 | 1385 | std::vector input_names; 1386 | std::vector input_tensors; 1387 | bool cuda_copy = false; 1388 | std::unique_ptr collector; 1389 | 1390 | // For 'KIND_MODEL', it's fine to use CUDA events to calculate the compute 1391 | // input duration since only one stream will be used for input collection. 1392 | if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) || 1393 | ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { 1394 | #ifdef TRITON_ENABLE_GPU 1395 | RESPOND_ALL_AND_SET_TRUE_IF_ERROR( 1396 | responses, request_count, all_response_failed, 1397 | ConvertCUDAStatusToTritonError( 1398 | cudaEventRecord( 1399 | compute_input_start_event_, GetCudaStreamByInstanceKind()), 1400 | TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); 1401 | #endif 1402 | } 1403 | 1404 | if (!all_response_failed) { 1405 | collector.reset(new BackendInputCollector( 1406 | requests, request_count, &responses, 1407 | model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(), 1408 | GetCudaStreamByInstanceKind(), nullptr, nullptr, 0, 1409 | HostPolicyName().c_str())); 1410 | RESPOND_ALL_AND_SET_TRUE_IF_ERROR( 1411 | responses, request_count, all_response_failed, 1412 | SetInputTensors( 1413 | total_batch_size, requests, request_count, &responses, 1414 | collector.get(), &input_names, &input_tensors, &cuda_copy)); 1415 | } 1416 | 1417 | #ifdef TRITON_ENABLE_GPU 1418 | if (cuda_copy) { 1419 | cudaStreamSynchronize(GetCudaStreamByInstanceKind()); 1420 | cuda_copy = false; 1421 | } 1422 | #endif 1423 | 1424 | std::vector output_tensors; 1425 | uint64_t compute_start_ns = 0; 1426 | uint64_t compute_infer_start = 0; 1427 | 1428 | RESPOND_ALL_AND_SET_TRUE_IF_ERROR( 1429 | responses, request_count, all_response_failed, 1430 | RecordBackendTimestamp( 1431 | &compute_start_ns, 1432 | reinterpret_cast(&compute_infer_start_event_))); 1433 | 1434 | // For 'KIND_MODEL', capture the timestamp for the compute infer duration. 1435 | if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { 1436 | SET_TIMESTAMP(compute_infer_start); 1437 | } 1438 | 1439 | // Run... 1440 | if (!all_response_failed) { 1441 | Execute(&responses, request_count, &input_tensors, &output_tensors); 1442 | } 1443 | 1444 | // Verify output indices are valid with number of outputs after execution 1445 | bool invalid_index = false; 1446 | int max_index = output_tensors.size() - 1; 1447 | 1448 | if (!all_response_failed) { 1449 | for (const auto& name : model_state_->ModelOutputs()) { 1450 | int op_index = output_index_map_[name.first]; 1451 | if ((op_index < 0) || (op_index > max_index)) { 1452 | RESPOND_ALL_AND_SET_TRUE_IF_ERROR( 1453 | responses, request_count, all_response_failed, 1454 | TRITONSERVER_ErrorNew( 1455 | TRITONSERVER_ERROR_INVALID_ARG, 1456 | std::string( 1457 | "The output " + std::string(name.first) + 1458 | " in the model configuration refers to an output index " 1459 | "which doesn't exist. This model has " + 1460 | std::to_string(max_index + 1) + " outputs") 1461 | .c_str())); 1462 | invalid_index = true; 1463 | break; 1464 | } 1465 | } 1466 | } 1467 | 1468 | #ifdef TRITON_ENABLE_GPU 1469 | if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { 1470 | // For 'KIND_MODEL', multiple streams will be involved, so we need to call 1471 | // 'cudaStreamSynchronize' before reading the output tensors. 1472 | for (auto& stream : stream_vec_) { 1473 | cudaStreamSynchronize(stream); 1474 | } 1475 | } 1476 | #endif 1477 | 1478 | uint64_t compute_end_ns = 0; 1479 | uint64_t compute_output_start = 0; 1480 | 1481 | if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { 1482 | #ifdef TRITON_ENABLE_GPU 1483 | SET_TIMESTAMP(compute_output_start); 1484 | #endif 1485 | } else { 1486 | RESPOND_ALL_AND_SET_TRUE_IF_ERROR( 1487 | responses, request_count, all_response_failed, 1488 | RecordBackendTimestamp( 1489 | &compute_end_ns, 1490 | reinterpret_cast(&compute_output_start_event_))); 1491 | } 1492 | 1493 | if (!all_response_failed) { 1494 | if (!invalid_index) { 1495 | RESPOND_ALL_AND_SET_TRUE_IF_ERROR( 1496 | responses, request_count, all_response_failed, 1497 | ReadOutputTensors( 1498 | total_batch_size, output_tensors, requests, request_count, 1499 | &responses)); 1500 | } 1501 | } 1502 | 1503 | uint64_t exec_end_ns = 0; 1504 | SET_TIMESTAMP(exec_end_ns); 1505 | 1506 | // Send all the responses that haven't already been sent because of 1507 | // an earlier error. Note that the responses are not set to nullptr 1508 | // here as we need that indication below to determine if the request 1509 | // we successful or not. 1510 | for (auto& response : responses) { 1511 | if (response != nullptr) { 1512 | LOG_IF_ERROR( 1513 | TRITONBACKEND_ResponseSend( 1514 | response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), 1515 | "failed to send PyTorch backend response"); 1516 | } 1517 | } 1518 | 1519 | // We don't need an explicit CUDA syncrhonization here since we have already 1520 | // synchronized the stream in the ReadOutputTensors function. 1521 | if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { 1522 | #ifdef TRITON_ENABLE_GPU 1523 | float compute_input_duration = GetCudaEventElapsedTime( 1524 | compute_input_start_event_, compute_infer_start_event_); 1525 | float compute_infer_duration = GetCudaEventElapsedTime( 1526 | compute_infer_start_event_, compute_output_start_event_); 1527 | 1528 | compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); 1529 | compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6); 1530 | #endif 1531 | } else if ( 1532 | (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { 1533 | #ifdef TRITON_ENABLE_GPU 1534 | float compute_input_duration = GetCudaEventElapsedTime( 1535 | compute_input_start_event_, compute_infer_start_event_); 1536 | uint64_t compute_infer_duration = 1537 | compute_output_start - compute_infer_start; 1538 | 1539 | compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); 1540 | compute_end_ns = compute_start_ns + compute_infer_duration; 1541 | #endif 1542 | } 1543 | 1544 | // Report statistics for each request. 1545 | for (uint32_t r = 0; r < request_count; ++r) { 1546 | auto& request = requests[r]; 1547 | LOG_IF_ERROR( 1548 | TRITONBACKEND_ModelInstanceReportStatistics( 1549 | TritonModelInstance(), request, 1550 | (responses[r] != nullptr) /* success */, exec_start_ns, 1551 | compute_start_ns, compute_end_ns, exec_end_ns), 1552 | "failed reporting request statistics"); 1553 | 1554 | LOG_IF_ERROR( 1555 | TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), 1556 | "failed releasing request"); 1557 | } 1558 | 1559 | if (!all_response_failed) { 1560 | // Report the entire batch statistics. 1561 | LOG_IF_ERROR( 1562 | TRITONBACKEND_ModelInstanceReportBatchStatistics( 1563 | TritonModelInstance(), total_batch_size, exec_start_ns, 1564 | compute_start_ns, compute_end_ns, exec_end_ns), 1565 | "failed reporting batch request statistics"); 1566 | } 1567 | } 1568 | 1569 | void 1570 | ModelInstanceState::Execute( 1571 | std::vector* responses, 1572 | const uint32_t response_count, 1573 | std::vector* input_tensors, 1574 | std::vector* output_tensors) 1575 | { 1576 | NVTX_RANGE(nvtx_, "Execute " + Name()); 1577 | 1578 | torch::jit::IValue model_outputs_; 1579 | 1580 | try { 1581 | // enable/disable optimized execution 1582 | torch::jit::setGraphExecutorOptimize( 1583 | model_state_->EnabledOptimizedExecution()); 1584 | 1585 | // enable/disable inference mode - supersedes NoGradGuard 1586 | torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); 1587 | 1588 | // enable/disable cudnn 1589 | at::globalContext().setUserEnabledCuDNN(model_state_->EnabledCudnn()); 1590 | 1591 | // JIT. No change is made unless parameter is explicitly set. 1592 | if (std::get<0>(model_state_->EnabledJitProfiling())) { 1593 | torch::jit::getProfilingMode() = 1594 | std::get<1>(model_state_->EnabledJitProfiling()); 1595 | } 1596 | 1597 | if (std::get<0>(model_state_->EnabledJitExecutor())) { 1598 | torch::jit::getExecutorMode() = 1599 | std::get<1>(model_state_->EnabledJitExecutor()); 1600 | } 1601 | 1602 | // Fuser. No change is made unless fuser is explicitly set in 1603 | // parameters. 1604 | if (std::get<0>(model_state_->EnabledTensorExprFuser())) { 1605 | torch::jit::setTensorExprFuserEnabled( 1606 | std::get<1>(model_state_->EnabledTensorExprFuser())); 1607 | } 1608 | 1609 | torch::NoGradGuard no_grad; 1610 | 1611 | // If input is a dictionary, prepare dictionary from 'input_tensors'. 1612 | if (is_dict_input_) { 1613 | torch::Dict input_dict; 1614 | for (auto& input_index : input_index_map_) { 1615 | torch::jit::IValue ival = (*input_tensors)[input_index.second]; 1616 | input_dict.insert(input_index.first, ival.toTensor()); 1617 | } 1618 | std::vector input_dict_ivalue = {input_dict}; 1619 | model_outputs_ = torch_model_->forward(input_dict_ivalue); 1620 | } else { 1621 | model_outputs_ = torch_model_->forward(*input_tensors); 1622 | } 1623 | 1624 | if (model_outputs_.isTuple()) { 1625 | auto model_outputs_tuple = model_outputs_.toTuple(); 1626 | size_t op_index = 0; 1627 | for (auto& m_op : model_outputs_tuple->elements()) { 1628 | if (m_op.isList()) { 1629 | auto list_output = m_op.toList(); 1630 | if (list_output.elementType()->kind() != c10::TypeKind::StringType) { 1631 | throw std::invalid_argument( 1632 | "output at index " + std::to_string(op_index) + 1633 | " must be of type Tensor or List[str], received List[" + 1634 | list_output.elementType()->str() + "]"); 1635 | } 1636 | output_tensors->push_back(m_op); 1637 | } else { 1638 | auto tensor_output = m_op.toTensor(); 1639 | output_tensors->push_back(m_op); 1640 | } 1641 | op_index++; 1642 | } 1643 | } else if (model_outputs_.isTensor()) { 1644 | output_tensors->push_back(model_outputs_); 1645 | } else if (model_outputs_.isList()) { 1646 | auto list_output = model_outputs_.toList(); 1647 | if (list_output.elementType()->kind() != c10::TypeKind::StringType) { 1648 | throw std::invalid_argument( 1649 | "output must be of type Tensor or List[str], received List[" + 1650 | list_output.elementType()->str() + "]"); 1651 | } 1652 | output_tensors->push_back(model_outputs_); 1653 | } else { 1654 | throw std::invalid_argument( 1655 | "output must be of type Tensor, List[str] or Tuple containing one of " 1656 | "these two types. It should not be a List / Dictionary of Tensors or " 1657 | "a Scalar"); 1658 | } 1659 | } 1660 | catch (std::exception& ex) { 1661 | SendErrorForResponses( 1662 | responses, response_count, 1663 | TRITONSERVER_ErrorNew( 1664 | TRITONSERVER_ERROR_INTERNAL, 1665 | ("PyTorch execute failure: " + std::string(ex.what())).c_str())); 1666 | } 1667 | } 1668 | 1669 | TRITONSERVER_Error* 1670 | ModelInstanceState::GetNamingConvention( 1671 | NamingConvention* naming_convention, 1672 | const std::vector& allowed_ios) 1673 | { 1674 | // Rules for (non-Dictionary) input tensor names: 1675 | // 1. Must be in 'allowed_inputs' (arguments in the forward function) 1676 | // 2. Must follow the naming convention i.e. __ 1677 | // 3. If neither of the above conditions are satisfied, enforce strict 1678 | // ordering of model inputs. 1679 | // 1680 | // Rules for output tensor names: 1681 | // 1. Must follow the naming convention i.e. __ 1682 | // 2. If not, we enforce strict ordering of model outputs. 1683 | std::string deliminator = "__"; 1684 | std::string io_kind = "input"; 1685 | *naming_convention = NamingConvention::FORWARD_ARGUMENT; 1686 | 1687 | // symbolizes output 1688 | if (allowed_ios.size() == 0) { 1689 | io_kind = "output"; 1690 | *naming_convention = NamingConvention::NAMED_INDEX; 1691 | } 1692 | 1693 | triton::common::TritonJson::Value ios; 1694 | RETURN_IF_ERROR( 1695 | model_state_->ModelConfig().MemberAsArray(io_kind.c_str(), &ios)); 1696 | 1697 | if (io_kind == "input") { 1698 | for (size_t i = 0; i < ios.ArraySize(); i++) { 1699 | triton::common::TritonJson::Value io; 1700 | RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); 1701 | 1702 | // Validate name 1703 | std::string io_name; 1704 | RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); 1705 | auto itr = std::find(allowed_ios.begin(), allowed_ios.end(), io_name); 1706 | if (itr == allowed_ios.end()) { 1707 | *naming_convention = NamingConvention::NAMED_INDEX; 1708 | break; 1709 | } 1710 | } 1711 | } 1712 | 1713 | // If not, check if inputs follow INDEX 1714 | if (*naming_convention == NamingConvention::NAMED_INDEX) { 1715 | for (size_t i = 0; i < ios.ArraySize(); i++) { 1716 | triton::common::TritonJson::Value io; 1717 | RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); 1718 | 1719 | // Validate name 1720 | std::string io_name; 1721 | RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); 1722 | int start_pos = io_name.find(deliminator); 1723 | if (start_pos == -1) { 1724 | *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; 1725 | break; 1726 | } else { 1727 | // check if the index part of the name is not an integer 1728 | std::string index_str = io_name.substr(start_pos + 2); 1729 | bool is_int = true; 1730 | for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { 1731 | if (std::isdigit(*itr) == 0) { 1732 | is_int = false; 1733 | } 1734 | } 1735 | 1736 | if (!is_int) { 1737 | if (io_kind == "input") { 1738 | LOG_MESSAGE( 1739 | TRITONSERVER_LOG_WARN, 1740 | ("input '" + io_name + 1741 | "' or previous input(s) are neither an input argument to the " 1742 | "model '" + 1743 | model_state_->Name() + 1744 | "' nor do they follow the __ naming convention. " 1745 | "Falling back to enforcing strict ordering from model " 1746 | "configuration.") 1747 | .c_str()); 1748 | } else { 1749 | LOG_MESSAGE( 1750 | TRITONSERVER_LOG_WARN, 1751 | ("output '" + io_name + 1752 | "' or previous output(s) of the model '" + 1753 | model_state_->Name() + 1754 | "' do not follow the __ naming convention. " 1755 | "Falling back to enforcing strict ordering from model " 1756 | "configuration.") 1757 | .c_str()); 1758 | } 1759 | *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; 1760 | break; 1761 | } 1762 | } 1763 | } 1764 | } 1765 | 1766 | triton::common::TritonJson::Value sequence_batching; 1767 | if (model_state_->ModelConfig().Find( 1768 | "sequence_batching", &sequence_batching)) { 1769 | // If we need to manage state for the model, then we need to check 1770 | // the naming of the state adheres to both the input and output conventions 1771 | triton::common::TritonJson::Value states; 1772 | if (sequence_batching.Find("state", &states)) { 1773 | if (*naming_convention != NamingConvention::NAMED_INDEX) { 1774 | return TRITONSERVER_ErrorNew( 1775 | TRITONSERVER_ERROR_INVALID_ARG, 1776 | ("PyTorch model '" + model_state_->Name() + 1777 | "' is using sequence batching with state but not all inputs and " 1778 | "outputs follow the __ naming convention. ") 1779 | .c_str()); 1780 | } 1781 | } 1782 | 1783 | for (size_t i = 0; i < states.ArraySize(); i++) { 1784 | triton::common::TritonJson::Value state; 1785 | RETURN_IF_ERROR(states.IndexAsObject(i, &state)); 1786 | std::string name_entry = 1787 | io_kind == "input" ? "input_name" : "output_name"; 1788 | std::string state_name; 1789 | RETURN_IF_ERROR(state.MemberAsString(name_entry.c_str(), &state_name)); 1790 | int start_pos = state_name.find(deliminator); 1791 | if (start_pos == -1) { 1792 | return TRITONSERVER_ErrorNew( 1793 | TRITONSERVER_ERROR_INVALID_ARG, 1794 | ("PyTorch model '" + model_state_->Name() + 1795 | "' is using sequence batching with state but state '" + 1796 | state_name + 1797 | "' does not follow the __ naming convention. ") 1798 | .c_str()); 1799 | } else { 1800 | // check if the index part of the name is not an integer 1801 | std::string index_str = state_name.substr(start_pos + 2); 1802 | bool is_int = true; 1803 | for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { 1804 | if (std::isdigit(*itr) == 0) { 1805 | is_int = false; 1806 | } 1807 | } 1808 | if (!is_int) { 1809 | return TRITONSERVER_ErrorNew( 1810 | TRITONSERVER_ERROR_INVALID_ARG, 1811 | ("PyTorch model '" + model_state_->Name() + 1812 | "' is using sequence batching with state but state '" + 1813 | state_name + 1814 | "' does not follow the __ naming convention. ") 1815 | .c_str()); 1816 | } 1817 | } 1818 | } 1819 | } 1820 | 1821 | return nullptr; // success 1822 | } 1823 | 1824 | // This function will return a tensor's contents as a contiguous 1825 | // chunk in system memory. In some cases this will require copying the data. 1826 | // If that happens, 'contiguous_buffer' will be set to hold the contiguous 1827 | // chunk and 'cuda_copy' will be set to indicate whether CUDA copy is 1828 | // conducted. The data copy can be avoided if the input is already in 1829 | // a contiguous chunk and the input is located in memory type and id 1830 | // specified. 1831 | TRITONSERVER_Error* 1832 | GetContiguousInputContent( 1833 | TRITONBACKEND_Input* rinput, const uint32_t buffer_count, 1834 | const char** content, size_t* content_byte_size, 1835 | std::vector* contiguous_buffer, cudaStream_t stream, bool* cuda_copy) 1836 | { 1837 | *cuda_copy = false; 1838 | 1839 | // Check input buffers to see if data copy is necessary 1840 | size_t chunk_count = 0; 1841 | bool type_mismatch = false; 1842 | uint64_t total_byte_size = 0; 1843 | for (size_t idx = 0; idx < buffer_count; ++idx) { 1844 | TRITONSERVER_MemoryType src_memory_type; 1845 | int64_t src_memory_type_id; 1846 | size_t src_byte_size; 1847 | const void* src_ptr; 1848 | 1849 | RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( 1850 | rinput, idx, &src_ptr, &src_byte_size, &src_memory_type, 1851 | &src_memory_type_id)); 1852 | 1853 | if (src_ptr != nullptr) { 1854 | chunk_count++; 1855 | total_byte_size += src_byte_size; 1856 | type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU); 1857 | } 1858 | } 1859 | 1860 | if (chunk_count == 0) { 1861 | *content = nullptr; 1862 | *content_byte_size = 0; 1863 | } else if ((chunk_count == 1) && !type_mismatch) { 1864 | TRITONSERVER_MemoryType src_memory_type; 1865 | int64_t src_memory_type_id; 1866 | RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( 1867 | rinput, 0, (const void**)content, content_byte_size, &src_memory_type, 1868 | &src_memory_type_id)); 1869 | } else { 1870 | contiguous_buffer->resize(total_byte_size); 1871 | 1872 | size_t offset = 0; 1873 | for (size_t i = 0; i < chunk_count; i++) { 1874 | bool cuda_used; 1875 | TRITONSERVER_MemoryType src_memory_type; 1876 | int64_t src_memory_type_id; 1877 | size_t src_byte_size; 1878 | const void* src_ptr; 1879 | 1880 | RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( 1881 | rinput, i, &src_ptr, &src_byte_size, &src_memory_type, 1882 | &src_memory_type_id)); 1883 | RETURN_IF_ERROR(CopyBuffer( 1884 | "Contiguous input", src_memory_type, src_memory_type_id, 1885 | TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr, 1886 | contiguous_buffer->data() + offset, stream, &cuda_used)); 1887 | *cuda_copy |= cuda_used; 1888 | offset += src_byte_size; 1889 | } 1890 | 1891 | *content = contiguous_buffer->data(); 1892 | *content_byte_size = total_byte_size; 1893 | } 1894 | 1895 | return nullptr; // success 1896 | } 1897 | 1898 | void 1899 | FillStringTensor(torch::List* input_list, const size_t cnt) 1900 | { 1901 | for (size_t c = 0; c < cnt; ++c) { 1902 | input_list->push_back(""); 1903 | } 1904 | } 1905 | 1906 | bool 1907 | SetStringInputTensor( 1908 | torch::List* input_list, TRITONBACKEND_Input* input, 1909 | const char* name, const uint32_t buffer_count, 1910 | const size_t request_element_cnt, TRITONBACKEND_Response** response, 1911 | cudaStream_t stream, const char* host_policy_name) 1912 | { 1913 | bool cuda_copy = false; 1914 | 1915 | // For string data type, we always need to have the data on CPU so 1916 | // that we can read string length and construct the string 1917 | // properly. So if the request's input tensor is not in CPU need to 1918 | // copy it there. 1919 | const char* content = nullptr; 1920 | size_t content_byte_size = 0; 1921 | 1922 | std::vector contiguous_buffer; 1923 | auto err = GetContiguousInputContent( 1924 | input, buffer_count, &content, &content_byte_size, &contiguous_buffer, 1925 | stream, &cuda_copy); 1926 | if (err != nullptr) { 1927 | RESPOND_AND_SET_NULL_IF_ERROR(response, err); 1928 | FillStringTensor(input_list, request_element_cnt); 1929 | return cuda_copy; 1930 | } 1931 | 1932 | #ifdef TRITON_ENABLE_GPU 1933 | if (cuda_copy) { 1934 | cudaStreamSynchronize(stream); 1935 | cuda_copy = false; 1936 | } 1937 | #endif // TRITON_ENABLE_GPU 1938 | 1939 | std::vector> str_list; 1940 | err = ValidateStringBuffer( 1941 | content, content_byte_size, request_element_cnt, name, &str_list); 1942 | // Set string values. 1943 | for (const auto& [addr, len] : str_list) { 1944 | input_list->push_back(std::string(addr, len)); 1945 | } 1946 | 1947 | size_t element_cnt = str_list.size(); 1948 | if (err != nullptr) { 1949 | RESPOND_AND_SET_NULL_IF_ERROR(response, err); 1950 | FillStringTensor(input_list, request_element_cnt - element_cnt); 1951 | } 1952 | return cuda_copy; 1953 | } 1954 | 1955 | bool 1956 | SetStringBuffer( 1957 | torch::List* tensor, TRITONBACKEND_Response** response, 1958 | TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state, 1959 | const size_t tensor_element_count, cudaStream_t stream, 1960 | std::string* serialized, bool state) 1961 | { 1962 | bool cuda_copy = false; 1963 | 1964 | // Serialize the output tensor strings. Each string is serialized as 1965 | // a 4-byte length followed by the string itself with no 1966 | // null-terminator. 1967 | serialized->clear(); 1968 | for (size_t e = 0; e < tensor_element_count; ++e) { 1969 | std::string str = tensor->get(e).to(); 1970 | const char* cstr = str.c_str(); 1971 | size_t len = str.length(); 1972 | serialized->append(reinterpret_cast(&len), sizeof(uint32_t)); 1973 | if (len > 0) { 1974 | serialized->append(cstr, len); 1975 | } 1976 | } 1977 | 1978 | // Allocate a buffer large enough to hold the serialized tensor. 1979 | TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU; 1980 | int64_t actual_memory_type_id = 0; 1981 | 1982 | TRITONSERVER_Error* err; 1983 | void* buffer; 1984 | 1985 | if (!state) { 1986 | auto err = TRITONBACKEND_OutputBuffer( 1987 | response_output, &buffer, serialized->size(), &actual_memory_type, 1988 | &actual_memory_type_id); 1989 | if (err != nullptr) { 1990 | RESPOND_AND_SET_NULL_IF_ERROR(response, err); 1991 | return cuda_copy; 1992 | } 1993 | } else { 1994 | auto err = TRITONBACKEND_StateBuffer( 1995 | response_state, &buffer, serialized->size(), &actual_memory_type, 1996 | &actual_memory_type_id); 1997 | if (err != nullptr) { 1998 | RESPOND_AND_SET_NULL_IF_ERROR(response, err); 1999 | return cuda_copy; 2000 | } 2001 | } 2002 | // Copy the serialized tensor into the allocated buffer. 2003 | bool cuda_used = false; 2004 | err = CopyBuffer( 2005 | "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */, 2006 | 0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id, 2007 | serialized->size(), reinterpret_cast(serialized->c_str()), 2008 | buffer, stream, &cuda_used); 2009 | cuda_copy |= cuda_used; 2010 | 2011 | if (err != nullptr) { 2012 | RESPOND_AND_SET_NULL_IF_ERROR(response, err); 2013 | return cuda_copy; 2014 | } 2015 | 2016 | if (state) { 2017 | RESPOND_AND_SET_NULL_IF_ERROR( 2018 | response, TRITONBACKEND_StateUpdate(response_state)); 2019 | } 2020 | 2021 | return cuda_copy; 2022 | } 2023 | 2024 | 2025 | bool 2026 | SetStringOutputBuffer( 2027 | torch::List* tensor, TRITONBACKEND_Response** response, 2028 | TRITONBACKEND_Output* response_output, const size_t tensor_element_count, 2029 | cudaStream_t stream, std::string* serialized) 2030 | { 2031 | return SetStringBuffer( 2032 | tensor, response, response_output, nullptr /* response_state */, 2033 | tensor_element_count, stream, serialized, false /* state */); 2034 | } 2035 | 2036 | bool 2037 | SetStringStateBuffer( 2038 | torch::List* tensor, TRITONBACKEND_Response** response, 2039 | TRITONBACKEND_State* response_state, const size_t tensor_element_count, 2040 | cudaStream_t stream, std::string* serialized) 2041 | { 2042 | return SetStringBuffer( 2043 | tensor, response, nullptr /* response_output */, response_state, 2044 | tensor_element_count, stream, serialized, true /* state */); 2045 | } 2046 | 2047 | 2048 | TRITONSERVER_Error* 2049 | ModelInstanceState::SetInputTensors( 2050 | size_t total_batch_size, TRITONBACKEND_Request** requests, 2051 | const uint32_t request_count, 2052 | std::vector* responses, 2053 | BackendInputCollector* collector, std::vector* input_names, 2054 | std::vector* input_tensors, bool* cuda_copy) 2055 | { 2056 | // InferenceMode should be used to guard all tensors operations 2057 | torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); 2058 | 2059 | // All requests must have equally-sized input tensors so use any 2060 | // request as the representative for the input tensors. 2061 | uint32_t input_count; 2062 | RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count)); 2063 | 2064 | input_tensors->resize(input_count + batch_input_count_); 2065 | 2066 | // The inputs must be in contiguous CPU/GPU memory. 2067 | std::vector> alloc_perference; 2068 | if (device_.is_cpu()) { 2069 | alloc_perference = { 2070 | {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; 2071 | } else { 2072 | alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; 2073 | } 2074 | 2075 | for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) { 2076 | TRITONBACKEND_Input* input; 2077 | RETURN_IF_ERROR( 2078 | TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input)); 2079 | 2080 | const char* input_name; 2081 | TRITONSERVER_DataType input_datatype; 2082 | const int64_t* input_shape; 2083 | uint32_t input_dims_count; 2084 | RETURN_IF_ERROR(TRITONBACKEND_InputProperties( 2085 | input, &input_name, &input_datatype, &input_shape, &input_dims_count, 2086 | nullptr, nullptr)); 2087 | 2088 | input_names->emplace_back(input_name); 2089 | 2090 | // The shape for the entire input patch, 2091 | // [total_batch_size, ...] for non-ragged input and 2092 | // [total_element_count] for ragged input (non-nested tensor) 2093 | std::vector batchn_shape; 2094 | if (StateForModel()->IsInputRagged(input_name)) { 2095 | batchn_shape = std::vector{0}; 2096 | for (size_t idx = 0; idx < request_count; idx++) { 2097 | TRITONBACKEND_Input* input; 2098 | RESPOND_AND_SET_NULL_IF_ERROR( 2099 | &((*responses)[idx]), 2100 | TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); 2101 | const int64_t* input_shape; 2102 | uint32_t input_dims_count; 2103 | RESPOND_AND_SET_NULL_IF_ERROR( 2104 | &((*responses)[idx]), TRITONBACKEND_InputProperties( 2105 | input, nullptr, nullptr, &input_shape, 2106 | &input_dims_count, nullptr, nullptr)); 2107 | 2108 | int64_t element_cnt = 0; 2109 | RESPOND_AND_SET_NULL_IF_ERROR( 2110 | &((*responses)[idx]), 2111 | GetElementCount(input_shape, input_dims_count, &element_cnt)); 2112 | batchn_shape[0] += element_cnt; 2113 | } 2114 | } else { 2115 | batchn_shape = 2116 | std::vector(input_shape, input_shape + input_dims_count); 2117 | if (supports_batching_) { 2118 | batchn_shape[0] = total_batch_size; 2119 | } 2120 | } 2121 | 2122 | // The input must be in contiguous CPU/GPU memory. 2123 | std::vector> alloc_perference; 2124 | // For 'KIND_MODEL', input will always be in CPU as we don't have a way to 2125 | // query the input types. 2126 | if (device_.is_cpu() || (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL)) { 2127 | alloc_perference = { 2128 | {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; 2129 | } else { 2130 | alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; 2131 | } 2132 | 2133 | const char* input_buffer; 2134 | size_t batchn_byte_size; 2135 | TRITONSERVER_MemoryType memory_type; 2136 | int64_t memory_type_id; 2137 | RETURN_IF_ERROR(collector->ProcessTensor( 2138 | input_name, nullptr, 0, alloc_perference, &input_buffer, 2139 | &batchn_byte_size, &memory_type, &memory_type_id)); 2140 | 2141 | // Create Torch tensor 2142 | const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype); 2143 | torch::TensorOptions options{torch_dtype.second}; 2144 | auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU) 2145 | ? options.device(torch::kCUDA, device_.index()) 2146 | : options.device(torch::kCPU); 2147 | 2148 | if (input_datatype == TRITONSERVER_TYPE_BYTES) { 2149 | // Create the PyTorch list to hold the strings. 2150 | torch::List input_list; 2151 | input_list.reserve(batchn_shape[0]); 2152 | 2153 | for (size_t idx = 0; idx < request_count; idx++) { 2154 | TRITONBACKEND_Input* input; 2155 | RESPOND_AND_SET_NULL_IF_ERROR( 2156 | &((*responses)[idx]), 2157 | TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); 2158 | const int64_t* shape; 2159 | uint32_t dims_count; 2160 | uint32_t buffer_count; 2161 | RESPOND_AND_SET_NULL_IF_ERROR( 2162 | &((*responses)[idx]), 2163 | TRITONBACKEND_InputPropertiesForHostPolicy( 2164 | input, HostPolicyName().c_str(), nullptr, nullptr, &shape, 2165 | &dims_count, nullptr, &buffer_count)); 2166 | 2167 | int64_t batch_element_cnt = 0; 2168 | RESPOND_AND_SET_NULL_IF_ERROR( 2169 | &((*responses)[idx]), 2170 | GetElementCount(shape, dims_count, &batch_element_cnt)); 2171 | 2172 | *cuda_copy |= SetStringInputTensor( 2173 | &input_list, input, input_name, buffer_count, batch_element_cnt, 2174 | &((*responses)[idx]), GetCudaStreamByInstanceKind(), 2175 | HostPolicyName().c_str()); 2176 | } 2177 | 2178 | (*input_tensors)[input_index_map_[input_name]] = input_list; 2179 | } else { 2180 | if (batchn_byte_size) { 2181 | // Remove constness to align with the signature of torch::from_blob() 2182 | torch::Tensor input_tensor = torch::from_blob( 2183 | const_cast(input_buffer), batchn_shape, updated_options); 2184 | (*input_tensors)[input_index_map_[input_name]] = input_tensor; 2185 | } else { 2186 | // torch:from_blob seems not working when the input size is 0 2187 | // create zero-length inputs directly 2188 | torch::Tensor input_tensor = 2189 | torch::zeros(batchn_shape, updated_options); 2190 | (*input_tensors)[input_index_map_[input_name]] = input_tensor; 2191 | } 2192 | } 2193 | } 2194 | 2195 | for (const auto& batch_input : StateForModel()->BatchInputs()) { 2196 | std::vector shape; 2197 | collector->BatchInputShape(batch_input, &shape); 2198 | 2199 | for (const auto& input_name : batch_input.TargetNames()) { 2200 | input_names->emplace_back(input_name.c_str()); 2201 | 2202 | const char* dst_buffer; 2203 | size_t dst_buffer_byte_size; 2204 | TRITONSERVER_MemoryType dst_memory_type; 2205 | int64_t dst_memory_type_id; 2206 | 2207 | RESPOND_ALL_AND_SET_NULL_IF_ERROR( 2208 | (*responses), responses->size(), 2209 | collector->ProcessBatchInput( 2210 | batch_input, nullptr, 0, alloc_perference, &dst_buffer, 2211 | &dst_buffer_byte_size, &dst_memory_type, &dst_memory_type_id)); 2212 | 2213 | const auto torch_dtype = 2214 | ConvertDataTypeToTorchType(batch_input.DataType()); 2215 | torch::TensorOptions options{torch_dtype.second}; 2216 | auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU) 2217 | ? options.device(torch::kCUDA, device_.index()) 2218 | : options.device(torch::kCPU); 2219 | 2220 | if (dst_buffer_byte_size) { 2221 | torch::Tensor input_tensor = torch::from_blob( 2222 | const_cast(dst_buffer), shape, updated_options); 2223 | (*input_tensors)[input_index_map_[input_name]] = input_tensor; 2224 | } else { 2225 | // special handle when input has zero size 2226 | torch::Tensor input_tensor = torch::zeros(shape, updated_options); 2227 | (*input_tensors)[input_index_map_[input_name]] = input_tensor; 2228 | } 2229 | } 2230 | } 2231 | 2232 | // Finalize... 2233 | *cuda_copy |= collector->Finalize(); 2234 | 2235 | return nullptr; 2236 | } 2237 | 2238 | TRITONSERVER_Error* 2239 | ModelInstanceState::ReadOutputTensors( 2240 | size_t total_batch_size, 2241 | const std::vector& output_tensors, 2242 | TRITONBACKEND_Request** requests, const uint32_t request_count, 2243 | std::vector* responses) 2244 | { 2245 | NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name()); 2246 | 2247 | BackendOutputResponder responder( 2248 | requests, request_count, responses, model_state_->TritonMemoryManager(), 2249 | model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(), 2250 | GetCudaStreamByInstanceKind()); 2251 | 2252 | bool cuda_copy = false; 2253 | // The serialized string buffer must be valid until output copies are done 2254 | std::vector> string_buffer; 2255 | for (auto& output : model_state_->ModelOutputs()) { 2256 | int op_index = output_index_map_[output.first]; 2257 | auto name = output.first; 2258 | auto output_tensor_pair = output.second; 2259 | 2260 | if (output_tensors[op_index].isTensor()) { 2261 | torch::Tensor output_flat; 2262 | try { 2263 | output_flat = 2264 | output_tensors[op_index].toTensor().contiguous().flatten(); 2265 | } 2266 | catch (std::exception& ex) { 2267 | RETURN_IF_ERROR(TRITONSERVER_ErrorNew( 2268 | TRITONSERVER_ERROR_INTERNAL, 2269 | (std::string("output tensor '") + name + "' is not found") 2270 | .c_str())); 2271 | } 2272 | 2273 | // Verify output datatype matches datatype from model config 2274 | TRITONSERVER_DataType output_dtype = 2275 | ConvertTorchTypeToDataType(output_flat.scalar_type()); 2276 | TRITONSERVER_DataType config_datatype = output_dtype_map_[name]; 2277 | if (config_datatype != output_dtype) { 2278 | RETURN_IF_ERROR(TRITONSERVER_ErrorNew( 2279 | TRITONSERVER_ERROR_INVALID_ARG, 2280 | (std::string("configuration expects datatype TYPE_") + 2281 | TRITONSERVER_DataTypeString(config_datatype) + " for output '" + 2282 | name + "', model provides TYPE_" + 2283 | TRITONSERVER_DataTypeString(output_dtype)) 2284 | .c_str())); 2285 | } 2286 | 2287 | const char* output_buffer = 2288 | static_cast(output_flat.data_ptr()); 2289 | 2290 | // Output tensors may not reside on the same device as model 2291 | torch::Device tensor_device = output_flat.device(); 2292 | const auto memory_type = (tensor_device.type() == torch::kCPU) 2293 | ? TRITONSERVER_MEMORY_CPU 2294 | : TRITONSERVER_MEMORY_GPU; 2295 | const auto memory_id = 2296 | (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index(); 2297 | 2298 | // Batch output doesn't support string data type yet, as it is not trivial 2299 | // to parse string output 2300 | const BatchOutput* batch_output = StateForModel()->FindBatchOutput(name); 2301 | if (batch_output == nullptr) { 2302 | // Get output shape 2303 | std::vector batchn_shape; 2304 | auto shape = output_tensors[op_index].toTensor().sizes(); 2305 | for (auto itr = shape.begin(); itr != shape.end(); itr++) { 2306 | batchn_shape.push_back(*itr); 2307 | } 2308 | 2309 | if (batchn_shape.size() == 0) { 2310 | return TRITONSERVER_ErrorNew( 2311 | TRITONSERVER_ERROR_INVALID_ARG, 2312 | (std::string("output '") + name + 2313 | "' is a scalar which is not supported.") 2314 | .c_str()); 2315 | } 2316 | if (output_tensor_pair.first != -1) { 2317 | responder.ProcessTensor( 2318 | name, output_dtype, batchn_shape, output_buffer, memory_type, 2319 | memory_id); 2320 | } 2321 | if (output_tensor_pair.second != -1) { 2322 | std::vector states; 2323 | states = responder.ProcessStateTensor( 2324 | name, output_dtype, batchn_shape, output_buffer, memory_type, 2325 | memory_id); 2326 | // Update the states 2327 | for (auto& state : states) { 2328 | RETURN_IF_ERROR(TRITONBACKEND_StateUpdate(state)); 2329 | } 2330 | } 2331 | 2332 | } else { 2333 | responder.ProcessBatchOutput( 2334 | name, *batch_output, output_buffer, memory_type, memory_id); 2335 | } 2336 | } else if (output_tensors[op_index].isList()) { 2337 | // Custom handling for string/bytes tensor... 2338 | torch::List output_list = 2339 | output_tensors[op_index].toList(); 2340 | 2341 | // Get output shape 2342 | std::vector batchn_shape{(int64_t)output_list.size()}; 2343 | 2344 | for (size_t idx = 0; idx < responses->size(); idx++) { 2345 | auto& request = requests[idx]; 2346 | auto& response = (*responses)[idx]; 2347 | 2348 | if (supports_batching_ != 0) { 2349 | TRITONBACKEND_Input* input; 2350 | TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input); 2351 | const int64_t* shape; 2352 | TRITONBACKEND_InputProperties( 2353 | input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); 2354 | batchn_shape[0] = shape[0]; 2355 | } 2356 | 2357 | int64_t tensor_element_cnt = 0; 2358 | RETURN_IF_ERROR(GetElementCount(batchn_shape, &tensor_element_cnt)); 2359 | 2360 | // Only need an response tensor for requested outputs. 2361 | if (response != nullptr) { 2362 | if (output_tensor_pair.first != -1) { 2363 | TRITONBACKEND_Output* response_output; 2364 | RESPOND_AND_SET_NULL_IF_ERROR( 2365 | &response, TRITONBACKEND_ResponseOutput( 2366 | response, &response_output, name.c_str(), 2367 | TRITONSERVER_TYPE_BYTES, batchn_shape.data(), 2368 | batchn_shape.size())); 2369 | string_buffer.emplace_back(new std::string()); 2370 | cuda_copy |= SetStringOutputBuffer( 2371 | &output_list, &response, response_output, tensor_element_cnt, 2372 | GetCudaStreamByInstanceKind(), string_buffer.back().get()); 2373 | } 2374 | } 2375 | if (output_tensor_pair.second != -1) { 2376 | TRITONBACKEND_State* response_state; 2377 | RESPOND_AND_SET_NULL_IF_ERROR( 2378 | &response, TRITONBACKEND_StateNew( 2379 | &response_state, request, name.c_str(), 2380 | TRITONSERVER_TYPE_BYTES, batchn_shape.data(), 2381 | batchn_shape.size())); 2382 | 2383 | string_buffer.emplace_back(new std::string()); 2384 | cuda_copy |= SetStringStateBuffer( 2385 | &output_list, &response, response_state, tensor_element_cnt, 2386 | GetCudaStreamByInstanceKind(), string_buffer.back().get()); 2387 | } 2388 | } 2389 | } else { 2390 | return TRITONSERVER_ErrorNew( 2391 | TRITONSERVER_ERROR_INVALID_ARG, 2392 | (std::string("output '") + name + 2393 | "' must be of type Tensor or List[str].") 2394 | .c_str()); 2395 | } 2396 | } 2397 | 2398 | // Finalize and wait for any pending buffer copies. 2399 | cuda_copy |= responder.Finalize(); 2400 | 2401 | #ifdef TRITON_ENABLE_GPU 2402 | // We have to always synchronize the stream. This is to make sure that 2403 | // the events on the cuda stream are synchronized. Otherwise, the events 2404 | // are only guaranteed to be synchronized if the model provides the output 2405 | // on GPU. 2406 | cudaStreamSynchronize(GetCudaStreamByInstanceKind()); 2407 | #endif 2408 | 2409 | return nullptr; 2410 | } 2411 | 2412 | TRITONSERVER_Error* 2413 | ModelInstanceState::RecordBackendTimestamp( 2414 | uint64_t* timestamp, void* cuda_event) 2415 | { 2416 | if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) || 2417 | ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { 2418 | #ifdef TRITON_ENABLE_GPU 2419 | cudaEvent_t* lcuda_event = reinterpret_cast(cuda_event); 2420 | RETURN_IF_ERROR(ConvertCUDAStatusToTritonError( 2421 | cudaEventRecord(*lcuda_event, GetCudaStreamByInstanceKind()), 2422 | TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); 2423 | #endif 2424 | } else { 2425 | SET_TIMESTAMP(*timestamp); 2426 | } 2427 | return nullptr; 2428 | } 2429 | 2430 | void 2431 | ModelInstanceState::CreateCudaEvents(const int32_t& device_id) 2432 | { 2433 | #ifdef TRITON_ENABLE_GPU 2434 | // Need to set the CUDA context so that the context that events are 2435 | // created on match with contexts that events are recorded with. 2436 | THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( 2437 | cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL, 2438 | "Failed to set the device")); 2439 | THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( 2440 | cudaEventCreate(&compute_input_start_event_), TRITONSERVER_ERROR_INTERNAL, 2441 | "Failed to create cuda event")); 2442 | THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( 2443 | cudaEventCreate(&compute_infer_start_event_), TRITONSERVER_ERROR_INTERNAL, 2444 | "Failed to create cuda event")); 2445 | THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( 2446 | cudaEventCreate(&compute_output_start_event_), 2447 | TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event")); 2448 | #endif 2449 | } 2450 | 2451 | cudaStream_t 2452 | ModelInstanceState::GetCudaStreamByInstanceKind() 2453 | { 2454 | #ifdef TRITON_ENABLE_GPU 2455 | if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { 2456 | return stream_; 2457 | } else if ( 2458 | (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && 2459 | !stream_vec_.empty()) { 2460 | return stream_vec_[0]; 2461 | } 2462 | #endif 2463 | return nullptr; 2464 | } 2465 | 2466 | void 2467 | ModelInstanceState::SetCurrentCudaStream( 2468 | const cudaStream_t& stream, const int& device_id) 2469 | { 2470 | #ifdef TRITON_ENABLE_GPU 2471 | at::cuda::CUDAStream torch_stream = 2472 | at::cuda::getStreamFromExternal(stream, device_id); 2473 | // This function replaces the default stream with the stream we created. It 2474 | // is not necessary to change the current device to the desired device when 2475 | // replacing the default stream for that device. See the documentation here: 2476 | // https://pytorch.org/cppdocs/api/function_namespacec10_1_1cuda_1a6ed50cc0fc16cc7014d9c2f4c3bd098d.html 2477 | at::cuda::setCurrentCUDAStream(torch_stream); 2478 | #endif 2479 | } 2480 | 2481 | float 2482 | ModelInstanceState::GetCudaEventElapsedTime( 2483 | const cudaEvent_t& start_event, const cudaEvent_t& end_event) 2484 | { 2485 | float duration = 0; 2486 | #ifdef TRITON_ENABLE_GPU 2487 | // [FIXME] in the case of cudaEventElapsedTime failure, should handle 2488 | // stats reporting more gracefully as the durations are inaccurate 2489 | LOG_IF_ERROR( 2490 | ConvertCUDAStatusToTritonError( 2491 | cudaEventElapsedTime(&duration, start_event, end_event), 2492 | TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"), 2493 | "Failed to capture elapsed time"); 2494 | #endif 2495 | return duration; 2496 | } 2497 | 2498 | ///////////// 2499 | 2500 | extern "C" { 2501 | 2502 | TRITONSERVER_Error* 2503 | TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) 2504 | { 2505 | const char* cname; 2506 | RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname)); 2507 | std::string name(cname); 2508 | 2509 | LOG_MESSAGE( 2510 | TRITONSERVER_LOG_INFO, 2511 | (std::string("TRITONBACKEND_Initialize: ") + name).c_str()); 2512 | 2513 | // Check the backend API version that Triton supports vs. what this 2514 | // backend was compiled against. 2515 | uint32_t api_version_major, api_version_minor; 2516 | RETURN_IF_ERROR( 2517 | TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor)); 2518 | 2519 | LOG_MESSAGE( 2520 | TRITONSERVER_LOG_INFO, 2521 | (std::string("Triton TRITONBACKEND API version: ") + 2522 | std::to_string(api_version_major) + "." + 2523 | std::to_string(api_version_minor)) 2524 | .c_str()); 2525 | LOG_MESSAGE( 2526 | TRITONSERVER_LOG_INFO, 2527 | (std::string("'") + name + "' TRITONBACKEND API version: " + 2528 | std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." + 2529 | std::to_string(TRITONBACKEND_API_VERSION_MINOR)) 2530 | .c_str()); 2531 | 2532 | if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) || 2533 | (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) { 2534 | return TRITONSERVER_ErrorNew( 2535 | TRITONSERVER_ERROR_UNSUPPORTED, 2536 | (std::string("Triton TRITONBACKEND API version: ") + 2537 | std::to_string(api_version_major) + "." + 2538 | std::to_string(api_version_minor) + " does not support '" + name + 2539 | "' TRITONBACKEND API version: " + 2540 | std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." + 2541 | std::to_string(TRITONBACKEND_API_VERSION_MINOR)) 2542 | .c_str()); 2543 | } 2544 | 2545 | return nullptr; // success 2546 | } 2547 | 2548 | TRITONSERVER_Error* 2549 | TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) 2550 | { 2551 | const char* cname; 2552 | RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname)); 2553 | std::string name(cname); 2554 | 2555 | uint64_t version; 2556 | RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version)); 2557 | 2558 | LOG_MESSAGE( 2559 | TRITONSERVER_LOG_INFO, 2560 | (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " + 2561 | std::to_string(version) + ")") 2562 | .c_str()); 2563 | 2564 | // Create a ModelState object and associate it with the 2565 | // TRITONBACKEND_Model. 2566 | ModelState* model_state; 2567 | RETURN_IF_ERROR(ModelState::Create(model, &model_state)); 2568 | RETURN_IF_ERROR( 2569 | TRITONBACKEND_ModelSetState(model, reinterpret_cast(model_state))); 2570 | 2571 | return nullptr; // success 2572 | } 2573 | 2574 | TRITONSERVER_Error* 2575 | TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) 2576 | { 2577 | void* vstate; 2578 | RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate)); 2579 | ModelState* model_state = reinterpret_cast(vstate); 2580 | 2581 | LOG_MESSAGE( 2582 | TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state"); 2583 | 2584 | delete model_state; 2585 | 2586 | return nullptr; // success 2587 | } 2588 | 2589 | TRITONSERVER_Error* 2590 | TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) 2591 | { 2592 | const char* cname; 2593 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname)); 2594 | std::string name(cname); 2595 | 2596 | int32_t device_id; 2597 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id)); 2598 | 2599 | TRITONSERVER_InstanceGroupKind kind; 2600 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind)); 2601 | 2602 | LOG_MESSAGE( 2603 | TRITONSERVER_LOG_INFO, 2604 | (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" + 2605 | TRITONSERVER_InstanceGroupKindString(kind) + " device " + 2606 | std::to_string(device_id) + ")") 2607 | .c_str()); 2608 | 2609 | // Get the model state associated with this instance's model. 2610 | TRITONBACKEND_Model* model; 2611 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model)); 2612 | 2613 | void* vmodelstate; 2614 | RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate)); 2615 | ModelState* model_state = reinterpret_cast(vmodelstate); 2616 | 2617 | // Create a ModelInstanceState object and associate it with the 2618 | // TRITONBACKEND_ModelInstance. 2619 | ModelInstanceState* instance_state; 2620 | RETURN_IF_ERROR( 2621 | ModelInstanceState::Create(model_state, instance, &instance_state)); 2622 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState( 2623 | instance, reinterpret_cast(instance_state))); 2624 | 2625 | return nullptr; // success 2626 | } 2627 | 2628 | TRITONSERVER_Error* 2629 | TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) 2630 | { 2631 | void* vstate; 2632 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); 2633 | ModelInstanceState* instance_state = 2634 | reinterpret_cast(vstate); 2635 | 2636 | LOG_MESSAGE( 2637 | TRITONSERVER_LOG_INFO, 2638 | "TRITONBACKEND_ModelInstanceFinalize: delete instance state"); 2639 | 2640 | delete instance_state; 2641 | 2642 | return nullptr; // success 2643 | } 2644 | 2645 | TRITONSERVER_Error* 2646 | TRITONBACKEND_ModelInstanceExecute( 2647 | TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, 2648 | const uint32_t request_count) 2649 | { 2650 | // Triton will not call this function simultaneously for the same 2651 | // 'instance'. But since this backend could be used by multiple 2652 | // instances from multiple models the implementation needs to handle 2653 | // multiple calls to this function at the same time (with different 2654 | // 'instance' objects). Suggested practice for this is to use only 2655 | // function-local and model-instance-specific state (obtained from 2656 | // 'instance'), which is what we do here. 2657 | ModelInstanceState* instance_state; 2658 | RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState( 2659 | instance, reinterpret_cast(&instance_state))); 2660 | ModelState* model_state = instance_state->StateForModel(); 2661 | 2662 | // This backend specifies BLOCKING execution policy. That means that 2663 | // we should not return from this function until execution is 2664 | // complete. Triton will automatically release 'instance' on return 2665 | // from this function so that it is again available to be used for 2666 | // another call to TRITONBACKEND_ModelInstanceExecute. 2667 | 2668 | LOG_MESSAGE( 2669 | TRITONSERVER_LOG_VERBOSE, 2670 | (std::string("model ") + model_state->Name() + ", instance " + 2671 | instance_state->Name() + ", executing " + std::to_string(request_count) + 2672 | " requests") 2673 | .c_str()); 2674 | 2675 | // At this point we accept ownership of 'requests', which means that 2676 | // even if something goes wrong we must still return success from 2677 | // this function. If something does go wrong in processing a 2678 | // particular request then we send an error response just for the 2679 | // specific request. 2680 | instance_state->ProcessRequests(requests, request_count); 2681 | 2682 | if (model_state->EnabledCacheCleaning()) { 2683 | instance_state->ClearCache(); 2684 | } 2685 | 2686 | return nullptr; // success 2687 | } 2688 | 2689 | } // extern "C" 2690 | 2691 | }}} // namespace triton::backend::pytorch 2692 | -------------------------------------------------------------------------------- /src/libtorch_utils.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020-24 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | #include "libtorch_utils.h" 28 | 29 | namespace triton { namespace backend { namespace pytorch { 30 | 31 | TRITONSERVER_DataType 32 | ConvertTorchTypeToDataType(const torch::ScalarType& stype) 33 | { 34 | switch (stype) { 35 | case torch::kBool: 36 | return TRITONSERVER_TYPE_BOOL; 37 | case torch::kByte: 38 | return TRITONSERVER_TYPE_UINT8; 39 | case torch::kChar: 40 | return TRITONSERVER_TYPE_INT8; 41 | case torch::kShort: 42 | return TRITONSERVER_TYPE_INT16; 43 | case torch::kInt: 44 | return TRITONSERVER_TYPE_INT32; 45 | case torch::kLong: 46 | return TRITONSERVER_TYPE_INT64; 47 | case torch::kHalf: 48 | return TRITONSERVER_TYPE_FP16; 49 | case torch::kFloat: 50 | return TRITONSERVER_TYPE_FP32; 51 | case torch::kDouble: 52 | return TRITONSERVER_TYPE_FP64; 53 | default: 54 | break; 55 | } 56 | 57 | return TRITONSERVER_TYPE_INVALID; 58 | } 59 | 60 | std::pair 61 | ConvertDataTypeToTorchType(const TRITONSERVER_DataType dtype) 62 | { 63 | torch::ScalarType type = torch::kInt; 64 | switch (dtype) { 65 | case TRITONSERVER_TYPE_BOOL: 66 | type = torch::kBool; 67 | break; 68 | case TRITONSERVER_TYPE_UINT8: 69 | type = torch::kByte; 70 | break; 71 | case TRITONSERVER_TYPE_INT8: 72 | type = torch::kChar; 73 | break; 74 | case TRITONSERVER_TYPE_INT16: 75 | type = torch::kShort; 76 | break; 77 | case TRITONSERVER_TYPE_INT32: 78 | type = torch::kInt; 79 | break; 80 | case TRITONSERVER_TYPE_INT64: 81 | type = torch::kLong; 82 | break; 83 | case TRITONSERVER_TYPE_FP16: 84 | type = torch::kHalf; 85 | break; 86 | case TRITONSERVER_TYPE_FP32: 87 | type = torch::kFloat; 88 | break; 89 | case TRITONSERVER_TYPE_FP64: 90 | type = torch::kDouble; 91 | break; 92 | case TRITONSERVER_TYPE_UINT16: 93 | case TRITONSERVER_TYPE_UINT32: 94 | case TRITONSERVER_TYPE_UINT64: 95 | case TRITONSERVER_TYPE_BYTES: 96 | default: 97 | return std::make_pair(false, type); 98 | } 99 | 100 | return std::make_pair(true, type); 101 | } 102 | 103 | std::pair 104 | ModelConfigDataTypeToTorchType(const std::string& data_type_str) 105 | { 106 | torch::ScalarType type = torch::kInt; 107 | 108 | // Must start with "TYPE_". 109 | if (data_type_str.rfind("TYPE_", 0) != 0) { 110 | return std::make_pair(false, type); 111 | } 112 | 113 | const std::string dtype = data_type_str.substr(strlen("TYPE_")); 114 | 115 | if (dtype == "BOOL") { 116 | type = torch::kBool; 117 | } else if (dtype == "UINT8") { 118 | type = torch::kByte; 119 | } else if (dtype == "INT8") { 120 | type = torch::kChar; 121 | } else if (dtype == "INT16") { 122 | type = torch::kShort; 123 | } else if (dtype == "INT32") { 124 | type = torch::kInt; 125 | } else if (dtype == "INT64") { 126 | type = torch::kLong; 127 | } else if (dtype == "FP16") { 128 | type = torch::kHalf; 129 | } else if (dtype == "FP32") { 130 | type = torch::kFloat; 131 | } else if (dtype == "FP64") { 132 | type = torch::kDouble; 133 | } else { 134 | return std::make_pair(false, type); 135 | } 136 | 137 | return std::make_pair(true, type); 138 | } 139 | 140 | TRITONSERVER_Error* 141 | ParseParameter( 142 | triton::common::TritonJson::Value& params, const std::string& mkey, 143 | bool* value) 144 | { 145 | std::string value_str; 146 | RETURN_IF_ERROR(GetParameterValue(params, mkey, &value_str)); 147 | RETURN_IF_ERROR(ParseBoolValue(value_str, value)); 148 | 149 | return nullptr; 150 | } 151 | 152 | TRITONSERVER_Error* 153 | ParseParameter( 154 | triton::common::TritonJson::Value& params, const std::string& mkey, 155 | int* value) 156 | { 157 | std::string value_str; 158 | RETURN_IF_ERROR(GetParameterValue(params, mkey, &value_str)); 159 | RETURN_IF_ERROR(ParseIntValue(value_str, value)); 160 | 161 | return nullptr; 162 | } 163 | 164 | 165 | #ifdef TRITON_ENABLE_GPU 166 | TRITONSERVER_Error* 167 | ConvertCUDAStatusToTritonError( 168 | cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg) 169 | { 170 | if (cuda_error != cudaSuccess) { 171 | return TRITONSERVER_ErrorNew( 172 | code, 173 | (std::string(msg) + ": " + cudaGetErrorString(cuda_error)).c_str()); 174 | } 175 | return nullptr; // success 176 | } 177 | #endif 178 | 179 | }}} // namespace triton::backend::pytorch 180 | -------------------------------------------------------------------------------- /src/libtorch_utils.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | #pragma once 28 | 29 | #include "triton/backend/backend_common.h" 30 | #include "triton/core/tritonserver.h" 31 | 32 | // Suppress warnings in torch headers 33 | #pragma GCC diagnostic push 34 | #pragma GCC diagnostic ignored "-Wsign-compare" 35 | #pragma warning(push, 0) 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include // One-stop header for TorchScript 41 | #pragma warning(pop) 42 | #pragma GCC diagnostic pop 43 | 44 | namespace triton { namespace backend { namespace pytorch { 45 | 46 | TRITONSERVER_DataType ConvertTorchTypeToDataType( 47 | const torch::ScalarType& ttype); 48 | std::pair ConvertDataTypeToTorchType( 49 | const TRITONSERVER_DataType dtype); 50 | std::pair ModelConfigDataTypeToTorchType( 51 | const std::string& data_type_str); 52 | 53 | #ifdef TRITON_ENABLE_GPU 54 | TRITONSERVER_Error* ConvertCUDAStatusToTritonError( 55 | cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg); 56 | #endif 57 | 58 | // If the key 'mkey' is present in 'params' then update 'value' with the 59 | // value associated with that key. If 'mkey' is not present in 'params' then 60 | // no update is made to 'value'. 61 | TRITONSERVER_Error* ParseParameter( 62 | triton::common::TritonJson::Value& params, const std::string& mkey, 63 | bool* value); 64 | 65 | // If the key 'mkey' is present in 'params' then update 'value' with the 66 | // value associated with that key. If 'mkey' is not present in 'params' then 67 | // 'value' is set to 'default_value'. 68 | TRITONSERVER_Error* ParseParameter( 69 | triton::common::TritonJson::Value& params, const std::string& mkey, 70 | int* value); 71 | 72 | }}} // namespace triton::backend::pytorch 73 | -------------------------------------------------------------------------------- /src/libtriton_pytorch.ldscript: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | { 27 | global: 28 | TRITONBACKEND_*; 29 | local: *; 30 | }; 31 | -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions 7 | # are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * Neither the name of NVIDIA CORPORATION nor the names of its 14 | # contributors may be used to endorse or promote products derived 15 | # from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 25 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | import importlib 30 | import json 31 | import os 32 | 33 | try: 34 | import torch 35 | except ModuleNotFoundError as error: 36 | raise RuntimeError("Missing/Incomplete PyTorch package installation") from error 37 | 38 | import triton_python_backend_utils as pb_utils 39 | 40 | 41 | def _get_model_path(config): 42 | # FIXME: Add support for torch.export IR models (.pt2) 43 | filenames = ["model.py", "model.pt"] 44 | if config["default_model_filename"]: 45 | filenames.insert(0, config["default_model_filename"]) 46 | for filename in filenames: 47 | model_path = os.path.join(pb_utils.get_model_dir(), filename) 48 | if os.path.exists(model_path): 49 | return model_path 50 | raise pb_utils.TritonModelException( 51 | "No model found in " + pb_utils.get_model_dir() + "/" + str(filenames) 52 | ) 53 | 54 | 55 | def _get_model_data_path(model_path): 56 | data_path_extensions = [".pt"] 57 | model_path_no_extension = model_path[: -(len(model_path.split(".")[-1]) + 1)] 58 | for extension in data_path_extensions: 59 | data_path = model_path_no_extension + extension 60 | if os.path.exists(data_path): 61 | return data_path 62 | # data file not provided 63 | return "" 64 | 65 | 66 | def _is_py_class_model(model_path): 67 | return model_path[-3:] == ".py" 68 | 69 | 70 | def _import_module_from_path(module_name, file_path): 71 | spec = importlib.util.spec_from_file_location(module_name, file_path) 72 | module = importlib.util.module_from_spec(spec) 73 | spec.loader.exec_module(module) 74 | return module 75 | 76 | 77 | def _get_model_class_from_module(module): 78 | names = dir(module) 79 | for name in names: 80 | attr = getattr(module, name) 81 | try: 82 | if issubclass(attr, torch.nn.Module): 83 | return attr 84 | except TypeError: 85 | # attr may not be a class 86 | pass 87 | raise pb_utils.TritonModelException("Cannot find a subclass of torch.nn.Module") 88 | 89 | 90 | def _parse_io_config(io_config): 91 | io = [] 92 | for conf in io_config: 93 | io.append({"name": conf["name"]}) 94 | return io 95 | 96 | 97 | def _get_device_name(kind, device_id): 98 | if kind == "GPU": 99 | return "cuda:" + device_id 100 | if kind == "CPU": 101 | return "cpu" 102 | # unspecified device 103 | return "" 104 | 105 | 106 | def _get_device(kind, device_id, model): 107 | device_name = _get_device_name(kind, device_id) 108 | if device_name == "": 109 | for param in model.parameters(): 110 | return param.device 111 | raise pb_utils.TritonModelException("Cannot determine model device") 112 | return torch.device(device_name) 113 | 114 | 115 | def _set_torch_parallelism(config): 116 | log_msg = "" 117 | parallelism_settings = ["NUM_THREADS", "NUM_INTEROP_THREADS"] 118 | for setting in parallelism_settings: 119 | val = "1" 120 | if setting in config["parameters"]: 121 | val = config["parameters"][setting]["string_value"] 122 | getattr(torch, "set_" + setting.lower())(int(val)) 123 | log_msg += setting + " = " + val + "; " 124 | return log_msg 125 | 126 | 127 | def _get_torch_compile_params(config): 128 | params = {} 129 | if "TORCH_COMPILE_OPTIONAL_PARAMETERS" in config["parameters"]: 130 | val = config["parameters"]["TORCH_COMPILE_OPTIONAL_PARAMETERS"]["string_value"] 131 | params = json.loads(val) 132 | if "model" in params: 133 | raise pb_utils.TritonModelException( 134 | "'model' is not an optional parameter for 'torch.compile'" 135 | ) 136 | return params 137 | 138 | 139 | def _gather_torch_tensors(scatter_tensors): 140 | gather_tensors = [] 141 | sections = [] 142 | for i in range(len(scatter_tensors)): 143 | tensors = scatter_tensors[i] 144 | for j in range(len(tensors)): 145 | tensor = tensors[j] 146 | if j < len(gather_tensors): 147 | # add to existing tensor 148 | gather_tensors[j] = torch.cat((gather_tensors[j], tensor), 0) 149 | else: 150 | # start a new tensor 151 | gather_tensors.append(tensor) 152 | # record section 153 | section_length = tensors[0].size()[0] 154 | sections.append(section_length) 155 | return gather_tensors, sections 156 | 157 | 158 | def _scatter_torch_tensors(gather_tensors, sections): 159 | scatter_tensors = [] 160 | for j in range(len(gather_tensors)): 161 | scatter_tensor = torch.split(gather_tensors[j], sections) 162 | for i in range(len(scatter_tensor)): 163 | tensor = scatter_tensor[i] 164 | if i < len(scatter_tensors): 165 | # add to existing response 166 | scatter_tensors[i].append(tensor) 167 | else: 168 | # start a new response 169 | scatter_tensors.append([tensor]) 170 | return scatter_tensors 171 | 172 | 173 | class TritonPythonModel: 174 | """Your Python model must use the same class name. Every Python model 175 | that is created must have "TritonPythonModel" as the class name. 176 | """ 177 | 178 | def initialize(self, args): 179 | """`initialize` is called only once when the model is being loaded. 180 | Implementing `initialize` function is optional. This function allows 181 | the model to initialize any state associated with this model. 182 | Parameters 183 | ---------- 184 | args : dict 185 | Both keys and values are strings. The dictionary keys and values are: 186 | * model_config: A JSON string containing the model configuration 187 | * model_instance_kind: A string containing model instance kind 188 | * model_instance_device_id: A string containing model instance device ID 189 | * model_repository: Model repository path 190 | * model_version: Model version 191 | * model_name: Model name 192 | """ 193 | self._model_name = args["model_name"] 194 | for_model = "for '" + self._model_name + "'" 195 | self._logger = pb_utils.Logger 196 | self._logger.log_info("Initializing model instance " + for_model) 197 | 198 | self._model_config = json.loads(args["model_config"]) 199 | self._kind = args["model_instance_kind"] 200 | self._device_id = args["model_instance_device_id"] 201 | self._support_batching = self._model_config["max_batch_size"] > 0 202 | self._inputs = _parse_io_config(self._model_config["input"]) 203 | self._outputs = _parse_io_config(self._model_config["output"]) 204 | 205 | setting_msg = _set_torch_parallelism(self._model_config) 206 | self._logger.log_verbose( 207 | "Torch parallelism settings " + for_model + ": " + setting_msg 208 | ) 209 | 210 | self._infer_mode = torch.inference_mode(mode=True) 211 | self._infer_mode.__enter__() 212 | 213 | params = _get_torch_compile_params(self._model_config) 214 | self._logger.log_verbose( 215 | "'torch.compile' optional parameter(s) " + for_model + ": " + str(params) 216 | ) 217 | if self._support_batching: 218 | self._gather = torch.compile(_gather_torch_tensors, **params) 219 | self._scatter = torch.compile(_scatter_torch_tensors, **params) 220 | 221 | model_path = _get_model_path(self._model_config) 222 | if not _is_py_class_model(model_path): 223 | self._logger.log_info("Loading '" + self._model_name + "' as TorchScript") 224 | self._model = torch.jit.load(model_path) 225 | self._device = _get_device(self._kind, self._device_id, self._model) 226 | self._model.to(self._device) 227 | self._model.eval() 228 | return 229 | 230 | self._model_module = _import_module_from_path(self._model_name, model_path) 231 | self._model_class = _get_model_class_from_module(self._model_module) 232 | self._raw_model = self._model_class() 233 | self._device = _get_device(self._kind, self._device_id, self._raw_model) 234 | data_path = _get_model_data_path(model_path) 235 | if data_path != "": 236 | self._raw_model.load_state_dict( 237 | torch.load(data_path, map_location=self._device) 238 | ) 239 | else: 240 | self._logger.log_info("Model parameter file not found " + for_model) 241 | self._raw_model.to(self._device) 242 | self._raw_model.eval() 243 | self._model = torch.compile(self._raw_model, **params) 244 | 245 | def execute(self, requests): 246 | """`execute` MUST be implemented in every Python model. `execute` 247 | function receives a list of pb_utils.InferenceRequest as the only 248 | argument. This function is called when an inference request is made 249 | for this model. Depending on the batching configuration (e.g. Dynamic 250 | Batching) used, `requests` may contain multiple requests. Every 251 | Python model, must create one pb_utils.InferenceResponse for every 252 | pb_utils.InferenceRequest in `requests`. If there is an error, you can 253 | set the error argument when creating a pb_utils.InferenceResponse 254 | Parameters 255 | ---------- 256 | requests : list 257 | A list of pb_utils.InferenceRequest 258 | Returns 259 | ------- 260 | list 261 | A list of pb_utils.InferenceResponse. The length of this list must 262 | be the same as `requests` 263 | """ 264 | 265 | responses = [] 266 | 267 | requests_tensors = [] 268 | for request in requests: 269 | tensors = [] 270 | for io in self._inputs: 271 | tensor = pb_utils.get_input_tensor_by_name( 272 | request, io["name"] 273 | ).to_dlpack() 274 | tensor = torch.from_dlpack(tensor).to(self._device) 275 | tensors.append(tensor) 276 | requests_tensors.append(tensors) 277 | 278 | sections = None 279 | if self._support_batching: 280 | requests_tensors, sections = self._gather(requests_tensors) 281 | requests_tensors = [requests_tensors] 282 | 283 | responses_tensors = [] 284 | for input_tensors in requests_tensors: 285 | output_tensors = self._model(*input_tensors) 286 | if not isinstance(output_tensors, tuple) and not isinstance( 287 | output_tensors, list 288 | ): 289 | output_tensors = [output_tensors] 290 | responses_tensors.append(output_tensors) 291 | 292 | if self._support_batching: 293 | responses_tensors = self._scatter(responses_tensors[0], sections) 294 | 295 | for response_tensors in responses_tensors: 296 | output_tensors = [] 297 | for i in range(len(self._outputs)): 298 | io = self._outputs[i] 299 | tensor = response_tensors[i].detach() 300 | tensor = pb_utils.Tensor.from_dlpack(io["name"], tensor) 301 | output_tensors.append(tensor) 302 | inference_response = pb_utils.InferenceResponse( 303 | output_tensors=output_tensors 304 | ) 305 | responses.append(inference_response) 306 | 307 | return responses 308 | 309 | def finalize(self): 310 | """`finalize` is called only once when the model is being unloaded. 311 | Implementing `finalize` function is OPTIONAL. This function allows 312 | the model to perform any necessary clean ups before exit. 313 | """ 314 | self._logger.log_info("Removing model instance for '" + self._model_name + "'") 315 | self._infer_mode.__exit__(exc_type=None, exc_value=None, traceback=None) 316 | -------------------------------------------------------------------------------- /tools/gen_pb_exec_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | # install conda 29 | rm -rf ./miniconda 30 | wget https://repo.anaconda.com/miniconda/Miniconda3-py312_24.9.2-0-Linux-x86_64.sh 31 | bash Miniconda3-py312_24.9.2-0-Linux-x86_64.sh -p ./miniconda -b 32 | eval "$(./miniconda/bin/conda shell.bash hook)" 33 | 34 | # create conda environment 35 | conda create -n pt python=3.12 -y 36 | conda activate pt 37 | conda install -c conda-forge conda-pack -y 38 | 39 | # pre install step 40 | export PYTHONNOUSERSITE=True 41 | conda install -c conda-forge libstdcxx-ng=14 -y 42 | 43 | # install PyTorch 44 | conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y 45 | 46 | # pack environment 47 | rm -f pb_exec_env_model.py.tar.gz 48 | conda pack -o pb_exec_env_model.py.tar.gz 49 | 50 | # deactivate conda 51 | conda deactivate 52 | --------------------------------------------------------------------------------