├── .clang-format
├── .github
    └── workflows
    │   └── pre-commit.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── cmake
    └── TritonPyTorchBackendConfig.cmake.in
├── pyproject.toml
├── src
    ├── libtorch.cc
    ├── libtorch_utils.cc
    ├── libtorch_utils.h
    ├── libtriton_pytorch.ldscript
    └── model.py
└── tools
    └── gen_pb_exec_env.sh


/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | BasedOnStyle: Google
 3 | 
 4 | IndentWidth: 2
 5 | ColumnLimit: 80
 6 | ContinuationIndentWidth: 4
 7 | UseTab: Never
 8 | MaxEmptyLinesToKeep: 2
 9 | 
10 | SortIncludes: true
11 | CompactNamespaces: true
12 | ReflowComments: true
13 | 
14 | DerivePointerAlignment: false
15 | PointerAlignment: Left
16 | 
17 | AllowShortIfStatementsOnASingleLine: false
18 | AllowShortBlocksOnASingleLine: false
19 | AllowShortFunctionsOnASingleLine: Inline
20 | 
21 | AlwaysBreakAfterReturnType: TopLevelDefinitions
22 | AlignAfterOpenBracket: AlwaysBreak
23 | BreakBeforeBraces: Custom
24 | BraceWrapping:
25 |   AfterClass: false
26 |   AfterControlStatement: false
27 |   AfterEnum: false
28 |   AfterFunction: true
29 |   AfterNamespace: false
30 |   AfterStruct: false
31 |   AfterUnion: false
32 |   BeforeCatch: true
33 | 
34 | BinPackArguments: true
35 | BinPackParameters: true
36 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
37 | 
38 | IndentCaseLabels: true
39 | 
40 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: pre-commit
28 | 
29 | on:
30 |   pull_request:
31 | 
32 | jobs:
33 |   pre-commit:
34 |     runs-on: ubuntu-22.04
35 |     steps:
36 |     - uses: actions/checkout@v3
37 |     - uses: actions/setup-python@v3
38 |     - uses: pre-commit/action@v3.0.0
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /build
2 | /.vscode
3 | *.so
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | repos:
28 | - repo: https://github.com/timothycrosley/isort
29 |   rev: 5.12.0
30 |   hooks:
31 |   - id: isort
32 |     additional_dependencies: [toml]
33 | - repo: https://github.com/psf/black
34 |   rev: 23.1.0
35 |   hooks:
36 |   - id: black
37 |     types_or: [python, cython]
38 | - repo: https://github.com/PyCQA/flake8
39 |   rev: 5.0.4
40 |   hooks:
41 |   - id: flake8
42 |     args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
43 |     types_or: [python, cython]
44 | - repo: https://github.com/pre-commit/mirrors-clang-format
45 |   rev: v16.0.5
46 |   hooks:
47 |   - id: clang-format
48 |     types_or: [c, c++, cuda, proto, textproto, java]
49 |     args: ["-fallback-style=none", "-style=file", "-i"]
50 | - repo: https://github.com/codespell-project/codespell
51 |   rev: v2.2.4
52 |   hooks:
53 |   - id: codespell
54 |     additional_dependencies: [tomli]
55 |     args: ["--toml", "pyproject.toml"]
56 |     exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)
57 | # More details about these pre-commit hooks here:
58 | # https://pre-commit.com/hooks.html
59 | - repo: https://github.com/pre-commit/pre-commit-hooks
60 |   rev: v4.4.0
61 |   hooks:
62 |   - id: check-case-conflict
63 |   - id: check-executables-have-shebangs
64 |   - id: check-merge-conflict
65 |   - id: check-json
66 |   - id: check-toml
67 |   - id: check-yaml
68 |   - id: check-shebang-scripts-are-executable
69 |   - id: end-of-file-fixer
70 |     types_or: [c, c++, cuda, proto, textproto, java, python]
71 |   - id: mixed-line-ending
72 |   - id: requirements-txt-fixer
73 |   - id: trailing-whitespace
74 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | cmake_minimum_required (VERSION 3.31.8)
 28 | 
 29 | project(tritonpytorchbackend LANGUAGES C CXX)
 30 | 
 31 | # Use C++17 standard as Triton's minimum required.
 32 | set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.")
 33 | 
 34 | #
 35 | # Options
 36 | #
 37 | # To build the PyTorch backend you must either:
 38 | #
 39 | #   - Point to the already built PyTorch and Torchvision using
 40 | #     TRITON_PYTORCH_INCLUDE_PATHS and TRITON_PYTORCH_LIB_PATHS
 41 | #
 42 | #   or:
 43 | #
 44 | #   - Set TRITON_PYTORCH_DOCKER_IMAGE to use the docker image of
 45 | #     PyTorch to base the build off.
 46 | #
 47 | 
 48 | option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
 49 | option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
 50 | option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF)
 51 | option(TRITON_PYTORCH_ENABLE_TORCHTRT "Enable TorchTRT support" OFF)
 52 | option(TRITON_PYTORCH_ENABLE_TORCHVISION "Enable Torchvision support" ON)
 53 | 
 54 | set(TRITON_PYTORCH_DOCKER_IMAGE "" CACHE STRING "Docker image containing the PyTorch build required by backend.")
 55 | set(TRITON_PYTORCH_INCLUDE_PATHS "" CACHE PATH "Paths to Torch includes")
 56 | set(TRITON_PYTORCH_LIB_PATHS "" CACHE PATH "Paths to Torch libraries")
 57 | 
 58 | set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from")
 59 | set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
 60 | set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
 61 | set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
 62 | 
 63 | if(NOT CMAKE_BUILD_TYPE)
 64 |   set(CMAKE_BUILD_TYPE Release)
 65 | endif()
 66 | 
 67 | set(TRITON_PYTORCH_DOCKER_BUILD OFF)
 68 | if(TRITON_PYTORCH_LIB_PATHS STREQUAL "")
 69 |   if(TRITON_PYTORCH_DOCKER_IMAGE STREQUAL "")
 70 |     message(FATAL_ERROR "Using the PyTorch docker based build requires TRITON_PYTORCH_DOCKER_IMAGE")
 71 |   endif()
 72 |   set(TRITON_PYTORCH_DOCKER_BUILD ON)
 73 |   message(STATUS "Using PyTorch docker: ${TRITON_PYTORCH_DOCKER_IMAGE}")
 74 | else()
 75 |   # Look for installed Torch-TRT package in lib paths
 76 |   if(TRITON_PYTORCH_ENABLE_TORCHTRT AND NOT EXISTS "${TRITON_PYTORCH_LIB_PATHS}/libtorchtrt_runtime.so")
 77 |     message(WARNING "TRITON_PYTORCH_ENABLE_TORCHTRT is on, but TRITON_PYTORCH_LIB_PATHS does not contain Torch-TRT package")
 78 |   endif()
 79 | 
 80 |     # Look for installed Torchvision package in lib paths
 81 |   find_library( LIBTORCHVISION libtorchvision.so libtorchvision.so.1 PATHS ${TRITON_PYTORCH_LIB_PATHS} )
 82 |   if(NOT ${LIBTORCHVISION})
 83 |     message(WARNING "TRITON_PYTORCH_ENABLE_TORCHVISION is on, but TRITON_PYTORCH_LIB_PATHS does not contain Torchvision package")
 84 |   endif(NOT ${LIBTORCHVISION})
 85 | endif()
 86 | 
 87 | # Python.h needed by torch headers.
 88 | find_package(Python3 REQUIRED COMPONENTS Development.Module)
 89 | 
 90 | set(RHEL_BUILD OFF)
 91 | set(LIB_DIR "lib")
 92 | set(LIBTORCH_LIBS_PATH "/usr/local/lib")
 93 | set(PY_INSTALL_PATH "/usr/local/lib/python3.12/dist-packages")
 94 | if(LINUX)
 95 |   file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE")
 96 |   if(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
 97 |     set(RHEL_BUILD ON)
 98 |     set(LIB_DIR "lib64")
 99 |     set(PY_INSTALL_PATH "/opt/_internal/cpython-3.12.1/lib/python3.12/site-packages")
100 |     if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
101 |       set(LIBTORCH_LIBS_PATH "/opt/_internal/cpython-3.12.1/lib")
102 |     endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
103 |   endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
104 | endif(LINUX)
105 | 
106 | #
107 | # Dependencies
108 | #
109 | # FetchContent's composability isn't very good. We must include the
110 | # transitive closure of all repos so that we can override the tag.
111 | #
112 | include(FetchContent)
113 | 
114 | FetchContent_Declare(
115 |   repo-common
116 |   GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git
117 |   GIT_TAG ${TRITON_COMMON_REPO_TAG}
118 |   GIT_SHALLOW ON
119 | )
120 | FetchContent_Declare(
121 |   repo-core
122 |   GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git
123 |   GIT_TAG ${TRITON_CORE_REPO_TAG}
124 |   GIT_SHALLOW ON
125 | )
126 | FetchContent_Declare(
127 |   repo-backend
128 |   GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git
129 |   GIT_TAG ${TRITON_BACKEND_REPO_TAG}
130 |   GIT_SHALLOW ON
131 | )
132 | FetchContent_MakeAvailable(repo-common repo-core repo-backend)
133 | 
134 | #
135 | # CUDA
136 | #
137 | if(${TRITON_ENABLE_GPU})
138 |   find_package(CUDAToolkit REQUIRED)
139 | else()
140 |   if (${TRITON_PYTORCH_ENABLE_TORCHTRT})
141 |     message(FATAL_ERROR "TRITON_PYTORCH_ENABLE_TORCHTRT is ON when TRITON_ENABLE_GPU is OFF")
142 |   endif()
143 | endif() # TRITON_ENABLE_GPU
144 | 
145 | if(${TRITON_ENABLE_NVTX})
146 |   add_definitions(-DTRITON_ENABLE_NVTX=1)
147 | endif() # TRITON_ENABLE_NVTX
148 | 
149 | #
150 | # Shared library implementing the Triton Backend API
151 | #
152 | configure_file(src/libtriton_pytorch.ldscript libtriton_pytorch.ldscript COPYONLY)
153 | 
154 | set(PT_LIBS
155 |     "libc10.so"
156 |     "libc10_cuda.so"
157 |     "libtorch.so"
158 |     "libtorch_cpu.so"
159 |     "libtorch_cuda.so"
160 |     "libtorch_cuda_linalg.so"
161 |     "libtorch_global_deps.so"
162 |     "libjpeg.so.62"
163 | )
164 | 
165 | if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
166 |   set(PT_LIBS
167 |       ${PT_LIBS}
168 |       $<IF:$<BOOL:${RHEL_BUILD}>,libtorchvision.so,libtorchvision.so.1>
169 |   )
170 | endif() # TRITON_PYTORCH_ENABLE_TORCHVISION
171 | 
172 | if (${TRITON_PYTORCH_ENABLE_TORCHTRT})
173 |   set(PT_LIBS
174 |       ${PT_LIBS}
175 |       "libtorchtrt_runtime.so"
176 |   )
177 | endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
178 | 
179 | if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
180 |   set(LIBS_ARCH "aarch64")
181 |   set(LIBTORCH_LIBS
182 |       "libnvpl_blas_core.so.0"
183 |       "libnvpl_blas_ilp64_gomp.so.0"
184 |       "libnvpl_blas_ilp64_seq.so.0"
185 |       "libnvpl_blas_lp64_gomp.so.0"
186 |       "libnvpl_blas_lp64_seq.so.0"
187 |       "libnvpl_lapack_core.so.0"
188 |       "libnvpl_lapack_ilp64_gomp.so.0"
189 |       "libnvpl_lapack_ilp64_seq.so.0"
190 |       "libnvpl_lapack_lp64_gomp.so.0"
191 |       "libnvpl_lapack_lp64_seq.so.0"
192 |   )
193 | else()
194 |   set(LIBS_ARCH "x86_64")
195 |   set(LIBTORCH_LIBS
196 |     "libmkl_avx2.so.1"
197 |     "libmkl_avx512.so.1"
198 |     "libmkl_core.so.1"
199 |     "libmkl_def.so.1"
200 |     "libmkl_gnu_thread.so.1"
201 |     "libmkl_intel_lp64.so.1"
202 |     "libmkl_intel_thread.so.1"
203 |     "libmkl_rt.so.1"
204 |     "libmkl_sequential.so.1"
205 |     "libmkl_vml_def.so.1"
206 |   )
207 | endif()
208 | set(TORCHVISION_LIBS
209 |     $<IF:$<BOOL:${RHEL_BUILD}>,libjpeg.so.62,libjpeg.so>
210 |     $<IF:$<BOOL:${RHEL_BUILD}>,libpng16.so.16,libpng16.so>
211 | )
212 | 
213 | # The patchelf commands ensure the MKL libraries are loaded correctly during runtime
214 | # Without these, the framework/backend complains of missing libraries / symbols and
215 | # in some cases leads to segmentation faults.
216 | if (${TRITON_PYTORCH_DOCKER_BUILD})
217 |   string(REPLACE ";" " " LIBTORCH_LIBS_STR "${LIBTORCH_LIBS}")
218 |   string(RANDOM 8 "abcdefghijklmnopqrstuvwxyz" random_id)
219 | 
220 |   add_custom_command(
221 |     OUTPUT
222 |       ${PT_LIBS}
223 |       ${LIBTORCH_LIBS}
224 |       ${TORCHVISION_LIBS}
225 |       LICENSE.pytorch
226 |       include/torch
227 |       include/torchvision
228 |     COMMAND ${CMAKE_COMMAND} -E make_directory "include/torchvision"
229 |     COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE}
230 |     COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true
231 |     COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE}
232 |     COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:${LIBTORCH_LIBS_PATH}/$i $i ; done"
233 |     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libc10.so libc10.so
234 |     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libc10_cuda.so libc10_cuda.so
235 |     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch.so libtorch.so
236 |     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cpu.so libtorch_cpu.so
237 |     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda.so libtorch_cuda.so
238 |     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
239 |     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
240 |     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
241 |     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -a -L pytorch_backend_ptlib:/usr/local/lib64/libtorchvision.so libtorchvision.so; else docker cp -a -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so.1 libtorchvision.so.1; fi; fi"
242 |     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision; fi"
243 |     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
244 |     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
245 |     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/LICENSE LICENSE.pytorch
246 |     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/include include/torch
247 |     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/.
248 | 
249 |     COMMAND /bin/sh -c "if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -L pytorch_backend_ptlib:/usr/lib64/libjpeg.so.62 libjpeg.so.62; else docker cp -L pytorch_backend_ptlib:/usr/local/lib/libjpeg.so.62 libjpeg.so.62 && docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so; fi;"
250 |     COMMAND /bin/sh -c "if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -L pytorch_backend_ptlib:/usr/lib64/libpng16.so.16 libpng16.so.16; else docker cp -L pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so libpng16.so; fi;"
251 |     COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi"
252 |     COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_def.so.1; fi"
253 |     COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx2.so.1; fi"
254 |     COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_avx2.so.1; fi"
255 |     COMMAND /bin/sh -c "if [ -f libmkl_avx512.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx512.so.1; fi"
256 |     COMMAND /bin/sh -c "if [ -f libmkl_avx512.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_avx512.so.1; fi"
257 |     COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_vml_def.so.1; fi"
258 |     COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_intel_thread.so.1 libmkl_vml_def.so.1; fi"
259 |     COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_vml_def.so.1; fi"
260 |     COMMAND /bin/sh -c "if [ -f libmkl_intel_thread.so.1 ]; then patchelf --add-needed libmkl_intel_lp64.so.1 libmkl_intel_thread.so.1; fi"
261 |     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'OFF' ]; then ln -s libtorchvision.so.1 libtorchvision.so; fi; fi;"
262 |     COMMAND docker rm pytorch_backend_ptlib
263 |     COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}"
264 |     VERBATIM
265 |   )
266 |   add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${LIBTORCH_LIBS} ${TORCHVISION_LIBS})
267 |   add_library(ptlib SHARED IMPORTED GLOBAL)
268 |   add_dependencies(ptlib ptlib_target)
269 | 
270 |   # Just one of the libs are enough to ensure the docker build
271 |   set_target_properties(
272 |     ptlib
273 |     PROPERTIES
274 |       IMPORTED_LOCATION libtorch.so
275 |   )
276 | endif() # TRITON_PYTORCH_DOCKER_BUILD
277 | 
278 | add_library(
279 |   triton-pytorch-backend SHARED
280 |   src/libtorch.cc
281 |   src/libtorch_utils.cc
282 |   src/libtorch_utils.h
283 | )
284 | 
285 | add_library(
286 |   TritonPyTorchBackend::triton-pytorch-backend ALIAS triton-pytorch-backend
287 | )
288 | 
289 | target_include_directories(
290 |   triton-pytorch-backend
291 |   PRIVATE
292 |     ${CMAKE_CURRENT_SOURCE_DIR}/src
293 |     ${Python3_INCLUDE_DIRS}
294 | )
295 | 
296 | if (${TRITON_PYTORCH_DOCKER_BUILD})
297 |   target_include_directories(
298 |     triton-pytorch-backend
299 |     PRIVATE
300 |       ${CMAKE_CURRENT_BINARY_DIR}/include/torch
301 |       ${CMAKE_CURRENT_BINARY_DIR}/include/torch/torch/csrc/api/include
302 |       ${CMAKE_CURRENT_BINARY_DIR}/include/torchvision
303 |   )
304 | else()
305 |   target_include_directories(
306 |     triton-pytorch-backend
307 |     PRIVATE ${TRITON_PYTORCH_INCLUDE_PATHS}
308 |   )
309 | endif() # TRITON_PYTORCH_DOCKER_BUILD
310 | 
311 | # Need to turn off -Werror due to Torchvision vision.h extern initialization
312 | # Unfortunately gcc does not provide a specific flag to ignore the specific
313 | # warning: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=45977
314 | target_compile_features(triton-pytorch-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})
315 | target_compile_options(
316 |   triton-pytorch-backend PRIVATE
317 |   $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
318 |     -Wall -Wextra -Wno-unused-parameter -Wno-type-limits>
319 | )
320 | 
321 | if(${TRITON_ENABLE_GPU})
322 |   target_compile_definitions(
323 |     triton-pytorch-backend
324 |     PRIVATE TRITON_ENABLE_GPU=1
325 |   )
326 | endif() # TRITON_ENABLE_GPU
327 | 
328 | set_target_properties(
329 |   triton-pytorch-backend
330 |   PROPERTIES
331 |     POSITION_INDEPENDENT_CODE ON
332 |     OUTPUT_NAME triton_pytorch
333 |     SKIP_BUILD_RPATH TRUE
334 |     BUILD_WITH_INSTALL_RPATH TRUE
335 |     INSTALL_RPATH_USE_LINK_PATH FALSE
336 |     INSTALL_RPATH "$\{ORIGIN\}"
337 |     LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_pytorch.ldscript
338 |     LINK_FLAGS "-Wl,--no-as-needed,--version-script libtriton_pytorch.ldscript"
339 | )
340 | 
341 | # Need to turn off unused-but-set-variable due to Torchvision
342 | # Need to turn off unknown-pragmas due to ATen OpenMP
343 | set_target_properties(
344 |   triton-pytorch-backend
345 |   PROPERTIES COMPILE_FLAGS
346 |     "-Wno-unknown-pragmas -Wno-unused-but-set-variable"
347 | )
348 | 
349 | if (${TRITON_PYTORCH_DOCKER_BUILD})
350 |   add_dependencies(
351 |     triton-pytorch-backend
352 |     ptlib
353 |   )
354 | endif() # TRITON_PYTORCH_DOCKER_BUILD
355 | 
356 | message(STATUS "Torchvision support is ${TRITON_PYTORCH_ENABLE_TORCHVISION}")
357 | message(STATUS "Torch-TRT support is ${TRITON_PYTORCH_ENABLE_TORCHTRT}")
358 | 
359 | set(TRITON_PYTORCH_LDFLAGS "")
360 | if (${TRITON_PYTORCH_DOCKER_BUILD})
361 |   set(TRITON_PYTORCH_LIBS "${CMAKE_CURRENT_BINARY_DIR}/libtorch.so")
362 | 
363 |   if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
364 |     set(TRITON_PYTORCH_LIBS
365 |     ${TRITON_PYTORCH_LIBS}
366 |     "${CMAKE_CURRENT_BINARY_DIR}/$<IF:$<BOOL:${RHEL_BUILD}>,libtorchvision.so,libtorchvision.so.1>")
367 |   endif() # TRITON_PYTORCH_ENABLE_TORCHVISION
368 | 
369 |   if (${TRITON_PYTORCH_ENABLE_TORCHTRT})
370 |     set(TRITON_PYTORCH_LIBS
371 |         ${TRITON_PYTORCH_LIBS}
372 |         "${CMAKE_CURRENT_BINARY_DIR}/libtorchtrt_runtime.so")
373 |   endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
374 | else()
375 |   set (TRITON_PYTORCH_LIBS "-ltorch")
376 | 
377 |   if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
378 |     set(TRITON_PYTORCH_LIBS
379 |         ${TRITON_PYTORCH_LIBS}
380 |         "-ltorchvision"
381 |     )
382 |   endif() # TRITON_PYTORCH_ENABLE_TORCHVISION
383 | 
384 |   if (${TRITON_PYTORCH_ENABLE_TORCHTRT})
385 |     set(TRITON_PYTORCH_LIBS
386 |         ${TRITON_PYTORCH_LIBS}
387 |         "-ltorchtrt_runtime"
388 |     )
389 |   endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
390 | 
391 |   FOREACH(p ${TRITON_PYTORCH_LIB_PATHS})
392 |     set(TRITON_PYTORCH_LDFLAGS ${TRITON_PYTORCH_LDFLAGS} "-L${p}")
393 |   ENDFOREACH(p)
394 | endif() # TRITON_PYTORCH_DOCKER_BUILD
395 | 
396 | target_link_libraries(
397 |   triton-pytorch-backend
398 |   PRIVATE
399 |     triton-core-serverapi  # from repo-core
400 |     triton-core-backendapi # from repo-core
401 |     triton-core-serverstub # from repo-core
402 |     triton-backend-utils   # from repo-backend
403 |     ${TRITON_PYTORCH_LDFLAGS}
404 |     ${TRITON_PYTORCH_LIBS}
405 | )
406 | 
407 | if(${TRITON_ENABLE_GPU})
408 |   target_link_libraries(
409 |     triton-pytorch-backend
410 |     PRIVATE
411 |       CUDA::cudart
412 |   )
413 | endif() # TRITON_ENABLE_GPU
414 | 
415 | #
416 | # Install
417 | #
418 | include(GNUInstallDirs)
419 | set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonPyTorchBackend)
420 | 
421 | install(
422 |   TARGETS
423 |     triton-pytorch-backend
424 |   EXPORT
425 |     triton-pytorch-backend-targets
426 |   LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch
427 |   ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch
428 | )
429 | 
430 | if (${TRITON_PYTORCH_DOCKER_BUILD})
431 |   set(PT_LIB_PATHS "")
432 |   FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${TORCHVISION_LIBS})
433 |     set(PT_LIB_PATHS ${PT_LIB_PATHS} "${CMAKE_CURRENT_BINARY_DIR}/${plib}")
434 |   ENDFOREACH(plib)
435 | 
436 |   install(
437 |     FILES
438 |       ${PT_LIB_PATHS}
439 |       ${CMAKE_CURRENT_BINARY_DIR}/LICENSE.pytorch
440 |     DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch
441 |   )
442 | 
443 |   if (${TRITON_PYTORCH_ENABLE_TORCHTRT})
444 |     install(
445 |       FILES
446 |         ${CMAKE_CURRENT_BINARY_DIR}/torchtrtc
447 |       DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch
448 |     )
449 |   endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
450 | 
451 |   FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${TORCHVISION_LIBS})
452 |     install(
453 |       CODE
454 |         "EXECUTE_PROCESS(
455 |           COMMAND patchelf --set-rpath \$ORIGIN ${plib}
456 |           RESULT_VARIABLE PATCHELF_STATUS
457 |           WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch)
458 |         if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0)
459 |           message(FATAL_ERROR \"FAILED: to run patchelf\")
460 |         endif()"
461 |     )
462 |   ENDFOREACH(plib)
463 | 
464 |   install(
465 |     CODE
466 |       "EXECUTE_PROCESS(
467 |         COMMAND ln -sf libpng16.so libpng16.so.16
468 |         COMMAND ln -sf libjpeg.so libjpeg.so.8
469 |         RESULT_VARIABLE LINK_STATUS
470 |         WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch)
471 |       if(LINK_STATUS AND NOT LINK_STATUS EQUAL 0)
472 |         message(FATAL_ERROR \"FAILED: to create links\")
473 |       endif()"
474 |   )
475 | else()
476 |   FOREACH(plib ${PT_LIBS})
477 |     set(PT_LIB_PATHS ${PT_LIB_PATHS} "${TRITON_PYTORCH_LIB_PATHS}/${plib}")
478 |   ENDFOREACH(plib)
479 | 
480 |   install(
481 |     FILES
482 |       ${PT_LIB_PATHS}
483 |     DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch
484 |   )
485 | 
486 |   FOREACH(plib ${PT_LIBS})
487 |     install(
488 |       CODE
489 |         "EXECUTE_PROCESS(
490 |           COMMAND patchelf --set-rpath \$ORIGIN ${plib}
491 |           RESULT_VARIABLE PATCHELF_STATUS
492 |           WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch)
493 |         if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0)
494 |           message(FATAL_ERROR \"FAILED: to run patchelf\")
495 |         endif()"
496 |     )
497 |   ENDFOREACH(plib)
498 | endif() # TRITON_PYTORCH_DOCKER_BUILD
499 | 
500 | install(
501 |   EXPORT
502 |     triton-pytorch-backend-targets
503 |   FILE
504 |     TritonPyTorchBackendTargets.cmake
505 |   NAMESPACE
506 |     TritonPyTorchBackend::
507 |   DESTINATION
508 |     ${INSTALL_CONFIGDIR}
509 | )
510 | 
511 | install(
512 |   FILES
513 |     src/model.py
514 |   DESTINATION
515 |     ${CMAKE_INSTALL_PREFIX}/backends/pytorch
516 | )
517 | 
518 | include(CMakePackageConfigHelpers)
519 | configure_package_config_file(
520 |   ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonPyTorchBackendConfig.cmake.in
521 |   ${CMAKE_CURRENT_BINARY_DIR}/TritonPyTorchBackendConfig.cmake
522 |   INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
523 | )
524 | 
525 | install(
526 |   FILES
527 |   ${CMAKE_CURRENT_BINARY_DIR}/TritonPyTorchBackendConfig.cmake
528 |   DESTINATION ${INSTALL_CONFIGDIR}
529 | )
530 | 
531 | #
532 | # Export from build tree
533 | #
534 | export(
535 |   EXPORT triton-pytorch-backend-targets
536 |   FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonPyTorchBackendTargets.cmake
537 |   NAMESPACE TritonPyTorchBackend::
538 | )
539 | 
540 | export(PACKAGE TritonPyTorchBackend)
541 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions
 5 | are met:
 6 |  * Redistributions of source code must retain the above copyright
 7 |    notice, this list of conditions and the following disclaimer.
 8 |  * Redistributions in binary form must reproduce the above copyright
 9 |    notice, this list of conditions and the following disclaimer in the
10 |    documentation and/or other materials provided with the distribution.
11 |  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |    contributors may be used to endorse or promote products derived
13 |    from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 | # Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | -->
 28 | 
 29 | [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 30 | 
 31 | # PyTorch (LibTorch) Backend
 32 | 
 33 | The Triton backend for [PyTorch](https://github.com/pytorch/pytorch).
 34 | You can learn more about Triton backends in the [backend
 35 | repo](https://github.com/triton-inference-server/backend). Ask
 36 | questions or report problems on the [issues
 37 | page](https://github.com/triton-inference-server/server/issues).
 38 | This backend is designed to run [TorchScript](https://pytorch.org/docs/stable/jit.html)
 39 | models using the PyTorch C++ API. All models created in PyTorch
 40 | using the python API must be traced/scripted to produce a TorchScript
 41 | model.
 42 | 
 43 | Where can I ask general questions about Triton and Triton backends?
 44 | Be sure to read all the information below as well as the [general
 45 | Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server)
 46 | available in the main [server](https://github.com/triton-inference-server/server)
 47 | repo. If you don't find your answer there you can ask questions on the
 48 | main Triton [issues page](https://github.com/triton-inference-server/server/issues).
 49 | 
 50 | ## Build the PyTorch Backend
 51 | 
 52 | Use a recent cmake to build. First install the required dependencies.
 53 | 
 54 | ```
 55 | $ apt-get install rapidjson-dev python3-dev python3-pip
 56 | $ pip3 install patchelf==0.17.2
 57 | ```
 58 | 
 59 | An appropriate PyTorch container from [NGC](https://ngc.nvidia.com) must be used.
 60 | For example, to build a backend that uses the 23.04 version of the PyTorch
 61 | container from NGC:
 62 | 
 63 | ```
 64 | $ mkdir build
 65 | $ cd build
 66 | $ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:23.04-py3" ..
 67 | $ make install
 68 | ```
 69 | 
 70 | The following required Triton repositories will be pulled and used in
 71 | the build. By default, the "main" branch/tag will be used for each repo
 72 | but the listed CMake argument can be used to override.
 73 | 
 74 | * triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag]
 75 | * triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]
 76 | * triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]
 77 | 
 78 | ## Build the PyTorch Backend With Custom PyTorch
 79 | 
 80 | Currently, Triton requires that a specially patched version of
 81 | PyTorch be used with the PyTorch backend. The full source for
 82 | these PyTorch versions are available as Docker images from
 83 | [NGC](https://ngc.nvidia.com). For example, the PyTorch version
 84 | compatible with the 22.12 release of Triton is available as
 85 | nvcr.io/nvidia/pytorch:22.12-py3.
 86 | 
 87 | Copy over the LibTorch and Torchvision headers and libraries from the
 88 | [PyTorch NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
 89 | into local directories. You can see which headers and libraries
 90 | are needed/copied from the docker.
 91 | 
 92 | ```
 93 | $ mkdir build
 94 | $ cd build
 95 | $ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_INCLUDE_PATHS="<PATH_PREFIX>/torch;<PATH_PREFIX>/torch/torch/csrc/api/include;<PATH_PREFIX>/torchvision" -DTRITON_PYTORCH_LIB_PATHS="<LIB_PATH_PREFIX>" ..
 96 | $ make install
 97 | ```
 98 | 
 99 | ## Using the PyTorch Backend
100 | 
101 | ### Parameters
102 | 
103 | Triton exposes some flags to control the execution mode of the TorchScript models through
104 | the Parameters section of the model's `config.pbtxt` file.
105 | 
106 | * `DISABLE_OPTIMIZED_EXECUTION`: Boolean flag to disable the optimized execution
107 | of TorchScript models. By default, the optimized execution is always enabled.
108 | 
109 | The initial calls to a loaded TorchScript model take extremely long. Due to this longer
110 | model warmup [issue](https://github.com/pytorch/pytorch/issues/57894), Triton also allows
111 | execution of models without these optimizations. In some models, optimized execution
112 | does not benefit performance as seen [here](https://github.com/pytorch/pytorch/issues/19978)
113 | and in other cases impacts performance negatively, as seen [here](https://github.com/pytorch/pytorch/issues/53824).
114 | 
115 | The section of model config file specifying this parameter will look like:
116 | 
117 | ```
118 | parameters: {
119 | key: "DISABLE_OPTIMIZED_EXECUTION"
120 |     value: {
121 |     string_value: "true"
122 |     }
123 | }
124 | ```
125 | 
126 | * `INFERENCE_MODE`: Boolean flag to enable the Inference Mode execution
127 | of TorchScript models. By default, the inference mode is enabled.
128 | 
129 | [InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new
130 | RAII guard analogous to NoGradMode to be used when you are certain your operations
131 | will have no interactions with autograd. Compared to NoGradMode, code run under
132 | this mode gets better performance by disabling autograd.
133 | 
134 | Please note that in some models, InferenceMode might not benefit performance
135 | and in fewer cases might impact performance negatively.
136 | 
137 | The section of model config file specifying this parameter will look like:
138 | 
139 | ```
140 | parameters: {
141 | key: "INFERENCE_MODE"
142 |     value: {
143 |     string_value: "true"
144 |     }
145 | }
146 | ```
147 | 
148 | * `DISABLE_CUDNN`: Boolean flag to disable the cuDNN library. By default, cuDNN is enabled.
149 | 
150 | [cuDNN](https://developer.nvidia.com/cudnn) is a GPU-accelerated library of primitives for
151 | deep neural networks. cuDNN provides highly tuned implementations for standard routines.
152 | 
153 | Typically, models run with cuDNN enabled are faster. However there are some exceptions
154 | where using cuDNN can be slower, cause higher memory usage or result in errors.
155 | 
156 | 
157 | The section of model config file specifying this parameter will look like:
158 | 
159 | ```
160 | parameters: {
161 | key: "DISABLE_CUDNN"
162 |     value: {
163 |     string_value: "true"
164 |     }
165 | }
166 | ```
167 | 
168 | * `ENABLE_WEIGHT_SHARING`: Boolean flag to enable model instances on the same device to
169 | share weights. This optimization should not be used with stateful models. If not specified,
170 | weight sharing is disabled.
171 | 
172 | The section of model config file specifying this parameter will look like:
173 | 
174 | ```
175 | parameters: {
176 | key: "ENABLE_WEIGHT_SHARING"
177 |     value: {
178 |     string_value: "true"
179 |     }
180 | }
181 | ```
182 | 
183 | * `ENABLE_CACHE_CLEANING`: Boolean flag to enable CUDA cache cleaning after each model execution.
184 | If not specified, cache cleaning is disabled. This flag has no effect if model is on CPU.
185 | Setting this flag to true will negatively impact the performance due to additional CUDA cache
186 | cleaning operation after each model execution. Therefore, you should only use this flag if you
187 | serve multiple models with Triton and encounter CUDA out of memory issue during model executions.
188 | 
189 | The section of model config file specifying this parameter will look like:
190 | 
191 | ```
192 | parameters: {
193 | key: "ENABLE_CACHE_CLEANING"
194 |     value: {
195 |     string_value:"true"
196 |     }
197 | }
198 | ```
199 | 
200 | * `INTER_OP_THREAD_COUNT`:
201 | 
202 | PyTorch allows using multiple CPU threads during TorchScript model inference.
203 | One or more inference threads execute a model’s forward pass on the given
204 | inputs. Each inference thread invokes a JIT interpreter that executes the ops
205 | of a model inline, one by one. This parameter sets the size of this thread
206 | pool. The default value of this setting is the number of cpu cores. Please refer
207 | to [this](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html)
208 | document on how to set this parameter properly.
209 | 
210 | The section of model config file specifying this parameter will look like:
211 | 
212 | ```
213 | parameters: {
214 | key: "INTER_OP_THREAD_COUNT"
215 |     value: {
216 |     string_value:"1"
217 |     }
218 | }
219 | ```
220 | 
221 | * `INTRA_OP_THREAD_COUNT`:
222 | 
223 | In addition to the inter-op parallelism, PyTorch can also utilize multiple threads
224 | within the ops (intra-op parallelism). This can be useful in many cases, including
225 | element-wise ops on large tensors, convolutions, GEMMs, embedding lookups and
226 | others. The default value for this setting is the number of CPU cores. Please refer
227 | to [this](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html)
228 | document on how to set this parameter properly.
229 | 
230 | The section of model config file specifying this parameter will look like:
231 | 
232 | ```
233 | parameters: {
234 | key: "INTRA_OP_THREAD_COUNT"
235 |     value: {
236 |     string_value:"1"
237 |     }
238 | }
239 | ```
240 | 
241 | * Additional Optimizations: Three additional boolean parameters are available to disable
242 | certain Torch optimizations that can sometimes cause latency regressions in models with
243 | complex execution modes and dynamic shapes. If not specified, all are enabled by default.
244 | 
245 |     `ENABLE_JIT_EXECUTOR`
246 | 
247 |     `ENABLE_JIT_PROFILING`
248 | 
249 | ### Support
250 | 
251 | #### Model Instance Group Kind
252 | 
253 | The PyTorch backend supports the following kinds of
254 | [Model Instance Groups](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
255 | where the input tensors are placed as follows:
256 | 
257 | * `KIND_GPU`: Inputs are prepared on the GPU device associated with the model
258 | instance.
259 | 
260 | * `KIND_CPU`: Inputs are prepared on the CPU.
261 | 
262 | * `KIND_MODEL`: Inputs are prepared on the CPU. When loading the model, the
263 | backend does not choose the GPU device for the model; instead, it respects the
264 | device(s) specified in the model and uses them as they are during inference.
265 | This is useful when the model internally utilizes multiple GPUs, as demonstrated
266 | in this
267 | [example model](https://github.com/triton-inference-server/server/blob/main/qa/L0_libtorch_instance_group_kind_model/gen_models.py).
268 | If no device is specified in the model, the backend uses the first available
269 | GPU device. This feature is available starting in the 23.06 release.
270 | 
271 | ### Important Notes
272 | 
273 | * The execution of PyTorch model on GPU is asynchronous in nature. See
274 |   [here](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution)
275 |   for more details. Consequently, an error in PyTorch model execution may
276 |   be raised during the next few inference requests to the server. Setting
277 |   environment variable `CUDA_LAUNCH_BLOCKING=1` when launching server will
278 |   help in correctly debugging failing cases by forcing synchronous execution.
279 |   * The PyTorch model in such cases may or may not recover from the failed
280 |     state and a restart of the server may be required to continue serving
281 |     successfully.
282 | 
283 | * PyTorch does not support Tensor of Strings but it does support models that
284 | accept a List of Strings as input(s) / produces a List of String as output(s).
285 | For these models Triton allows users to pass String input(s)/receive String
286 | output(s) using the String datatype. As a limitation of using List instead of
287 | Tensor for String I/O, only for 1-dimensional input(s)/output(s) are supported
288 | for I/O of String type.
289 | 
290 | * In a multi-GPU environment, a potential runtime issue can occur when using
291 | [Tracing](https://pytorch.org/docs/stable/generated/torch.jit.trace.html)
292 | to generate a
293 | [TorchScript](https://pytorch.org/docs/stable/jit.html) model. This issue
294 | arises due to a device mismatch between the model instance and the tensor. By
295 | default, Triton creates a single execution instance of the model for each
296 | available GPU. The runtime error occurs when a request is sent to a model
297 | instance with a different GPU device from the one used during the TorchScript
298 | generation process. To address this problem, it is highly recommended to use
299 | [Scripting](https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script)
300 | instead of Tracing for model generation in a multi-GPU environment. Scripting
301 | avoids the device mismatch issue and ensures compatibility with different GPUs
302 | when used with Triton. However, if using Tracing is unavoidable, there is a
303 | workaround available. You can explicitly specify the GPU device for the model
304 | instance in the
305 | [model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
306 | to ensure that the model instance and the tensors used for inference are
307 | assigned to the same GPU device as on which the model was traced.
308 | 
309 | # PyTorch 2.0 Backend \[Experimental\]
310 | 
311 | > [!WARNING]
312 | > *This feature is subject to change and removal.*
313 | 
314 | Starting from 24.01, PyTorch models can be served directly via
315 | [Python runtime](src/model.py). By default, Triton will use the
316 | [LibTorch runtime](#pytorch-libtorch-backend) for PyTorch models. To use Python
317 | runtime, provide the following
318 | [runtime setting](https://github.com/triton-inference-server/backend/blob/main/README.md#backend-shared-library)
319 | in the model configuration:
320 | 
321 | ```
322 | runtime: "model.py"
323 | ```
324 | 
325 | ## Dependencies
326 | 
327 | ### Python backend dependency
328 | 
329 | This feature depends on
330 | [Python backend](https://github.com/triton-inference-server/python_backend),
331 | see
332 | [Python-based Backends](https://github.com/triton-inference-server/backend/blob/main/docs/python_based_backends.md)
333 | for more details.
334 | 
335 | ### PyTorch dependency
336 | 
337 | This feature will take advantage of the
338 | [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile)
339 | optimization, make sure the
340 | [PyTorch 2.0+ pip package](https://pypi.org/project/torch) is available in the
341 | same Python environment.
342 | 
343 | Alternatively, a [Python Execution Environment](#using-custom-python-execution-environments)
344 | with the PyTorch dependency may be used. It can be created with the
345 | [provided script](tools/gen_pb_exec_env.sh). The resulting
346 | `pb_exec_env_model.py.tar.gz` file should be placed at the same
347 | [backend shared library](https://github.com/triton-inference-server/backend/blob/main/README.md#backend-shared-library)
348 | directory as the [Python runtime](src/model.py).
349 | 
350 | ## Model Layout
351 | 
352 | ### PyTorch 2.0 models
353 | 
354 | The model repository should look like:
355 | 
356 | ```
357 | model_repository/
358 | `-- model_directory
359 |     |-- 1
360 |     |   |-- model.py
361 |     |   `-- [model.pt]
362 |     `-- config.pbtxt
363 | ```
364 | 
365 | The `model.py` contains the class definition of the PyTorch model. The class
366 | should extend the
367 | [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
368 | The `model.pt` may be optionally provided which contains the saved
369 | [`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference)
370 | of the model.
371 | 
372 | ### TorchScript models
373 | 
374 | The model repository should look like:
375 | 
376 | ```
377 | model_repository/
378 | `-- model_directory
379 |     |-- 1
380 |     |   `-- model.pt
381 |     `-- config.pbtxt
382 | ```
383 | 
384 | The `model.pt` is the TorchScript model file.
385 | 
386 | ## Customization
387 | 
388 | The following PyTorch settings may be customized by setting parameters on the
389 | `config.pbtxt`.
390 | 
391 | [`torch.set_num_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads)
392 | - Key: NUM_THREADS
393 | - Value: The number of threads used for intraop parallelism on CPU.
394 | 
395 | [`torch.set_num_interop_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_interop_threads.html#torch.set_num_interop_threads)
396 | - Key: NUM_INTEROP_THREADS
397 | - Value: The number of threads used for interop parallelism (e.g. in JIT
398 | interpreter) on CPU.
399 | 
400 | [`torch.compile()` parameters](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile)
401 | - Key: TORCH_COMPILE_OPTIONAL_PARAMETERS
402 | - Value: Any of following parameter(s) encoded as a JSON object.
403 |   - fullgraph (*bool*): Whether it is ok to break model into several subgraphs.
404 |   - dynamic (*bool*): Use dynamic shape tracing.
405 |   - backend (*str*): The backend to be used.
406 |   - mode (*str*): Can be either "default", "reduce-overhead" or "max-autotune".
407 |   - options (*dict*): A dictionary of options to pass to the backend.
408 |   - disable (*bool*): Turn `torch.compile()` into a no-op for testing.
409 | 
410 | For example:
411 | ```
412 | parameters: {
413 |     key: "NUM_THREADS"
414 |     value: { string_value: "4" }
415 | }
416 | parameters: {
417 |     key: "TORCH_COMPILE_OPTIONAL_PARAMETERS"
418 |     value: { string_value: "{\"disable\": true}" }
419 | }
420 | ```
421 | 
422 | ## Limitations
423 | 
424 | Following are few known limitations of this feature:
425 | - Python functions optimizable by `torch.compile` may not be served directly in
426 | the `model.py` file, they need to be enclosed by a class extending the
427 | [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
428 | - Model weights cannot be shared across multiple instances on the same GPU
429 | device.
430 | - When using `KIND_MODEL` as model instance kind, the default device of the
431 | first parameter on the model is used.
432 | 


--------------------------------------------------------------------------------
/cmake/TritonPyTorchBackendConfig.cmake.in:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | include(CMakeFindDependencyMacro)
28 | 
29 | get_filename_component(
30 |   TRITONPYTORCHBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
31 | )
32 | 
33 | list(APPEND CMAKE_MODULE_PATH ${TRITONPYTORCHBACKEND_CMAKE_DIR})
34 | 
35 | if(NOT TARGET TritonPyTorchBackend::triton-pytorch-backend)
36 |   include("${TRITONPYTORCHBACKEND_CMAKE_DIR}/TritonPyTorchBackendTargets.cmake")
37 | endif()
38 | 
39 | set(TRITONPYTORCHBACKEND_LIBRARIES TritonPyTorchBackend::triton-pytorch-backend)
40 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | [tool.codespell]
28 | # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
29 | # this is only to allow you to run codespell interactively
30 | skip = "./.git,./.github"
31 | # ignore short words, and typename parameters like OffsetT
32 | ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
33 | # use the 'clear' dictionary for unambiguous spelling mistakes
34 | builtin = "clear"
35 | # disable warnings about binary files and wrong encoding
36 | quiet-level = 3
37 | 
38 | [tool.isort]
39 | profile = "black"
40 | use_parentheses = true
41 | multi_line_output = 3
42 | include_trailing_comma = true
43 | force_grid_wrap = 0
44 | ensure_newline_before_comments = true
45 | line_length = 88
46 | balanced_wrapping = true
47 | indent = "    "
48 | skip = ["build"]
49 | 
50 | 


--------------------------------------------------------------------------------
/src/libtorch.cc:
--------------------------------------------------------------------------------
   1 | // Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
   2 | //
   3 | // Redistribution and use in source and binary forms, with or without
   4 | // modification, are permitted provided that the following conditions
   5 | // are met:
   6 | //  * Redistributions of source code must retain the above copyright
   7 | //    notice, this list of conditions and the following disclaimer.
   8 | //  * Redistributions in binary form must reproduce the above copyright
   9 | //    notice, this list of conditions and the following disclaimer in the
  10 | //    documentation and/or other materials provided with the distribution.
  11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
  12 | //    contributors may be used to endorse or promote products derived
  13 | //    from this software without specific prior written permission.
  14 | //
  15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
  16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 | 
  27 | #include <stdint.h>
  28 | 
  29 | #include <cstdint>
  30 | #include <exception>
  31 | 
  32 | #include "libtorch_utils.h"
  33 | #include "triton/backend/backend_common.h"
  34 | #include "triton/backend/backend_input_collector.h"
  35 | #include "triton/backend/backend_memory.h"
  36 | #include "triton/backend/backend_model.h"
  37 | #include "triton/backend/backend_model_instance.h"
  38 | #include "triton/backend/backend_output_responder.h"
  39 | #include "triton/common/nvtx.h"
  40 | #include "triton/core/tritonbackend.h"
  41 | 
  42 | #ifdef TRITON_PYTORCH_ENABLE_TORCHVISION
  43 | // Suppress warnings in torch headers
  44 | #pragma GCC diagnostic push
  45 | #pragma GCC diagnostic ignored "-Wsign-compare"
  46 | #pragma warning(push, 0)
  47 | #include <torchvision/ops/ops.h>
  48 | #include <torchvision/vision.h>  // Torchvision header
  49 | #pragma warning(pop)
  50 | #pragma GCC diagnostic pop
  51 | #endif  // TRITON_PYTORCH_ENABLE_TORCHVISION
  52 | 
  53 | #ifdef TRITON_ENABLE_GPU
  54 | #include <c10/cuda/CUDACachingAllocator.h>
  55 | #include <c10/cuda/CUDAGuard.h>
  56 | #include <cuda_runtime_api.h>
  57 | #endif  // TRITON_ENABLE_GPU
  58 | 
  59 | // for thread control
  60 | // https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api
  61 | // https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133
  62 | #include <ATen/Parallel.h>
  63 | 
  64 | 
  65 | //
  66 | // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API.
  67 | //
  68 | 
  69 | namespace triton { namespace backend { namespace pytorch {
  70 | 
  71 | //
  72 | // ModelState
  73 | //
  74 | // State associated with a model that is using this backend. An object
  75 | // of this class is created and associated with each
  76 | // TRITONBACKEND_Model.
  77 | //
  78 | class ModelState : public BackendModel {
  79 |  public:
  80 |   static TRITONSERVER_Error* Create(
  81 |       TRITONBACKEND_Model* triton_model, ModelState** state);
  82 |   virtual ~ModelState() = default;
  83 | 
  84 |   // Load a TorchScript model using 'artifact_name' as the name for the
  85 |   // TorchScript file. Return in 'model_path' the full path to the
  86 |   // TorchScript file, return in 'torch_model' the Torch Module
  87 |   // representing the model.
  88 |   TRITONSERVER_Error* LoadModel(
  89 |       const std::string& artifact_name, const torch::Device device,
  90 |       std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind,
  91 |       std::shared_ptr<torch::jit::script::Module>* torch_model);
  92 | 
  93 |   bool EnabledOptimizedExecution() { return enable_optimized_execution_; }
  94 |   const std::pair<bool, bool>& EnabledTensorExprFuser() const
  95 |   {
  96 |     return enable_tensor_fuser_pair_;
  97 |   }
  98 |   const std::pair<bool, bool>& EnabledJitProfiling() const
  99 |   {
 100 |     return enable_jit_profiling_pair_;
 101 |   }
 102 |   const std::pair<bool, bool>& EnabledJitExecutor() const
 103 |   {
 104 |     return enable_jit_executor_pair_;
 105 |   }
 106 |   bool EnabledInferenceMode() { return enable_inference_mode_; }
 107 |   bool EnabledCudnn() { return enable_cudnn_; }
 108 |   bool EnabledCacheCleaning() { return enable_cache_cleaning_; }
 109 | 
 110 |   bool EnabledWeightSharing() { return enable_weight_sharing_; }
 111 |   const std::map<std::string, std::pair<int64_t, int64_t>>& ModelOutputs()
 112 |   {
 113 |     return model_outputs_;
 114 |   }
 115 | 
 116 |  private:
 117 |   ModelState(TRITONBACKEND_Model* triton_model);
 118 |   TRITONSERVER_Error* AutoCompleteConfig();
 119 | 
 120 |   // Parses and validates parameters in config
 121 |   TRITONSERVER_Error* ParseParameters();
 122 | 
 123 |   // Flag to indicate whether optimized execution is enabled. Defaults to true.
 124 |   bool enable_optimized_execution_;
 125 | 
 126 |   // Flag to indicate whether inference mode is enabled. Defaults to false.
 127 |   bool enable_inference_mode_;
 128 | 
 129 |   // Flag to indicate whether cudnn is enabled. Defaults to true.
 130 |   bool enable_cudnn_;
 131 | 
 132 |   // Flag to indicate whether cache cleaning after each run is enabled.
 133 |   // Defaults to false.
 134 |   bool enable_cache_cleaning_;
 135 | 
 136 |   // Flag to indicate whether weight sharing is enabled. Defaults to false.
 137 |   bool enable_weight_sharing_;
 138 | 
 139 |   // Flag pairs to indicate if various JIT settings are set and
 140 |   // enabled respectively. Defaults to (false, true). Default behavior
 141 |   // is to do nothing if not explicitly set.
 142 |   std::pair<bool, bool> enable_tensor_fuser_pair_;
 143 |   std::pair<bool, bool> enable_jit_profiling_pair_;
 144 |   std::pair<bool, bool> enable_jit_executor_pair_;
 145 | 
 146 |   // Model mapping for shared TorchScript model across all instances on the
 147 |   // same device. The key is a pair of isGPU and device index.
 148 |   std::map<
 149 |       std::pair<bool, int64_t>, std::shared_ptr<torch::jit::script::Module>>
 150 |       torch_models_;
 151 | 
 152 |   // model_outputs is a map that contains unique outputs that the model must
 153 |   // provide. The first pair is the model output index and the second is
 154 |   // the index in the model state, -1 is used if one is not required.
 155 |   // In the model configuration, the output in the state configuration
 156 |   // can have intersection with the outputs section of the model. If an output
 157 |   // is specified both in the output section and state section, it indicates
 158 |   // that the backend must return the output state to the client too.
 159 |   std::map<std::string, std::pair<int64_t, int64_t>> model_outputs_;
 160 | };
 161 | 
 162 | TRITONSERVER_Error*
 163 | ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
 164 | {
 165 |   try {
 166 |     *state = new ModelState(triton_model);
 167 |   }
 168 |   catch (const BackendModelException& ex) {
 169 |     RETURN_ERROR_IF_TRUE(
 170 |         ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
 171 |         std::string("unexpected nullptr in BackendModelException"));
 172 |     RETURN_IF_ERROR(ex.err_);
 173 |   }
 174 | 
 175 |   // Auto-complete the configuration if requested...
 176 |   bool auto_complete_config = false;
 177 |   RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(
 178 |       triton_model, &auto_complete_config));
 179 |   if (auto_complete_config) {
 180 |     RETURN_IF_ERROR((*state)->AutoCompleteConfig());
 181 |     RETURN_IF_ERROR((*state)->SetModelConfig());
 182 |   }
 183 | 
 184 |   auto& model_outputs = (*state)->model_outputs_;
 185 |   // Parse the output states in the model configuration
 186 |   triton::common::TritonJson::Value sequence_batching;
 187 |   if ((*state)->ModelConfig().Find("sequence_batching", &sequence_batching)) {
 188 |     triton::common::TritonJson::Value states;
 189 |     if (sequence_batching.Find("state", &states)) {
 190 |       for (size_t i = 0; i < states.ArraySize(); i++) {
 191 |         triton::common::TritonJson::Value state;
 192 |         RETURN_IF_ERROR(states.IndexAsObject(i, &state));
 193 |         std::string output_state_name;
 194 |         RETURN_IF_ERROR(
 195 |             state.MemberAsString("output_name", &output_state_name));
 196 |         auto it = model_outputs.find(output_state_name);
 197 |         if (it == model_outputs.end()) {
 198 |           model_outputs.insert({output_state_name, std::make_pair(-1, i)});
 199 |         } else {
 200 |           it->second.second = i;
 201 |         }
 202 |       }
 203 |     }
 204 |   }
 205 | 
 206 |   // Parse the output names in the model configuration
 207 |   triton::common::TritonJson::Value outputs;
 208 |   RETURN_IF_ERROR((*state)->ModelConfig().MemberAsArray("output", &outputs));
 209 |   for (size_t i = 0; i < outputs.ArraySize(); i++) {
 210 |     triton::common::TritonJson::Value output;
 211 |     THROW_IF_BACKEND_INSTANCE_ERROR(outputs.IndexAsObject(i, &output));
 212 | 
 213 |     // Use names from ModelConfig by reference since the model
 214 |     // config will persist longer than this inference execution.
 215 |     std::string output_name;
 216 |     THROW_IF_BACKEND_INSTANCE_ERROR(
 217 |         output.MemberAsString("name", &output_name));
 218 | 
 219 |     auto it = model_outputs.find(output_name);
 220 |     if (it == model_outputs.end()) {
 221 |       model_outputs.insert({output_name, std::make_pair(i, -1)});
 222 |     } else {
 223 |       it->second.first = i;
 224 |     }
 225 |   }
 226 | 
 227 |   RETURN_IF_ERROR((*state)->ParseParameters());
 228 | 
 229 |   return nullptr;  // success
 230 | }
 231 | 
 232 | ModelState::ModelState(TRITONBACKEND_Model* triton_model)
 233 |     : BackendModel(triton_model), enable_optimized_execution_(true),
 234 |       enable_inference_mode_(true), enable_cudnn_(true),
 235 |       enable_cache_cleaning_(false), enable_weight_sharing_(false),
 236 |       enable_tensor_fuser_pair_({false, true}),
 237 |       enable_jit_profiling_pair_({false, true}),
 238 |       enable_jit_executor_pair_({false, true})
 239 | {
 240 | }
 241 | 
 242 | TRITONSERVER_Error*
 243 | ModelState::LoadModel(
 244 |     const std::string& artifact_name, const torch::Device device,
 245 |     std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind,
 246 |     std::shared_ptr<torch::jit::script::Module>* torch_model)
 247 | {
 248 |   // Find the TorchScript file that describes the model. If the model
 249 |   // configuration doesn't have an explicit model file specified then
 250 |   // use the default name ("model.pt").
 251 |   std::string cc_model_filename = artifact_name;
 252 |   if (cc_model_filename.empty()) {
 253 |     cc_model_filename = "model.pt";
 254 |   }
 255 | 
 256 |   *model_path = JoinPath(
 257 |       {RepositoryPath(), std::to_string(Version()), cc_model_filename});
 258 | 
 259 |   {
 260 |     bool exists;
 261 |     RETURN_IF_ERROR(FileExists(*model_path, &exists));
 262 |     RETURN_ERROR_IF_FALSE(
 263 |         exists, TRITONSERVER_ERROR_UNAVAILABLE,
 264 |         std::string("unable to find '") + *model_path +
 265 |             "' for model instance '" + Name() + "'");
 266 |   }
 267 | 
 268 |   // If weight sharing is enabled, skip loading model if
 269 |   // it is already available on the target device
 270 |   std::pair<bool, int> device_pair;
 271 |   if (enable_weight_sharing_) {
 272 |     device_pair = std::make_pair(!device.is_cpu(), device.index());
 273 |     auto mit = torch_models_.find(device_pair);
 274 |     if (mit != torch_models_.end()) {
 275 |       *torch_model = mit->second;
 276 |       LOG_MESSAGE(
 277 |           TRITONSERVER_LOG_INFO,
 278 |           (std::string("Reusing TorchScript model for instance '") + Name() +
 279 |            "'")
 280 |               .c_str());
 281 |       return nullptr;  // success
 282 |     }
 283 |   }
 284 | 
 285 |   // Serialize the torch model to string
 286 |   std::string model_data_str;
 287 |   RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str));
 288 | 
 289 |   // InferenceMode should be used to guard all tensors operations including
 290 |   // model loading: https://pytorch.org/cppdocs/notes/inference_mode.html
 291 |   torch::InferenceMode infer_guard(EnabledInferenceMode());
 292 | 
 293 |   try {
 294 |     std::istringstream model_stream(model_data_str);
 295 |     if (kind == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
 296 |       // Load the model without selecting a device.
 297 |       torch_model->reset(
 298 |           new torch::jit::Module(torch::jit::load(model_stream)));
 299 |     } else {
 300 |       torch_model->reset(
 301 |           new torch::jit::Module(torch::jit::load(model_stream, device)));
 302 |     }
 303 |   }
 304 |   catch (const std::exception& ex) {
 305 |     return TRITONSERVER_ErrorNew(
 306 |         TRITONSERVER_ERROR_INTERNAL,
 307 |         ("failed to load model '" + Name() + "': " + ex.what()).c_str());
 308 |   }
 309 | 
 310 |   if (enable_weight_sharing_) {
 311 |     if (!((torch_models_.emplace(device_pair, *torch_model)).second)) {
 312 |       std::string type = device.is_cpu() ? "CPU" : "GPU";
 313 |       LOG_MESSAGE(
 314 |           TRITONSERVER_LOG_WARN,
 315 |           (std::string("Model already found on target ") + type + " device " +
 316 |            "(id " + std::to_string(device.index()) + ") for '" + Name() + "'")
 317 |               .c_str());
 318 |     }
 319 |   }
 320 | 
 321 |   return nullptr;  // success
 322 | }
 323 | 
 324 | TRITONSERVER_Error*
 325 | ModelState::AutoCompleteConfig()
 326 | {
 327 |   // Auto-complete configuration is not supported since PyTorch does not
 328 |   // store/capture sufficient model metadata so just log error instead.
 329 |   LOG_MESSAGE(
 330 |       TRITONSERVER_LOG_WARN,
 331 |       (std::string("skipping model configuration auto-complete for '") +
 332 |        Name() + "': not supported for pytorch backend")
 333 |           .c_str());
 334 | 
 335 |   return nullptr;  // success
 336 | }
 337 | 
 338 | TRITONSERVER_Error*
 339 | ModelState::ParseParameters()
 340 | {
 341 |   triton::common::TritonJson::Value params;
 342 |   bool status = model_config_.Find("parameters", &params);
 343 |   if (status) {
 344 |     // If 'DISABLE_OPTIMIZED_EXECUTION' is not present in 'parameters' then no
 345 |     // update is made to 'enable_optimized_execution_'.
 346 |     bool disable_optimized_execution = false;
 347 |     TRITONSERVER_Error* err = ParseParameter(
 348 |         params, "DISABLE_OPTIMIZED_EXECUTION", &disable_optimized_execution);
 349 |     if (err != nullptr) {
 350 |       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
 351 |         return err;
 352 |       } else {
 353 |         TRITONSERVER_ErrorDelete(err);
 354 |       }
 355 |     }
 356 |     enable_optimized_execution_ = !disable_optimized_execution;
 357 | 
 358 |     LOG_MESSAGE(
 359 |         TRITONSERVER_LOG_INFO,
 360 |         (std::string("Optimized execution is ") +
 361 |          (enable_optimized_execution_ ? "enabled" : "disabled") +
 362 |          " for model instance '" + Name() + "'")
 363 |             .c_str());
 364 | 
 365 |     // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then
 366 |     // no update is made to 'enable_cache_cleaning_'.
 367 |     err = ParseParameter(
 368 |         params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_);
 369 |     if (err != nullptr) {
 370 |       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
 371 |         return err;
 372 |       } else {
 373 |         TRITONSERVER_ErrorDelete(err);
 374 |       }
 375 |     }
 376 | 
 377 |     LOG_MESSAGE(
 378 |         TRITONSERVER_LOG_INFO,
 379 |         (std::string("Cache Cleaning is ") +
 380 |          (enable_cache_cleaning_ ? "enabled" : "disabled") +
 381 |          " for model instance '" + Name() + "'")
 382 |             .c_str());
 383 | 
 384 |     // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made
 385 |     // to 'enable_inference_mode_'.
 386 |     err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_);
 387 |     if (err != nullptr) {
 388 |       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
 389 |         return err;
 390 |       } else {
 391 |         TRITONSERVER_ErrorDelete(err);
 392 |       }
 393 |     }
 394 |     LOG_MESSAGE(
 395 |         TRITONSERVER_LOG_INFO,
 396 |         (std::string("Inference Mode is ") +
 397 |          (enable_inference_mode_ ? "enabled" : "disabled") +
 398 |          " for model instance '" + Name() + "'")
 399 |             .c_str());
 400 | 
 401 |     // If 'DISABLE_CUDNN' is not present in 'parameters' then no update is made
 402 |     // to 'enable_cudnn_'.
 403 |     bool disable_cudnn = false;
 404 |     err = ParseParameter(params, "DISABLE_CUDNN", &disable_cudnn);
 405 |     if (err != nullptr) {
 406 |       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
 407 |         return err;
 408 |       } else {
 409 |         TRITONSERVER_ErrorDelete(err);
 410 |       }
 411 |     }
 412 |     enable_cudnn_ = !disable_cudnn;
 413 |     LOG_MESSAGE(
 414 |         TRITONSERVER_LOG_INFO,
 415 |         (std::string("cuDNN is ") + (enable_cudnn_ ? "enabled" : "disabled") +
 416 |          " for model instance '" + Name() + "'")
 417 |             .c_str());
 418 | 
 419 |     // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no
 420 |     // update is made to 'enable_tensor_fuser'.
 421 |     bool enable_tensor_fuser = false;
 422 |     err = ParseParameter(params, "ENABLE_TENSOR_FUSER", &enable_tensor_fuser);
 423 |     if (err != nullptr) {
 424 |       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
 425 |         return err;
 426 |       } else {
 427 |         TRITONSERVER_ErrorDelete(err);
 428 |       }
 429 |     } else {
 430 |       enable_tensor_fuser_pair_ = {true, enable_tensor_fuser};
 431 |       LOG_MESSAGE(
 432 |           TRITONSERVER_LOG_INFO,
 433 |           (std::string("Tensor fuser is ") +
 434 |            (enable_tensor_fuser ? "enabled" : "disabled") +
 435 |            " for model instance '" + Name() + "'")
 436 |               .c_str());
 437 |     }
 438 | 
 439 |     // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no
 440 |     // update is made to 'enable_weight_sharing'.
 441 |     err = ParseParameter(
 442 |         params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_);
 443 |     if (err != nullptr) {
 444 |       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
 445 |         return err;
 446 |       } else {
 447 |         TRITONSERVER_ErrorDelete(err);
 448 |       }
 449 |     } else {
 450 |       LOG_MESSAGE(
 451 |           TRITONSERVER_LOG_INFO,
 452 |           (std::string("Weight sharing is ") +
 453 |            (enable_weight_sharing_ ? "enabled" : "disabled") +
 454 |            " for model instance '" + Name() + "'")
 455 |               .c_str());
 456 |     }
 457 | 
 458 |     // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update
 459 |     // is made to 'enable_jit_profiling'.
 460 |     bool enable_jit_profiling = false;
 461 |     err = ParseParameter(params, "ENABLE_JIT_PROFILING", &enable_jit_profiling);
 462 |     if (err != nullptr) {
 463 |       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
 464 |         return err;
 465 |       } else {
 466 |         TRITONSERVER_ErrorDelete(err);
 467 |       }
 468 |     } else {
 469 |       enable_jit_profiling_pair_ = {true, enable_jit_profiling};
 470 |       LOG_MESSAGE(
 471 |           TRITONSERVER_LOG_INFO,
 472 |           (std::string("Jit profiling is ") +
 473 |            (enable_jit_profiling ? "enabled" : "disabled") +
 474 |            " for model instance '" + Name() + "'")
 475 |               .c_str());
 476 |     }
 477 | 
 478 |     // If 'ENABLE_JIT_EXECUTOR' is not present in 'parameters' then no update is
 479 |     // made to 'enable_jit_executor'.
 480 |     bool enable_jit_executor = false;
 481 |     err = ParseParameter(params, "ENABLE_JIT_EXECUTOR", &enable_jit_executor);
 482 |     if (err != nullptr) {
 483 |       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
 484 |         return err;
 485 |       } else {
 486 |         TRITONSERVER_ErrorDelete(err);
 487 |       }
 488 |     } else {
 489 |       enable_jit_executor_pair_ = {true, enable_jit_executor};
 490 |       LOG_MESSAGE(
 491 |           TRITONSERVER_LOG_INFO,
 492 |           (std::string("Jit executor is ") +
 493 |            (enable_jit_executor ? "enabled" : "disabled") +
 494 |            " for model instance '" + Name() + "'")
 495 |               .c_str());
 496 |     }
 497 | 
 498 |     // If 'INTRA_OP_THREAD_COUNT' is not present in 'parameters' then no update
 499 |     // is made to 'intra_op_thread_count', which by default will take all
 500 |     // threads
 501 |     int intra_op_thread_count = -1;
 502 |     err =
 503 |         ParseParameter(params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count);
 504 |     if (err != nullptr) {
 505 |       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
 506 |         return err;
 507 |       } else {
 508 |         TRITONSERVER_ErrorDelete(err);
 509 |       }
 510 |     } else {
 511 |       if (intra_op_thread_count > 0) {
 512 |         at::set_num_threads(intra_op_thread_count);
 513 |         LOG_MESSAGE(
 514 |             TRITONSERVER_LOG_INFO,
 515 |             (std::string("Intra op thread count is set to ") +
 516 |              std::to_string(intra_op_thread_count) + " for model instance '" +
 517 |              Name() + "'")
 518 |                 .c_str());
 519 |       }
 520 |     }
 521 | 
 522 |     // If 'INTER_OP_THREAD_COUNT' is not present in 'parameters' then no update
 523 |     // is made to 'inter_op_thread_count', which by default will take all
 524 |     // threads
 525 |     int inter_op_thread_count = -1;
 526 |     err =
 527 |         ParseParameter(params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count);
 528 |     if (err != nullptr) {
 529 |       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
 530 |         return err;
 531 |       } else {
 532 |         TRITONSERVER_ErrorDelete(err);
 533 |       }
 534 |     } else {
 535 |       if (inter_op_thread_count > 0) {
 536 |         at::set_num_interop_threads(inter_op_thread_count);
 537 |         LOG_MESSAGE(
 538 |             TRITONSERVER_LOG_INFO,
 539 |             (std::string("Inter op thread count is set to ") +
 540 |              std::to_string(inter_op_thread_count) + " for model instance '" +
 541 |              Name() + "'")
 542 |                 .c_str());
 543 |       }
 544 |     }
 545 |   }
 546 | 
 547 |   return nullptr;
 548 | }
 549 | 
 550 | // The naming convention followed for inputs/outputs in the model configuration.
 551 | // Outputs don't support FORWARD_ARGUMENT.
 552 | enum class NamingConvention {
 553 |   NAMED_INDEX,
 554 |   FORWARD_ARGUMENT,
 555 |   STRICT_CONFIG_ORDERING
 556 | };
 557 | 
 558 | //
 559 | // ModelInstanceState
 560 | //
 561 | // State associated with a model instance. An object of this class is
 562 | // created and associated with each TRITONBACKEND_ModelInstance.
 563 | //
 564 | class ModelInstanceState : public BackendModelInstance {
 565 |  public:
 566 |   static TRITONSERVER_Error* Create(
 567 |       ModelState* model_state,
 568 |       TRITONBACKEND_ModelInstance* triton_model_instance,
 569 |       ModelInstanceState** state);
 570 |   virtual ~ModelInstanceState();
 571 | 
 572 |   // Get the state of the model that corresponds to this instance.
 573 |   ModelState* StateForModel() const { return model_state_; }
 574 | 
 575 |   // Execute...
 576 |   void ProcessRequests(
 577 |       TRITONBACKEND_Request** requests, const uint32_t request_count);
 578 | 
 579 |   // Clear CUDA cache
 580 |   void ClearCache();
 581 | 
 582 |  private:
 583 |   ModelInstanceState(
 584 |       ModelState* model_state,
 585 |       TRITONBACKEND_ModelInstance* triton_model_instance);
 586 |   TRITONSERVER_Error* ValidateBooleanSequenceControl(
 587 |       triton::common::TritonJson::Value& sequence_batching,
 588 |       const std::string& control_kind, bool required, bool* have_control);
 589 |   TRITONSERVER_Error* ValidateTypedSequenceControl(
 590 |       triton::common::TritonJson::Value& sequence_batching,
 591 |       const std::string& control_kind, bool required, bool* have_control);
 592 |   TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt);
 593 |   void AddInputToMap(
 594 |       NamingConvention naming_convention,
 595 |       const std::vector<std::string> allowed_inputs, const std::string& io_name,
 596 |       const uint32_t index);
 597 |   TRITONSERVER_Error* ValidateOutputs();
 598 |   void Execute(
 599 |       std::vector<TRITONBACKEND_Response*>* responses,
 600 |       const uint32_t response_count,
 601 |       std::vector<torch::jit::IValue>* input_tensors,
 602 |       std::vector<torch::jit::IValue>* output_tensors);
 603 |   TRITONSERVER_Error* SetInputTensors(
 604 |       size_t total_batch_size, TRITONBACKEND_Request** requests,
 605 |       const uint32_t request_count,
 606 |       std::vector<TRITONBACKEND_Response*>* responses,
 607 |       BackendInputCollector* collector, std::vector<const char*>* input_names,
 608 |       std::vector<torch::jit::IValue>* input_tensors, bool* cuda_copy);
 609 |   TRITONSERVER_Error* ReadOutputTensors(
 610 |       size_t total_batch_size,
 611 |       const std::vector<torch::jit::IValue>& output_tensors,
 612 |       TRITONBACKEND_Request** requests, const uint32_t request_count,
 613 |       std::vector<TRITONBACKEND_Response*>* responses);
 614 |   TRITONSERVER_Error* RecordBackendTimestamp(
 615 |       uint64_t* timestamp, void* cuda_event);
 616 | 
 617 |   // Get the naming convention for inputs/outputs from the model configuration
 618 |   TRITONSERVER_Error* GetNamingConvention(
 619 |       NamingConvention* naming_convention,
 620 |       const std::vector<std::string>& allowed_io);
 621 | 
 622 |   // Create CUDA events for statistics collection.
 623 |   void CreateCudaEvents(const int32_t& device_id);
 624 | 
 625 |   // Get the appropriate CUDA stream for input and output handling based on the
 626 |   // instance group type.
 627 |   cudaStream_t GetCudaStreamByInstanceKind();
 628 | 
 629 |   // Replace the default CUDA stream with the stream we created to ensure proper
 630 |   // cuda stream synchronization.
 631 |   void SetCurrentCudaStream(
 632 |       const cudaStream_t& stream, const int32_t& device_id);
 633 | 
 634 |   // Get the elapsed time between two CUDA events.
 635 |   float GetCudaEventElapsedTime(
 636 |       const cudaEvent_t& start_event, const cudaEvent_t& end_event);
 637 | 
 638 |   ModelState* model_state_;
 639 | 
 640 |   // The full path to the TorchScript model file.
 641 |   std::string model_path_;
 642 | 
 643 |   std::shared_ptr<torch::jit::script::Module> torch_model_;
 644 |   torch::Device device_;
 645 | 
 646 |   // Map from configuration name for an input to the index of
 647 |   // that input in the model.
 648 |   std::unordered_map<std::string, int> input_index_map_;
 649 |   uint32_t batch_input_count_ = 0;
 650 | 
 651 |   // Map from configuration name for an output to the index of
 652 |   // that output in the model.
 653 |   std::unordered_map<std::string, int> output_index_map_;
 654 |   std::unordered_map<std::string, TRITONSERVER_DataType> output_dtype_map_;
 655 | 
 656 |   // If the input to the tensor is a dictionary of tensors.
 657 |   bool is_dict_input_;
 658 | 
 659 |   // If the model supports batching.
 660 |   bool supports_batching_;
 661 | 
 662 |   cudaEvent_t compute_input_start_event_;
 663 |   cudaEvent_t compute_infer_start_event_;
 664 |   cudaEvent_t compute_output_start_event_;
 665 | 
 666 |   // Store the cuda streams created for the 'KIND_MODEL' instance group.
 667 |   std::vector<cudaStream_t> stream_vec_;
 668 | 
 669 |   // The number of available devices.
 670 |   int device_cnt_;
 671 | };
 672 | 
 673 | TRITONSERVER_Error*
 674 | ModelInstanceState::Create(
 675 |     ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
 676 |     ModelInstanceState** state)
 677 | {
 678 |   try {
 679 |     *state = new ModelInstanceState(model_state, triton_model_instance);
 680 |   }
 681 |   catch (const BackendModelInstanceException& ex) {
 682 |     RETURN_ERROR_IF_TRUE(
 683 |         ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
 684 |         std::string("unexpected nullptr in BackendModelInstanceException"));
 685 |     RETURN_IF_ERROR(ex.err_);
 686 |   }
 687 | 
 688 |   return nullptr;  // success
 689 | }
 690 | 
 691 | ModelInstanceState::ModelInstanceState(
 692 |     ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
 693 |     : BackendModelInstance(model_state, triton_model_instance),
 694 |       model_state_(model_state), device_(torch::kCPU), is_dict_input_(false),
 695 |       device_cnt_(0)
 696 | {
 697 |   if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
 698 | #ifdef TRITON_ENABLE_GPU
 699 |     device_ = torch::Device(torch::kCUDA, DeviceId());
 700 |     CreateCudaEvents(DeviceId());
 701 | #endif
 702 |   }
 703 | 
 704 | #ifdef TRITON_ENABLE_GPU
 705 |   device_cnt_ = torch::cuda::device_count();
 706 | #endif
 707 | 
 708 |   THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(
 709 |       ArtifactFilename(), device_, &model_path_, Kind(), &torch_model_));
 710 | 
 711 |   if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
 712 | #ifdef TRITON_ENABLE_GPU
 713 |     // Since we cannot determine the exact devices used by the model, we create
 714 |     // a CUDA stream for every available device to ensure proper synchronization
 715 |     // of CUDA streams. This approach may have implications when a timestamp is
 716 |     // captured on a device that is not used by the model. Currently, this issue
 717 |     // is addressed by synchronizing the CUDA streams before recording
 718 |     // timestamps to prevent timestamp skewing. However, in the future, any
 719 |     // modifications to the CUDA stream synchronization logic should be handled
 720 |     // with caution.
 721 |     for (int i = 0; i < device_cnt_; i++) {
 722 |       cudaStream_t stream;
 723 |       THROW_IF_BACKEND_INSTANCE_ERROR(
 724 |           CreateCudaStream(i, 0 /* cuda_stream_priority */, &stream));
 725 |       stream_vec_.push_back(stream);
 726 |     }
 727 |     if (!stream_vec_.empty()) {
 728 |       // Create CUDA events on the first device that will be used for collecting
 729 |       // inputs/outputs.
 730 |       CreateCudaEvents(0);
 731 |     }
 732 | #endif
 733 |   }
 734 | 
 735 |   size_t expected_input_cnt = 0;
 736 |   {
 737 |     triton::common::TritonJson::Value inputs;
 738 |     if (model_state->ModelConfig().Find("input", &inputs)) {
 739 |       expected_input_cnt = inputs.ArraySize();
 740 |     }
 741 | 
 742 |     triton::common::TritonJson::Value config_batch_inputs;
 743 |     if (model_state->ModelConfig().Find("batch_input", &config_batch_inputs)) {
 744 |       batch_input_count_ = config_batch_inputs.ArraySize();
 745 |       expected_input_cnt += batch_input_count_;
 746 |     }
 747 |   }
 748 | 
 749 |   // If this is a sequence model then make sure that the required
 750 |   // inputs are present in the model and have the correct shape and
 751 |   // datatype.
 752 |   triton::common::TritonJson::Value sequence_batching;
 753 |   if (model_state->ModelConfig().Find(
 754 |           "sequence_batching", &sequence_batching)) {
 755 |     bool have_start, have_end, have_ready, have_corrid;
 756 |     THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
 757 |         sequence_batching, "CONTROL_SEQUENCE_START", false /* required */,
 758 |         &have_start));
 759 |     THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
 760 |         sequence_batching, "CONTROL_SEQUENCE_END", false /* required */,
 761 |         &have_end));
 762 |     THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
 763 |         sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */,
 764 |         &have_ready));
 765 |     THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl(
 766 |         sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */,
 767 |         &have_corrid));
 768 |     if (have_start) {
 769 |       expected_input_cnt += 1;
 770 |     }
 771 |     if (have_end) {
 772 |       expected_input_cnt += 1;
 773 |     }
 774 |     if (have_ready) {
 775 |       expected_input_cnt += 1;
 776 |     }
 777 |     if (have_corrid) {
 778 |       expected_input_cnt += 1;
 779 |     }
 780 |     // Add the state inputs to the expected count
 781 |     triton::common::TritonJson::Value states;
 782 |     if (sequence_batching.Find("state", &states)) {
 783 |       expected_input_cnt += states.ArraySize();
 784 |     }
 785 |   }
 786 |   supports_batching_ = model_state_->MaxBatchSize() > 0;
 787 | 
 788 |   THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt));
 789 |   THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
 790 | }
 791 | 
 792 | void
 793 | ModelInstanceState::ClearCache()
 794 | {
 795 | #ifdef TRITON_ENABLE_GPU
 796 |   if (device_.is_cuda() ||
 797 |       ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
 798 |     c10::cuda::CUDACachingAllocator::emptyCache();
 799 |   }
 800 | #endif  // TRITON_ENABLE_GPU
 801 | }
 802 | 
 803 | ModelInstanceState::~ModelInstanceState()
 804 | {
 805 |   torch_model_.reset();
 806 |   ClearCache();
 807 | 
 808 |   if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
 809 | #ifdef TRITON_ENABLE_GPU
 810 |     for (size_t i = 0; i < stream_vec_.size(); i++) {
 811 |       LOG_IF_ERROR(
 812 |           ConvertCUDAStatusToTritonError(
 813 |               cudaSetDevice(i), TRITONSERVER_ERROR_INTERNAL,
 814 |               "Failed to set the device"),
 815 |           "Failed to set the device");
 816 | 
 817 |       LOG_IF_ERROR(
 818 |           ConvertCUDAStatusToTritonError(
 819 |               cudaStreamDestroy(stream_vec_[i]), TRITONSERVER_ERROR_INTERNAL,
 820 |               "Failed to destroy cuda stream"),
 821 |           "~ModelInstanceState error: ");
 822 |       stream_vec_[i] = nullptr;
 823 |     }
 824 | #endif
 825 |   }
 826 | }
 827 | 
 828 | TRITONSERVER_Error*
 829 | ModelInstanceState::ValidateBooleanSequenceControl(
 830 |     triton::common::TritonJson::Value& sequence_batching,
 831 |     const std::string& control_kind, bool required, bool* have_control)
 832 | {
 833 |   std::string tensor_name;
 834 |   std::string tensor_datatype;
 835 |   RETURN_IF_ERROR(GetBooleanSequenceControlProperties(
 836 |       sequence_batching, model_state_->Name(), control_kind, required,
 837 |       &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr,
 838 |       nullptr, nullptr));
 839 |   *have_control = !tensor_name.empty();
 840 |   if (*have_control) {
 841 |     std::string deliminator = "__";
 842 |     int ip_index = 0;
 843 |     int start_pos = tensor_name.find(deliminator);
 844 |     if (start_pos == -1) {
 845 |       return TRITONSERVER_ErrorNew(
 846 |           TRITONSERVER_ERROR_INTERNAL,
 847 |           ("input '" + tensor_name +
 848 |            "' does not follow <name>__<index> naming convention.")
 849 |               .c_str());
 850 |     }
 851 | 
 852 |     // check if the index part of the name is not an integer
 853 |     std::string index_str = tensor_name.substr(start_pos + 2);
 854 |     for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
 855 |       if (std::isdigit(*itr) == 0) {
 856 |         return TRITONSERVER_ErrorNew(
 857 |             TRITONSERVER_ERROR_INTERNAL,
 858 |             ("input '" + tensor_name +
 859 |              "' does not follow <name>__<index> naming convention.")
 860 |                 .c_str());
 861 |       }
 862 |     }
 863 | 
 864 |     ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
 865 |     input_index_map_[tensor_name] = ip_index;
 866 |   }
 867 | 
 868 |   return nullptr;  // success
 869 | }
 870 | 
 871 | TRITONSERVER_Error*
 872 | ModelInstanceState::ValidateTypedSequenceControl(
 873 |     triton::common::TritonJson::Value& sequence_batching,
 874 |     const std::string& control_kind, bool required, bool* have_control)
 875 | {
 876 |   std::string tensor_name;
 877 |   std::string tensor_datatype;
 878 |   RETURN_IF_ERROR(GetTypedSequenceControlProperties(
 879 |       sequence_batching, model_state_->Name(), control_kind, required,
 880 |       &tensor_name, &tensor_datatype));
 881 |   *have_control = !tensor_name.empty();
 882 |   if (*have_control) {
 883 |     std::string deliminator = "__";
 884 |     int ip_index = 0;
 885 |     int start_pos = tensor_name.find(deliminator);
 886 |     if (start_pos == -1) {
 887 |       return TRITONSERVER_ErrorNew(
 888 |           TRITONSERVER_ERROR_INTERNAL,
 889 |           ("input '" + tensor_name +
 890 |            "' does not follow <name>__<index> naming convention.")
 891 |               .c_str());
 892 |     }
 893 | 
 894 |     // check if the index part of the name is not an integer
 895 |     std::string index_str = tensor_name.substr(start_pos + 2);
 896 |     for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
 897 |       if (std::isdigit(*itr) == 0) {
 898 |         return TRITONSERVER_ErrorNew(
 899 |             TRITONSERVER_ERROR_INTERNAL,
 900 |             ("input '" + tensor_name +
 901 |              "' does not follow <name>__<index> naming convention.")
 902 |                 .c_str());
 903 |       }
 904 |     }
 905 | 
 906 |     // check if the data type is supported by PyTorch
 907 |     if (!ModelConfigDataTypeToTorchType(tensor_datatype).first) {
 908 |       return TRITONSERVER_ErrorNew(
 909 |           TRITONSERVER_ERROR_INTERNAL,
 910 |           ("input '" + tensor_name + "' type '" + tensor_datatype +
 911 |            "' is not supported by PyTorch.")
 912 |               .c_str());
 913 |     }
 914 | 
 915 |     ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
 916 |     input_index_map_[tensor_name] = ip_index;
 917 |   }
 918 | 
 919 |   return nullptr;  // success
 920 | }
 921 | 
 922 | void
 923 | ModelInstanceState::AddInputToMap(
 924 |     NamingConvention naming_convention,
 925 |     const std::vector<std::string> allowed_inputs, const std::string& io_name,
 926 |     const uint32_t index)
 927 | {
 928 |   std::string deliminator = "__";
 929 | 
 930 |   if (is_dict_input_) {
 931 |     // If dictionary, index is irrelevant but we use the map to store the
 932 |     // input names since they are the keys for the dictionary
 933 |     input_index_map_[io_name] = index;
 934 |   } else {
 935 |     switch (naming_convention) {
 936 |       case NamingConvention::FORWARD_ARGUMENT: {
 937 |         auto itr =
 938 |             std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name);
 939 |         if (itr != allowed_inputs.end()) {
 940 |           input_index_map_[io_name] =
 941 |               std::distance(allowed_inputs.begin(), itr);
 942 |         }
 943 |         return;
 944 |       }
 945 |       case NamingConvention::NAMED_INDEX: {
 946 |         int start_pos = io_name.find(deliminator);
 947 |         int ip_index = std::atoi(io_name.substr(start_pos + 2).c_str());
 948 |         input_index_map_[io_name] = ip_index;
 949 |         return;
 950 |       }
 951 |       case NamingConvention::STRICT_CONFIG_ORDERING: {
 952 |         input_index_map_[io_name] = index;
 953 |         return;
 954 |       }
 955 |     }
 956 |   }
 957 | }
 958 | 
 959 | TRITONSERVER_Error*
 960 | ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
 961 | {
 962 |   // Collect all the expected input tensor names and validate that the model
 963 |   // configuration specifies only those.
 964 |   std::vector<std::string> allowed_inputs;
 965 | 
 966 |   const torch::jit::Method& method = torch_model_->get_method("forward");
 967 |   const auto& schema = method.function().getSchema();
 968 |   const std::vector<c10::Argument>& arguments = schema.arguments();
 969 | 
 970 |   // Currently, only models with a single input of type Dict(str, Tensor) are
 971 |   // supported. If the model expects more than one input then they must be all
 972 |   // be of type Tensor.
 973 |   //
 974 |   // Ignore the argument at idx 0 if it is of Class type (self param in forward
 975 |   // function)
 976 |   size_t start_idx = 0;
 977 |   if ((arguments.size() > 0) &&
 978 |       (arguments.at(0).type()->kind() == c10::TypeKind::ClassType)) {
 979 |     start_idx = 1;
 980 |   }
 981 |   if ((arguments.size() == (1 + start_idx)) &&
 982 |       (arguments.at(start_idx).type()->kind() == c10::TypeKind::DictType)) {
 983 |     is_dict_input_ = true;
 984 |   } else if (arguments.size() > start_idx) {
 985 |     // Return error if multiple inputs are of kind DictType
 986 |     for (size_t i = start_idx + 1; i < arguments.size(); i++) {
 987 |       if (arguments.at(i).type()->kind() == c10::TypeKind::DictType) {
 988 |         return TRITONSERVER_ErrorNew(
 989 |             TRITONSERVER_ERROR_INTERNAL,
 990 |             "Multiple inputs of kind DictType were detected. Only a single "
 991 |             "input of type Dict(str, Tensor) is supported.");
 992 |       }
 993 |     }
 994 | 
 995 |     // Return error if all inputs are not of type Tensor
 996 |     for (size_t i = start_idx; i < arguments.size(); i++) {
 997 |       if ((arguments.at(i).type()->kind() != c10::TypeKind::TensorType) &&
 998 |           (arguments.at(i).type()->kind() != c10::TypeKind::ListType)) {
 999 |         return TRITONSERVER_ErrorNew(
1000 |             TRITONSERVER_ERROR_INTERNAL,
1001 |             (std::string("An input of type '") + arguments.at(i).type()->str() +
1002 |              "' was detected in the model. Only a single input of type "
1003 |              "Dict(str, Tensor) or input(s) of type Tensor are supported.")
1004 |                 .c_str());
1005 |       }
1006 |       allowed_inputs.emplace_back(arguments.at(i).name());
1007 |     }
1008 | 
1009 |     // If all inputs are tensors, match number of expected inputs between model
1010 |     // and configuration
1011 |     if ((arguments.size() - start_idx) != expected_input_cnt) {
1012 |       return TRITONSERVER_ErrorNew(
1013 |           TRITONSERVER_ERROR_INVALID_ARG,
1014 |           (std::string("unable to load model '") + model_state_->Name() +
1015 |            "', configuration expects " + std::to_string(expected_input_cnt) +
1016 |            " inputs, model provides " +
1017 |            std::to_string(arguments.size() - start_idx))
1018 |               .c_str());
1019 |     }
1020 |   }
1021 | 
1022 |   triton::common::TritonJson::Value ios;
1023 |   RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios));
1024 | 
1025 |   if (ios.ArraySize() == 0) {
1026 |     return TRITONSERVER_ErrorNew(
1027 |         TRITONSERVER_ERROR_INTERNAL,
1028 |         "model configuration must contain at least one input, none were "
1029 |         "specified.");
1030 |   }
1031 | 
1032 |   NamingConvention naming_convention;
1033 |   RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs));
1034 | 
1035 |   for (size_t i = 0; i < ios.ArraySize(); i++) {
1036 |     triton::common::TritonJson::Value io;
1037 |     RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
1038 | 
1039 |     // Validate name
1040 |     std::string io_name;
1041 |     RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
1042 |     AddInputToMap(naming_convention, allowed_inputs, io_name, i);
1043 |     // Validate data type
1044 |     std::string io_dtype;
1045 |     RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
1046 |     const auto pr = ModelConfigDataTypeToTorchType(io_dtype);
1047 |     if (!pr.first && (io_dtype != "TYPE_STRING")) {
1048 |       return TRITONSERVER_ErrorNew(
1049 |           TRITONSERVER_ERROR_INTERNAL,
1050 |           ("unsupported datatype " + io_dtype + " for input '" + io_name +
1051 |            "' for model '" + model_state_->Name() + "'")
1052 |               .c_str());
1053 |     }
1054 | 
1055 |     // Validate shape for String inputs. Only allow 1 dimension.
1056 |     if (io_dtype == "TYPE_STRING") {
1057 |       // If a reshape is provided for the input then use that when
1058 |       // validating the model shapes.
1059 |       std::vector<int64_t> dims;
1060 |       triton::common::TritonJson::Value reshape;
1061 |       if (io.Find("reshape", &reshape)) {
1062 |         RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
1063 |       } else {
1064 |         RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
1065 |       }
1066 | 
1067 |       if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
1068 |         return TRITONSERVER_ErrorNew(
1069 |             TRITONSERVER_ERROR_INTERNAL,
1070 |             ("Triton only supports 1 dimensional List of String as input for "
1071 |              "'" +
1072 |              std::string(io_name) + "' for model '" + model_state_->Name() +
1073 |              "'")
1074 |                 .c_str());
1075 |       }
1076 |     }
1077 |   }
1078 |   triton::common::TritonJson::Value sequence_batching;
1079 |   if (model_state_->ModelConfig().Find(
1080 |           "sequence_batching", &sequence_batching)) {
1081 |     triton::common::TritonJson::Value states;
1082 |     if (sequence_batching.Find("state", &states)) {
1083 |       for (size_t i = 0; i < states.ArraySize(); i++) {
1084 |         triton::common::TritonJson::Value state;
1085 |         RETURN_IF_ERROR(states.IndexAsObject(i, &state));
1086 |         std::string state_name;
1087 |         RETURN_IF_ERROR(state.MemberAsString("input_name", &state_name));
1088 |         AddInputToMap(naming_convention, allowed_inputs, state_name, i);
1089 | 
1090 |         // Validate data type
1091 |         std::string state_dtype;
1092 |         RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype));
1093 |         const auto pr = ModelConfigDataTypeToTorchType(state_dtype);
1094 |         if (!pr.first && (state_dtype != "TYPE_STRING")) {
1095 |           return TRITONSERVER_ErrorNew(
1096 |               TRITONSERVER_ERROR_INTERNAL,
1097 |               ("unsupported datatype " + state_dtype + " for input state '" +
1098 |                state_name + "' for model '" + model_state_->Name() + "'")
1099 |                   .c_str());
1100 |         }
1101 | 
1102 |         // Validate shape for String inputs. Only allow 1 dimension.
1103 |         if (state_dtype == "TYPE_STRING") {
1104 |           std::vector<int64_t> dims;
1105 |           if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
1106 |             return TRITONSERVER_ErrorNew(
1107 |                 TRITONSERVER_ERROR_INTERNAL,
1108 |                 ("Triton only supports 1 dimensional List of String as input "
1109 |                  "for "
1110 |                  "'" +
1111 |                  std::string(state_name) + "' for model '" +
1112 |                  model_state_->Name() + "'")
1113 |                     .c_str());
1114 |           }
1115 |         }
1116 |       }
1117 |     }
1118 |   }
1119 | 
1120 |   triton::common::TritonJson::Value batch_inputs;
1121 |   RETURN_IF_ERROR(
1122 |       model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs));
1123 |   size_t i = 0;
1124 |   for (const auto& batch_input : StateForModel()->BatchInputs()) {
1125 |     for (const auto& input_name : batch_input.TargetNames()) {
1126 |       AddInputToMap(
1127 |           naming_convention, allowed_inputs, input_name, i + ios.ArraySize());
1128 |       i++;
1129 |     }
1130 |   }
1131 | 
1132 |   return nullptr;  // success
1133 | }
1134 | 
1135 | TRITONSERVER_Error*
1136 | ModelInstanceState::ValidateOutputs()
1137 | {
1138 |   triton::common::TritonJson::Value ios;
1139 |   RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios));
1140 |   std::string deliminator = "__";
1141 |   int op_index = 0;
1142 | 
1143 |   if (ios.ArraySize() == 0) {
1144 |     return TRITONSERVER_ErrorNew(
1145 |         TRITONSERVER_ERROR_INTERNAL,
1146 |         "model configuration must contain at least one output, none were "
1147 |         "specified.");
1148 |   }
1149 | 
1150 |   NamingConvention naming_convention;
1151 |   RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {}));
1152 | 
1153 |   for (size_t i = 0; i < ios.ArraySize(); i++) {
1154 |     triton::common::TritonJson::Value io;
1155 |     RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
1156 | 
1157 |     // Validate name
1158 |     std::string io_name;
1159 |     RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
1160 |     switch (naming_convention) {
1161 |       case NamingConvention::NAMED_INDEX: {
1162 |         int start_pos = io_name.find(deliminator);
1163 |         op_index = std::atoi(io_name.substr(start_pos + 2).c_str());
1164 |         break;
1165 |       }
1166 |       case NamingConvention::STRICT_CONFIG_ORDERING: {
1167 |         op_index = i;
1168 |         break;
1169 |       }
1170 |       default:
1171 |         break;
1172 |     }
1173 | 
1174 |     // Validate data type
1175 |     std::string io_dtype;
1176 |     RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
1177 |     const auto pr = ModelConfigDataTypeToTorchType(io_dtype);
1178 |     if (!pr.first && (io_dtype != "TYPE_STRING")) {
1179 |       return TRITONSERVER_ErrorNew(
1180 |           TRITONSERVER_ERROR_INTERNAL,
1181 |           ("unsupported datatype " + io_dtype + " for output '" + io_name +
1182 |            "' for model '" + model_state_->Name() + "'")
1183 |               .c_str());
1184 |     }
1185 | 
1186 |     // Validate shape for String outputs. Only allow 1 dimension.
1187 |     if (io_dtype == "TYPE_STRING") {
1188 |       // If a reshape is provided for the output then use that when
1189 |       // validating the model shapes.
1190 |       std::vector<int64_t> dims;
1191 |       triton::common::TritonJson::Value reshape;
1192 |       if (io.Find("reshape", &reshape)) {
1193 |         RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
1194 |       } else {
1195 |         RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
1196 |       }
1197 | 
1198 |       if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
1199 |         return TRITONSERVER_ErrorNew(
1200 |             TRITONSERVER_ERROR_INTERNAL,
1201 |             ("Triton only supports 1 dimensional List of String as output for "
1202 |              "'" +
1203 |              std::string(io_name) + "' for model '" + model_state_->Name() +
1204 |              "'")
1205 |                 .c_str());
1206 |       }
1207 |     }
1208 | 
1209 |     output_index_map_[io_name] = op_index;
1210 |     output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second);
1211 |   }
1212 | 
1213 |   triton::common::TritonJson::Value sequence_batching;
1214 |   if (model_state_->ModelConfig().Find(
1215 |           "sequence_batching", &sequence_batching)) {
1216 |     triton::common::TritonJson::Value states;
1217 |     if (sequence_batching.Find("state", &states)) {
1218 |       for (size_t i = 0; i < states.ArraySize(); i++) {
1219 |         triton::common::TritonJson::Value state;
1220 |         RETURN_IF_ERROR(states.IndexAsObject(i, &state));
1221 |         std::string state_name;
1222 |         RETURN_IF_ERROR(state.MemberAsString("output_name", &state_name));
1223 |         std::string state_dtype;
1224 |         RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype));
1225 |         std::vector<int64_t> dims;
1226 |         RETURN_IF_ERROR(ParseShape(state, "dims", &dims));
1227 | 
1228 |         // For state, naming convention is enforced to be NAMED_INDEX
1229 |         int start_pos = state_name.find(deliminator);
1230 |         op_index = std::atoi(state_name.substr(start_pos + 2).c_str());
1231 | 
1232 |         const auto pr = ModelConfigDataTypeToTorchType(state_dtype);
1233 |         if (!pr.first && (state_dtype != "TYPE_STRING")) {
1234 |           return TRITONSERVER_ErrorNew(
1235 |               TRITONSERVER_ERROR_INTERNAL,
1236 |               ("unsupported datatype " + state_dtype + " for state '" +
1237 |                state_name + "' for model '" + model_state_->Name() + "'")
1238 |                   .c_str());
1239 |         }
1240 | 
1241 |         // Validate shape for String outputs. Only allow 1 dimension.
1242 |         if (state_dtype == "TYPE_STRING") {
1243 |           if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
1244 |             return TRITONSERVER_ErrorNew(
1245 |                 TRITONSERVER_ERROR_INTERNAL,
1246 |                 ("Triton only supports 1 dimensional List of String as output "
1247 |                  "for "
1248 |                  "'" +
1249 |                  std::string(state_name) + "' for model '" +
1250 |                  model_state_->Name() + "'")
1251 |                     .c_str());
1252 |           }
1253 |         }
1254 | 
1255 |         output_index_map_[state_name] = op_index;
1256 |         output_dtype_map_[state_name] = ConvertTorchTypeToDataType(pr.second);
1257 |       }
1258 |     }
1259 |   }
1260 | 
1261 |   return nullptr;  // success
1262 | }
1263 | 
1264 | void
1265 | ModelInstanceState::ProcessRequests(
1266 |     TRITONBACKEND_Request** requests, const uint32_t request_count)
1267 | {
1268 |   LOG_MESSAGE(
1269 |       TRITONSERVER_LOG_VERBOSE,
1270 |       (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " +
1271 |        std::to_string(request_count) + " requests")
1272 |           .c_str());
1273 | 
1274 | #ifdef TRITON_ENABLE_GPU
1275 |   if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
1276 |     SetCurrentCudaStream(stream_, DeviceId());
1277 |   } else if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
1278 |     // Replace the default stream of each device with the one we created.
1279 |     for (size_t i = 0; i < stream_vec_.size(); i++) {
1280 |       SetCurrentCudaStream(stream_vec_[i], i);
1281 |     }
1282 |   }
1283 | #endif
1284 | 
1285 |   NVTX_RANGE(nvtx_, "ProcessRequests " + Name());
1286 | 
1287 |   uint64_t exec_start_ns = 0;
1288 |   SET_TIMESTAMP(exec_start_ns);
1289 | 
1290 |   const int max_batch_size = model_state_->MaxBatchSize();
1291 | 
1292 |   // For each request collect the total batch size for this inference
1293 |   // execution. The batch-size, number of inputs, and size of each
1294 |   // input has already been checked so don't need to do that here.
1295 |   size_t total_batch_size = 0;
1296 |   for (size_t i = 0; i < request_count; i++) {
1297 |     // If we get a nullptr request then something is badly wrong. Fail
1298 |     // and release all requests.
1299 |     if (requests[i] == nullptr) {
1300 |       RequestsRespondWithError(
1301 |           requests, request_count,
1302 |           TRITONSERVER_ErrorNew(
1303 |               TRITONSERVER_ERROR_INTERNAL,
1304 |               std::string(
1305 |                   "null request given to PyTorch backend for '" + Name() + "'")
1306 |                   .c_str()));
1307 |       return;
1308 |     }
1309 |   }
1310 | 
1311 |   // At this point we are committed to running inference with all
1312 |   // 'requests'. Create a response for each request. During input
1313 |   // processing if there is an error with any request that error will
1314 |   // be sent immediately with the corresponding response (and the
1315 |   // response unique_ptr will then be nullptr). The request object
1316 |   // itself will not be released until after all inferencing is done
1317 |   // (below) as we may need to access the request object when
1318 |   // determine how to process outputs (for example, even if we don't
1319 |   // need the outputs for a request that has an error, we do need to
1320 |   // know the size of those outputs associated with the request so we
1321 |   // can skip them in the output tensors).
1322 |   std::vector<TRITONBACKEND_Response*> responses;
1323 |   responses.reserve(request_count);
1324 |   bool all_response_failed = false;
1325 | 
1326 |   for (size_t i = 0; i < request_count; i++) {
1327 |     TRITONBACKEND_Response* response;
1328 |     auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
1329 |     if (err == nullptr) {
1330 |       responses.emplace_back(response);
1331 |     } else {
1332 |       responses.emplace_back(nullptr);
1333 |       LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
1334 |       TRITONSERVER_ErrorDelete(err);
1335 |     }
1336 |   }
1337 | 
1338 |   for (size_t i = 0; i < request_count; i++) {
1339 |     if (max_batch_size > 0) {
1340 |       // Retrieve the batch size from one of the inputs, if the model
1341 |       // supports batching, the first dimension size is batch size.
1342 |       TRITONBACKEND_Input* input;
1343 |       TRITONSERVER_Error* err =
1344 |           TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
1345 |       if (err == nullptr) {
1346 |         const int64_t* shape;
1347 |         err = TRITONBACKEND_InputProperties(
1348 |             input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
1349 |         total_batch_size += shape[0];
1350 |       }
1351 |       if (err != nullptr) {
1352 |         RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
1353 |             responses, request_count, all_response_failed, err);
1354 |       }
1355 |     } else {
1356 |       total_batch_size += 1;
1357 |     }
1358 |   }
1359 | 
1360 |   // If there are no valid payloads then no need to run the inference.
1361 |   if (total_batch_size == 0) {
1362 |     return;
1363 |   }
1364 | 
1365 |   // Make sure the maximum batch size is not exceeded. The
1366 |   // total_batch_size must be 1 for models that don't support batching
1367 |   // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
1368 |   // scheduler has done something badly wrong so fail and release all
1369 |   // requests.
1370 |   if (!all_response_failed) {
1371 |     if ((total_batch_size != 1) &&
1372 |         (total_batch_size > (size_t)max_batch_size)) {
1373 |       RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
1374 |           responses, request_count, all_response_failed,
1375 |           TRITONSERVER_ErrorNew(
1376 |               TRITONSERVER_ERROR_INTERNAL,
1377 |               std::string(
1378 |                   "batch size " + std::to_string(total_batch_size) + " for '" +
1379 |                   Name() + "', max allowed is " +
1380 |                   std::to_string(max_batch_size))
1381 |                   .c_str()));
1382 |     }
1383 |   }
1384 | 
1385 |   std::vector<const char*> input_names;
1386 |   std::vector<torch::jit::IValue> input_tensors;
1387 |   bool cuda_copy = false;
1388 |   std::unique_ptr<BackendInputCollector> collector;
1389 | 
1390 |   // For 'KIND_MODEL', it's fine to use CUDA events to calculate the compute
1391 |   // input duration since only one stream will be used for input collection.
1392 |   if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
1393 |       ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
1394 | #ifdef TRITON_ENABLE_GPU
1395 |     RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
1396 |         responses, request_count, all_response_failed,
1397 |         ConvertCUDAStatusToTritonError(
1398 |             cudaEventRecord(
1399 |                 compute_input_start_event_, GetCudaStreamByInstanceKind()),
1400 |             TRITONSERVER_ERROR_INTERNAL, "Failed to record the event."));
1401 | #endif
1402 |   }
1403 | 
1404 |   if (!all_response_failed) {
1405 |     collector.reset(new BackendInputCollector(
1406 |         requests, request_count, &responses,
1407 |         model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(),
1408 |         GetCudaStreamByInstanceKind(), nullptr, nullptr, 0,
1409 |         HostPolicyName().c_str()));
1410 |     RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
1411 |         responses, request_count, all_response_failed,
1412 |         SetInputTensors(
1413 |             total_batch_size, requests, request_count, &responses,
1414 |             collector.get(), &input_names, &input_tensors, &cuda_copy));
1415 |   }
1416 | 
1417 | #ifdef TRITON_ENABLE_GPU
1418 |   if (cuda_copy) {
1419 |     cudaStreamSynchronize(GetCudaStreamByInstanceKind());
1420 |     cuda_copy = false;
1421 |   }
1422 | #endif
1423 | 
1424 |   std::vector<torch::jit::IValue> output_tensors;
1425 |   uint64_t compute_start_ns = 0;
1426 |   uint64_t compute_infer_start = 0;
1427 | 
1428 |   RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
1429 |       responses, request_count, all_response_failed,
1430 |       RecordBackendTimestamp(
1431 |           &compute_start_ns,
1432 |           reinterpret_cast<void*>(&compute_infer_start_event_)));
1433 | 
1434 |   // For 'KIND_MODEL', capture the timestamp for the compute infer duration.
1435 |   if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
1436 |     SET_TIMESTAMP(compute_infer_start);
1437 |   }
1438 | 
1439 |   // Run...
1440 |   if (!all_response_failed) {
1441 |     Execute(&responses, request_count, &input_tensors, &output_tensors);
1442 |   }
1443 | 
1444 |   // Verify output indices are valid with number of outputs after execution
1445 |   bool invalid_index = false;
1446 |   int max_index = output_tensors.size() - 1;
1447 | 
1448 |   if (!all_response_failed) {
1449 |     for (const auto& name : model_state_->ModelOutputs()) {
1450 |       int op_index = output_index_map_[name.first];
1451 |       if ((op_index < 0) || (op_index > max_index)) {
1452 |         RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
1453 |             responses, request_count, all_response_failed,
1454 |             TRITONSERVER_ErrorNew(
1455 |                 TRITONSERVER_ERROR_INVALID_ARG,
1456 |                 std::string(
1457 |                     "The output " + std::string(name.first) +
1458 |                     " in the model configuration refers to an output index "
1459 |                     "which doesn't exist. This model has " +
1460 |                     std::to_string(max_index + 1) + " outputs")
1461 |                     .c_str()));
1462 |         invalid_index = true;
1463 |         break;
1464 |       }
1465 |     }
1466 |   }
1467 | 
1468 | #ifdef TRITON_ENABLE_GPU
1469 |   if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
1470 |     // For 'KIND_MODEL', multiple streams will be involved, so we need to call
1471 |     // 'cudaStreamSynchronize' before reading the output tensors.
1472 |     for (auto& stream : stream_vec_) {
1473 |       cudaStreamSynchronize(stream);
1474 |     }
1475 |   }
1476 | #endif
1477 | 
1478 |   uint64_t compute_end_ns = 0;
1479 |   uint64_t compute_output_start = 0;
1480 | 
1481 |   if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
1482 | #ifdef TRITON_ENABLE_GPU
1483 |     SET_TIMESTAMP(compute_output_start);
1484 | #endif
1485 |   } else {
1486 |     RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
1487 |         responses, request_count, all_response_failed,
1488 |         RecordBackendTimestamp(
1489 |             &compute_end_ns,
1490 |             reinterpret_cast<void*>(&compute_output_start_event_)));
1491 |   }
1492 | 
1493 |   if (!all_response_failed) {
1494 |     if (!invalid_index) {
1495 |       RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
1496 |           responses, request_count, all_response_failed,
1497 |           ReadOutputTensors(
1498 |               total_batch_size, output_tensors, requests, request_count,
1499 |               &responses));
1500 |     }
1501 |   }
1502 | 
1503 |   uint64_t exec_end_ns = 0;
1504 |   SET_TIMESTAMP(exec_end_ns);
1505 | 
1506 |   // Send all the responses that haven't already been sent because of
1507 |   // an earlier error. Note that the responses are not set to nullptr
1508 |   // here as we need that indication below to determine if the request
1509 |   // we successful or not.
1510 |   for (auto& response : responses) {
1511 |     if (response != nullptr) {
1512 |       LOG_IF_ERROR(
1513 |           TRITONBACKEND_ResponseSend(
1514 |               response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
1515 |           "failed to send PyTorch backend response");
1516 |     }
1517 |   }
1518 | 
1519 |   // We don't need an explicit CUDA syncrhonization here since we have already
1520 |   // synchronized the stream in the ReadOutputTensors function.
1521 |   if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
1522 | #ifdef TRITON_ENABLE_GPU
1523 |     float compute_input_duration = GetCudaEventElapsedTime(
1524 |         compute_input_start_event_, compute_infer_start_event_);
1525 |     float compute_infer_duration = GetCudaEventElapsedTime(
1526 |         compute_infer_start_event_, compute_output_start_event_);
1527 | 
1528 |     compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
1529 |     compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6);
1530 | #endif
1531 |   } else if (
1532 |       (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
1533 | #ifdef TRITON_ENABLE_GPU
1534 |     float compute_input_duration = GetCudaEventElapsedTime(
1535 |         compute_input_start_event_, compute_infer_start_event_);
1536 |     uint64_t compute_infer_duration =
1537 |         compute_output_start - compute_infer_start;
1538 | 
1539 |     compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
1540 |     compute_end_ns = compute_start_ns + compute_infer_duration;
1541 | #endif
1542 |   }
1543 | 
1544 |   // Report statistics for each request.
1545 |   for (uint32_t r = 0; r < request_count; ++r) {
1546 |     auto& request = requests[r];
1547 |     LOG_IF_ERROR(
1548 |         TRITONBACKEND_ModelInstanceReportStatistics(
1549 |             TritonModelInstance(), request,
1550 |             (responses[r] != nullptr) /* success */, exec_start_ns,
1551 |             compute_start_ns, compute_end_ns, exec_end_ns),
1552 |         "failed reporting request statistics");
1553 | 
1554 |     LOG_IF_ERROR(
1555 |         TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
1556 |         "failed releasing request");
1557 |   }
1558 | 
1559 |   if (!all_response_failed) {
1560 |     // Report the entire batch statistics.
1561 |     LOG_IF_ERROR(
1562 |         TRITONBACKEND_ModelInstanceReportBatchStatistics(
1563 |             TritonModelInstance(), total_batch_size, exec_start_ns,
1564 |             compute_start_ns, compute_end_ns, exec_end_ns),
1565 |         "failed reporting batch request statistics");
1566 |   }
1567 | }
1568 | 
1569 | void
1570 | ModelInstanceState::Execute(
1571 |     std::vector<TRITONBACKEND_Response*>* responses,
1572 |     const uint32_t response_count,
1573 |     std::vector<torch::jit::IValue>* input_tensors,
1574 |     std::vector<torch::jit::IValue>* output_tensors)
1575 | {
1576 |   NVTX_RANGE(nvtx_, "Execute " + Name());
1577 | 
1578 |   torch::jit::IValue model_outputs_;
1579 | 
1580 |   try {
1581 |     // enable/disable optimized execution
1582 |     torch::jit::setGraphExecutorOptimize(
1583 |         model_state_->EnabledOptimizedExecution());
1584 | 
1585 |     // enable/disable inference mode - supersedes NoGradGuard
1586 |     torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
1587 | 
1588 |     // enable/disable cudnn
1589 |     at::globalContext().setUserEnabledCuDNN(model_state_->EnabledCudnn());
1590 | 
1591 |     // JIT. No change is made unless parameter is explicitly set.
1592 |     if (std::get<0>(model_state_->EnabledJitProfiling())) {
1593 |       torch::jit::getProfilingMode() =
1594 |           std::get<1>(model_state_->EnabledJitProfiling());
1595 |     }
1596 | 
1597 |     if (std::get<0>(model_state_->EnabledJitExecutor())) {
1598 |       torch::jit::getExecutorMode() =
1599 |           std::get<1>(model_state_->EnabledJitExecutor());
1600 |     }
1601 | 
1602 |     // Fuser. No change is made unless fuser is explicitly set in
1603 |     // parameters.
1604 |     if (std::get<0>(model_state_->EnabledTensorExprFuser())) {
1605 |       torch::jit::setTensorExprFuserEnabled(
1606 |           std::get<1>(model_state_->EnabledTensorExprFuser()));
1607 |     }
1608 | 
1609 |     torch::NoGradGuard no_grad;
1610 | 
1611 |     // If input is a dictionary, prepare dictionary from 'input_tensors'.
1612 |     if (is_dict_input_) {
1613 |       torch::Dict<std::string, torch::Tensor> input_dict;
1614 |       for (auto& input_index : input_index_map_) {
1615 |         torch::jit::IValue ival = (*input_tensors)[input_index.second];
1616 |         input_dict.insert(input_index.first, ival.toTensor());
1617 |       }
1618 |       std::vector<torch::jit::IValue> input_dict_ivalue = {input_dict};
1619 |       model_outputs_ = torch_model_->forward(input_dict_ivalue);
1620 |     } else {
1621 |       model_outputs_ = torch_model_->forward(*input_tensors);
1622 |     }
1623 | 
1624 |     if (model_outputs_.isTuple()) {
1625 |       auto model_outputs_tuple = model_outputs_.toTuple();
1626 |       size_t op_index = 0;
1627 |       for (auto& m_op : model_outputs_tuple->elements()) {
1628 |         if (m_op.isList()) {
1629 |           auto list_output = m_op.toList();
1630 |           if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
1631 |             throw std::invalid_argument(
1632 |                 "output at index " + std::to_string(op_index) +
1633 |                 " must be of type Tensor or List[str], received List[" +
1634 |                 list_output.elementType()->str() + "]");
1635 |           }
1636 |           output_tensors->push_back(m_op);
1637 |         } else {
1638 |           auto tensor_output = m_op.toTensor();
1639 |           output_tensors->push_back(m_op);
1640 |         }
1641 |         op_index++;
1642 |       }
1643 |     } else if (model_outputs_.isTensor()) {
1644 |       output_tensors->push_back(model_outputs_);
1645 |     } else if (model_outputs_.isList()) {
1646 |       auto list_output = model_outputs_.toList();
1647 |       if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
1648 |         throw std::invalid_argument(
1649 |             "output must be of type Tensor or List[str], received List[" +
1650 |             list_output.elementType()->str() + "]");
1651 |       }
1652 |       output_tensors->push_back(model_outputs_);
1653 |     } else {
1654 |       throw std::invalid_argument(
1655 |           "output must be of type Tensor, List[str] or Tuple containing one of "
1656 |           "these two types. It should not be a List / Dictionary of Tensors or "
1657 |           "a Scalar");
1658 |     }
1659 |   }
1660 |   catch (std::exception& ex) {
1661 |     SendErrorForResponses(
1662 |         responses, response_count,
1663 |         TRITONSERVER_ErrorNew(
1664 |             TRITONSERVER_ERROR_INTERNAL,
1665 |             ("PyTorch execute failure: " + std::string(ex.what())).c_str()));
1666 |   }
1667 | }
1668 | 
1669 | TRITONSERVER_Error*
1670 | ModelInstanceState::GetNamingConvention(
1671 |     NamingConvention* naming_convention,
1672 |     const std::vector<std::string>& allowed_ios)
1673 | {
1674 |   // Rules for (non-Dictionary) input tensor names:
1675 |   // 1. Must be in 'allowed_inputs' (arguments in the forward function)
1676 |   // 2. Must follow the naming convention i.e. <name>__<index>
1677 |   // 3. If neither of the above conditions are satisfied, enforce strict
1678 |   // ordering of model inputs.
1679 |   //
1680 |   // Rules for output tensor names:
1681 |   // 1. Must follow the naming convention i.e. <name>__<index>
1682 |   // 2. If not, we enforce strict ordering of model outputs.
1683 |   std::string deliminator = "__";
1684 |   std::string io_kind = "input";
1685 |   *naming_convention = NamingConvention::FORWARD_ARGUMENT;
1686 | 
1687 |   // symbolizes output
1688 |   if (allowed_ios.size() == 0) {
1689 |     io_kind = "output";
1690 |     *naming_convention = NamingConvention::NAMED_INDEX;
1691 |   }
1692 | 
1693 |   triton::common::TritonJson::Value ios;
1694 |   RETURN_IF_ERROR(
1695 |       model_state_->ModelConfig().MemberAsArray(io_kind.c_str(), &ios));
1696 | 
1697 |   if (io_kind == "input") {
1698 |     for (size_t i = 0; i < ios.ArraySize(); i++) {
1699 |       triton::common::TritonJson::Value io;
1700 |       RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
1701 | 
1702 |       // Validate name
1703 |       std::string io_name;
1704 |       RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
1705 |       auto itr = std::find(allowed_ios.begin(), allowed_ios.end(), io_name);
1706 |       if (itr == allowed_ios.end()) {
1707 |         *naming_convention = NamingConvention::NAMED_INDEX;
1708 |         break;
1709 |       }
1710 |     }
1711 |   }
1712 | 
1713 |   // If not, check if inputs follow INDEX
1714 |   if (*naming_convention == NamingConvention::NAMED_INDEX) {
1715 |     for (size_t i = 0; i < ios.ArraySize(); i++) {
1716 |       triton::common::TritonJson::Value io;
1717 |       RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
1718 | 
1719 |       // Validate name
1720 |       std::string io_name;
1721 |       RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
1722 |       int start_pos = io_name.find(deliminator);
1723 |       if (start_pos == -1) {
1724 |         *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING;
1725 |         break;
1726 |       } else {
1727 |         // check if the index part of the name is not an integer
1728 |         std::string index_str = io_name.substr(start_pos + 2);
1729 |         bool is_int = true;
1730 |         for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
1731 |           if (std::isdigit(*itr) == 0) {
1732 |             is_int = false;
1733 |           }
1734 |         }
1735 | 
1736 |         if (!is_int) {
1737 |           if (io_kind == "input") {
1738 |             LOG_MESSAGE(
1739 |                 TRITONSERVER_LOG_WARN,
1740 |                 ("input '" + io_name +
1741 |                  "' or previous input(s) are neither an input argument to the "
1742 |                  "model '" +
1743 |                  model_state_->Name() +
1744 |                  "' nor do they follow the <name>__<index> naming convention. "
1745 |                  "Falling back to enforcing strict ordering from model "
1746 |                  "configuration.")
1747 |                     .c_str());
1748 |           } else {
1749 |             LOG_MESSAGE(
1750 |                 TRITONSERVER_LOG_WARN,
1751 |                 ("output '" + io_name +
1752 |                  "' or previous output(s) of the model '" +
1753 |                  model_state_->Name() +
1754 |                  "' do not follow the <name>__<index> naming convention. "
1755 |                  "Falling back to enforcing strict ordering from model "
1756 |                  "configuration.")
1757 |                     .c_str());
1758 |           }
1759 |           *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING;
1760 |           break;
1761 |         }
1762 |       }
1763 |     }
1764 |   }
1765 | 
1766 |   triton::common::TritonJson::Value sequence_batching;
1767 |   if (model_state_->ModelConfig().Find(
1768 |           "sequence_batching", &sequence_batching)) {
1769 |     // If we need to manage state for the model, then we need to check
1770 |     // the naming of the state adheres to both the input and output conventions
1771 |     triton::common::TritonJson::Value states;
1772 |     if (sequence_batching.Find("state", &states)) {
1773 |       if (*naming_convention != NamingConvention::NAMED_INDEX) {
1774 |         return TRITONSERVER_ErrorNew(
1775 |             TRITONSERVER_ERROR_INVALID_ARG,
1776 |             ("PyTorch model '" + model_state_->Name() +
1777 |              "' is using sequence batching with state but not all inputs and "
1778 |              "outputs follow the <name>__<index> naming convention. ")
1779 |                 .c_str());
1780 |       }
1781 |     }
1782 | 
1783 |     for (size_t i = 0; i < states.ArraySize(); i++) {
1784 |       triton::common::TritonJson::Value state;
1785 |       RETURN_IF_ERROR(states.IndexAsObject(i, &state));
1786 |       std::string name_entry =
1787 |           io_kind == "input" ? "input_name" : "output_name";
1788 |       std::string state_name;
1789 |       RETURN_IF_ERROR(state.MemberAsString(name_entry.c_str(), &state_name));
1790 |       int start_pos = state_name.find(deliminator);
1791 |       if (start_pos == -1) {
1792 |         return TRITONSERVER_ErrorNew(
1793 |             TRITONSERVER_ERROR_INVALID_ARG,
1794 |             ("PyTorch model '" + model_state_->Name() +
1795 |              "' is using sequence batching with state but state '" +
1796 |              state_name +
1797 |              "' does not follow the <name>__<index> naming convention. ")
1798 |                 .c_str());
1799 |       } else {
1800 |         // check if the index part of the name is not an integer
1801 |         std::string index_str = state_name.substr(start_pos + 2);
1802 |         bool is_int = true;
1803 |         for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
1804 |           if (std::isdigit(*itr) == 0) {
1805 |             is_int = false;
1806 |           }
1807 |         }
1808 |         if (!is_int) {
1809 |           return TRITONSERVER_ErrorNew(
1810 |               TRITONSERVER_ERROR_INVALID_ARG,
1811 |               ("PyTorch model '" + model_state_->Name() +
1812 |                "' is using sequence batching with state but state '" +
1813 |                state_name +
1814 |                "' does not follow the <name>__<index> naming convention. ")
1815 |                   .c_str());
1816 |         }
1817 |       }
1818 |     }
1819 |   }
1820 | 
1821 |   return nullptr;  // success
1822 | }
1823 | 
1824 | // This function will return a tensor's contents as a contiguous
1825 | // chunk in system memory. In some cases this will require copying the data.
1826 | // If that  happens, 'contiguous_buffer' will be set to hold the contiguous
1827 | // chunk and 'cuda_copy' will be set to indicate whether CUDA copy is
1828 | // conducted.  The data copy can be avoided if the input is already in
1829 | // a contiguous chunk and the input is located in memory type and id
1830 | // specified.
1831 | TRITONSERVER_Error*
1832 | GetContiguousInputContent(
1833 |     TRITONBACKEND_Input* rinput, const uint32_t buffer_count,
1834 |     const char** content, size_t* content_byte_size,
1835 |     std::vector<char>* contiguous_buffer, cudaStream_t stream, bool* cuda_copy)
1836 | {
1837 |   *cuda_copy = false;
1838 | 
1839 |   // Check input buffers to see if data copy is necessary
1840 |   size_t chunk_count = 0;
1841 |   bool type_mismatch = false;
1842 |   uint64_t total_byte_size = 0;
1843 |   for (size_t idx = 0; idx < buffer_count; ++idx) {
1844 |     TRITONSERVER_MemoryType src_memory_type;
1845 |     int64_t src_memory_type_id;
1846 |     size_t src_byte_size;
1847 |     const void* src_ptr;
1848 | 
1849 |     RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
1850 |         rinput, idx, &src_ptr, &src_byte_size, &src_memory_type,
1851 |         &src_memory_type_id));
1852 | 
1853 |     if (src_ptr != nullptr) {
1854 |       chunk_count++;
1855 |       total_byte_size += src_byte_size;
1856 |       type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU);
1857 |     }
1858 |   }
1859 | 
1860 |   if (chunk_count == 0) {
1861 |     *content = nullptr;
1862 |     *content_byte_size = 0;
1863 |   } else if ((chunk_count == 1) && !type_mismatch) {
1864 |     TRITONSERVER_MemoryType src_memory_type;
1865 |     int64_t src_memory_type_id;
1866 |     RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
1867 |         rinput, 0, (const void**)content, content_byte_size, &src_memory_type,
1868 |         &src_memory_type_id));
1869 |   } else {
1870 |     contiguous_buffer->resize(total_byte_size);
1871 | 
1872 |     size_t offset = 0;
1873 |     for (size_t i = 0; i < chunk_count; i++) {
1874 |       bool cuda_used;
1875 |       TRITONSERVER_MemoryType src_memory_type;
1876 |       int64_t src_memory_type_id;
1877 |       size_t src_byte_size;
1878 |       const void* src_ptr;
1879 | 
1880 |       RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
1881 |           rinput, i, &src_ptr, &src_byte_size, &src_memory_type,
1882 |           &src_memory_type_id));
1883 |       RETURN_IF_ERROR(CopyBuffer(
1884 |           "Contiguous input", src_memory_type, src_memory_type_id,
1885 |           TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr,
1886 |           contiguous_buffer->data() + offset, stream, &cuda_used));
1887 |       *cuda_copy |= cuda_used;
1888 |       offset += src_byte_size;
1889 |     }
1890 | 
1891 |     *content = contiguous_buffer->data();
1892 |     *content_byte_size = total_byte_size;
1893 |   }
1894 | 
1895 |   return nullptr;  // success
1896 | }
1897 | 
1898 | void
1899 | FillStringTensor(torch::List<std::string>* input_list, const size_t cnt)
1900 | {
1901 |   for (size_t c = 0; c < cnt; ++c) {
1902 |     input_list->push_back("");
1903 |   }
1904 | }
1905 | 
1906 | bool
1907 | SetStringInputTensor(
1908 |     torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
1909 |     const char* name, const uint32_t buffer_count,
1910 |     const size_t request_element_cnt, TRITONBACKEND_Response** response,
1911 |     cudaStream_t stream, const char* host_policy_name)
1912 | {
1913 |   bool cuda_copy = false;
1914 | 
1915 |   // For string data type, we always need to have the data on CPU so
1916 |   // that we can read string length and construct the string
1917 |   // properly. So if the request's input tensor is not in CPU need to
1918 |   // copy it there.
1919 |   const char* content = nullptr;
1920 |   size_t content_byte_size = 0;
1921 | 
1922 |   std::vector<char> contiguous_buffer;
1923 |   auto err = GetContiguousInputContent(
1924 |       input, buffer_count, &content, &content_byte_size, &contiguous_buffer,
1925 |       stream, &cuda_copy);
1926 |   if (err != nullptr) {
1927 |     RESPOND_AND_SET_NULL_IF_ERROR(response, err);
1928 |     FillStringTensor(input_list, request_element_cnt);
1929 |     return cuda_copy;
1930 |   }
1931 | 
1932 | #ifdef TRITON_ENABLE_GPU
1933 |   if (cuda_copy) {
1934 |     cudaStreamSynchronize(stream);
1935 |     cuda_copy = false;
1936 |   }
1937 | #endif  // TRITON_ENABLE_GPU
1938 | 
1939 |   std::vector<std::pair<const char*, const uint32_t>> str_list;
1940 |   err = ValidateStringBuffer(
1941 |       content, content_byte_size, request_element_cnt, name, &str_list);
1942 |   // Set string values.
1943 |   for (const auto& [addr, len] : str_list) {
1944 |     input_list->push_back(std::string(addr, len));
1945 |   }
1946 | 
1947 |   size_t element_cnt = str_list.size();
1948 |   if (err != nullptr) {
1949 |     RESPOND_AND_SET_NULL_IF_ERROR(response, err);
1950 |     FillStringTensor(input_list, request_element_cnt - element_cnt);
1951 |   }
1952 |   return cuda_copy;
1953 | }
1954 | 
1955 | bool
1956 | SetStringBuffer(
1957 |     torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
1958 |     TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state,
1959 |     const size_t tensor_element_count, cudaStream_t stream,
1960 |     std::string* serialized, bool state)
1961 | {
1962 |   bool cuda_copy = false;
1963 | 
1964 |   // Serialize the output tensor strings. Each string is serialized as
1965 |   // a 4-byte length followed by the string itself with no
1966 |   // null-terminator.
1967 |   serialized->clear();
1968 |   for (size_t e = 0; e < tensor_element_count; ++e) {
1969 |     std::string str = tensor->get(e).to<std::string>();
1970 |     const char* cstr = str.c_str();
1971 |     size_t len = str.length();
1972 |     serialized->append(reinterpret_cast<const char*>(&len), sizeof(uint32_t));
1973 |     if (len > 0) {
1974 |       serialized->append(cstr, len);
1975 |     }
1976 |   }
1977 | 
1978 |   // Allocate a buffer large enough to hold the serialized tensor.
1979 |   TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU;
1980 |   int64_t actual_memory_type_id = 0;
1981 | 
1982 |   TRITONSERVER_Error* err;
1983 |   void* buffer;
1984 | 
1985 |   if (!state) {
1986 |     auto err = TRITONBACKEND_OutputBuffer(
1987 |         response_output, &buffer, serialized->size(), &actual_memory_type,
1988 |         &actual_memory_type_id);
1989 |     if (err != nullptr) {
1990 |       RESPOND_AND_SET_NULL_IF_ERROR(response, err);
1991 |       return cuda_copy;
1992 |     }
1993 |   } else {
1994 |     auto err = TRITONBACKEND_StateBuffer(
1995 |         response_state, &buffer, serialized->size(), &actual_memory_type,
1996 |         &actual_memory_type_id);
1997 |     if (err != nullptr) {
1998 |       RESPOND_AND_SET_NULL_IF_ERROR(response, err);
1999 |       return cuda_copy;
2000 |     }
2001 |   }
2002 |   // Copy the serialized tensor into the allocated buffer.
2003 |   bool cuda_used = false;
2004 |   err = CopyBuffer(
2005 |       "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */,
2006 |       0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id,
2007 |       serialized->size(), reinterpret_cast<const void*>(serialized->c_str()),
2008 |       buffer, stream, &cuda_used);
2009 |   cuda_copy |= cuda_used;
2010 | 
2011 |   if (err != nullptr) {
2012 |     RESPOND_AND_SET_NULL_IF_ERROR(response, err);
2013 |     return cuda_copy;
2014 |   }
2015 | 
2016 |   if (state) {
2017 |     RESPOND_AND_SET_NULL_IF_ERROR(
2018 |         response, TRITONBACKEND_StateUpdate(response_state));
2019 |   }
2020 | 
2021 |   return cuda_copy;
2022 | }
2023 | 
2024 | 
2025 | bool
2026 | SetStringOutputBuffer(
2027 |     torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
2028 |     TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
2029 |     cudaStream_t stream, std::string* serialized)
2030 | {
2031 |   return SetStringBuffer(
2032 |       tensor, response, response_output, nullptr /* response_state */,
2033 |       tensor_element_count, stream, serialized, false /* state */);
2034 | }
2035 | 
2036 | bool
2037 | SetStringStateBuffer(
2038 |     torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
2039 |     TRITONBACKEND_State* response_state, const size_t tensor_element_count,
2040 |     cudaStream_t stream, std::string* serialized)
2041 | {
2042 |   return SetStringBuffer(
2043 |       tensor, response, nullptr /* response_output */, response_state,
2044 |       tensor_element_count, stream, serialized, true /* state */);
2045 | }
2046 | 
2047 | 
2048 | TRITONSERVER_Error*
2049 | ModelInstanceState::SetInputTensors(
2050 |     size_t total_batch_size, TRITONBACKEND_Request** requests,
2051 |     const uint32_t request_count,
2052 |     std::vector<TRITONBACKEND_Response*>* responses,
2053 |     BackendInputCollector* collector, std::vector<const char*>* input_names,
2054 |     std::vector<torch::jit::IValue>* input_tensors, bool* cuda_copy)
2055 | {
2056 |   // InferenceMode should be used to guard all tensors operations
2057 |   torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
2058 | 
2059 |   // All requests must have equally-sized input tensors so use any
2060 |   // request as the representative for the input tensors.
2061 |   uint32_t input_count;
2062 |   RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count));
2063 | 
2064 |   input_tensors->resize(input_count + batch_input_count_);
2065 | 
2066 |   // The inputs must be in contiguous CPU/GPU memory.
2067 |   std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
2068 |   if (device_.is_cpu()) {
2069 |     alloc_perference = {
2070 |         {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
2071 |   } else {
2072 |     alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
2073 |   }
2074 | 
2075 |   for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
2076 |     TRITONBACKEND_Input* input;
2077 |     RETURN_IF_ERROR(
2078 |         TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
2079 | 
2080 |     const char* input_name;
2081 |     TRITONSERVER_DataType input_datatype;
2082 |     const int64_t* input_shape;
2083 |     uint32_t input_dims_count;
2084 |     RETURN_IF_ERROR(TRITONBACKEND_InputProperties(
2085 |         input, &input_name, &input_datatype, &input_shape, &input_dims_count,
2086 |         nullptr, nullptr));
2087 | 
2088 |     input_names->emplace_back(input_name);
2089 | 
2090 |     // The shape for the entire input patch,
2091 |     // [total_batch_size, ...] for non-ragged input and
2092 |     // [total_element_count] for ragged input (non-nested tensor)
2093 |     std::vector<int64_t> batchn_shape;
2094 |     if (StateForModel()->IsInputRagged(input_name)) {
2095 |       batchn_shape = std::vector<int64_t>{0};
2096 |       for (size_t idx = 0; idx < request_count; idx++) {
2097 |         TRITONBACKEND_Input* input;
2098 |         RESPOND_AND_SET_NULL_IF_ERROR(
2099 |             &((*responses)[idx]),
2100 |             TRITONBACKEND_RequestInput(requests[idx], input_name, &input));
2101 |         const int64_t* input_shape;
2102 |         uint32_t input_dims_count;
2103 |         RESPOND_AND_SET_NULL_IF_ERROR(
2104 |             &((*responses)[idx]), TRITONBACKEND_InputProperties(
2105 |                                       input, nullptr, nullptr, &input_shape,
2106 |                                       &input_dims_count, nullptr, nullptr));
2107 | 
2108 |         int64_t element_cnt = 0;
2109 |         RESPOND_AND_SET_NULL_IF_ERROR(
2110 |             &((*responses)[idx]),
2111 |             GetElementCount(input_shape, input_dims_count, &element_cnt));
2112 |         batchn_shape[0] += element_cnt;
2113 |       }
2114 |     } else {
2115 |       batchn_shape =
2116 |           std::vector<int64_t>(input_shape, input_shape + input_dims_count);
2117 |       if (supports_batching_) {
2118 |         batchn_shape[0] = total_batch_size;
2119 |       }
2120 |     }
2121 | 
2122 |     // The input must be in contiguous CPU/GPU memory.
2123 |     std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
2124 |     // For 'KIND_MODEL', input will always be in CPU as we don't have a way to
2125 |     // query the input types.
2126 |     if (device_.is_cpu() || (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL)) {
2127 |       alloc_perference = {
2128 |           {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
2129 |     } else {
2130 |       alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
2131 |     }
2132 | 
2133 |     const char* input_buffer;
2134 |     size_t batchn_byte_size;
2135 |     TRITONSERVER_MemoryType memory_type;
2136 |     int64_t memory_type_id;
2137 |     RETURN_IF_ERROR(collector->ProcessTensor(
2138 |         input_name, nullptr, 0, alloc_perference, &input_buffer,
2139 |         &batchn_byte_size, &memory_type, &memory_type_id));
2140 | 
2141 |     // Create Torch tensor
2142 |     const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype);
2143 |     torch::TensorOptions options{torch_dtype.second};
2144 |     auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU)
2145 |                                ? options.device(torch::kCUDA, device_.index())
2146 |                                : options.device(torch::kCPU);
2147 | 
2148 |     if (input_datatype == TRITONSERVER_TYPE_BYTES) {
2149 |       // Create the PyTorch list to hold the strings.
2150 |       torch::List<std::string> input_list;
2151 |       input_list.reserve(batchn_shape[0]);
2152 | 
2153 |       for (size_t idx = 0; idx < request_count; idx++) {
2154 |         TRITONBACKEND_Input* input;
2155 |         RESPOND_AND_SET_NULL_IF_ERROR(
2156 |             &((*responses)[idx]),
2157 |             TRITONBACKEND_RequestInput(requests[idx], input_name, &input));
2158 |         const int64_t* shape;
2159 |         uint32_t dims_count;
2160 |         uint32_t buffer_count;
2161 |         RESPOND_AND_SET_NULL_IF_ERROR(
2162 |             &((*responses)[idx]),
2163 |             TRITONBACKEND_InputPropertiesForHostPolicy(
2164 |                 input, HostPolicyName().c_str(), nullptr, nullptr, &shape,
2165 |                 &dims_count, nullptr, &buffer_count));
2166 | 
2167 |         int64_t batch_element_cnt = 0;
2168 |         RESPOND_AND_SET_NULL_IF_ERROR(
2169 |             &((*responses)[idx]),
2170 |             GetElementCount(shape, dims_count, &batch_element_cnt));
2171 | 
2172 |         *cuda_copy |= SetStringInputTensor(
2173 |             &input_list, input, input_name, buffer_count, batch_element_cnt,
2174 |             &((*responses)[idx]), GetCudaStreamByInstanceKind(),
2175 |             HostPolicyName().c_str());
2176 |       }
2177 | 
2178 |       (*input_tensors)[input_index_map_[input_name]] = input_list;
2179 |     } else {
2180 |       if (batchn_byte_size) {
2181 |         // Remove constness to align with the signature of torch::from_blob()
2182 |         torch::Tensor input_tensor = torch::from_blob(
2183 |             const_cast<char*>(input_buffer), batchn_shape, updated_options);
2184 |         (*input_tensors)[input_index_map_[input_name]] = input_tensor;
2185 |       } else {
2186 |         // torch:from_blob seems not working when the input size is 0
2187 |         // create zero-length inputs directly
2188 |         torch::Tensor input_tensor =
2189 |             torch::zeros(batchn_shape, updated_options);
2190 |         (*input_tensors)[input_index_map_[input_name]] = input_tensor;
2191 |       }
2192 |     }
2193 |   }
2194 | 
2195 |   for (const auto& batch_input : StateForModel()->BatchInputs()) {
2196 |     std::vector<int64_t> shape;
2197 |     collector->BatchInputShape(batch_input, &shape);
2198 | 
2199 |     for (const auto& input_name : batch_input.TargetNames()) {
2200 |       input_names->emplace_back(input_name.c_str());
2201 | 
2202 |       const char* dst_buffer;
2203 |       size_t dst_buffer_byte_size;
2204 |       TRITONSERVER_MemoryType dst_memory_type;
2205 |       int64_t dst_memory_type_id;
2206 | 
2207 |       RESPOND_ALL_AND_SET_NULL_IF_ERROR(
2208 |           (*responses), responses->size(),
2209 |           collector->ProcessBatchInput(
2210 |               batch_input, nullptr, 0, alloc_perference, &dst_buffer,
2211 |               &dst_buffer_byte_size, &dst_memory_type, &dst_memory_type_id));
2212 | 
2213 |       const auto torch_dtype =
2214 |           ConvertDataTypeToTorchType(batch_input.DataType());
2215 |       torch::TensorOptions options{torch_dtype.second};
2216 |       auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU)
2217 |                                  ? options.device(torch::kCUDA, device_.index())
2218 |                                  : options.device(torch::kCPU);
2219 | 
2220 |       if (dst_buffer_byte_size) {
2221 |         torch::Tensor input_tensor = torch::from_blob(
2222 |             const_cast<char*>(dst_buffer), shape, updated_options);
2223 |         (*input_tensors)[input_index_map_[input_name]] = input_tensor;
2224 |       } else {
2225 |         // special handle when input has zero size
2226 |         torch::Tensor input_tensor = torch::zeros(shape, updated_options);
2227 |         (*input_tensors)[input_index_map_[input_name]] = input_tensor;
2228 |       }
2229 |     }
2230 |   }
2231 | 
2232 |   // Finalize...
2233 |   *cuda_copy |= collector->Finalize();
2234 | 
2235 |   return nullptr;
2236 | }
2237 | 
2238 | TRITONSERVER_Error*
2239 | ModelInstanceState::ReadOutputTensors(
2240 |     size_t total_batch_size,
2241 |     const std::vector<torch::jit::IValue>& output_tensors,
2242 |     TRITONBACKEND_Request** requests, const uint32_t request_count,
2243 |     std::vector<TRITONBACKEND_Response*>* responses)
2244 | {
2245 |   NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name());
2246 | 
2247 |   BackendOutputResponder responder(
2248 |       requests, request_count, responses, model_state_->TritonMemoryManager(),
2249 |       model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(),
2250 |       GetCudaStreamByInstanceKind());
2251 | 
2252 |   bool cuda_copy = false;
2253 |   // The serialized string buffer must be valid until output copies are done
2254 |   std::vector<std::unique_ptr<std::string>> string_buffer;
2255 |   for (auto& output : model_state_->ModelOutputs()) {
2256 |     int op_index = output_index_map_[output.first];
2257 |     auto name = output.first;
2258 |     auto output_tensor_pair = output.second;
2259 | 
2260 |     if (output_tensors[op_index].isTensor()) {
2261 |       torch::Tensor output_flat;
2262 |       try {
2263 |         output_flat =
2264 |             output_tensors[op_index].toTensor().contiguous().flatten();
2265 |       }
2266 |       catch (std::exception& ex) {
2267 |         RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
2268 |             TRITONSERVER_ERROR_INTERNAL,
2269 |             (std::string("output tensor '") + name + "' is not found")
2270 |                 .c_str()));
2271 |       }
2272 | 
2273 |       // Verify output datatype matches datatype from model config
2274 |       TRITONSERVER_DataType output_dtype =
2275 |           ConvertTorchTypeToDataType(output_flat.scalar_type());
2276 |       TRITONSERVER_DataType config_datatype = output_dtype_map_[name];
2277 |       if (config_datatype != output_dtype) {
2278 |         RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
2279 |             TRITONSERVER_ERROR_INVALID_ARG,
2280 |             (std::string("configuration expects datatype TYPE_") +
2281 |              TRITONSERVER_DataTypeString(config_datatype) + " for output '" +
2282 |              name + "', model provides TYPE_" +
2283 |              TRITONSERVER_DataTypeString(output_dtype))
2284 |                 .c_str()));
2285 |       }
2286 | 
2287 |       const char* output_buffer =
2288 |           static_cast<const char*>(output_flat.data_ptr());
2289 | 
2290 |       // Output tensors may not reside on the same device as model
2291 |       torch::Device tensor_device = output_flat.device();
2292 |       const auto memory_type = (tensor_device.type() == torch::kCPU)
2293 |                                    ? TRITONSERVER_MEMORY_CPU
2294 |                                    : TRITONSERVER_MEMORY_GPU;
2295 |       const auto memory_id =
2296 |           (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index();
2297 | 
2298 |       // Batch output doesn't support string data type yet, as it is not trivial
2299 |       // to parse string output
2300 |       const BatchOutput* batch_output = StateForModel()->FindBatchOutput(name);
2301 |       if (batch_output == nullptr) {
2302 |         // Get output shape
2303 |         std::vector<int64_t> batchn_shape;
2304 |         auto shape = output_tensors[op_index].toTensor().sizes();
2305 |         for (auto itr = shape.begin(); itr != shape.end(); itr++) {
2306 |           batchn_shape.push_back(*itr);
2307 |         }
2308 | 
2309 |         if (batchn_shape.size() == 0) {
2310 |           return TRITONSERVER_ErrorNew(
2311 |               TRITONSERVER_ERROR_INVALID_ARG,
2312 |               (std::string("output '") + name +
2313 |                "' is a scalar which is not supported.")
2314 |                   .c_str());
2315 |         }
2316 |         if (output_tensor_pair.first != -1) {
2317 |           responder.ProcessTensor(
2318 |               name, output_dtype, batchn_shape, output_buffer, memory_type,
2319 |               memory_id);
2320 |         }
2321 |         if (output_tensor_pair.second != -1) {
2322 |           std::vector<TRITONBACKEND_State*> states;
2323 |           states = responder.ProcessStateTensor(
2324 |               name, output_dtype, batchn_shape, output_buffer, memory_type,
2325 |               memory_id);
2326 |           // Update the states
2327 |           for (auto& state : states) {
2328 |             RETURN_IF_ERROR(TRITONBACKEND_StateUpdate(state));
2329 |           }
2330 |         }
2331 | 
2332 |       } else {
2333 |         responder.ProcessBatchOutput(
2334 |             name, *batch_output, output_buffer, memory_type, memory_id);
2335 |       }
2336 |     } else if (output_tensors[op_index].isList()) {
2337 |       // Custom handling for string/bytes tensor...
2338 |       torch::List<torch::jit::IValue> output_list =
2339 |           output_tensors[op_index].toList();
2340 | 
2341 |       // Get output shape
2342 |       std::vector<int64_t> batchn_shape{(int64_t)output_list.size()};
2343 | 
2344 |       for (size_t idx = 0; idx < responses->size(); idx++) {
2345 |         auto& request = requests[idx];
2346 |         auto& response = (*responses)[idx];
2347 | 
2348 |         if (supports_batching_ != 0) {
2349 |           TRITONBACKEND_Input* input;
2350 |           TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input);
2351 |           const int64_t* shape;
2352 |           TRITONBACKEND_InputProperties(
2353 |               input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
2354 |           batchn_shape[0] = shape[0];
2355 |         }
2356 | 
2357 |         int64_t tensor_element_cnt = 0;
2358 |         RETURN_IF_ERROR(GetElementCount(batchn_shape, &tensor_element_cnt));
2359 | 
2360 |         // Only need an response tensor for requested outputs.
2361 |         if (response != nullptr) {
2362 |           if (output_tensor_pair.first != -1) {
2363 |             TRITONBACKEND_Output* response_output;
2364 |             RESPOND_AND_SET_NULL_IF_ERROR(
2365 |                 &response, TRITONBACKEND_ResponseOutput(
2366 |                                response, &response_output, name.c_str(),
2367 |                                TRITONSERVER_TYPE_BYTES, batchn_shape.data(),
2368 |                                batchn_shape.size()));
2369 |             string_buffer.emplace_back(new std::string());
2370 |             cuda_copy |= SetStringOutputBuffer(
2371 |                 &output_list, &response, response_output, tensor_element_cnt,
2372 |                 GetCudaStreamByInstanceKind(), string_buffer.back().get());
2373 |           }
2374 |         }
2375 |         if (output_tensor_pair.second != -1) {
2376 |           TRITONBACKEND_State* response_state;
2377 |           RESPOND_AND_SET_NULL_IF_ERROR(
2378 |               &response, TRITONBACKEND_StateNew(
2379 |                              &response_state, request, name.c_str(),
2380 |                              TRITONSERVER_TYPE_BYTES, batchn_shape.data(),
2381 |                              batchn_shape.size()));
2382 | 
2383 |           string_buffer.emplace_back(new std::string());
2384 |           cuda_copy |= SetStringStateBuffer(
2385 |               &output_list, &response, response_state, tensor_element_cnt,
2386 |               GetCudaStreamByInstanceKind(), string_buffer.back().get());
2387 |         }
2388 |       }
2389 |     } else {
2390 |       return TRITONSERVER_ErrorNew(
2391 |           TRITONSERVER_ERROR_INVALID_ARG,
2392 |           (std::string("output '") + name +
2393 |            "' must be of type Tensor or List[str].")
2394 |               .c_str());
2395 |     }
2396 |   }
2397 | 
2398 |   // Finalize and wait for any pending buffer copies.
2399 |   cuda_copy |= responder.Finalize();
2400 | 
2401 | #ifdef TRITON_ENABLE_GPU
2402 |   // We have to always synchronize the stream. This is to make sure that
2403 |   // the events on the cuda stream are synchronized. Otherwise, the events
2404 |   // are only guaranteed to be synchronized if the model provides the output
2405 |   // on GPU.
2406 |   cudaStreamSynchronize(GetCudaStreamByInstanceKind());
2407 | #endif
2408 | 
2409 |   return nullptr;
2410 | }
2411 | 
2412 | TRITONSERVER_Error*
2413 | ModelInstanceState::RecordBackendTimestamp(
2414 |     uint64_t* timestamp, void* cuda_event)
2415 | {
2416 |   if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
2417 |       ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
2418 | #ifdef TRITON_ENABLE_GPU
2419 |     cudaEvent_t* lcuda_event = reinterpret_cast<cudaEvent_t*>(cuda_event);
2420 |     RETURN_IF_ERROR(ConvertCUDAStatusToTritonError(
2421 |         cudaEventRecord(*lcuda_event, GetCudaStreamByInstanceKind()),
2422 |         TRITONSERVER_ERROR_INTERNAL, "Failed to record the event."));
2423 | #endif
2424 |   } else {
2425 |     SET_TIMESTAMP(*timestamp);
2426 |   }
2427 |   return nullptr;
2428 | }
2429 | 
2430 | void
2431 | ModelInstanceState::CreateCudaEvents(const int32_t& device_id)
2432 | {
2433 | #ifdef TRITON_ENABLE_GPU
2434 |   // Need to set the CUDA context so that the context that events are
2435 |   // created on match with contexts that events are recorded with.
2436 |   THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
2437 |       cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL,
2438 |       "Failed to set the device"));
2439 |   THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
2440 |       cudaEventCreate(&compute_input_start_event_), TRITONSERVER_ERROR_INTERNAL,
2441 |       "Failed to create cuda event"));
2442 |   THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
2443 |       cudaEventCreate(&compute_infer_start_event_), TRITONSERVER_ERROR_INTERNAL,
2444 |       "Failed to create cuda event"));
2445 |   THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
2446 |       cudaEventCreate(&compute_output_start_event_),
2447 |       TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event"));
2448 | #endif
2449 | }
2450 | 
2451 | cudaStream_t
2452 | ModelInstanceState::GetCudaStreamByInstanceKind()
2453 | {
2454 | #ifdef TRITON_ENABLE_GPU
2455 |   if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
2456 |     return stream_;
2457 |   } else if (
2458 |       (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) &&
2459 |       !stream_vec_.empty()) {
2460 |     return stream_vec_[0];
2461 |   }
2462 | #endif
2463 |   return nullptr;
2464 | }
2465 | 
2466 | void
2467 | ModelInstanceState::SetCurrentCudaStream(
2468 |     const cudaStream_t& stream, const int& device_id)
2469 | {
2470 | #ifdef TRITON_ENABLE_GPU
2471 |   at::cuda::CUDAStream torch_stream =
2472 |       at::cuda::getStreamFromExternal(stream, device_id);
2473 |   // This function replaces the default stream with the stream we created. It
2474 |   // is not necessary to change the current device to the desired device when
2475 |   // replacing the default stream for that device. See the documentation here:
2476 |   // https://pytorch.org/cppdocs/api/function_namespacec10_1_1cuda_1a6ed50cc0fc16cc7014d9c2f4c3bd098d.html
2477 |   at::cuda::setCurrentCUDAStream(torch_stream);
2478 | #endif
2479 | }
2480 | 
2481 | float
2482 | ModelInstanceState::GetCudaEventElapsedTime(
2483 |     const cudaEvent_t& start_event, const cudaEvent_t& end_event)
2484 | {
2485 |   float duration = 0;
2486 | #ifdef TRITON_ENABLE_GPU
2487 |   // [FIXME] in the case of cudaEventElapsedTime failure, should handle
2488 |   // stats reporting more gracefully as the durations are inaccurate
2489 |   LOG_IF_ERROR(
2490 |       ConvertCUDAStatusToTritonError(
2491 |           cudaEventElapsedTime(&duration, start_event, end_event),
2492 |           TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"),
2493 |       "Failed to capture elapsed time");
2494 | #endif
2495 |   return duration;
2496 | }
2497 | 
2498 | /////////////
2499 | 
2500 | extern "C" {
2501 | 
2502 | TRITONSERVER_Error*
2503 | TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
2504 | {
2505 |   const char* cname;
2506 |   RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
2507 |   std::string name(cname);
2508 | 
2509 |   LOG_MESSAGE(
2510 |       TRITONSERVER_LOG_INFO,
2511 |       (std::string("TRITONBACKEND_Initialize: ") + name).c_str());
2512 | 
2513 |   // Check the backend API version that Triton supports vs. what this
2514 |   // backend was compiled against.
2515 |   uint32_t api_version_major, api_version_minor;
2516 |   RETURN_IF_ERROR(
2517 |       TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
2518 | 
2519 |   LOG_MESSAGE(
2520 |       TRITONSERVER_LOG_INFO,
2521 |       (std::string("Triton TRITONBACKEND API version: ") +
2522 |        std::to_string(api_version_major) + "." +
2523 |        std::to_string(api_version_minor))
2524 |           .c_str());
2525 |   LOG_MESSAGE(
2526 |       TRITONSERVER_LOG_INFO,
2527 |       (std::string("'") + name + "' TRITONBACKEND API version: " +
2528 |        std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
2529 |        std::to_string(TRITONBACKEND_API_VERSION_MINOR))
2530 |           .c_str());
2531 | 
2532 |   if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
2533 |       (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
2534 |     return TRITONSERVER_ErrorNew(
2535 |         TRITONSERVER_ERROR_UNSUPPORTED,
2536 |         (std::string("Triton TRITONBACKEND API version: ") +
2537 |          std::to_string(api_version_major) + "." +
2538 |          std::to_string(api_version_minor) + " does not support '" + name +
2539 |          "' TRITONBACKEND API version: " +
2540 |          std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
2541 |          std::to_string(TRITONBACKEND_API_VERSION_MINOR))
2542 |             .c_str());
2543 |   }
2544 | 
2545 |   return nullptr;  // success
2546 | }
2547 | 
2548 | TRITONSERVER_Error*
2549 | TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
2550 | {
2551 |   const char* cname;
2552 |   RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
2553 |   std::string name(cname);
2554 | 
2555 |   uint64_t version;
2556 |   RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
2557 | 
2558 |   LOG_MESSAGE(
2559 |       TRITONSERVER_LOG_INFO,
2560 |       (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " +
2561 |        std::to_string(version) + ")")
2562 |           .c_str());
2563 | 
2564 |   // Create a ModelState object and associate it with the
2565 |   // TRITONBACKEND_Model.
2566 |   ModelState* model_state;
2567 |   RETURN_IF_ERROR(ModelState::Create(model, &model_state));
2568 |   RETURN_IF_ERROR(
2569 |       TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
2570 | 
2571 |   return nullptr;  // success
2572 | }
2573 | 
2574 | TRITONSERVER_Error*
2575 | TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
2576 | {
2577 |   void* vstate;
2578 |   RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
2579 |   ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
2580 | 
2581 |   LOG_MESSAGE(
2582 |       TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state");
2583 | 
2584 |   delete model_state;
2585 | 
2586 |   return nullptr;  // success
2587 | }
2588 | 
2589 | TRITONSERVER_Error*
2590 | TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
2591 | {
2592 |   const char* cname;
2593 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));
2594 |   std::string name(cname);
2595 | 
2596 |   int32_t device_id;
2597 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id));
2598 | 
2599 |   TRITONSERVER_InstanceGroupKind kind;
2600 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind));
2601 | 
2602 |   LOG_MESSAGE(
2603 |       TRITONSERVER_LOG_INFO,
2604 |       (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" +
2605 |        TRITONSERVER_InstanceGroupKindString(kind) + " device " +
2606 |        std::to_string(device_id) + ")")
2607 |           .c_str());
2608 | 
2609 |   // Get the model state associated with this instance's model.
2610 |   TRITONBACKEND_Model* model;
2611 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
2612 | 
2613 |   void* vmodelstate;
2614 |   RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
2615 |   ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
2616 | 
2617 |   // Create a ModelInstanceState object and associate it with the
2618 |   // TRITONBACKEND_ModelInstance.
2619 |   ModelInstanceState* instance_state;
2620 |   RETURN_IF_ERROR(
2621 |       ModelInstanceState::Create(model_state, instance, &instance_state));
2622 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
2623 |       instance, reinterpret_cast<void*>(instance_state)));
2624 | 
2625 |   return nullptr;  // success
2626 | }
2627 | 
2628 | TRITONSERVER_Error*
2629 | TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
2630 | {
2631 |   void* vstate;
2632 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
2633 |   ModelInstanceState* instance_state =
2634 |       reinterpret_cast<ModelInstanceState*>(vstate);
2635 | 
2636 |   LOG_MESSAGE(
2637 |       TRITONSERVER_LOG_INFO,
2638 |       "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
2639 | 
2640 |   delete instance_state;
2641 | 
2642 |   return nullptr;  // success
2643 | }
2644 | 
2645 | TRITONSERVER_Error*
2646 | TRITONBACKEND_ModelInstanceExecute(
2647 |     TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
2648 |     const uint32_t request_count)
2649 | {
2650 |   // Triton will not call this function simultaneously for the same
2651 |   // 'instance'. But since this backend could be used by multiple
2652 |   // instances from multiple models the implementation needs to handle
2653 |   // multiple calls to this function at the same time (with different
2654 |   // 'instance' objects). Suggested practice for this is to use only
2655 |   // function-local and model-instance-specific state (obtained from
2656 |   // 'instance'), which is what we do here.
2657 |   ModelInstanceState* instance_state;
2658 |   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
2659 |       instance, reinterpret_cast<void**>(&instance_state)));
2660 |   ModelState* model_state = instance_state->StateForModel();
2661 | 
2662 |   // This backend specifies BLOCKING execution policy. That means that
2663 |   // we should not return from this function until execution is
2664 |   // complete. Triton will automatically release 'instance' on return
2665 |   // from this function so that it is again available to be used for
2666 |   // another call to TRITONBACKEND_ModelInstanceExecute.
2667 | 
2668 |   LOG_MESSAGE(
2669 |       TRITONSERVER_LOG_VERBOSE,
2670 |       (std::string("model ") + model_state->Name() + ", instance " +
2671 |        instance_state->Name() + ", executing " + std::to_string(request_count) +
2672 |        " requests")
2673 |           .c_str());
2674 | 
2675 |   // At this point we accept ownership of 'requests', which means that
2676 |   // even if something goes wrong we must still return success from
2677 |   // this function. If something does go wrong in processing a
2678 |   // particular request then we send an error response just for the
2679 |   // specific request.
2680 |   instance_state->ProcessRequests(requests, request_count);
2681 | 
2682 |   if (model_state->EnabledCacheCleaning()) {
2683 |     instance_state->ClearCache();
2684 |   }
2685 | 
2686 |   return nullptr;  // success
2687 | }
2688 | 
2689 | }  // extern "C"
2690 | 
2691 | }}}  // namespace triton::backend::pytorch
2692 | 


--------------------------------------------------------------------------------
/src/libtorch_utils.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020-24 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | //
  3 | // Redistribution and use in source and binary forms, with or without
  4 | // modification, are permitted provided that the following conditions
  5 | // are met:
  6 | //  * Redistributions of source code must retain the above copyright
  7 | //    notice, this list of conditions and the following disclaimer.
  8 | //  * Redistributions in binary form must reproduce the above copyright
  9 | //    notice, this list of conditions and the following disclaimer in the
 10 | //    documentation and/or other materials provided with the distribution.
 11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | //    contributors may be used to endorse or promote products derived
 13 | //    from this software without specific prior written permission.
 14 | //
 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | #include "libtorch_utils.h"
 28 | 
 29 | namespace triton { namespace backend { namespace pytorch {
 30 | 
 31 | TRITONSERVER_DataType
 32 | ConvertTorchTypeToDataType(const torch::ScalarType& stype)
 33 | {
 34 |   switch (stype) {
 35 |     case torch::kBool:
 36 |       return TRITONSERVER_TYPE_BOOL;
 37 |     case torch::kByte:
 38 |       return TRITONSERVER_TYPE_UINT8;
 39 |     case torch::kChar:
 40 |       return TRITONSERVER_TYPE_INT8;
 41 |     case torch::kShort:
 42 |       return TRITONSERVER_TYPE_INT16;
 43 |     case torch::kInt:
 44 |       return TRITONSERVER_TYPE_INT32;
 45 |     case torch::kLong:
 46 |       return TRITONSERVER_TYPE_INT64;
 47 |     case torch::kHalf:
 48 |       return TRITONSERVER_TYPE_FP16;
 49 |     case torch::kFloat:
 50 |       return TRITONSERVER_TYPE_FP32;
 51 |     case torch::kDouble:
 52 |       return TRITONSERVER_TYPE_FP64;
 53 |     default:
 54 |       break;
 55 |   }
 56 | 
 57 |   return TRITONSERVER_TYPE_INVALID;
 58 | }
 59 | 
 60 | std::pair<bool, torch::ScalarType>
 61 | ConvertDataTypeToTorchType(const TRITONSERVER_DataType dtype)
 62 | {
 63 |   torch::ScalarType type = torch::kInt;
 64 |   switch (dtype) {
 65 |     case TRITONSERVER_TYPE_BOOL:
 66 |       type = torch::kBool;
 67 |       break;
 68 |     case TRITONSERVER_TYPE_UINT8:
 69 |       type = torch::kByte;
 70 |       break;
 71 |     case TRITONSERVER_TYPE_INT8:
 72 |       type = torch::kChar;
 73 |       break;
 74 |     case TRITONSERVER_TYPE_INT16:
 75 |       type = torch::kShort;
 76 |       break;
 77 |     case TRITONSERVER_TYPE_INT32:
 78 |       type = torch::kInt;
 79 |       break;
 80 |     case TRITONSERVER_TYPE_INT64:
 81 |       type = torch::kLong;
 82 |       break;
 83 |     case TRITONSERVER_TYPE_FP16:
 84 |       type = torch::kHalf;
 85 |       break;
 86 |     case TRITONSERVER_TYPE_FP32:
 87 |       type = torch::kFloat;
 88 |       break;
 89 |     case TRITONSERVER_TYPE_FP64:
 90 |       type = torch::kDouble;
 91 |       break;
 92 |     case TRITONSERVER_TYPE_UINT16:
 93 |     case TRITONSERVER_TYPE_UINT32:
 94 |     case TRITONSERVER_TYPE_UINT64:
 95 |     case TRITONSERVER_TYPE_BYTES:
 96 |     default:
 97 |       return std::make_pair(false, type);
 98 |   }
 99 | 
100 |   return std::make_pair(true, type);
101 | }
102 | 
103 | std::pair<bool, torch::ScalarType>
104 | ModelConfigDataTypeToTorchType(const std::string& data_type_str)
105 | {
106 |   torch::ScalarType type = torch::kInt;
107 | 
108 |   // Must start with "TYPE_".
109 |   if (data_type_str.rfind("TYPE_", 0) != 0) {
110 |     return std::make_pair(false, type);
111 |   }
112 | 
113 |   const std::string dtype = data_type_str.substr(strlen("TYPE_"));
114 | 
115 |   if (dtype == "BOOL") {
116 |     type = torch::kBool;
117 |   } else if (dtype == "UINT8") {
118 |     type = torch::kByte;
119 |   } else if (dtype == "INT8") {
120 |     type = torch::kChar;
121 |   } else if (dtype == "INT16") {
122 |     type = torch::kShort;
123 |   } else if (dtype == "INT32") {
124 |     type = torch::kInt;
125 |   } else if (dtype == "INT64") {
126 |     type = torch::kLong;
127 |   } else if (dtype == "FP16") {
128 |     type = torch::kHalf;
129 |   } else if (dtype == "FP32") {
130 |     type = torch::kFloat;
131 |   } else if (dtype == "FP64") {
132 |     type = torch::kDouble;
133 |   } else {
134 |     return std::make_pair(false, type);
135 |   }
136 | 
137 |   return std::make_pair(true, type);
138 | }
139 | 
140 | TRITONSERVER_Error*
141 | ParseParameter(
142 |     triton::common::TritonJson::Value& params, const std::string& mkey,
143 |     bool* value)
144 | {
145 |   std::string value_str;
146 |   RETURN_IF_ERROR(GetParameterValue(params, mkey, &value_str));
147 |   RETURN_IF_ERROR(ParseBoolValue(value_str, value));
148 | 
149 |   return nullptr;
150 | }
151 | 
152 | TRITONSERVER_Error*
153 | ParseParameter(
154 |     triton::common::TritonJson::Value& params, const std::string& mkey,
155 |     int* value)
156 | {
157 |   std::string value_str;
158 |   RETURN_IF_ERROR(GetParameterValue(params, mkey, &value_str));
159 |   RETURN_IF_ERROR(ParseIntValue(value_str, value));
160 | 
161 |   return nullptr;
162 | }
163 | 
164 | 
165 | #ifdef TRITON_ENABLE_GPU
166 | TRITONSERVER_Error*
167 | ConvertCUDAStatusToTritonError(
168 |     cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg)
169 | {
170 |   if (cuda_error != cudaSuccess) {
171 |     return TRITONSERVER_ErrorNew(
172 |         code,
173 |         (std::string(msg) + ": " + cudaGetErrorString(cuda_error)).c_str());
174 |   }
175 |   return nullptr;  // success
176 | }
177 | #endif
178 | 
179 | }}}  // namespace triton::backend::pytorch
180 | 


--------------------------------------------------------------------------------
/src/libtorch_utils.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | //
 3 | // Redistribution and use in source and binary forms, with or without
 4 | // modification, are permitted provided that the following conditions
 5 | // are met:
 6 | //  * Redistributions of source code must retain the above copyright
 7 | //    notice, this list of conditions and the following disclaimer.
 8 | //  * Redistributions in binary form must reproduce the above copyright
 9 | //    notice, this list of conditions and the following disclaimer in the
10 | //    documentation and/or other materials provided with the distribution.
11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | //    contributors may be used to endorse or promote products derived
13 | //    from this software without specific prior written permission.
14 | //
15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | #pragma once
28 | 
29 | #include "triton/backend/backend_common.h"
30 | #include "triton/core/tritonserver.h"
31 | 
32 | // Suppress warnings in torch headers
33 | #pragma GCC diagnostic push
34 | #pragma GCC diagnostic ignored "-Wsign-compare"
35 | #pragma warning(push, 0)
36 | #include <torch/csrc/jit/codegen/cuda/interface.h>
37 | #include <torch/csrc/jit/codegen/fuser/interface.h>
38 | #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
39 | #include <torch/csrc/jit/runtime/graph_executor.h>
40 | #include <torch/script.h>  // One-stop header for TorchScript
41 | #pragma warning(pop)
42 | #pragma GCC diagnostic pop
43 | 
44 | namespace triton { namespace backend { namespace pytorch {
45 | 
46 | TRITONSERVER_DataType ConvertTorchTypeToDataType(
47 |     const torch::ScalarType& ttype);
48 | std::pair<bool, torch::ScalarType> ConvertDataTypeToTorchType(
49 |     const TRITONSERVER_DataType dtype);
50 | std::pair<bool, torch::ScalarType> ModelConfigDataTypeToTorchType(
51 |     const std::string& data_type_str);
52 | 
53 | #ifdef TRITON_ENABLE_GPU
54 | TRITONSERVER_Error* ConvertCUDAStatusToTritonError(
55 |     cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg);
56 | #endif
57 | 
58 | // If the key 'mkey' is present in 'params' then update 'value' with the
59 | // value associated with that key. If 'mkey' is not present in 'params' then
60 | // no update is made to 'value'.
61 | TRITONSERVER_Error* ParseParameter(
62 |     triton::common::TritonJson::Value& params, const std::string& mkey,
63 |     bool* value);
64 | 
65 | // If the key 'mkey' is present in 'params' then update 'value' with the
66 | // value associated with that key. If 'mkey' is not present in 'params' then
67 | // 'value' is set to 'default_value'.
68 | TRITONSERVER_Error* ParseParameter(
69 |     triton::common::TritonJson::Value& params, const std::string& mkey,
70 |     int* value);
71 | 
72 | }}}  // namespace triton::backend::pytorch
73 | 


--------------------------------------------------------------------------------
/src/libtriton_pytorch.ldscript:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | {
27 |   global:
28 |     TRITONBACKEND_*;
29 |   local: *;
30 | };
31 | 


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without
  6 | # modification, are permitted provided that the following conditions
  7 | # are met:
  8 | #  * Redistributions of source code must retain the above copyright
  9 | #    notice, this list of conditions and the following disclaimer.
 10 | #  * Redistributions in binary form must reproduce the above copyright
 11 | #    notice, this list of conditions and the following disclaimer in the
 12 | #    documentation and/or other materials provided with the distribution.
 13 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 14 | #    contributors may be used to endorse or promote products derived
 15 | #    from this software without specific prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 20 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 21 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 22 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 23 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 24 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 25 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | import importlib
 30 | import json
 31 | import os
 32 | 
 33 | try:
 34 |     import torch
 35 | except ModuleNotFoundError as error:
 36 |     raise RuntimeError("Missing/Incomplete PyTorch package installation") from error
 37 | 
 38 | import triton_python_backend_utils as pb_utils
 39 | 
 40 | 
 41 | def _get_model_path(config):
 42 |     # FIXME: Add support for torch.export IR models (.pt2)
 43 |     filenames = ["model.py", "model.pt"]
 44 |     if config["default_model_filename"]:
 45 |         filenames.insert(0, config["default_model_filename"])
 46 |     for filename in filenames:
 47 |         model_path = os.path.join(pb_utils.get_model_dir(), filename)
 48 |         if os.path.exists(model_path):
 49 |             return model_path
 50 |     raise pb_utils.TritonModelException(
 51 |         "No model found in " + pb_utils.get_model_dir() + "/" + str(filenames)
 52 |     )
 53 | 
 54 | 
 55 | def _get_model_data_path(model_path):
 56 |     data_path_extensions = [".pt"]
 57 |     model_path_no_extension = model_path[: -(len(model_path.split(".")[-1]) + 1)]
 58 |     for extension in data_path_extensions:
 59 |         data_path = model_path_no_extension + extension
 60 |         if os.path.exists(data_path):
 61 |             return data_path
 62 |     # data file not provided
 63 |     return ""
 64 | 
 65 | 
 66 | def _is_py_class_model(model_path):
 67 |     return model_path[-3:] == ".py"
 68 | 
 69 | 
 70 | def _import_module_from_path(module_name, file_path):
 71 |     spec = importlib.util.spec_from_file_location(module_name, file_path)
 72 |     module = importlib.util.module_from_spec(spec)
 73 |     spec.loader.exec_module(module)
 74 |     return module
 75 | 
 76 | 
 77 | def _get_model_class_from_module(module):
 78 |     names = dir(module)
 79 |     for name in names:
 80 |         attr = getattr(module, name)
 81 |         try:
 82 |             if issubclass(attr, torch.nn.Module):
 83 |                 return attr
 84 |         except TypeError:
 85 |             # attr may not be a class
 86 |             pass
 87 |     raise pb_utils.TritonModelException("Cannot find a subclass of torch.nn.Module")
 88 | 
 89 | 
 90 | def _parse_io_config(io_config):
 91 |     io = []
 92 |     for conf in io_config:
 93 |         io.append({"name": conf["name"]})
 94 |     return io
 95 | 
 96 | 
 97 | def _get_device_name(kind, device_id):
 98 |     if kind == "GPU":
 99 |         return "cuda:" + device_id
100 |     if kind == "CPU":
101 |         return "cpu"
102 |     # unspecified device
103 |     return ""
104 | 
105 | 
106 | def _get_device(kind, device_id, model):
107 |     device_name = _get_device_name(kind, device_id)
108 |     if device_name == "":
109 |         for param in model.parameters():
110 |             return param.device
111 |         raise pb_utils.TritonModelException("Cannot determine model device")
112 |     return torch.device(device_name)
113 | 
114 | 
115 | def _set_torch_parallelism(config):
116 |     log_msg = ""
117 |     parallelism_settings = ["NUM_THREADS", "NUM_INTEROP_THREADS"]
118 |     for setting in parallelism_settings:
119 |         val = "1"
120 |         if setting in config["parameters"]:
121 |             val = config["parameters"][setting]["string_value"]
122 |         getattr(torch, "set_" + setting.lower())(int(val))
123 |         log_msg += setting + " = " + val + "; "
124 |     return log_msg
125 | 
126 | 
127 | def _get_torch_compile_params(config):
128 |     params = {}
129 |     if "TORCH_COMPILE_OPTIONAL_PARAMETERS" in config["parameters"]:
130 |         val = config["parameters"]["TORCH_COMPILE_OPTIONAL_PARAMETERS"]["string_value"]
131 |         params = json.loads(val)
132 |         if "model" in params:
133 |             raise pb_utils.TritonModelException(
134 |                 "'model' is not an optional parameter for 'torch.compile'"
135 |             )
136 |     return params
137 | 
138 | 
139 | def _gather_torch_tensors(scatter_tensors):
140 |     gather_tensors = []
141 |     sections = []
142 |     for i in range(len(scatter_tensors)):
143 |         tensors = scatter_tensors[i]
144 |         for j in range(len(tensors)):
145 |             tensor = tensors[j]
146 |             if j < len(gather_tensors):
147 |                 # add to existing tensor
148 |                 gather_tensors[j] = torch.cat((gather_tensors[j], tensor), 0)
149 |             else:
150 |                 # start a new tensor
151 |                 gather_tensors.append(tensor)
152 |         # record section
153 |         section_length = tensors[0].size()[0]
154 |         sections.append(section_length)
155 |     return gather_tensors, sections
156 | 
157 | 
158 | def _scatter_torch_tensors(gather_tensors, sections):
159 |     scatter_tensors = []
160 |     for j in range(len(gather_tensors)):
161 |         scatter_tensor = torch.split(gather_tensors[j], sections)
162 |         for i in range(len(scatter_tensor)):
163 |             tensor = scatter_tensor[i]
164 |             if i < len(scatter_tensors):
165 |                 # add to existing response
166 |                 scatter_tensors[i].append(tensor)
167 |             else:
168 |                 # start a new response
169 |                 scatter_tensors.append([tensor])
170 |     return scatter_tensors
171 | 
172 | 
173 | class TritonPythonModel:
174 |     """Your Python model must use the same class name. Every Python model
175 |     that is created must have "TritonPythonModel" as the class name.
176 |     """
177 | 
178 |     def initialize(self, args):
179 |         """`initialize` is called only once when the model is being loaded.
180 |         Implementing `initialize` function is optional. This function allows
181 |         the model to initialize any state associated with this model.
182 |         Parameters
183 |         ----------
184 |         args : dict
185 |           Both keys and values are strings. The dictionary keys and values are:
186 |           * model_config: A JSON string containing the model configuration
187 |           * model_instance_kind: A string containing model instance kind
188 |           * model_instance_device_id: A string containing model instance device ID
189 |           * model_repository: Model repository path
190 |           * model_version: Model version
191 |           * model_name: Model name
192 |         """
193 |         self._model_name = args["model_name"]
194 |         for_model = "for '" + self._model_name + "'"
195 |         self._logger = pb_utils.Logger
196 |         self._logger.log_info("Initializing model instance " + for_model)
197 | 
198 |         self._model_config = json.loads(args["model_config"])
199 |         self._kind = args["model_instance_kind"]
200 |         self._device_id = args["model_instance_device_id"]
201 |         self._support_batching = self._model_config["max_batch_size"] > 0
202 |         self._inputs = _parse_io_config(self._model_config["input"])
203 |         self._outputs = _parse_io_config(self._model_config["output"])
204 | 
205 |         setting_msg = _set_torch_parallelism(self._model_config)
206 |         self._logger.log_verbose(
207 |             "Torch parallelism settings " + for_model + ": " + setting_msg
208 |         )
209 | 
210 |         self._infer_mode = torch.inference_mode(mode=True)
211 |         self._infer_mode.__enter__()
212 | 
213 |         params = _get_torch_compile_params(self._model_config)
214 |         self._logger.log_verbose(
215 |             "'torch.compile' optional parameter(s) " + for_model + ": " + str(params)
216 |         )
217 |         if self._support_batching:
218 |             self._gather = torch.compile(_gather_torch_tensors, **params)
219 |             self._scatter = torch.compile(_scatter_torch_tensors, **params)
220 | 
221 |         model_path = _get_model_path(self._model_config)
222 |         if not _is_py_class_model(model_path):
223 |             self._logger.log_info("Loading '" + self._model_name + "' as TorchScript")
224 |             self._model = torch.jit.load(model_path)
225 |             self._device = _get_device(self._kind, self._device_id, self._model)
226 |             self._model.to(self._device)
227 |             self._model.eval()
228 |             return
229 | 
230 |         self._model_module = _import_module_from_path(self._model_name, model_path)
231 |         self._model_class = _get_model_class_from_module(self._model_module)
232 |         self._raw_model = self._model_class()
233 |         self._device = _get_device(self._kind, self._device_id, self._raw_model)
234 |         data_path = _get_model_data_path(model_path)
235 |         if data_path != "":
236 |             self._raw_model.load_state_dict(
237 |                 torch.load(data_path, map_location=self._device)
238 |             )
239 |         else:
240 |             self._logger.log_info("Model parameter file not found " + for_model)
241 |         self._raw_model.to(self._device)
242 |         self._raw_model.eval()
243 |         self._model = torch.compile(self._raw_model, **params)
244 | 
245 |     def execute(self, requests):
246 |         """`execute` MUST be implemented in every Python model. `execute`
247 |         function receives a list of pb_utils.InferenceRequest as the only
248 |         argument. This function is called when an inference request is made
249 |         for this model. Depending on the batching configuration (e.g. Dynamic
250 |         Batching) used, `requests` may contain multiple requests. Every
251 |         Python model, must create one pb_utils.InferenceResponse for every
252 |         pb_utils.InferenceRequest in `requests`. If there is an error, you can
253 |         set the error argument when creating a pb_utils.InferenceResponse
254 |         Parameters
255 |         ----------
256 |         requests : list
257 |           A list of pb_utils.InferenceRequest
258 |         Returns
259 |         -------
260 |         list
261 |           A list of pb_utils.InferenceResponse. The length of this list must
262 |           be the same as `requests`
263 |         """
264 | 
265 |         responses = []
266 | 
267 |         requests_tensors = []
268 |         for request in requests:
269 |             tensors = []
270 |             for io in self._inputs:
271 |                 tensor = pb_utils.get_input_tensor_by_name(
272 |                     request, io["name"]
273 |                 ).to_dlpack()
274 |                 tensor = torch.from_dlpack(tensor).to(self._device)
275 |                 tensors.append(tensor)
276 |             requests_tensors.append(tensors)
277 | 
278 |         sections = None
279 |         if self._support_batching:
280 |             requests_tensors, sections = self._gather(requests_tensors)
281 |             requests_tensors = [requests_tensors]
282 | 
283 |         responses_tensors = []
284 |         for input_tensors in requests_tensors:
285 |             output_tensors = self._model(*input_tensors)
286 |             if not isinstance(output_tensors, tuple) and not isinstance(
287 |                 output_tensors, list
288 |             ):
289 |                 output_tensors = [output_tensors]
290 |             responses_tensors.append(output_tensors)
291 | 
292 |         if self._support_batching:
293 |             responses_tensors = self._scatter(responses_tensors[0], sections)
294 | 
295 |         for response_tensors in responses_tensors:
296 |             output_tensors = []
297 |             for i in range(len(self._outputs)):
298 |                 io = self._outputs[i]
299 |                 tensor = response_tensors[i].detach()
300 |                 tensor = pb_utils.Tensor.from_dlpack(io["name"], tensor)
301 |                 output_tensors.append(tensor)
302 |             inference_response = pb_utils.InferenceResponse(
303 |                 output_tensors=output_tensors
304 |             )
305 |             responses.append(inference_response)
306 | 
307 |         return responses
308 | 
309 |     def finalize(self):
310 |         """`finalize` is called only once when the model is being unloaded.
311 |         Implementing `finalize` function is OPTIONAL. This function allows
312 |         the model to perform any necessary clean ups before exit.
313 |         """
314 |         self._logger.log_info("Removing model instance for '" + self._model_name + "'")
315 |         self._infer_mode.__exit__(exc_type=None, exc_value=None, traceback=None)
316 | 


--------------------------------------------------------------------------------
/tools/gen_pb_exec_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | # install conda
29 | rm -rf ./miniconda
30 | wget https://repo.anaconda.com/miniconda/Miniconda3-py312_24.9.2-0-Linux-x86_64.sh
31 | bash Miniconda3-py312_24.9.2-0-Linux-x86_64.sh -p ./miniconda -b
32 | eval "$(./miniconda/bin/conda shell.bash hook)"
33 | 
34 | # create conda environment
35 | conda create -n pt python=3.12 -y
36 | conda activate pt
37 | conda install -c conda-forge conda-pack -y
38 | 
39 | # pre install step
40 | export PYTHONNOUSERSITE=True
41 | conda install -c conda-forge libstdcxx-ng=14 -y
42 | 
43 | # install PyTorch
44 | conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y
45 | 
46 | # pack environment
47 | rm -f pb_exec_env_model.py.tar.gz
48 | conda pack -o pb_exec_env_model.py.tar.gz
49 | 
50 | # deactivate conda
51 | conda deactivate
52 | 


--------------------------------------------------------------------------------