├── README.md ├── .gitignore ├── .clang-format ├── .github └── workflows │ └── pre-commit.yml ├── qa ├── L0_server_unit_test │ ├── models │ │ ├── failing_infer │ │ │ ├── 1 │ │ │ │ └── model.py │ │ │ └── config.pbtxt │ │ ├── add_sub │ │ │ ├── 1 │ │ │ │ └── model.py │ │ │ └── config.pbtxt │ │ └── add_sub_str │ │ │ ├── 1 │ │ │ └── model.py │ │ │ └── config.pbtxt │ └── test.sh ├── L0_java_simple_cpp_example │ └── test.sh └── L0_server_example │ └── test.sh ├── server ├── cmake │ └── TritonDeveloperToolsServerConfig.cmake.in ├── test │ ├── CMakeLists.txt │ └── wrapper_test.cc ├── install_dependencies_and_build.sh ├── examples │ ├── CMakeLists.txt │ ├── square_async_infer.cc │ ├── addsub_string_async_infer.cc │ └── simple_addsub_async_infer.cc ├── src │ ├── infer_requested_output.h │ ├── tracer.h │ └── tracer.cc ├── CMakeLists.txt ├── include │ └── triton │ │ └── developer_tools │ │ ├── generic_server_wrapper.h │ │ └── server_wrapper.h └── README.md ├── pyproject.toml └── .pre-commit-config.yaml /README.md: -------------------------------------------------------------------------------- 1 | # triton_developer_tools -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | /.vscode 3 | *.so 4 | build 5 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Google 3 | 4 | IndentWidth: 2 5 | ColumnLimit: 80 6 | ContinuationIndentWidth: 4 7 | UseTab: Never 8 | MaxEmptyLinesToKeep: 2 9 | 10 | SortIncludes: true 11 | CompactNamespaces: true 12 | ReflowComments: true 13 | 14 | DerivePointerAlignment: false 15 | PointerAlignment: Left 16 | 17 | AllowShortIfStatementsOnASingleLine: false 18 | AllowShortBlocksOnASingleLine: false 19 | AllowShortFunctionsOnASingleLine: Inline 20 | 21 | AlwaysBreakAfterReturnType: TopLevelDefinitions 22 | AlignAfterOpenBracket: AlwaysBreak 23 | BreakBeforeBraces: Custom 24 | BraceWrapping: 25 | AfterClass: false 26 | AfterControlStatement: false 27 | AfterEnum: false 28 | AfterFunction: true 29 | AfterNamespace: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | BeforeCatch: true 33 | 34 | BinPackArguments: true 35 | BinPackParameters: true 36 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 37 | 38 | IndentCaseLabels: true 39 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: pre-commit 28 | 29 | on: 30 | pull_request: 31 | 32 | jobs: 33 | pre-commit: 34 | runs-on: ubuntu-latest 35 | steps: 36 | - uses: actions/checkout@v5.0.0 37 | - uses: actions/setup-python@v6.0.0 38 | - uses: pre-commit/action@v3.0.1 39 | -------------------------------------------------------------------------------- /qa/L0_server_unit_test/models/failing_infer/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "failing_infer" 28 | backend: "python" 29 | input [ 30 | { 31 | name: "INPUT" 32 | data_type: TYPE_INT32 33 | dims: [ 16 ] 34 | } 35 | ] 36 | output [ 37 | { 38 | name: "OUTPUT" 39 | data_type: TYPE_INT32 40 | dims: [ 16 ] 41 | } 42 | ] 43 | instance_group [{ kind: KIND_CPU }] 44 | -------------------------------------------------------------------------------- /server/cmake/TritonDeveloperToolsServerConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | include(CMakeFindDependencyMacro) 28 | 29 | get_filename_component( 30 | TRITONDEVELOPERTOOLSSERVER_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH 31 | ) 32 | 33 | list(APPEND CMAKE_MODULE_PATH ${TRITONDEVELOPERTOOLSSERVER_CMAKE_DIR}) 34 | 35 | if(NOT TARGET TritonDeveloperToolsServer::triton-developer_tools-server) 36 | include("${TRITONDEVELOPERTOOLSSERVER_CMAKE_DIR}/TritonDeveloperToolsServerTargets.cmake") 37 | endif() 38 | -------------------------------------------------------------------------------- /qa/L0_server_unit_test/models/add_sub/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | backend: "python" 28 | 29 | input [ 30 | { 31 | name: "INPUT0" 32 | data_type: TYPE_INT32 33 | dims: [ 16 ] 34 | } 35 | ] 36 | input [ 37 | { 38 | name: "INPUT1" 39 | data_type: TYPE_INT32 40 | dims: [ 16 ] 41 | } 42 | ] 43 | output [ 44 | { 45 | name: "OUTPUT0" 46 | data_type: TYPE_INT32 47 | dims: [ 16 ] 48 | } 49 | ] 50 | output [ 51 | { 52 | name: "OUTPUT1" 53 | data_type: TYPE_INT32 54 | dims: [ 16 ] 55 | } 56 | ] 57 | 58 | instance_group [{ kind: KIND_CPU }] 59 | -------------------------------------------------------------------------------- /qa/L0_server_unit_test/models/add_sub_str/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | backend: "python" 28 | 29 | input [ 30 | { 31 | name: "INPUT0" 32 | data_type: TYPE_STRING 33 | dims: [ -1 ] 34 | } 35 | ] 36 | input [ 37 | { 38 | name: "INPUT1" 39 | data_type: TYPE_STRING 40 | dims: [ -1 ] 41 | } 42 | ] 43 | output [ 44 | { 45 | name: "OUTPUT0" 46 | data_type: TYPE_STRING 47 | dims: [ -1 ] 48 | } 49 | ] 50 | output [ 51 | { 52 | name: "OUTPUT1" 53 | data_type: TYPE_STRING 54 | dims: [ -1 ] 55 | } 56 | ] 57 | 58 | instance_group [{ kind: KIND_CPU }] 59 | -------------------------------------------------------------------------------- /qa/L0_server_unit_test/models/failing_infer/1/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions 7 | # are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * Neither the name of NVIDIA CORPORATION nor the names of its 14 | # contributors may be used to endorse or promote products derived 15 | # from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 25 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | import triton_python_backend_utils as pb_utils 30 | 31 | 32 | class TritonPythonModel: 33 | """Test model that always returns error for all requests.""" 34 | 35 | def execute(self, requests): 36 | responses = [] 37 | 38 | for _ in requests: 39 | responses.append( 40 | pb_utils.InferenceResponse( 41 | output_tensors=[], error=pb_utils.TritonError("An Error Occurred") 42 | ) 43 | ) 44 | 45 | # You must return a list of pb_utils.InferenceResponse. Length 46 | # of this list must match the length of `requests` list. 47 | return responses 48 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | [tool.codespell] 28 | # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override - 29 | # this is only to allow you to run codespell interactively 30 | skip = "./.git,./.github" 31 | # ignore short words, and typename parameters like OffsetT 32 | ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" 33 | # use the 'clear' dictionary for unambiguous spelling mistakes 34 | builtin = "clear" 35 | # disable warnings about binary files and wrong encoding 36 | quiet-level = 3 37 | 38 | [tool.isort] 39 | profile = "black" 40 | use_parentheses = true 41 | multi_line_output = 3 42 | include_trailing_comma = true 43 | force_grid_wrap = 0 44 | ensure_newline_before_comments = true 45 | line_length = 88 46 | balanced_wrapping = true 47 | indent = " " 48 | skip = ["build"] 49 | 50 | -------------------------------------------------------------------------------- /server/test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | cmake_minimum_required(VERSION 3.31.8) 28 | 29 | # 30 | # Unit tests 31 | # 32 | 33 | # 34 | # Unit test for Triton Developer Tools Server 35 | # 36 | add_executable( 37 | wrapper_test 38 | wrapper_test.cc 39 | ) 40 | 41 | set_target_properties( 42 | wrapper_test 43 | PROPERTIES 44 | SKIP_BUILD_RPATH TRUE 45 | BUILD_WITH_INSTALL_RPATH TRUE 46 | INSTALL_RPATH_USE_LINK_PATH FALSE 47 | INSTALL_RPATH "" 48 | ) 49 | 50 | target_include_directories( 51 | wrapper_test 52 | PRIVATE 53 | ${CMAKE_CURRENT_SOURCE_DIR}/../include 54 | ${GTEST_INCLUDE_DIRS} 55 | ) 56 | 57 | target_link_libraries( 58 | wrapper_test 59 | PRIVATE 60 | triton-developer_tools-server 61 | triton-core-serverstub 62 | GTest::gtest_main 63 | ) 64 | 65 | install( 66 | TARGETS wrapper_test 67 | RUNTIME DESTINATION bin 68 | ) 69 | -------------------------------------------------------------------------------- /server/install_dependencies_and_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | # Install dependencies 29 | apt update -q=2 \ 30 | && apt install -y \ 31 | gpg \ 32 | wget \ 33 | rapidjson-dev \ 34 | software-properties-common \ 35 | && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \ 36 | && . /etc/os-release \ 37 | && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \ 38 | && apt-get update -q=2 \ 39 | && apt-get install -y --no-install-recommends cmake=4.0.3* cmake-data=4.0.3* \ 40 | && cmake --version 41 | 42 | # Install developer tools 43 | mkdir -p /opt/tritonserver/developer_tools/server/build && cd /opt/tritonserver/developer_tools/server/build 44 | cmake -DCMAKE_INSTALL_PREFIX:PATH=/opt/tritonserver/developer_tools/server/build/install \ 45 | -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} .. 46 | make install 47 | cp /opt/tritonserver/developer_tools/server/build/install/lib/libtritondevelopertoolsserver.a /opt/tritonserver/lib/ 48 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | repos: 28 | - repo: https://github.com/PyCQA/isort 29 | rev: 5.12.0 30 | hooks: 31 | - id: isort 32 | additional_dependencies: [toml] 33 | - repo: https://github.com/psf/black 34 | rev: 23.1.0 35 | hooks: 36 | - id: black 37 | types_or: [python, cython] 38 | - repo: https://github.com/PyCQA/flake8 39 | rev: 7.3.0 40 | hooks: 41 | - id: flake8 42 | args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501] 43 | types_or: [python, cython] 44 | - repo: https://github.com/pre-commit/mirrors-clang-format 45 | rev: v16.0.5 46 | hooks: 47 | - id: clang-format 48 | types_or: [c, c++, cuda, proto, textproto, java] 49 | args: ["-fallback-style=none", "-style=file", "-i"] 50 | - repo: https://github.com/codespell-project/codespell 51 | rev: v2.2.4 52 | hooks: 53 | - id: codespell 54 | additional_dependencies: [tomli] 55 | args: ["--toml", "pyproject.toml"] 56 | exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$) 57 | # More details about these pre-commit hooks here: 58 | # https://pre-commit.com/hooks.html 59 | - repo: https://github.com/pre-commit/pre-commit-hooks 60 | rev: v6.0.0 61 | hooks: 62 | - id: check-case-conflict 63 | - id: check-executables-have-shebangs 64 | - id: check-merge-conflict 65 | - id: check-json 66 | - id: check-toml 67 | - id: check-yaml 68 | - id: check-shebang-scripts-are-executable 69 | - id: end-of-file-fixer 70 | types_or: [c, c++, cuda, proto, textproto, java, python] 71 | - id: mixed-line-ending 72 | - id: requirements-txt-fixer 73 | - id: trailing-whitespace 74 | -------------------------------------------------------------------------------- /qa/L0_server_unit_test/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} 29 | if [ "$#" -ge 1 ]; then 30 | REPO_VERSION=$1 31 | fi 32 | if [ -z "$REPO_VERSION" ]; then 33 | echo -e "Repository version must be specified" 34 | echo -e "\n***\n*** Test Failed\n***" 35 | exit 1 36 | fi 37 | if [ ! -z "$TEST_REPO_ARCH" ]; then 38 | REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} 39 | fi 40 | bash -x ../../server/install_dependencies_and_build.sh 41 | 42 | export CUDA_VISIBLE_DEVICES=0 43 | 44 | TEST_LOG=test.log 45 | 46 | # Copy over the decoupled model placed in the python_backend repository. 47 | git clone --single-branch --depth=1 -b ${PYTHON_BACKEND_REPO_TAG} https://github.com/triton-inference-server/python_backend.git 48 | mkdir -p ./models/square_int32/1 49 | cp python_backend/examples/decoupled/square_model.py ./models/square_int32/1/model.py 50 | cp python_backend/examples/decoupled/square_config.pbtxt ./models/square_int32/config.pbtxt 51 | # Copy the model repository for 'ModelRepoRegister' test case. 52 | cp -fr ./models ./models1 53 | 54 | RET=0 55 | 56 | cp /opt/tritonserver/developer_tools/server/build/install/bin/wrapper_test ./ 57 | 58 | set +e 59 | # Must explicitly set LD_LIBRARY_PATH so that the test can find 60 | # libtritonserver.so. 61 | LD_LIBRARY_PATH=/opt/tritonserver/lib:${LD_LIBRARY_PATH} ./wrapper_test >> ${TEST_LOG} 2>&1 62 | if [ $? -ne 0 ]; then 63 | cat ${TEST_LOG} 64 | RET=1 65 | fi 66 | set -e 67 | 68 | if [ $RET -eq 0 ]; then 69 | echo -e "\n***\n*** Test Passed\n***" 70 | else 71 | echo -e "\n***\n*** Test FAILED\n***" 72 | fi 73 | 74 | exit $RET 75 | -------------------------------------------------------------------------------- /qa/L0_server_unit_test/models/add_sub/1/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions 7 | # are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * Neither the name of NVIDIA CORPORATION nor the names of its 14 | # contributors may be used to endorse or promote products derived 15 | # from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 25 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | import json 30 | 31 | import numpy as np 32 | import triton_python_backend_utils as pb_utils 33 | 34 | 35 | class TritonPythonModel: 36 | def initialize(self, args): 37 | self.model_config = model_config = json.loads(args["model_config"]) 38 | 39 | output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") 40 | output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") 41 | 42 | self.output0_dtype = pb_utils.triton_string_to_numpy( 43 | output0_config["data_type"] 44 | ) 45 | self.output1_dtype = pb_utils.triton_string_to_numpy( 46 | output1_config["data_type"] 47 | ) 48 | 49 | def execute(self, requests): 50 | """This function is called on inference request.""" 51 | 52 | output0_dtype = self.output0_dtype 53 | output1_dtype = self.output1_dtype 54 | 55 | responses = [] 56 | for request in requests: 57 | in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") 58 | in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") 59 | if ( 60 | in_0.as_numpy().dtype.type is np.bytes_ 61 | or in_0.as_numpy().dtype == np.object_ 62 | ): 63 | out_0, out_1 = ( 64 | in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32), 65 | in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32), 66 | ) 67 | else: 68 | out_0, out_1 = ( 69 | in_0.as_numpy() + in_1.as_numpy(), 70 | in_0.as_numpy() - in_1.as_numpy(), 71 | ) 72 | 73 | out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) 74 | out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) 75 | responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])) 76 | return responses 77 | -------------------------------------------------------------------------------- /qa/L0_server_unit_test/models/add_sub_str/1/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions 7 | # are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * Neither the name of NVIDIA CORPORATION nor the names of its 14 | # contributors may be used to endorse or promote products derived 15 | # from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 25 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | import json 30 | 31 | import numpy as np 32 | import triton_python_backend_utils as pb_utils 33 | 34 | 35 | class TritonPythonModel: 36 | def initialize(self, args): 37 | self.model_config = model_config = json.loads(args["model_config"]) 38 | 39 | output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") 40 | output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") 41 | 42 | self.output0_dtype = pb_utils.triton_string_to_numpy( 43 | output0_config["data_type"] 44 | ) 45 | self.output1_dtype = pb_utils.triton_string_to_numpy( 46 | output1_config["data_type"] 47 | ) 48 | 49 | def execute(self, requests): 50 | """This function is called on inference request.""" 51 | 52 | output0_dtype = self.output0_dtype 53 | output1_dtype = self.output1_dtype 54 | 55 | responses = [] 56 | for request in requests: 57 | in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") 58 | in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") 59 | if ( 60 | in_0.as_numpy().dtype.type is np.bytes_ 61 | or in_0.as_numpy().dtype == np.object_ 62 | ): 63 | out_0, out_1 = ( 64 | in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32), 65 | in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32), 66 | ) 67 | else: 68 | out_0, out_1 = ( 69 | in_0.as_numpy() + in_1.as_numpy(), 70 | in_0.as_numpy() - in_1.as_numpy(), 71 | ) 72 | 73 | out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) 74 | out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) 75 | responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])) 76 | return responses 77 | -------------------------------------------------------------------------------- /qa/L0_java_simple_cpp_example/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} 29 | if [ "$#" -ge 1 ]; then 30 | REPO_VERSION=$1 31 | fi 32 | if [ -z "$REPO_VERSION" ]; then 33 | echo -e "Repository version must be specified" 34 | echo -e "\n***\n*** Test Failed\n***" 35 | exit 1 36 | fi 37 | 38 | # set variables 39 | CLIENT_LOG="client.log" 40 | MODEL_REPO=$PWD/models 41 | SAMPLES_REPO=$PWD/javacpp-presets/tritonserver/samples/simplecpp 42 | TRITON_SERVER_REPO_TAG=${TRITON_SERVER_REPO_TAG:="main"} 43 | TRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG:="main"} 44 | TEST_HOME=$PWD 45 | 46 | # generate models 47 | rm -rf ${MODEL_REPO} 48 | git clone --single-branch --depth=1 -b ${TRITON_SERVER_REPO_TAG} https://github.com/triton-inference-server/server.git 49 | mkdir -p ${MODEL_REPO} 50 | cp -r server/docs/examples/model_repository/simple ${MODEL_REPO}/simple 51 | 52 | # use build script to generate .jar 53 | git clone --single-branch --depth=1 -b ${TRITON_CLIENT_REPO_TAG} https://github.com/triton-inference-server/client.git 54 | source client/src/java-api-bindings/scripts/install_dependencies_and_build.sh --enable-developer-tools-server 55 | 56 | cd ${TEST_HOME} 57 | # build javacpp-presets/tritonserver 58 | set +e 59 | rm -r javacpp-presets 60 | git clone --single-branch --depth=1 -b ${JAVACPP_BRANCH_TAG} ${JAVACPP_BRANCH} 61 | cd javacpp-presets 62 | ${MAVEN_PATH} clean install --projects .,tritonserver 63 | ${MAVEN_PATH} clean install -f platform --projects ../tritonserver/platform -Djavacpp.platform.host 64 | cd .. 65 | set -e 66 | 67 | rm -f *.log 68 | RET=0 69 | 70 | set +e 71 | # Build SimpleCPP example 72 | BASE_COMMAND="${MAVEN_PATH} clean compile -f ${SAMPLES_REPO} exec:java -Djavacpp.platform=linux-x86_64" 73 | ${BASE_COMMAND} -Dexec.args="-r ${MODEL_REPO}" >>${CLIENT_LOG} 2>&1 74 | if [ $? -ne 0 ]; then 75 | echo -e "Failed to run: ${BASE_COMMAND} -Dexec.args=\"-r ${MODEL_REPO}\"" 76 | RET=1 77 | fi 78 | 79 | # Run SimpleCPP with generated jar 80 | java -cp ${JAR_INSTALL_PATH}/tritonserver-java-bindings.jar ${SAMPLES_REPO}/SimpleCPP.java 81 | if [ $? -ne 0 ]; then 82 | echo -e "Failed to run: java -cp ${JAR_INSTALL_PATH}/tritonserver-java-bindings.jar ${SAMPLES_REPO}/SimpleCPP.java -r ${MODEL_REPO}" 83 | RET=1 84 | fi 85 | 86 | set -e 87 | 88 | if [ $RET -eq 0 ]; then 89 | echo -e "\n***\n*** Test Passed\n***" 90 | else 91 | echo -e "\n***\n*** Test FAILED\n***" 92 | fi 93 | 94 | exit $RET 95 | -------------------------------------------------------------------------------- /server/examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | cmake_minimum_required(VERSION 3.31.8) 28 | 29 | # 30 | # examples 31 | # 32 | 33 | # 34 | # simple_addsub_async_infer 35 | # 36 | add_executable( 37 | simple_addsub_async_infer 38 | simple_addsub_async_infer.cc 39 | ) 40 | 41 | set_target_properties( 42 | simple_addsub_async_infer 43 | PROPERTIES 44 | SKIP_BUILD_RPATH TRUE 45 | BUILD_WITH_INSTALL_RPATH TRUE 46 | INSTALL_RPATH_USE_LINK_PATH FALSE 47 | INSTALL_RPATH "" 48 | ) 49 | 50 | target_include_directories( 51 | simple_addsub_async_infer 52 | PRIVATE 53 | ${CMAKE_CURRENT_SOURCE_DIR}/../include 54 | ) 55 | 56 | target_link_libraries( 57 | simple_addsub_async_infer 58 | PRIVATE 59 | triton-developer_tools-server 60 | triton-core-serverstub 61 | ) 62 | 63 | install( 64 | TARGETS simple_addsub_async_infer 65 | RUNTIME DESTINATION bin 66 | ) 67 | 68 | # 69 | # addsub_string_async_infer 70 | # 71 | add_executable( 72 | addsub_string_async_infer 73 | addsub_string_async_infer.cc 74 | ) 75 | 76 | set_target_properties( 77 | addsub_string_async_infer 78 | PROPERTIES 79 | SKIP_BUILD_RPATH TRUE 80 | BUILD_WITH_INSTALL_RPATH TRUE 81 | INSTALL_RPATH_USE_LINK_PATH FALSE 82 | INSTALL_RPATH "" 83 | ) 84 | 85 | target_include_directories( 86 | addsub_string_async_infer 87 | PRIVATE 88 | ${CMAKE_CURRENT_SOURCE_DIR}/../include 89 | ) 90 | 91 | target_link_libraries( 92 | addsub_string_async_infer 93 | PRIVATE 94 | triton-developer_tools-server 95 | triton-core-serverstub 96 | ) 97 | 98 | install( 99 | TARGETS addsub_string_async_infer 100 | RUNTIME DESTINATION bin 101 | ) 102 | 103 | # 104 | # square_async_infer 105 | # 106 | add_executable( 107 | square_async_infer 108 | square_async_infer.cc 109 | ) 110 | 111 | set_target_properties( 112 | square_async_infer 113 | PROPERTIES 114 | SKIP_BUILD_RPATH TRUE 115 | BUILD_WITH_INSTALL_RPATH TRUE 116 | INSTALL_RPATH_USE_LINK_PATH FALSE 117 | INSTALL_RPATH "" 118 | ) 119 | 120 | target_include_directories( 121 | square_async_infer 122 | PRIVATE 123 | ${CMAKE_CURRENT_SOURCE_DIR}/../include 124 | ) 125 | 126 | target_link_libraries( 127 | square_async_infer 128 | PRIVATE 129 | triton-developer_tools-server 130 | triton-core-serverstub 131 | ) 132 | 133 | install( 134 | TARGETS square_async_infer 135 | RUNTIME DESTINATION bin 136 | ) -------------------------------------------------------------------------------- /qa/L0_server_example/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} 29 | if [ "$#" -ge 1 ]; then 30 | REPO_VERSION=$1 31 | fi 32 | if [ -z "$REPO_VERSION" ]; then 33 | echo -e "Repository version must be specified" 34 | echo -e "\n***\n*** Test Failed\n***" 35 | exit 1 36 | fi 37 | if [ ! -z "$TEST_REPO_ARCH" ]; then 38 | REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} 39 | fi 40 | 41 | bash -x ../../server/install_dependencies_and_build.sh 42 | export CUDA_VISIBLE_DEVICES=0 43 | 44 | CLIENT_LOG=`pwd`/client.log 45 | SIMPLE_ADDSUB_ASYNC_INFER_CLIENT=/opt/tritonserver/developer_tools/server/build/install/bin/simple_addsub_async_infer 46 | ADDSUB_STRING_ASYNC_INFER_CLIENT=/opt/tritonserver/developer_tools/server/build/install/bin/addsub_string_async_infer 47 | SQUARE_ASYNC_INFER_CLIENT=/opt/tritonserver/developer_tools/server/build/install/bin/square_async_infer 48 | 49 | RET=0 50 | 51 | # Prepare required models for the examples 52 | mkdir models 53 | cp -r ../L0_server_unit_test/models/add_sub* ./models/. 54 | git clone --single-branch --depth=1 -b ${TRITON_SERVER_BRANCH_NAME} https://github.com/triton-inference-server/server.git 55 | cp -r server/docs/examples/model_repository/simple ./models/. 56 | # Copy over the decoupled model placed in the python_backend repository. 57 | git clone --single-branch --depth=1 -b ${PYTHON_BACKEND_REPO_TAG} https://github.com/triton-inference-server/python_backend.git 58 | mkdir -p ./models/square_int32/1 59 | cp python_backend/examples/decoupled/square_model.py ./models/square_int32/1/model.py 60 | cp python_backend/examples/decoupled/square_config.pbtxt ./models/square_int32/config.pbtxt 61 | 62 | # Must explicitly set LD_LIBRARY_PATH so that the test can find 63 | # libtritonserver.so. 64 | LD_LIBRARY_PATH=/opt/tritonserver/lib:${LD_LIBRARY_PATH} 65 | 66 | set +e 67 | 68 | for i in \ 69 | $SIMPLE_ADDSUB_ASYNC_INFER_CLIENT \ 70 | $ADDSUB_STRING_ASYNC_INFER_CLIENT \ 71 | $SQUARE_ASYNC_INFER_CLIENT \ 72 | ; do 73 | BASE=$(basename -- $i) 74 | SUFFIX="${BASE%.*}" 75 | 76 | if [ $i == $SIMPLE_ADDSUB_ASYNC_INFER_CLIENT ]; then 77 | # Enforce I/O to be in specific memory type 78 | for MEM_TYPE in system pinned gpu ; do 79 | $i -v -m $MEM_TYPE >> $CLIENT_LOG.${SUFFIX}.$MEM_TYPE 2>&1 80 | if [ $? -ne 0 ]; then 81 | cat $CLIENT_LOG.${SUFFIX}.$MEM_TYPE 82 | RET=1 83 | fi 84 | done 85 | else 86 | $i -v >> ${CLIENT_LOG}.${SUFFIX} 2>&1 87 | if [ $? -ne 0 ]; then 88 | cat ${CLIENT_LOG}.${SUFFIX} 89 | RET=1 90 | fi 91 | fi 92 | done 93 | 94 | set -e 95 | 96 | if [ $RET -eq 0 ]; then 97 | echo -e "\n***\n*** Test Passed\n***" 98 | else 99 | echo -e "\n***\n*** Test FAILED\n***" 100 | fi 101 | 102 | exit $RET 103 | -------------------------------------------------------------------------------- /server/src/infer_requested_output.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | #pragma once 27 | 28 | #include 29 | 30 | #include "triton/core/tritonserver.h" 31 | #include "triton/developer_tools/common.h" 32 | 33 | namespace triton { namespace developer_tools { namespace server { 34 | 35 | //============================================================================== 36 | /// An InferRequestedOutput object is used to describe the requested model 37 | /// output for inference. 38 | /// 39 | class InferRequestedOutput { 40 | public: 41 | /// Create an InferRequestedOutput instance that describes a model output 42 | /// being requested. 43 | /// \param name The name of output being requested. 44 | /// \return Returns a new InferRequestedOutput object. 45 | static std::unique_ptr Create(const std::string& name) 46 | { 47 | return std::unique_ptr( 48 | new InferRequestedOutput(name)); 49 | } 50 | 51 | /// Create a InferRequestedOutput instance that describes a model output being 52 | /// requested with pre-allocated output buffer. 53 | /// \param name The name of output being requested. 54 | /// \param buffer The pointer to the start of the pre-allocated buffer. 55 | /// \param byte_size The size of buffer in bytes. 56 | /// \param memory_type The memory type of the output. 57 | /// \param memory_type_id The memory type id of the output. 58 | /// \return Returns a new InferRequestedOutput object. 59 | static std::unique_ptr Create( 60 | const std::string& name, const char* buffer, size_t byte_size, 61 | MemoryType memory_type, int64_t memory_type_id) 62 | { 63 | return std::unique_ptr(new InferRequestedOutput( 64 | name, buffer, byte_size, memory_type, memory_type_id)); 65 | } 66 | 67 | /// Get name of the associated output tensor. 68 | /// \return The name of the tensor. 69 | const std::string& Name() const { return name_; } 70 | 71 | /// Get buffer of the associated output tensor. 72 | /// \return The name of the tensor. 73 | const char* Buffer() { return buffer_; } 74 | 75 | /// Get byte size of the associated output tensor. 76 | /// \return The name of the tensor. 77 | size_t ByteSize() { return byte_size_; } 78 | 79 | /// Get the memory type of the output tensor. 80 | /// \return The memory type of the tensor. 81 | const MemoryType& GetMemoryType() const { return memory_type_; } 82 | 83 | /// Get the memory type id of the output tensor. 84 | /// \return The memory type id of the tensor. 85 | const int64_t& MemoryTypeId() const { return memory_type_id_; } 86 | 87 | InferRequestedOutput(const std::string& name) 88 | : name_(name), buffer_(nullptr), byte_size_(0), 89 | memory_type_(MemoryType::CPU), memory_type_id_(0) 90 | { 91 | } 92 | 93 | InferRequestedOutput( 94 | const std::string& name, const char* buffer, size_t byte_size, 95 | MemoryType memory_type, int64_t memory_type_id) 96 | : name_(name), buffer_(buffer), byte_size_(byte_size), 97 | memory_type_(memory_type), memory_type_id_(memory_type_id) 98 | { 99 | } 100 | 101 | private: 102 | std::string name_; 103 | const char* buffer_; 104 | size_t byte_size_; 105 | MemoryType memory_type_; 106 | int64_t memory_type_id_; 107 | }; 108 | 109 | }}} // namespace triton::developer_tools::server 110 | -------------------------------------------------------------------------------- /server/src/tracer.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | #pragma once 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #include "triton/core/tritonserver.h" 39 | #include "triton/developer_tools/common.h" 40 | 41 | namespace triton { namespace developer_tools { namespace server { 42 | 43 | class TraceManager { 44 | public: 45 | class TraceSetting; 46 | 47 | class TraceFile { 48 | public: 49 | TraceFile(const std::string& file_name) 50 | : file_name_(file_name), index_(0), first_write_(true) 51 | { 52 | } 53 | ~TraceFile(); 54 | 55 | // Save the traces stored in 'trace_stream' into the file. 'to_index_file' 56 | // specifies whether the file name should be indexed, if true, the traces 57 | // will be written to 'file_name.index' where index will be incremented 58 | // every time the traces are written to a file with index. If false, the 59 | // trace will be written to 'file_name'. 60 | void SaveTraces(std::stringstream& trace_stream, const bool to_index_file); 61 | 62 | const std::string& FileName() { return file_name_; } 63 | 64 | private: 65 | const std::string file_name_; 66 | // The file index for the next index file write. 67 | std::atomic index_; 68 | 69 | // Multiple traces may be finished and write to the trace file at the same 70 | // time 71 | std::mutex mu_; 72 | std::ofstream trace_file_; 73 | bool first_write_; 74 | }; 75 | 76 | struct Trace { 77 | Trace() : trace_(nullptr), trace_id_(0) {} 78 | ~Trace(); 79 | std::shared_ptr setting_; 80 | // Group the spawned traces by trace ID for better formatting 81 | std::mutex mtx_; 82 | std::unordered_map> streams_; 83 | // Triton trace object that this trace is assosicated with, 84 | // 'Trace' object does not take ownership of 'trace_'. The caller of 85 | // SampleTrace() must call TraceManager::TraceRelease() with 'trace_userp_' 86 | // to properly release the resources if 'trace_' is not passed to a 87 | // TRITONSERVER_ServerInferAsync() call. 88 | TRITONSERVER_InferenceTrace* trace_; 89 | void* trace_userp_; 90 | 91 | uint64_t trace_id_; 92 | }; 93 | 94 | TraceManager( 95 | const TRITONSERVER_InferenceTraceLevel level, const uint32_t rate, 96 | const int32_t count, const uint32_t log_frequency, 97 | const std::string& filepath); 98 | 99 | ~TraceManager() = default; 100 | 101 | void UpdateTraceSetting( 102 | const std::string& model_name, const TraceSetting& new_setting); 103 | 104 | // Return a trace that should be used to collected trace activities 105 | // for an inference request. Return nullptr if no tracing should occur. 106 | std::shared_ptr SampleTrace(const std::string& model_name); 107 | 108 | static void TraceRelease(TRITONSERVER_InferenceTrace* trace, void* userp); 109 | 110 | class TraceSetting { 111 | public: 112 | TraceSetting(); 113 | 114 | TraceSetting( 115 | const TRITONSERVER_InferenceTraceLevel level, const uint32_t rate, 116 | const int32_t count, const uint32_t log_frequency, 117 | const std::shared_ptr& file); 118 | 119 | ~TraceSetting(); 120 | 121 | bool Valid() { return invalid_reason_.empty() && (count_ != 0); } 122 | const std::string& Reason() { return invalid_reason_; } 123 | 124 | void WriteTrace( 125 | const std::unordered_map>& 126 | streams); 127 | 128 | std::shared_ptr SampleTrace(); 129 | 130 | TRITONSERVER_InferenceTraceLevel level_; 131 | uint32_t rate_; 132 | int32_t count_; 133 | uint32_t log_frequency_; 134 | std::shared_ptr file_; 135 | 136 | private: 137 | std::string invalid_reason_; 138 | 139 | std::mutex mu_; 140 | 141 | // use to sample a trace based on sampling rate. 142 | uint64_t sample_; 143 | 144 | // use to track the status of trace count feature 145 | uint64_t created_; 146 | uint64_t collected_; 147 | 148 | // Tracking traces that haven't been saved to file 149 | uint32_t sample_in_stream_; 150 | std::stringstream trace_stream_; 151 | }; 152 | 153 | private: 154 | static void TraceActivity( 155 | TRITONSERVER_InferenceTrace* trace, 156 | TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns, 157 | void* userp); 158 | 159 | static void TraceTensorActivity( 160 | TRITONSERVER_InferenceTrace* trace, 161 | TRITONSERVER_InferenceTraceActivity activity, const char* name, 162 | TRITONSERVER_DataType datatype, const void* base, size_t byte_size, 163 | const int64_t* shape, uint64_t dim_count, 164 | TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, void* userp); 165 | 166 | std::shared_ptr global_setting_; 167 | std::unordered_map> 168 | model_settings_; 169 | 170 | std::unordered_map> trace_files_; 171 | 172 | // lock for accessing trace setting. 173 | std::mutex r_mu_; 174 | }; 175 | 176 | }}} // namespace triton::developer_tools::server 177 | -------------------------------------------------------------------------------- /server/examples/square_async_infer.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | #include 28 | 29 | #include 30 | 31 | #include "triton/developer_tools/server_wrapper.h" 32 | 33 | namespace tds = triton::developer_tools::server; 34 | 35 | namespace { 36 | 37 | #define FAIL(MSG) \ 38 | do { \ 39 | std::cerr << "error: " << (MSG) << std::endl; \ 40 | exit(1); \ 41 | } while (false) 42 | 43 | void 44 | Usage(char** argv, const std::string& msg = std::string()) 45 | { 46 | if (!msg.empty()) { 47 | std::cerr << msg << std::endl; 48 | } 49 | 50 | std::cerr << "Usage: " << argv[0] << " [options]" << std::endl; 51 | std::cerr << "\t-v Enable verbose logging" << std::endl; 52 | 53 | exit(1); 54 | } 55 | 56 | void 57 | GetResults( 58 | std::vector>& results, 59 | std::future> future) 60 | { 61 | results.push_back(future.get()); 62 | size_t size = results.size(); 63 | for (size_t i = 0; i < size; i++) { 64 | if (results[i]) { 65 | if (results[i]->HasError()) { 66 | FAIL(results[i]->ErrorMsg()); 67 | } 68 | auto next_future = results[i]->GetNextResult(); 69 | if (next_future) { 70 | results.push_back(next_future->get()); 71 | size++; 72 | } 73 | } 74 | } 75 | } 76 | 77 | void 78 | Check( 79 | const std::vector>& results, 80 | int32_t input_value) 81 | { 82 | int count = 0; 83 | std::cout << "Outputs:\n"; 84 | for (auto& result : results) { 85 | if (result) { 86 | std::shared_ptr out = result->Output("OUT"); 87 | 88 | if ((out->shape_.size() != 1) || (out->shape_[0] != 1)) { 89 | FAIL("error: received incorrect shapes"); 90 | } 91 | 92 | if (out->memory_type_ != tds::MemoryType::CPU) { 93 | FAIL( 94 | "unexpected memory type, expected to be allocated in CPU, got " + 95 | std::string(MemoryTypeString(out->memory_type_)) + ", id " + 96 | std::to_string(out->memory_type_id_)); 97 | } 98 | 99 | if (out->data_type_ != tds::DataType::INT32) { 100 | FAIL( 101 | "unexpected datatype '" + 102 | std::string(DataTypeString(out->data_type_))); 103 | } 104 | 105 | if (input_value && (*((int32_t*)out->buffer_) != input_value)) { 106 | FAIL( 107 | "incorrect value, expected: '" + std::to_string(input_value) + 108 | ", got :" + std::to_string(*((int32_t*)out->buffer_))); 109 | } 110 | 111 | std::cout << *((int32_t*)out->buffer_) << "\n"; 112 | count++; 113 | } 114 | } 115 | 116 | if (count != input_value) { 117 | std::cerr << "error: received incorrect number of responses. Expected: " 118 | << input_value << ", got: " << count << std::endl; 119 | } 120 | } 121 | 122 | } // namespace 123 | 124 | int 125 | main(int argc, char** argv) 126 | { 127 | int verbose_level = 0; 128 | 129 | // Parse commandline... 130 | int opt; 131 | while ((opt = getopt(argc, argv, "vu:H:")) != -1) { 132 | switch (opt) { 133 | case 'v': 134 | verbose_level = 1; 135 | break; 136 | case '?': 137 | Usage(argv); 138 | break; 139 | } 140 | } 141 | try { 142 | // Use 'ServerOptions' object to initialize TritonServer. 143 | tds::ServerOptions options({"./models"}); 144 | options.logging_.verbose_ = 145 | tds::LoggingOptions::VerboseLevel(verbose_level); 146 | options.model_control_mode_ = tds::ModelControlMode::EXPLICIT; 147 | auto server = tds::TritonServer::Create(options); 148 | 149 | // Load 'square_int32' model. 150 | server->LoadModel("square_int32"); 151 | 152 | // Please see here for more information about this decoupled model: 153 | // https://github.com/triton-inference-server/python_backend/tree/main/examples/decoupled. 154 | std::string model_name = "square_int32"; 155 | 156 | // Initialize 'InferRequest' with the name of the model that we want to run 157 | // an inference on. 158 | auto request1 = tds::InferRequest::Create(tds::InferOptions(model_name)); 159 | 160 | // Create the data for an input tensor. For square model, value '3' here 161 | // means there will be three responses for this request, and each response 162 | // contains only one output with value '3'. 163 | std::vector input_data1 = {3}; 164 | 165 | std::vector shape{1}; 166 | 167 | // Add input tensor to the inference request. 168 | request1->AddInput( 169 | "IN", input_data1.begin(), input_data1.end(), tds::DataType::INT32, 170 | shape, tds::MemoryType::CPU, 0); 171 | 172 | // Call 'AsyncInfer' function to run inference. 173 | auto result_future1 = server->AsyncInfer(*request1); 174 | 175 | // Run the second inference. 176 | auto request2 = tds::InferRequest::Create(tds::InferOptions(model_name)); 177 | 178 | // Create the data for an input tensor. For square model, value '0' here 179 | // means there won't be any responses for this request. 180 | std::vector input_data2 = {0}; 181 | request2->AddInput( 182 | "IN", input_data2.begin(), input_data2.end(), tds::DataType::INT32, 183 | shape, tds::MemoryType::CPU, 0); 184 | 185 | // Call 'AsyncInfer' function to run inference. 186 | auto result_future2 = server->AsyncInfer(*request2); 187 | 188 | // Get the infer results from both inferences and check the results. 189 | std::vector> results1; 190 | GetResults(results1, std::move(result_future1)); 191 | Check(results1, 3); 192 | 193 | std::vector> results2; 194 | GetResults(results2, std::move(result_future2)); 195 | Check(results2, 0); 196 | } 197 | catch (const tds::TritonException& ex) { 198 | std::cerr << "Error: " << ex.what(); 199 | exit(1); 200 | } 201 | 202 | return 0; 203 | } 204 | -------------------------------------------------------------------------------- /server/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | cmake_minimum_required(VERSION 3.31.8) 28 | 29 | project(tritondevelopertoolsserver LANGUAGES C CXX) 30 | 31 | # 32 | # Options 33 | # 34 | option(TRITON_ENABLE_GPU "Enable GPU support in backend utilities" ON) 35 | option(TRITON_ENABLE_STATS "Include statistics collections in backend utilities" ON) 36 | option(TRITON_BUILD_TEST "Include unit test for the Server Wrapper" ON) 37 | option(TRITON_ENABLE_EXAMPLES "Include examples in build" ON) 38 | 39 | option(TRITON_BUILD_STATIC_LIBRARY "Create multiple static libraries, otherwise create one dynamic library" ON) 40 | set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") 41 | set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") 42 | 43 | if(NOT CMAKE_BUILD_TYPE) 44 | set(CMAKE_BUILD_TYPE Release) 45 | endif() 46 | 47 | # 48 | # Dependencies 49 | # 50 | include(FetchContent) 51 | 52 | FetchContent_Declare( 53 | repo-common 54 | GIT_REPOSITORY https://github.com/triton-inference-server/common.git 55 | GIT_TAG ${TRITON_COMMON_REPO_TAG} 56 | GIT_SHALLOW ON 57 | ) 58 | FetchContent_Declare( 59 | repo-core 60 | GIT_REPOSITORY https://github.com/triton-inference-server/core.git 61 | GIT_TAG ${TRITON_CORE_REPO_TAG} 62 | GIT_SHALLOW ON 63 | ) 64 | FetchContent_MakeAvailable(repo-common repo-core) 65 | 66 | # 67 | # CUDA 68 | # 69 | if(${TRITON_ENABLE_GPU}) 70 | find_package(CUDAToolkit REQUIRED) 71 | set(CUDA_NVCC_FLAGS -std=c++11) 72 | 73 | if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.1") 74 | add_definitions(-DTRITON_ENABLE_CUDA_GRAPH=1) 75 | else() 76 | message(WARNING "CUDA ${CUDAToolkit_VERSION} does not support CUDA graphs.") 77 | endif() 78 | endif() # TRITON_ENABLE_GPU 79 | 80 | find_package(Threads REQUIRED) 81 | 82 | # 83 | # Triton Developer Tools Server 84 | # 85 | file(GLOB SRC_FILES src/*.cc src/*.h) 86 | if(${TRITON_BUILD_STATIC_LIBRARY}) 87 | add_library( 88 | triton-developer_tools-server 89 | ${SRC_FILES} 90 | ) 91 | else() 92 | add_library( 93 | triton-developer_tools-server SHARED 94 | ${SRC_FILES} 95 | ) 96 | endif() 97 | 98 | 99 | add_library( 100 | TritonDeveloperToolsServer::triton-developer_tools-server ALIAS triton-developer_tools-server 101 | ) 102 | 103 | target_include_directories( 104 | triton-developer_tools-server 105 | PUBLIC 106 | $ 107 | $ 108 | PRIVATE 109 | ${CMAKE_CURRENT_SOURCE_DIR}/src 110 | ) 111 | 112 | if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") 113 | message("Using MSVC as compiler, default target on Windows 10. " 114 | "If the target system is not Windows 10, please update _WIN32_WINNT " 115 | "to corresponding value.") 116 | endif() 117 | target_compile_features(triton-developer_tools-server PRIVATE cxx_std_11) 118 | target_compile_options( 119 | triton-developer_tools-server 120 | PRIVATE 121 | $<$,$,$>: 122 | -Wall -Wextra -Wno-unused-parameter> 123 | $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc> 124 | ) 125 | 126 | # TRITON_ENABLE_GPU exposed in header so set PUBLIC 127 | if(${TRITON_ENABLE_GPU}) 128 | target_compile_definitions( 129 | triton-developer_tools-server 130 | PUBLIC TRITON_ENABLE_GPU=1 131 | ) 132 | endif() # TRITON_ENABLE_GPU 133 | 134 | # TRITON_ENABLE_STATS exposed in header so set PUBLIC 135 | if(${TRITON_ENABLE_STATS}) 136 | target_compile_definitions( 137 | triton-developer_tools-server 138 | PUBLIC TRITON_ENABLE_STATS=1 139 | ) 140 | endif() # TRITON_ENABLE_STATS 141 | 142 | set_target_properties( 143 | triton-developer_tools-server PROPERTIES 144 | WINDOWS_EXPORT_ALL_SYMBOLS TRUE 145 | POSITION_INDEPENDENT_CODE ON 146 | OUTPUT_NAME tritondevelopertoolsserver 147 | ) 148 | 149 | target_link_libraries( 150 | triton-developer_tools-server 151 | PUBLIC 152 | Threads::Threads 153 | triton-core-serverapi # from repo-core 154 | triton-core-serverstub # from repo-core 155 | triton-common-logging # from repo-common 156 | ) 157 | 158 | if(${TRITON_ENABLE_GPU}) 159 | target_link_libraries( 160 | triton-developer_tools-server 161 | PUBLIC 162 | CUDA::cudart 163 | ) 164 | endif() # TRITON_ENABLE_GPU 165 | 166 | # 167 | # Install 168 | # 169 | include(GNUInstallDirs) 170 | set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonDeveloperToolsServer) 171 | 172 | install( 173 | TARGETS 174 | triton-developer_tools-server 175 | EXPORT 176 | triton-developer_tools-server-targets 177 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 178 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} 179 | ) 180 | 181 | install( 182 | DIRECTORY include/ 183 | DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} 184 | ) 185 | 186 | install( 187 | EXPORT 188 | triton-developer_tools-server-targets 189 | FILE 190 | TritonDeveloperToolsServerTargets.cmake 191 | NAMESPACE 192 | TritonDeveloperToolsServer:: 193 | DESTINATION 194 | ${INSTALL_CONFIGDIR} 195 | ) 196 | 197 | include(CMakePackageConfigHelpers) 198 | configure_package_config_file( 199 | ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonDeveloperToolsServerConfig.cmake.in 200 | ${CMAKE_CURRENT_BINARY_DIR}/TritonDeveloperToolsServerConfig.cmake 201 | INSTALL_DESTINATION ${INSTALL_CONFIGDIR} 202 | ) 203 | 204 | install( 205 | FILES 206 | ${CMAKE_CURRENT_BINARY_DIR}/TritonDeveloperToolsServerConfig.cmake 207 | DESTINATION ${INSTALL_CONFIGDIR} 208 | ) 209 | 210 | # 211 | # Export from build tree 212 | # 213 | export( 214 | EXPORT triton-developer_tools-server-targets 215 | FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonDeveloperToolsServerTargets.cmake 216 | NAMESPACE TritonDeveloperToolsServer:: 217 | ) 218 | 219 | export(PACKAGE TritonDeveloperToolsServer) 220 | 221 | if(${TRITON_BUILD_TEST}) 222 | FetchContent_Declare( 223 | googletest 224 | GIT_REPOSITORY https://github.com/google/googletest.git 225 | GIT_TAG release-1.12.1 226 | ) 227 | FetchContent_MakeAvailable(googletest) 228 | add_subdirectory(test test) 229 | endif() 230 | 231 | if(TRITON_ENABLE_EXAMPLES) 232 | add_subdirectory(examples) 233 | endif() # TRITON_ENABLE_EXAMPLES 234 | -------------------------------------------------------------------------------- /server/examples/addsub_string_async_infer.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | #include 28 | 29 | #include 30 | #include 31 | 32 | #include "triton/developer_tools/server_wrapper.h" 33 | 34 | namespace tds = triton::developer_tools::server; 35 | 36 | namespace { 37 | 38 | #define FAIL(MSG) \ 39 | do { \ 40 | std::cerr << "error: " << (MSG) << std::endl; \ 41 | exit(1); \ 42 | } while (false) 43 | 44 | void 45 | Usage(char** argv, const std::string& msg = std::string()) 46 | { 47 | if (!msg.empty()) { 48 | std::cerr << msg << std::endl; 49 | } 50 | 51 | std::cerr << "Usage: " << argv[0] << " [options]" << std::endl; 52 | std::cerr << "\t-v Enable verbose logging" << std::endl; 53 | 54 | exit(1); 55 | } 56 | 57 | void 58 | CompareResult( 59 | const std::vector& input0_data, 60 | const std::vector& input1_data, 61 | const std::vector& result0_data, 62 | const std::vector& result1_data, 63 | const std::vector& expected_sum, 64 | const std::vector& expected_diff) 65 | { 66 | for (size_t i = 0; i < 16; ++i) { 67 | std::cout << input0_data[i] << " + " << input0_data[i] << " = " 68 | << result0_data[i] << std::endl; 69 | std::cout << input0_data[i] << " - " << input1_data[i] << " = " 70 | << result1_data[i] << std::endl; 71 | 72 | if (expected_sum[i] != std::stoi(result0_data[i])) { 73 | std::cerr << "error: incorrect sum" << std::endl; 74 | exit(1); 75 | } 76 | if (expected_diff[i] != std::stoi(result1_data[i])) { 77 | std::cerr << "error: incorrect difference" << std::endl; 78 | exit(1); 79 | } 80 | } 81 | } 82 | 83 | void 84 | Check( 85 | std::shared_ptr& output0, 86 | std::shared_ptr& output1, 87 | const std::vector& input0_data, 88 | const std::vector& input1_data, 89 | const std::string& output0_name, const std::string& output1_name, 90 | const std::vector& result0_data, 91 | const std::vector& result1_data, 92 | const std::vector& expected_sum, 93 | const std::vector& expected_diff) 94 | { 95 | for (auto& output : 96 | {std::make_pair(output0_name, output0), 97 | std::make_pair(output1_name, output1)}) { 98 | if ((output.second->shape_.size() != 1) || 99 | (output.second->shape_[0] != 16)) { 100 | std::cerr << "error: received incorrect shapes for " << output.first 101 | << std::endl; 102 | exit(1); 103 | } 104 | 105 | if (output.second->data_type_ != tds::DataType::BYTES) { 106 | FAIL( 107 | "unexpected datatype '" + 108 | std::string(DataTypeString(output.second->data_type_)) + "' for '" + 109 | output.first + "'"); 110 | } 111 | 112 | if (output.second->memory_type_ != tds::MemoryType::CPU) { 113 | FAIL( 114 | "unexpected memory type, expected to be allocated in CPU, got " + 115 | std::string(MemoryTypeString(output.second->memory_type_)) + ", id " + 116 | std::to_string(output.second->memory_type_id_) + " for " + 117 | output.first); 118 | } 119 | } 120 | 121 | if (result0_data.size() != 16) { 122 | std::cerr << "error: received incorrect number of strings for OUTPUT0: " 123 | << result0_data.size() << std::endl; 124 | } 125 | if (result1_data.size() != 16) { 126 | std::cerr << "error: received incorrect number of strings for OUTPUT1: " 127 | << result1_data.size() << std::endl; 128 | } 129 | 130 | CompareResult( 131 | input0_data, input1_data, result0_data, result1_data, expected_sum, 132 | expected_diff); 133 | } 134 | 135 | } // namespace 136 | 137 | int 138 | main(int argc, char** argv) 139 | { 140 | int verbose_level = 0; 141 | 142 | // Parse commandline... 143 | int opt; 144 | while ((opt = getopt(argc, argv, "vu:H:")) != -1) { 145 | switch (opt) { 146 | case 'v': 147 | verbose_level = 1; 148 | break; 149 | case '?': 150 | Usage(argv); 151 | break; 152 | } 153 | } 154 | try { 155 | // Use 'ServerOptions' object to initialize TritonServer. 156 | tds::ServerOptions options({"./models"}); 157 | options.logging_.verbose_ = 158 | tds::LoggingOptions::VerboseLevel(verbose_level); 159 | auto server = tds::TritonServer::Create(options); 160 | 161 | // We use a simple model that takes 2 input tensors of 16 strings 162 | // each and returns 2 output tensors of 16 strings each. The input 163 | // strings must represent integers. One output tensor is the 164 | // element-wise sum of the inputs and one output is the element-wise 165 | // difference. 166 | std::string model_name = "add_sub_str"; 167 | 168 | // Use 'LoadedModels' function to check if the model we need is loaded. 169 | std::set loaded_models = server->LoadedModels(); 170 | if (loaded_models.find(model_name) == loaded_models.end()) { 171 | FAIL("Model '" + model_name + "' is not found."); 172 | } 173 | 174 | // Initialize 'InferRequest' with the name of the model that we want to run 175 | // an inference on. 176 | auto request = tds::InferRequest::Create(tds::InferOptions(model_name)); 177 | 178 | // Create the data for the two input tensors. Initialize the first 179 | // to unique integers and the second to all ones. The input tensors 180 | // are the string representation of these values. 181 | std::vector input0_data(16); 182 | std::vector input1_data(16); 183 | std::vector expected_sum(16); 184 | std::vector expected_diff(16); 185 | for (size_t i = 0; i < 16; ++i) { 186 | input0_data[i] = std::to_string(i); 187 | input1_data[i] = std::to_string(1); 188 | expected_sum[i] = i + 1; 189 | expected_diff[i] = i - 1; 190 | } 191 | 192 | std::vector shape{16}; 193 | 194 | // Add two input tensors to the inference request. 195 | request->AddInput( 196 | "INPUT0", input0_data.begin(), input0_data.end(), tds::DataType::BYTES, 197 | shape, tds::MemoryType::CPU, 0); 198 | request->AddInput( 199 | "INPUT1", input1_data.begin(), input1_data.end(), tds::DataType::BYTES, 200 | shape, tds::MemoryType::CPU, 0); 201 | 202 | // Indicate that we want both output tensors calculated and returned 203 | // for the inference request. These calls are optional, if no 204 | // output(s) are specifically requested then all outputs defined by 205 | // the model will be calculated and returned. 206 | request->AddRequestedOutput("OUTPUT0"); 207 | request->AddRequestedOutput("OUTPUT1"); 208 | 209 | // Call 'AsyncInfer' function to run inference. 210 | auto result_future = server->AsyncInfer(*request); 211 | 212 | // Get the infer result and check the result. 213 | auto result = result_future.get(); 214 | if (result->HasError()) { 215 | FAIL(result->ErrorMsg()); 216 | } 217 | std::string name = result->ModelName(); 218 | std::string version = result->ModelVersion(); 219 | std::string id = result->Id(); 220 | std::cout << "Ran inferencece on model '" << name << "', version '" 221 | << version << "', with request ID '" << id << "'\n"; 222 | 223 | // Retrieve two outputs from the 'InferResult' object. 224 | std::shared_ptr out0 = result->Output("OUTPUT0"); 225 | std::shared_ptr out1 = result->Output("OUTPUT1"); 226 | 227 | // Get the result data as a vector of string. 228 | std::vector result0_data = result->StringData("OUTPUT0"); 229 | std::vector result1_data = result->StringData("OUTPUT1"); 230 | if (result0_data.size() != 16) { 231 | std::cerr << "error: received incorrect number of strings for OUTPUT0: " 232 | << result0_data.size() << std::endl; 233 | } 234 | if (result1_data.size() != 16) { 235 | std::cerr << "error: received incorrect number of strings for OUTPUT1: " 236 | << result1_data.size() << std::endl; 237 | } 238 | 239 | Check( 240 | out0, out1, input0_data, input1_data, "OUTPUT0", "OUTPUT1", 241 | result0_data, result1_data, expected_sum, expected_diff); 242 | 243 | // Get full response. 244 | std::cout << result->DebugString() << std::endl; 245 | } 246 | catch (const tds::TritonException& ex) { 247 | std::cerr << "Error: " << ex.what(); 248 | exit(1); 249 | } 250 | 251 | return 0; 252 | } 253 | -------------------------------------------------------------------------------- /server/include/triton/developer_tools/generic_server_wrapper.h: -------------------------------------------------------------------------------- 1 | // Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | #pragma once 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include "../src/infer_requested_output.h" 33 | #include "../src/tracer.h" 34 | #include "common.h" 35 | 36 | namespace triton { namespace developer_tools { namespace server { 37 | 38 | class ServerOptions; 39 | class InferOptions; 40 | class RepositoryIndex; 41 | class NewModelRepo; 42 | class Tensor; 43 | class GenericInferRequest; 44 | class GenericInferResult; 45 | using TensorAllocMap = std::unordered_map< 46 | std::string, 47 | std::tuple>; 48 | 49 | //============================================================================== 50 | /// Object that encapsulates in-process C API functionalities. 51 | /// 52 | class GenericTritonServer { 53 | public: 54 | /// Create a GenericTritonServer instance. 55 | static std::unique_ptr Create( 56 | const ServerOptions& server_options); 57 | 58 | virtual ~GenericTritonServer(); 59 | 60 | /// Load the requested model or reload the model if it is already loaded. 61 | /// \param model_name The name of the model. 62 | virtual void LoadModel(const std::string& model_name) = 0; 63 | 64 | /// Unload the requested model. Unloading a model that is not loaded 65 | /// on server has no affect. 66 | /// \param model_name The name of the model. 67 | virtual void UnloadModel(const std::string& model_name) = 0; 68 | 69 | /// Get the set of names of models that are loaded and ready for inference. 70 | /// \return Returns the set of names of models that are 71 | /// loaded and ready for inference. 72 | virtual std::set LoadedModels() = 0; 73 | 74 | /// Get the index of model repository contents. 75 | /// \return Returns a vector of 'RepositoryIndex' object 76 | /// representing the repository index. 77 | virtual std::vector ModelIndex() = 0; 78 | 79 | /// Get the metrics of the server. 80 | /// \return Returns a string representing the metrics. 81 | virtual std::string ServerMetrics() = 0; 82 | 83 | /// Get the inference statistics of the specified model. 84 | /// \param model_name The name of the model. 85 | /// \param model_version the version of the model requested. 86 | /// \return Returns a json string representing the model metrics. 87 | virtual std::string ModelStatistics( 88 | const std::string& model_name, const int64_t model_version) = 0; 89 | 90 | /// Is the server live? 91 | /// \return Returns true if server is live, false otherwise. 92 | virtual bool IsServerLive() = 0; 93 | 94 | /// Is the server ready? 95 | /// \return Returns true if server is ready, false otherwise. 96 | virtual bool IsServerReady() = 0; 97 | 98 | /// Stop a server object. A server can't be restarted once it is 99 | /// stopped. 100 | virtual void ServerStop() = 0; 101 | 102 | /// Is the model ready? 103 | /// \param model_name The name of the model to get readiness for. 104 | /// \param model_version The version of the model to get readiness 105 | /// for. If -1 then the server will choose a version based on the 106 | /// model's policy. This field is optional, default is -1. 107 | /// \return Returns true if server is ready, false otherwise. 108 | virtual bool IsModelReady( 109 | const std::string& model_name, const int64_t model_version = -1) = 0; 110 | 111 | /// Get the configuration of specified model. 112 | /// \param model_name The name of the model. 113 | /// \param model_version The version of the model to get configuration. 114 | /// The default value is -1 which means then the server will 115 | /// choose a version based on the model and internal policy. This field is 116 | /// optional. \return Returns JSON representation of model configuration as a 117 | /// string. 118 | virtual std::string ModelConfig( 119 | const std::string& model_name, const int64_t model_version = -1) = 0; 120 | 121 | /// Get the metadata of the server. 122 | /// \return Returns JSON representation of server metadata as a string. 123 | virtual std::string ServerMetadata() = 0; 124 | 125 | /// Get the metadata of specified model. 126 | /// \param model_name The name of the model. 127 | /// \param model_version The version of the model to get configuration. 128 | /// The default value is -1 which means then the server will choose a version 129 | /// based on the model and internal policy. This field is optional. 130 | /// \return Returns JSON representation of model metadata as a string. 131 | virtual std::string ModelMetadata( 132 | const std::string& model_name, const int64_t model_version = -1) = 0; 133 | 134 | /// Register a new model repository. This function is not available in polling 135 | /// mode. 136 | /// \param new_model_repo The 'NewModelRepo' object contains the info of the 137 | /// new model repo to be registered. 138 | virtual void RegisterModelRepo(const NewModelRepo& new_model_repo) = 0; 139 | 140 | /// Unregister a model repository. This function is not available in polling 141 | /// mode. 142 | /// \param repo_path The full path to the model repository. 143 | virtual void UnregisterModelRepo(const std::string& repo_path) = 0; 144 | 145 | virtual std::unique_ptr Infer( 146 | GenericInferRequest& infer_request) = 0; 147 | }; 148 | 149 | //============================================================================== 150 | /// An interface for InferResult object to interpret the response to an 151 | /// inference request. 152 | /// 153 | class GenericInferResult { 154 | public: 155 | virtual ~GenericInferResult(); 156 | 157 | /// Get the name of the model which generated this response. 158 | /// \return Returns the name of the model. 159 | virtual std::string ModelName() noexcept = 0; 160 | 161 | /// Get the version of the model which generated this response. 162 | /// \return Returns the version of the model. 163 | virtual std::string ModelVersion() noexcept = 0; 164 | 165 | /// Get the id of the request which generated this response. 166 | /// \return Returns the id of the request. 167 | virtual std::string Id() noexcept = 0; 168 | 169 | /// Get the output names from the infer result 170 | /// \return Vector of output names 171 | virtual std::vector OutputNames() = 0; 172 | 173 | /// Get the result output as a shared pointer of 'Tensor' object. The 'buffer' 174 | /// field of the output is owned by the returned 'Tensor' object itself. Note 175 | /// that for string data, need to use 'StringData' function for string data 176 | /// result. 177 | /// \param name The name of the output tensor to be retrieved. 178 | /// \return Returns the output result as a shared pointer of 'Tensor' object. 179 | virtual std::shared_ptr Output(const std::string& name) = 0; 180 | 181 | /// Get the result data as a vector of strings. The vector will 182 | /// receive a copy of result data. An exception will be thrown if 183 | /// the data type of output is not 'BYTES'. 184 | /// \param output_name The name of the output to get result data. 185 | /// \return Returns the result data represented as a vector of strings. The 186 | /// strings are stored in the row-major order. 187 | virtual std::vector StringData( 188 | const std::string& output_name) = 0; 189 | 190 | /// Return the complete response as a user friendly string. 191 | /// \return The string describing the complete response. 192 | virtual std::string DebugString() = 0; 193 | 194 | /// Return if there is an error within this result. 195 | /// \return True if this 'GenericInferResult' object has an error, false if no 196 | /// error. 197 | virtual bool HasError() = 0; 198 | 199 | /// Return the error message of the error. 200 | /// \return The message for the error. Empty if no error. 201 | virtual std::string ErrorMsg() = 0; 202 | }; 203 | 204 | //============================================================================== 205 | /// Object that describes an inflight inference request. 206 | /// 207 | class GenericInferRequest { 208 | public: 209 | /// Create an InferRequest instance. 210 | static std::unique_ptr Create( 211 | const InferOptions& infer_options); 212 | 213 | virtual ~GenericInferRequest(); 214 | 215 | /// Add an input tensor to be sent within an InferRequest object. The input 216 | /// data buffer within the 'Tensor' object must not be modified until 217 | /// inference is completed and result is returned. 218 | /// \param name The name of the input tensor. 219 | /// \param input A Tensor object that describes an input tensor. 220 | virtual void AddInput( 221 | const std::string& name, const Tensor& input) noexcept = 0; 222 | 223 | /// Add a requested output to be sent within an InferRequest object. 224 | /// Calling this function is optional. If no output(s) are specifically 225 | /// requested then all outputs defined by the model will be calculated and 226 | /// returned. Pre-allocated buffer for each output should be specified within 227 | /// the 'Tensor' object. 228 | /// \param name The name of the output tensor. 229 | /// \param output A Tensor object that describes an output tensor containing 230 | /// its pre-allocated buffer. 231 | virtual void AddRequestedOutput(const std::string& name, Tensor& output) = 0; 232 | 233 | /// Add a requested output to be sent within an InferRequest object. 234 | /// Calling this function is optional. If no output(s) are specifically 235 | /// requested then all outputs defined by the model will be calculated and 236 | /// returned. 237 | /// \param name The name of the output tensor. 238 | virtual void AddRequestedOutput(const std::string& name) = 0; 239 | 240 | /// Clear inputs and outputs of the request. This allows users to reuse the 241 | /// InferRequest object if needed. 242 | virtual void Reset() = 0; 243 | }; 244 | 245 | }}} // namespace triton::developer_tools::server 246 | -------------------------------------------------------------------------------- /server/README.md: -------------------------------------------------------------------------------- 1 | 28 | 29 | [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) 30 | 31 | # Triton Server C-API Wrapper 32 | 33 | Triton Server C-API Wrapper wraps up the functionality of 34 | [Triton in-process C-API](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#in-process-triton-server-api) 35 | , providing a simpler interface for users to use Triton in-process C API for 36 | developing their application without having in-depth knowledge of Triton 37 | implementation details or writing complicated code. This wrapper is also called 38 | the "Higher Level In Process C++ API" or just "Server Wrapper" for short. The 39 | header files that defines and documents the Server C-API Wrapper is 40 | [server_wrapper.h](include/triton/developer_tools/server_wrapper.h). Ask 41 | questions or report problems in the main Triton 42 | [issues page](https://github.com/triton-inference-server/server/issues). 43 | 44 | ## Build the Server C-API Wrapper library and custom application 45 | 46 | To build and install the Server Wrapper library from 47 | `developer_tools/server`, use the following commands. 48 | 49 | ``` 50 | $ mkdir build 51 | $ cd build 52 | $ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. 53 | $ make install 54 | ``` 55 | 56 | The following required Triton repositories will be pulled and used in 57 | the build. By default the "main" branch/tag will be used for each repo 58 | but the listed CMake argument can be used to override. 59 | 60 | * triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag] 61 | * triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag] 62 | 63 | See the [CMakeLists.txt](CMakeLists.txt) file for other build options. 64 | 65 | When the build completes, the library `libtritondevelopertoolsserver.a` and examples 66 | can be found in the install directory. 67 | 68 | For custom application, you can refer to 69 | [CMakeLists.txt](examples/CMakeLists.txt) to see how to build your executable 70 | with the Server Wrapper library. 71 | 72 | ### API Description 73 | 74 | Triton Server C-API Wrapper is encapsulated in a shared library which is built 75 | from source contained in this repository. You can include the full 76 | capabilities by linking the shared library into your application and by using 77 | the C++ API defined in [server_wrapper.h](include/triton/developer_tools/server_wrapper.h). 78 | 79 | #### Inference APIs 80 | 81 | Three main objects will be used for Server Wrapper. 82 | 83 | ##### TritonServer 84 | 85 | The top-level abstraction used by Server Wrapper is `TritonServer`, 86 | which represents the Triton core logic that is capable of implementing 87 | some of the features and capabilities of Triton. 88 | 89 | ##### InferRequest 90 | 91 | `InferRequest` carries the information for a inference request. This object 92 | allows you to set inference options, add inputs and requeseted outputs to a request. 93 | 94 | ##### InferResult 95 | 96 | `InferResult` provides an interface to interpret the inference response, making 97 | it more easily to retrieve output data. 98 | 99 | ##### General Workflow 100 | 101 | Performing an inference request requires the use of some Server C++ API 102 | functions and objects, as demonstrated in 103 | [simple_addsub_async_infer.cc](examples/simple_addsub_async_infer.cc). 104 | The general usage requires the following steps. 105 | 106 | 1. Start Server 107 | 108 | To start a Triton server, you need to create a `TritonServer` instance with 109 | the `ServerOptions` structure which contains the server options used to 110 | initialize the server. 111 | 112 | ```cpp 113 | auto server = TritonServer::Create(ServerOptions options({"path/to/your/model_repository", "path/to/another/model_repository"})); 114 | ``` 115 | 116 | 2. Load model (optional) 117 | 118 | This step is optional as all the models in the model repository paths provided 119 | in the previous step will be loaded to the server by default. However, if 120 | [model control mode](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_management.md) 121 | is set to "EXPLICIT" when setting the server options in the previous step, you 122 | can load a specific model by calling 123 | 124 | ```cpp 125 | server->LoadModel("your_model_name"); 126 | ``` 127 | 128 | 3. Construct `InferRequest` with infer options 129 | 130 | Initialize the request with `InferOptions` structure, specifying the name of 131 | the model that you want to run an inference on and other inference options. 132 | 133 | ```cpp 134 | auto request = InferRequest::Create(InferOptions("your_model_name")); 135 | ``` 136 | 137 | 4. Add inputs / requested outputs to a request 138 | 139 | You can add an input to a request by either using `Tensor` object, which 140 | contains the information of an input tensor, or using the iterator if the input 141 | data is stored in a contiguous container. Iterator can also be used if input 142 | data is of 'string' type and is stored in a contiguous container. Note that the 143 | input data buffer within the 'Tensor' object must not be modified until 144 | inference is completed and result is returned. 145 | 146 | For output, you can add the name of requested output to a request, indicating 147 | what output to be calculated and returned for inference. You can also provide 148 | pre-allocated buffer for output in this step if you want the output data to 149 | be stored in-place in the provided buffer. See "Use pre-allocated buffer" 150 | section in the next step for more information. 151 | 152 | ```cpp 153 | // Assume that we have input data in these two vectors. 154 | std::vector input0_data; 155 | std::vector input1_data; 156 | 157 | Tensor input0(&input0_data[0], input0_data.size(), DataType::INT32, {1, 16}, MemoryType::CPU, 0); 158 | Tensor input1(&input1_data[0], input1_data.size(), DataType::INT32, {1, 16}, MemoryType::CPU, 0); 159 | 160 | request->AddInput("INPUT0_NAME", input0); 161 | request->AddInput("INPUT1_NAME", input1); 162 | 163 | request->AddRequestedOutput("OUTPUT0_NAME"); 164 | request->AddRequestedOutput("OUTPUT1_NAME"); 165 | ``` 166 | 167 | 5. Call the inference method 168 | 169 | Server Wrapper uses promise-future based structure for asynchronous inference. 170 | A future of a unique pointer of `InferResult` object will be returned from 171 | `AsyncInfer` function, and the result can be retrieved whenever needed by 172 | calling `future.get()`. 173 | 174 | > [!Note] 175 | > For 176 | > [decoupled models](https://github.com/triton-inference-server/python_backend/tree/main/examples/decoupled#decoupled-model-examples) 177 | > with multi-part responses we recommend using the example `GetResults` 178 | > function, as demonstrated in 179 | > [square_async_infer.cc](examples/square_async_infer.cc), to ensure the entire 180 | > response from the model is collected. 181 | 182 | When running inference, Server Wrapper provides three options for the 183 | allocation and deallocation of output tensors. 184 | 185 | * Use default allocator 186 | 187 | Default output allocation/deallocation will be used. No need to specify how to 188 | allocate/deallocate the output tensors. 189 | 190 | ```cpp 191 | // Call the inference method. 192 | std::future> result_future = server->AsyncInfer(*request); 193 | 194 | // Get the infer result and check the result. 195 | auto result = result_future.get(); 196 | if (result->HasError()) { 197 | std::cerr << result->ErrorMsg(); 198 | } else { 199 | // Retrieve output data from 'InferResult' object... 200 | } 201 | ``` 202 | 203 | * Use custom allocator 204 | 205 | You can provide your custom allocator using `Allocator` object. You need to 206 | register your callback functions to the allocator when creating the 207 | `Allocator` object, and set `InferOptions` properly when initializing 208 | `InferRequest`. The signatures of the callback functions are defined in 209 | [common.h](include/triton/developer_tools/common.h). 210 | 211 | ```cpp 212 | // 'ResponseAllocator' and 'ResponseRelease' are the custom output allocation 213 | // and deallocation functions. 214 | Allocator allocator(ResponseAllocator, ResponseRelease); 215 | auto infer_options = InferOptions("your_model_name"); 216 | 217 | // Set custom allocator to 'InferOptions'. 218 | infer_options.custom_allocator_ = &allocator; 219 | auto request = InferRequest(infer_options); 220 | 221 | /** 222 | Add inputs/requested outputs to a request as shown in the previous step... 223 | */ 224 | 225 | // Call the inference method, and the custom allocator will be used. 226 | std::future> result_future = server->AsyncInfer(*request); 227 | 228 | // Get the infer result and check the result. 229 | auto result = result_future.get(); 230 | if (result->HasError()) { 231 | std::cerr << result->ErrorMsg(); 232 | } else { 233 | // Retrieve output data from 'InferResult' object... 234 | } 235 | ``` 236 | 237 | * Use pre-allocated buffer 238 | 239 | You can pre-allocate buffers for output tensors. The output data will be 240 | stored in the buffer you provided when adding requested outputs to a request in 241 | the previous step. Note that those buffers will *not* be freed when the `Tensor` 242 | object goes out of scope, and should be freed manually when they are no 243 | longer needed. 244 | 245 | ```cpp 246 | /* 247 | Add inputs to a request as shown in the previous step... 248 | */ 249 | 250 | void* buffer_ptr0 = malloc(64); 251 | void* buffer_ptr1 = malloc(64); 252 | 253 | // Provide pre-allocated buffer for each output tensor. 254 | Tensor output0(reinterpret_cast(buffer_ptr0), 64, MemoryType::CPU, 0); 255 | Tensor output1(reinterpret_cast(buffer_ptr1), 64, MemoryType::CPU, 0); 256 | 257 | request->AddRequestedOutput("OUTPUT0_NAME", output0); 258 | request->AddRequestedOutput("OUTPUT1_NAME", output1); 259 | 260 | // Call the inference method. 261 | std::future> result_future = server->AsyncInfer(*request); 262 | 263 | // Get the infer result and check the result. 264 | auto result = result_future.get(); 265 | if (result->HasError()) { 266 | std::cerr << result->ErrorMsg(); 267 | } else { 268 | // Retrieve output data from 'InferResult' object... 269 | } 270 | 271 | // Need to free the buffer manually. 272 | free(buffer_ptr0); 273 | free(buffer_ptr1); 274 | ``` 275 | 276 | The lifetime of output data is owned by each returned output `Tensor` object. 277 | For cases using default allocator or custom allocator, the deallocation of 278 | the buffer where the output data is stored will occurs when the `Tensor` 279 | object goes out of scope. 280 | 281 | #### Non-Inference APIs 282 | 283 | Server Wrapper contains APIs for loading/unloading models, getting metrics, and 284 | model index, etc. The use of these functions is straightforward and these 285 | functions are documented in 286 | [server_wrapper.h](include/triton/developer_tools/server_wrapper.h). You can 287 | find some of the functions demonstrated in the [examples](examples). 288 | 289 | #### Error Handling 290 | 291 | Most Higher Level Server C++ API functions throws a `TritonException` when an 292 | error occurs. You can utilize `TritonException`, which is documented in 293 | [common.h](include/triton/developer_tools/common.h), in your application for 294 | error handling. 295 | 296 | #### Examples 297 | 298 | A simple example using the Server Wrapper can be found in 299 | [simple_addsub_async_infer.cc](examples/simple_addsub_async_infer.cc) 300 | which is heavily commented. For string type IO, an example can be found in 301 | [addsub_string_async_infer.cc](examples/addsub_string_async_infer.cc). For 302 | decoupled models, please refer to 303 | [square_async_infer.cc](examples/square_async_infer.cc). 304 | 305 | When running the examples, make sure the model repository is placed under the 306 | same path, and `LD_LIBRARY_PATH` is set properly for `libtritonserver.so`. 307 | 308 | ``` 309 | # Prepare the models required by the examples. 310 | 311 | $ cd /path/to/developer_tools/server 312 | 313 | $ mkdir -p ./examples/models 314 | 315 | # Copy over the models placed in the qa folder. 316 | $ cp -r ../qa/L0_server_unit_test/models/add_sub* ./examples/models/. 317 | 318 | # Copy over the models placed in the server repository. 319 | $ git clone https://github.com/triton-inference-server/server.git 320 | $ cp -r server/docs/examples/model_repository/simple ./examples/models/. 321 | 322 | # Copy over the decoupled model placed in the python_backend repository. 323 | $ git clone https://github.com/triton-inference-server/python_backend.git 324 | $ mkdir -p ./examples/models/square_int32/1 325 | $ cp python_backend/examples/decoupled/square_model.py ./examples/models/square_int32/1/model.py 326 | $ cp python_backend/examples/decoupled/square_config.pbtxt ./examples/models/square_int32/config.pbtxt 327 | 328 | # Copy over the executables from the install directory. 329 | $ cp /path/to/install/bin/simple_addsub_async_infer ./examples 330 | $ cp /path/to/install/bin/addsub_string_async_infer ./examples 331 | $ cp /path/to/install/bin/square_async_infer ./examples 332 | 333 | # Assume libtritonserver.so is placed under "/opt/tritonserver/lib" 334 | $ LD_LIBRARY_PATH=/opt/tritonserver/lib:${LD_LIBRARY_PATH} 335 | 336 | $ cd ./examples 337 | 338 | # Run examples 339 | $ ./simple_addsub_async_infer 340 | $ ./addsub_string_async_infer 341 | $ ./square_async_infer 342 | ``` 343 | 344 | ## Triton Server C-API Wrapper Java Bindings 345 | Similar to the [Java bindings for In-Process Triton Server API](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#java-bindings-for-in-process-triton-server-api) C-API Wrapper Java Bindings 346 | is created using [Java CPP](https://github.com/bytedeco/javacpp). 347 | 348 | 349 | The API is documented in [tritonserver.java](https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/src/gen/java/org/bytedeco/tritonserver/global/tritonserver.java). 350 | **Note:** Currently, `tritonserver.java` contains bindings for both 351 | `In-Process API` and `C-API Wrapper`. 352 | More information about the `In-Process API` can be found in [Inference Protocol README](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#in-process-triton-server-api). 353 | 354 | 355 | A simple example using the Java API can be found in 356 | [Samples folder](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver/samples) 357 | which includes `SimpleCPP.java` which is similar to 358 | [`simple.cc`](https://github.com/triton-inference-server/server/blob/main/src/simple.cc), which uses the `In-Process API`. 359 | 360 | 361 | In the [QA folder](https://github.com/triton-inference-server/developer_tools/tree/main/qa), folders starting with L0_java include Java API tests. 362 | 363 | ### Java API setup instructions 364 | 365 | Please refer to [Java API setup instructions](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#java-api-setup-instructions) for instructions on how to use C-API Wrapper Java Bindings. 366 | -------------------------------------------------------------------------------- /server/include/triton/developer_tools/server_wrapper.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | #pragma once 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #include "generic_server_wrapper.h" 37 | #include "triton/core/tritonserver.h" 38 | 39 | #ifdef TRITON_ENABLE_GPU 40 | #include 41 | #endif // TRITON_ENABLE_GPU 42 | 43 | namespace triton { namespace developer_tools { namespace server { 44 | 45 | class Allocator; 46 | class InferResult; 47 | class InferRequest; 48 | struct ResponseParameters; 49 | class TraceManager; 50 | 51 | //============================================================================== 52 | /// Object that encapsulates in-process C API functionalities. 53 | /// 54 | class TritonServer : public GenericTritonServer { 55 | public: 56 | static std::unique_ptr Create( 57 | const ServerOptions& server_options); 58 | 59 | virtual ~TritonServer(); 60 | 61 | /// Load the requested model or reload the model if it is already loaded. 62 | /// \param model_name The name of the model. 63 | void LoadModel(const std::string& model_name) override; 64 | 65 | /// Unload the requested model. Unloading a model that is not loaded 66 | /// on server has no affect. 67 | /// \param model_name The name of the model. 68 | void UnloadModel(const std::string& model_name) override; 69 | 70 | /// Get the set of names of models that are loaded and ready for inference. 71 | /// \return Returns the set of names of models that are 72 | /// loaded and ready for inference. 73 | std::set LoadedModels() override; 74 | 75 | /// Get the index of model repository contents. 76 | /// \return Returns a vector of 'RepositoryIndex' object 77 | /// representing the repository index. 78 | std::vector ModelIndex() override; 79 | 80 | /// Get the metrics of the server. 81 | /// \return Returns a string representing the metrics. 82 | std::string ServerMetrics() override; 83 | 84 | /// Get the inference statistics of the specified model. 85 | /// \param model_name The name of the model. 86 | /// \param model_version the version of the model requested. 87 | /// \return Returns a json string representing the model metrics. 88 | std::string ModelStatistics( 89 | const std::string& model_name, const int64_t model_version) override; 90 | 91 | /// Run synchronous inference on server. 92 | /// \param infer_request The InferRequest object contains 93 | /// the inputs, outputs and infer options for an inference request. 94 | /// \return Returns the result of inference as a future of 95 | /// a unique pointer of InferResult object. 96 | virtual std::unique_ptr Infer(InferRequest& infer_request) = 0; 97 | 98 | /// Run asynchronous inference on server. 99 | /// \param infer_request The InferRequest object contains 100 | /// the inputs, outputs and infer options for an inference request. 101 | /// \return Returns the result of inference as a future of 102 | /// a unique pointer of InferResult object. 103 | virtual std::future> AsyncInfer( 104 | InferRequest& infer_request) = 0; 105 | 106 | /// Is the server live? 107 | /// \return Returns true if server is live, false otherwise. 108 | bool IsServerLive() override; 109 | 110 | /// Is the server ready? 111 | /// \return Returns true if server is ready, false otherwise. 112 | bool IsServerReady() override; 113 | 114 | /// Stop a server object. A server can't be restarted once it is 115 | /// stopped. 116 | void ServerStop() override; 117 | 118 | /// Is the model ready? 119 | /// \param model_name The name of the model to get readiness for. 120 | /// \param model_version The version of the model to get readiness 121 | /// for. If -1 then the server will choose a version based on the 122 | /// model's policy. This field is optional, default is -1. 123 | /// \return Returns true if server is ready, false otherwise. 124 | bool IsModelReady( 125 | const std::string& model_name, const int64_t model_version = -1) override; 126 | 127 | /// Get the configuration of specified model. 128 | /// \param model_name The name of the model. 129 | /// \param model_version The version of the model to get configuration. 130 | /// The default value is -1 which means then the server will 131 | /// choose a version based on the model and internal policy. This field is 132 | /// optional. \return Returns JSON representation of model configuration as a 133 | /// string. 134 | std::string ModelConfig( 135 | const std::string& model_name, const int64_t model_version = -1) override; 136 | 137 | /// Get the metadata of the server. 138 | /// \return Returns JSON representation of server metadata as a string. 139 | std::string ServerMetadata() override; 140 | 141 | /// Get the metadata of specified model. 142 | /// \param model_name The name of the model. 143 | /// \param model_version The version of the model to get configuration. 144 | /// The default value is -1 which means then the server will choose a version 145 | /// based on the model and internal policy. This field is optional. 146 | /// \return Returns JSON representation of model metadata as a string. 147 | std::string ModelMetadata( 148 | const std::string& model_name, const int64_t model_version = -1) override; 149 | 150 | /// Register a new model repository. This function is not available in polling 151 | /// mode. 152 | /// \param new_model_repo The 'NewModelRepo' object contains the info of the 153 | /// new model repo to be registered. 154 | void RegisterModelRepo(const NewModelRepo& new_model_repo) override; 155 | 156 | /// Unregister a model repository. This function is not available in polling 157 | /// mode. 158 | /// \param repo_path The full path to the model repository. 159 | void UnregisterModelRepo(const std::string& repo_path) override; 160 | 161 | protected: 162 | void PrepareInferenceRequest( 163 | TRITONSERVER_InferenceRequest** irequest, const InferRequest& request); 164 | 165 | void PrepareInferenceInput( 166 | TRITONSERVER_InferenceRequest* irequest, const InferRequest& request); 167 | 168 | void PrepareInferenceOutput( 169 | TRITONSERVER_InferenceRequest* irequest, InferRequest& request); 170 | 171 | void PreprocessIrequest( 172 | TRITONSERVER_InferenceRequest** irequest, 173 | const InferRequest& infer_request); 174 | 175 | // The server object. 176 | std::shared_ptr server_; 177 | // The allocator object allocating output tensor. 178 | TRITONSERVER_ResponseAllocator* allocator_; 179 | // The trace manager. 180 | std::shared_ptr trace_manager_; 181 | }; 182 | 183 | 184 | //============================================================================== 185 | /// An interface for InferResult object to interpret the response to an 186 | /// inference request. 187 | /// 188 | class InferResult : public GenericInferResult { 189 | public: 190 | virtual ~InferResult(); 191 | 192 | /// Get the name of the model which generated this response. 193 | /// \return Returns the name of the model. 194 | std::string ModelName() noexcept override; 195 | 196 | /// Get the version of the model which generated this response. 197 | /// \return Returns the version of the model. 198 | std::string ModelVersion() noexcept override; 199 | 200 | /// Get the id of the request which generated this response. 201 | /// \return Returns the id of the request. 202 | std::string Id() noexcept override; 203 | 204 | /// Get the output names from the infer result 205 | /// \return Vector of output names 206 | std::vector OutputNames() override; 207 | /// Get the result output as a shared pointer of 'Tensor' object. The 'buffer' 208 | /// field of the output is owned by the returned 'Tensor' object itself. Note 209 | /// that for string data, need to use 'StringData' function for string data 210 | /// result. 211 | /// \param name The name of the output tensor to be retrieved. 212 | /// \return Returns the output result as a shared pointer of 'Tensor' object. 213 | std::shared_ptr Output(const std::string& name) override; 214 | 215 | /// Get the result data as a vector of strings. The vector will 216 | /// receive a copy of result data. An exception will be thrown if 217 | /// the data type of output is not 'BYTES'. 218 | /// \param output_name The name of the output to get result data. 219 | /// \return Returns the result data represented as a vector of strings. The 220 | /// strings are stored in the row-major order. 221 | std::vector StringData(const std::string& output_name) override; 222 | 223 | /// Return the complete response as a user friendly string. 224 | /// \return The string describing the complete response. 225 | std::string DebugString() override; 226 | 227 | /// Return if there is an error within this result. 228 | /// \return True if this 'InferResult' object has an error, false if no error. 229 | bool HasError() override; 230 | 231 | /// Return the error message of the error. 232 | /// \return The message for the error. Empty if no error. 233 | std::string ErrorMsg() override; 234 | 235 | // Get the pointer to the future of the next result. This function is used for 236 | // retrieving multiple responses from decoupled model. If there is no next 237 | // result, this function will return nullptr. 238 | std::unique_ptr>> GetNextResult(); 239 | 240 | friend class InternalServer; 241 | 242 | protected: 243 | InferResult(); 244 | const char* model_name_; 245 | int64_t model_version_; 246 | const char* request_id_; 247 | std::vector> params_; 248 | std::unordered_map> infer_outputs_; 249 | bool has_error_; 250 | std::string error_msg_; 251 | 252 | // The pointer to the future of the next result. 253 | std::unique_ptr>> 254 | next_result_future_; 255 | 256 | TRITONSERVER_InferenceResponse* completed_response_; 257 | }; 258 | 259 | //============================================================================== 260 | /// Object that describes an inflight inference request. 261 | /// 262 | class InferRequest : public GenericInferRequest { 263 | public: 264 | /// Create an InferRequest instance. 265 | static std::unique_ptr Create( 266 | const InferOptions& infer_options); 267 | 268 | ~InferRequest(); 269 | 270 | /// Add an input tensor to be sent within an InferRequest object. The input 271 | /// data buffer within the 'Tensor' object must not be modified until 272 | /// inference is completed and result is returned. 273 | /// \param name The name of the input tensor. 274 | /// \param input A Tensor object that describes an input tensor. 275 | void AddInput(const std::string& name, const Tensor& input) noexcept override; 276 | 277 | /// Add an input tensor to be sent within an InferRequest object. This 278 | /// function is for containers holding 'non-string' data elements. Data in the 279 | /// container should be contiguous, and the the container must not be modified 280 | /// until inference is completed and result is returned. 281 | /// \param name The name of the input tensor. 282 | /// \param begin The begin iterator of the container. 283 | /// \param end The end iterator of the container. 284 | /// \param data_type The data type of the input. 285 | /// \param shape The shape of the input. 286 | /// \param memory_type The memory type of the input. 287 | /// \param memory_type_id The ID of the memory for the tensor. (e.g. '0' is 288 | /// the memory type id of 'GPU-0') 289 | template < 290 | typename Iterator, 291 | typename std::enable_if::value_type, 293 | std::string>::value>::type* = nullptr> 294 | void AddInput( 295 | const std::string& name, const Iterator begin, const Iterator end, 296 | const DataType& data_type, const std::vector& shape, 297 | const MemoryType& memory_type, const int64_t memory_type_id) noexcept; 298 | 299 | /// Add an input tensor to be sent within an InferRequest object. This 300 | /// function is for containers holding 'string' elements. Data in the 301 | /// container should be contiguous, and the the container must not be modified 302 | /// until inference is completed and the result is returned. 303 | /// \param name The name of the input tensor. 304 | /// \param begin The begin iterator of the container. 305 | /// \param end The end iterator of the container. 306 | /// \param data_type The data type of the input. For 'string' input, data type 307 | /// should be 'BYTES'. 308 | /// \param shape The shape of the input. 309 | /// \param memory_type The memory type of the input. 310 | /// \param memory_type_id The ID of the memory for the tensor. (e.g. '0' is 311 | /// the memory type id of 'GPU-0') 312 | template < 313 | typename Iterator, 314 | typename std::enable_if::value_type, 316 | std::string>::value>::type* = nullptr> 317 | void AddInput( 318 | const std::string& name, const Iterator begin, const Iterator end, 319 | const DataType& data_type, const std::vector& shape, 320 | const MemoryType& memory_type, const int64_t memory_type_id) noexcept; 321 | 322 | /// Add a requested output to be sent within an InferRequest object. 323 | /// Calling this function is optional. If no output(s) are specifically 324 | /// requested then all outputs defined by the model will be calculated and 325 | /// returned. Pre-allocated buffer for each output should be specified within 326 | /// the 'Tensor' object. 327 | /// \param name The name of the output tensor. 328 | /// \param output A Tensor object that describes an output tensor containing 329 | /// its pre-allocated buffer. 330 | void AddRequestedOutput(const std::string& name, Tensor& output) override; 331 | 332 | /// Add a requested output to be sent within an InferRequest object. 333 | /// Calling this function is optional. If no output(s) are specifically 334 | /// requested then all outputs defined by the model will be calculated and 335 | /// returned. 336 | /// \param name The name of the output tensor. 337 | void AddRequestedOutput(const std::string& name) override; 338 | 339 | /// Clear inputs and outputs of the request. This allows users to reuse the 340 | /// InferRequest object if needed. 341 | void Reset() override; 342 | friend class TritonServer; 343 | friend class InternalServer; 344 | 345 | protected: 346 | InferRequest(); 347 | 348 | std::unique_ptr infer_options_; 349 | std::list str_bufs_; 350 | std::unordered_map> inputs_; 351 | std::vector> outputs_; 352 | 353 | // The map for each output tensor and a tuple of it's pre-allocated buffer, 354 | // byte size, memory type and memory type id. 355 | TensorAllocMap tensor_alloc_map_; 356 | // The updated trace setting for the specified model set within 357 | // 'InferOptions'. If set, the lifetime of this 'TraceManager::Trace' object 358 | // should be long enough until the trace associated with this request is 359 | // written to file. 360 | std::shared_ptr trace_; 361 | 362 | // If the requested model is a decoupled model. If true, the lifetime of this 363 | // 'InferRequest' should be long enough until all the responses are returned 364 | // and retrieved. 365 | bool is_decoupled_; 366 | 367 | private: 368 | // The promise object used for setting value to the result future. 369 | std::unique_ptr>> prev_promise_; 370 | }; 371 | //============================================================================== 372 | /// Helper functions to convert Wrapper enum to string. 373 | /// 374 | std::string MemoryTypeString(const MemoryType& memory_type); 375 | std::string DataTypeString(const DataType& data_type); 376 | std::string ModelReadyStateString(const ModelReadyState& state); 377 | 378 | //============================================================================== 379 | /// Implementation of template functions 380 | /// 381 | template < 382 | typename Iterator, typename std::enable_if::value_type, 384 | std::string>::value>::type*> 385 | void 386 | InferRequest::AddInput( 387 | const std::string& name, const Iterator begin, const Iterator end, 388 | const DataType& data_type, const std::vector& shape, 389 | const MemoryType& memory_type, const int64_t memory_type_id) noexcept 390 | { 391 | // Serialize the strings into a "raw" buffer. The first 4-bytes are 392 | // the length of the string length. Next are the actual string 393 | // characters. There is *not* a null-terminator on the string. 394 | str_bufs_.emplace_back(); 395 | std::string& sbuf = str_bufs_.back(); 396 | 397 | Iterator it; 398 | for (it = begin; it != end; it++) { 399 | auto len = it->size(); 400 | sbuf.append(reinterpret_cast(&len), sizeof(uint32_t)); 401 | sbuf.append(*it); 402 | } 403 | Tensor input( 404 | reinterpret_cast(&sbuf[0]), sbuf.size(), DataType::BYTES, shape, 405 | memory_type, memory_type_id); 406 | 407 | AddInput(name, input); 408 | } 409 | 410 | template < 411 | typename Iterator, typename std::enable_if::value_type, 413 | std::string>::value>::type*> 414 | void 415 | InferRequest::AddInput( 416 | const std::string& name, const Iterator begin, const Iterator end, 417 | const DataType& data_type, const std::vector& shape, 418 | const MemoryType& memory_type, const int64_t memory_type_id) noexcept 419 | { 420 | // FIXME (DLIS-4134) This function should also work for non-contiguous 421 | // container, and input data should be copied so that we don't need to worry 422 | // about the lifetime of input data. 423 | size_t bytes = sizeof(*begin) * std::distance(begin, end); 424 | Tensor input( 425 | reinterpret_cast(&(*begin)), bytes, data_type, shape, memory_type, 426 | memory_type_id); 427 | 428 | AddInput(name, input); 429 | } 430 | 431 | }}} // namespace triton::developer_tools::server 432 | -------------------------------------------------------------------------------- /server/src/tracer.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | #include "tracer.h" 28 | 29 | #include 30 | 31 | #include 32 | 33 | #include "triton/common/logging.h" 34 | #ifdef TRITON_ENABLE_GPU 35 | #include 36 | #endif // TRITON_ENABLE_GPU 37 | #include 38 | 39 | namespace triton { namespace developer_tools { namespace server { 40 | 41 | #define IGNORE_ERROR(X) \ 42 | do { \ 43 | TRITONSERVER_Error* ie_err__ = (X); \ 44 | if (ie_err__ != nullptr) { \ 45 | TRITONSERVER_ErrorDelete(ie_err__); \ 46 | } \ 47 | } while (false) 48 | 49 | #define LOG_IF_ERROR(X, MSG) \ 50 | do { \ 51 | TRITONSERVER_Error* lie_err__ = (X); \ 52 | if (lie_err__ != nullptr) { \ 53 | IGNORE_ERROR(TRITONSERVER_LogMessage( \ 54 | TRITONSERVER_LOG_ERROR, __FILE__, __LINE__, \ 55 | (std::string(MSG) + ": " + TRITONSERVER_ErrorCodeString(lie_err__) + \ 56 | " - " + TRITONSERVER_ErrorMessage(lie_err__)) \ 57 | .c_str())); \ 58 | TRITONSERVER_ErrorDelete(lie_err__); \ 59 | } \ 60 | } while (false) 61 | 62 | TraceManager::TraceManager( 63 | const TRITONSERVER_InferenceTraceLevel level, const uint32_t rate, 64 | const int32_t count, const uint32_t log_frequency, 65 | const std::string& filepath) 66 | { 67 | std::shared_ptr file(new TraceFile(filepath)); 68 | global_setting_.reset( 69 | new TraceSetting(level, rate, count, log_frequency, file)); 70 | trace_files_.emplace(filepath, file); 71 | } 72 | 73 | void 74 | TraceManager::UpdateTraceSetting( 75 | const std::string& model_name, const TraceSetting& new_setting) 76 | { 77 | std::shared_ptr setting(new TraceSetting( 78 | new_setting.level_, new_setting.rate_, new_setting.count_, 79 | new_setting.log_frequency_, new_setting.file_)); 80 | if ((!setting->Valid()) && 81 | (new_setting.level_ != TRITONSERVER_TRACE_LEVEL_DISABLED)) { 82 | throw TritonException( 83 | std::string("Attempting to set invalid trace setting: ") + 84 | setting->Reason()); 85 | } 86 | 87 | std::lock_guard r_lk(r_mu_); 88 | auto it = model_settings_.find(model_name); 89 | if (it != model_settings_.end()) { 90 | // Model update 91 | it->second = std::move(setting); 92 | } else { 93 | // Model init 94 | model_settings_.emplace(model_name, setting); 95 | } 96 | } 97 | 98 | std::shared_ptr 99 | TraceManager::SampleTrace(const std::string& model_name) 100 | { 101 | std::shared_ptr trace_setting; 102 | { 103 | std::lock_guard r_lk(r_mu_); 104 | auto m_it = model_settings_.find(model_name); 105 | trace_setting = 106 | (m_it == model_settings_.end()) ? global_setting_ : m_it->second; 107 | } 108 | std::shared_ptr ts = trace_setting->SampleTrace(); 109 | if (ts != nullptr) { 110 | ts->setting_ = trace_setting; 111 | } 112 | return ts; 113 | } 114 | 115 | void 116 | TraceManager::TraceRelease(TRITONSERVER_InferenceTrace* trace, void* userp) 117 | { 118 | uint64_t parent_id; 119 | LOG_IF_ERROR( 120 | TRITONSERVER_InferenceTraceParentId(trace, &parent_id), 121 | "getting trace parent id"); 122 | // The userp will be shared with the trace children, so only delete it 123 | // if the root trace is being released 124 | if (parent_id == 0) { 125 | delete reinterpret_cast*>(userp); 126 | } 127 | LOG_IF_ERROR(TRITONSERVER_InferenceTraceDelete(trace), "deleting trace"); 128 | } 129 | 130 | void 131 | TraceManager::TraceActivity( 132 | TRITONSERVER_InferenceTrace* trace, 133 | TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns, 134 | void* userp) 135 | { 136 | uint64_t id; 137 | LOG_IF_ERROR(TRITONSERVER_InferenceTraceId(trace, &id), "getting trace id"); 138 | 139 | // The function may be called with different traces but the same 'userp', 140 | // group the activity of the same trace together for more readable output. 141 | auto ts = 142 | reinterpret_cast*>(userp)->get(); 143 | 144 | std::lock_guard lk(ts->mtx_); 145 | std::stringstream* ss = nullptr; 146 | { 147 | if (ts->streams_.find(id) == ts->streams_.end()) { 148 | std::unique_ptr stream(new std::stringstream()); 149 | ss = stream.get(); 150 | ts->streams_.emplace(id, std::move(stream)); 151 | } else { 152 | ss = ts->streams_[id].get(); 153 | // If the string stream is not newly created, add "," as there is 154 | // already content in the string stream 155 | *ss << ","; 156 | } 157 | } 158 | 159 | // If 'activity' is TRITONSERVER_TRACE_REQUEST_START then collect 160 | // and serialize trace details. 161 | if (activity == TRITONSERVER_TRACE_REQUEST_START) { 162 | const char* model_name; 163 | int64_t model_version; 164 | uint64_t parent_id; 165 | 166 | LOG_IF_ERROR( 167 | TRITONSERVER_InferenceTraceModelName(trace, &model_name), 168 | "getting model name"); 169 | LOG_IF_ERROR( 170 | TRITONSERVER_InferenceTraceModelVersion(trace, &model_version), 171 | "getting model version"); 172 | LOG_IF_ERROR( 173 | TRITONSERVER_InferenceTraceParentId(trace, &parent_id), 174 | "getting trace parent id"); 175 | 176 | *ss << "{\"id\":" << id << ",\"model_name\":\"" << model_name 177 | << "\",\"model_version\":" << model_version; 178 | if (parent_id != 0) { 179 | *ss << ",\"parent_id\":" << parent_id; 180 | } 181 | *ss << "},"; 182 | } 183 | 184 | *ss << "{\"id\":" << id << ",\"timestamps\":[" 185 | << "{\"name\":\"" << TRITONSERVER_InferenceTraceActivityString(activity) 186 | << "\",\"ns\":" << timestamp_ns << "}]}"; 187 | } 188 | 189 | void 190 | TraceManager::TraceTensorActivity( 191 | TRITONSERVER_InferenceTrace* trace, 192 | TRITONSERVER_InferenceTraceActivity activity, const char* name, 193 | TRITONSERVER_DataType datatype, const void* base, size_t byte_size, 194 | const int64_t* shape, uint64_t dim_count, 195 | TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, void* userp) 196 | { 197 | if ((activity != TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT) && 198 | (activity != TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT) && 199 | (activity != TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT)) { 200 | LOG_ERROR << "Unsupported activity: " 201 | << TRITONSERVER_InferenceTraceActivityString(activity); 202 | return; 203 | } 204 | 205 | void* buffer_base = const_cast(base); 206 | if (memory_type == TRITONSERVER_MEMORY_GPU) { 207 | #ifdef TRITON_ENABLE_GPU 208 | buffer_base = malloc(byte_size); 209 | if (buffer_base == nullptr) { 210 | LOG_ERROR << "Failed to malloc CPU buffer"; 211 | return; 212 | } 213 | cudaError_t err = 214 | cudaMemcpy(buffer_base, base, byte_size, cudaMemcpyDeviceToHost); 215 | if (err != cudaSuccess) { 216 | throw TritonException( 217 | std::string("Error - copying buffer into CPU memory: ") + 218 | cudaGetErrorString(err)); 219 | } 220 | 221 | // FAIL_IF_CUDA_ERR( 222 | // cudaMemcpy(buffer_base, base, byte_size, cudaMemcpyDeviceToHost), 223 | // "copying buffer into CPU memory"); 224 | #else 225 | LOG_ERROR << "GPU buffer is unsupported"; 226 | return; 227 | #endif // TRITON_ENABLE_GPU 228 | } 229 | 230 | uint64_t id; 231 | LOG_IF_ERROR(TRITONSERVER_InferenceTraceId(trace, &id), "getting trace id"); 232 | 233 | // The function may be called with different traces but the same 'userp', 234 | // group the activity of the same trace together for more readable output. 235 | auto ts = 236 | reinterpret_cast*>(userp)->get(); 237 | 238 | std::lock_guard lk(ts->mtx_); 239 | std::stringstream* ss = nullptr; 240 | { 241 | if (ts->streams_.find(id) == ts->streams_.end()) { 242 | std::unique_ptr stream(new std::stringstream()); 243 | ss = stream.get(); 244 | ts->streams_.emplace(id, std::move(stream)); 245 | } else { 246 | ss = ts->streams_[id].get(); 247 | // If the string stream is not newly created, add "," as there is 248 | // already content in the string stream 249 | *ss << ","; 250 | } 251 | } 252 | 253 | // collect and serialize trace details. 254 | *ss << "{\"id\":" << id << ",\"activity\":\"" 255 | << TRITONSERVER_InferenceTraceActivityString(activity) << "\""; 256 | // collect tensor 257 | *ss << ",\"tensor\":{"; 258 | // collect tensor name 259 | *ss << "\"name\":\"" << std::string(name) << "\""; 260 | // collect tensor data 261 | *ss << ",\"data\":\""; 262 | size_t element_count = 1; 263 | for (uint64_t i = 0; i < dim_count; i++) { 264 | element_count *= shape[i]; 265 | } 266 | switch (datatype) { 267 | case TRITONSERVER_TYPE_BOOL: { 268 | const uint8_t* bool_base = reinterpret_cast(buffer_base); 269 | for (size_t e = 0; e < element_count; ++e) { 270 | *ss << ((bool_base[e] == 0) ? false : true); 271 | if (e < (element_count - 1)) 272 | *ss << ","; 273 | } 274 | break; 275 | } 276 | case TRITONSERVER_TYPE_UINT8: { 277 | const uint8_t* cbase = reinterpret_cast(buffer_base); 278 | for (size_t e = 0; e < element_count; ++e) { 279 | *ss << cbase[e]; 280 | if (e < (element_count - 1)) 281 | *ss << ","; 282 | } 283 | break; 284 | } 285 | case TRITONSERVER_TYPE_UINT16: { 286 | const uint16_t* cbase = reinterpret_cast(buffer_base); 287 | for (size_t e = 0; e < element_count; ++e) { 288 | *ss << cbase[e]; 289 | if (e < (element_count - 1)) 290 | *ss << ","; 291 | } 292 | break; 293 | } 294 | case TRITONSERVER_TYPE_UINT32: { 295 | const uint32_t* cbase = reinterpret_cast(buffer_base); 296 | for (size_t e = 0; e < element_count; ++e) { 297 | *ss << cbase[e]; 298 | if (e < (element_count - 1)) 299 | *ss << ","; 300 | } 301 | break; 302 | } 303 | case TRITONSERVER_TYPE_UINT64: { 304 | const uint64_t* cbase = reinterpret_cast(buffer_base); 305 | for (size_t e = 0; e < element_count; ++e) { 306 | *ss << cbase[e]; 307 | if (e < (element_count - 1)) 308 | *ss << ","; 309 | } 310 | break; 311 | } 312 | case TRITONSERVER_TYPE_INT8: { 313 | const int8_t* cbase = reinterpret_cast(buffer_base); 314 | for (size_t e = 0; e < element_count; ++e) { 315 | *ss << cbase[e]; 316 | if (e < (element_count - 1)) 317 | *ss << ","; 318 | } 319 | break; 320 | } 321 | case TRITONSERVER_TYPE_INT16: { 322 | const int16_t* cbase = reinterpret_cast(buffer_base); 323 | for (size_t e = 0; e < element_count; ++e) { 324 | *ss << cbase[e]; 325 | if (e < (element_count - 1)) 326 | *ss << ","; 327 | } 328 | break; 329 | } 330 | case TRITONSERVER_TYPE_INT32: { 331 | const int32_t* cbase = reinterpret_cast(buffer_base); 332 | for (size_t e = 0; e < element_count; ++e) { 333 | *ss << cbase[e]; 334 | if (e < (element_count - 1)) 335 | *ss << ","; 336 | } 337 | break; 338 | } 339 | case TRITONSERVER_TYPE_INT64: { 340 | const int64_t* cbase = reinterpret_cast(buffer_base); 341 | for (size_t e = 0; e < element_count; ++e) { 342 | *ss << cbase[e]; 343 | if (e < (element_count - 1)) 344 | *ss << ","; 345 | } 346 | break; 347 | } 348 | 349 | // FP16 / BF16 already handled as binary blobs, no need to manipulate here 350 | case TRITONSERVER_TYPE_FP16: { 351 | break; 352 | } 353 | case TRITONSERVER_TYPE_BF16: { 354 | break; 355 | } 356 | 357 | case TRITONSERVER_TYPE_FP32: { 358 | const float* cbase = reinterpret_cast(buffer_base); 359 | for (size_t e = 0; e < element_count; ++e) { 360 | *ss << cbase[e]; 361 | if (e < (element_count - 1)) 362 | *ss << ","; 363 | } 364 | break; 365 | } 366 | case TRITONSERVER_TYPE_FP64: { 367 | const double* cbase = reinterpret_cast(buffer_base); 368 | for (size_t e = 0; e < element_count; ++e) { 369 | *ss << cbase[e]; 370 | if (e < (element_count - 1)) 371 | *ss << ","; 372 | } 373 | break; 374 | } 375 | case TRITONSERVER_TYPE_BYTES: { 376 | const char* cbase = reinterpret_cast(buffer_base); 377 | size_t offset = 0; 378 | for (size_t e = 0; e < element_count; ++e) { 379 | if ((offset + sizeof(uint32_t)) > byte_size) { 380 | return; 381 | } 382 | const size_t len = *(reinterpret_cast(cbase + offset)); 383 | offset += sizeof(uint32_t); 384 | if ((offset + len) > byte_size) { 385 | return; 386 | } 387 | std::string str(cbase + offset, len); 388 | *ss << "\\\"" << str << "\\\""; 389 | offset += len; 390 | 391 | if (e < (element_count - 1)) 392 | *ss << ","; 393 | } 394 | break; 395 | } 396 | case TRITONSERVER_TYPE_INVALID: { 397 | return; 398 | } 399 | } 400 | *ss << "\",\"shape\":\""; 401 | for (uint64_t i = 0; i < dim_count; i++) { 402 | *ss << shape[i]; 403 | if (i < (dim_count - 1)) { 404 | *ss << ","; 405 | } 406 | } 407 | *ss << "\",\"dtype\":\"" << TRITONSERVER_DataTypeString(datatype) << "\"}"; 408 | *ss << "}"; 409 | 410 | if (memory_type == TRITONSERVER_MEMORY_GPU) { 411 | #ifdef TRITON_ENABLE_GPU 412 | if (buffer_base != nullptr) { 413 | free(buffer_base); 414 | } 415 | #endif // TRITON_ENABLE_GPU 416 | } 417 | } 418 | 419 | TraceManager::Trace::~Trace() 420 | { 421 | // Write trace now 422 | setting_->WriteTrace(streams_); 423 | } 424 | 425 | TraceManager::TraceFile::~TraceFile() 426 | { 427 | if (!first_write_) { 428 | trace_file_ << "]"; 429 | } 430 | } 431 | 432 | void 433 | TraceManager::TraceFile::SaveTraces( 434 | std::stringstream& trace_stream, const bool to_index_file) 435 | { 436 | try { 437 | if (to_index_file) { 438 | std::string file_name = 439 | file_name_ + "." + std::to_string(index_.fetch_add(1)); 440 | std::ofstream file_stream; 441 | file_stream.open(file_name); 442 | file_stream << "["; 443 | file_stream << trace_stream.rdbuf(); 444 | file_stream << "]"; 445 | } else { 446 | std::lock_guard lock(mu_); 447 | if (first_write_) { 448 | trace_file_.open(file_name_); 449 | trace_file_ << "["; 450 | first_write_ = false; 451 | } else { 452 | trace_file_ << ","; 453 | } 454 | trace_file_ << trace_stream.rdbuf(); 455 | } 456 | } 457 | catch (const std::ofstream::failure& e) { 458 | LOG_ERROR << "failed creating trace file: " << e.what(); 459 | } 460 | catch (...) { 461 | LOG_ERROR << "failed creating trace file: reason unknown"; 462 | } 463 | } 464 | 465 | std::shared_ptr 466 | TraceManager::TraceSetting::SampleTrace() 467 | { 468 | bool create_trace = false; 469 | { 470 | std::lock_guard lk(mu_); 471 | if (!Valid()) { 472 | return nullptr; 473 | } 474 | create_trace = (((++sample_) % rate_) == 0); 475 | if (create_trace && (count_ > 0)) { 476 | --count_; 477 | ++created_; 478 | } 479 | } 480 | if (create_trace) { 481 | std::shared_ptr lts(new Trace()); 482 | // Split 'Trace' management to frontend and Triton trace separately 483 | // to avoid dependency between frontend request and Triton trace's liveness 484 | auto trace_userp = new std::shared_ptr(lts); 485 | TRITONSERVER_InferenceTrace* trace; 486 | TRITONSERVER_Error* err = TRITONSERVER_InferenceTraceTensorNew( 487 | &trace, level_, 0 /* parent_id */, TraceActivity, TraceTensorActivity, 488 | TraceRelease, trace_userp); 489 | if (err != nullptr) { 490 | LOG_IF_ERROR(err, "creating inference trace object"); 491 | delete trace_userp; 492 | return nullptr; 493 | } 494 | lts->trace_ = trace; 495 | lts->trace_userp_ = trace_userp; 496 | LOG_IF_ERROR( 497 | TRITONSERVER_InferenceTraceId(trace, <s->trace_id_), 498 | "getting trace id"); 499 | return lts; 500 | } 501 | 502 | return nullptr; 503 | } 504 | 505 | void 506 | TraceManager::TraceSetting::WriteTrace( 507 | const std::unordered_map>& 508 | streams) 509 | { 510 | std::unique_lock lock(mu_); 511 | 512 | if (sample_in_stream_ != 0) { 513 | trace_stream_ << ","; 514 | } 515 | ++sample_in_stream_; 516 | ++collected_; 517 | 518 | size_t stream_count = 0; 519 | for (const auto& stream : streams) { 520 | trace_stream_ << stream.second->rdbuf(); 521 | // Need to add ',' unless it is the last trace in the group 522 | ++stream_count; 523 | if (stream_count != streams.size()) { 524 | trace_stream_ << ","; 525 | } 526 | } 527 | // Write to file with index when one of the following is true 528 | // 1. trace_count is specified and that number of traces has been collected 529 | // 2. log_frequency is specified and that number of traces has been collected 530 | if (((count_ == 0) && (collected_ == sample_)) || 531 | ((log_frequency_ != 0) && (sample_in_stream_ >= log_frequency_))) { 532 | // Reset variables and release lock before saving to file 533 | sample_in_stream_ = 0; 534 | std::stringstream stream; 535 | trace_stream_.swap(stream); 536 | lock.unlock(); 537 | 538 | file_->SaveTraces(stream, true /* to_index_file */); 539 | } 540 | } 541 | 542 | TraceManager::TraceSetting::TraceSetting() 543 | : level_(TRITONSERVER_TRACE_LEVEL_DISABLED), rate_(0), count_(-1), 544 | log_frequency_(0), sample_(0), created_(0), collected_(0), 545 | sample_in_stream_(0) 546 | { 547 | invalid_reason_ = "Setting hasn't been initialized"; 548 | } 549 | 550 | TraceManager::TraceSetting::TraceSetting( 551 | const TRITONSERVER_InferenceTraceLevel level, const uint32_t rate, 552 | const int32_t count, const uint32_t log_frequency, 553 | const std::shared_ptr& file) 554 | : level_(level), rate_(rate), count_(count), log_frequency_(log_frequency), 555 | file_(file), sample_(0), created_(0), collected_(0), sample_in_stream_(0) 556 | { 557 | if (level_ == TRITONSERVER_TRACE_LEVEL_DISABLED) { 558 | invalid_reason_ = "tracing is disabled"; 559 | } else if (rate_ == 0) { 560 | invalid_reason_ = "sample rate must be non-zero"; 561 | } else if (file_->FileName().empty()) { 562 | invalid_reason_ = "trace file name is not given"; 563 | } 564 | } 565 | 566 | TraceManager::TraceSetting::~TraceSetting() 567 | { 568 | // If log frequency is set, should log the remaining traces to indexed file. 569 | if (sample_in_stream_ != 0) { 570 | file_->SaveTraces(trace_stream_, (log_frequency_ != 0)); 571 | } 572 | } 573 | 574 | }}} // namespace triton::developer_tools::server 575 | -------------------------------------------------------------------------------- /server/test/wrapper_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | #include 27 | 28 | #include "gtest/gtest.h" 29 | #include "triton/core/tritonserver.h" 30 | #include "triton/developer_tools/server_wrapper.h" 31 | 32 | namespace tds = triton::developer_tools::server; 33 | 34 | namespace { 35 | 36 | TEST(TritonServer, LibraryVersionCheck) 37 | { 38 | // Check that proper 'libtritonserver.so' is used 39 | uint32_t major = 0; 40 | uint32_t minor = 0; 41 | auto err = TRITONSERVER_ApiVersion(&major, &minor); 42 | ASSERT_TRUE(err == nullptr) << "Unexpected error from API version call"; 43 | ASSERT_EQ(major, TRITONSERVER_API_VERSION_MAJOR) << "Mismatch major version"; 44 | ASSERT_GE(minor, TRITONSERVER_API_VERSION_MINOR) << "Older minor version"; 45 | } 46 | 47 | TEST(TritonServer, StartInvalidRepository) 48 | { 49 | // Run server with invalid model repository 50 | try { 51 | tds::TritonServer::Create( 52 | tds::ServerOptions({"/invalid_model_repository"})); 53 | } 54 | catch (std::exception& ex) { 55 | ASSERT_STREQ( 56 | ex.what(), "Internal-failed to stat file /invalid_model_repository\n"); 57 | } 58 | catch (...) { 59 | ASSERT_NO_THROW(throw); 60 | } 61 | } 62 | 63 | class TritonServerTest : public ::testing::Test { 64 | protected: 65 | TritonServerTest() : options_({"./models"}) 66 | { 67 | options_.logging_ = tds::LoggingOptions( 68 | tds::LoggingOptions::VerboseLevel(0), false, false, false, 69 | tds::LoggingOptions::LogFormat::DEFAULT, ""); 70 | } 71 | 72 | tds::ServerOptions options_; 73 | }; 74 | 75 | void 76 | CPUAllocator( 77 | const char* tensor_name, size_t byte_size, 78 | tds::MemoryType preferred_memory_type, int64_t preferred_memory_type_id, 79 | void** buffer, tds::MemoryType* actual_memory_type, 80 | int64_t* actual_memory_type_id) 81 | { 82 | std::cout << "Using custom allocation function" << std::endl; 83 | 84 | *actual_memory_type = tds::MemoryType::CPU; 85 | *actual_memory_type_id = preferred_memory_type_id; 86 | 87 | // If 'byte_size' is zero just return 'buffer' == nullptr, we don't 88 | // need to do any other book-keeping. 89 | if (byte_size == 0) { 90 | *buffer = nullptr; 91 | std::cout << "allocated " << byte_size << " bytes for result tensor " 92 | << tensor_name << std::endl; 93 | } else { 94 | void* allocated_ptr = malloc(byte_size); 95 | if (allocated_ptr != nullptr) { 96 | *buffer = allocated_ptr; 97 | std::cout << "allocated " << byte_size << " bytes in " 98 | << MemoryTypeString(*actual_memory_type) 99 | << " for result tensor " << tensor_name << std::endl; 100 | } 101 | } 102 | } 103 | 104 | void 105 | ResponseRelease( 106 | void* buffer, size_t byte_size, tds::MemoryType memory_type, 107 | int64_t memory_type_id) 108 | { 109 | std::cout << "Using custom response release function" << std::endl; 110 | 111 | std::stringstream ss; 112 | ss << buffer; 113 | std::string buffer_str = ss.str(); 114 | 115 | std::cout << "Releasing buffer " << buffer_str << " of size " 116 | << std::to_string(byte_size) << " in " 117 | << tds::MemoryTypeString(memory_type); 118 | 119 | switch (memory_type) { 120 | case tds::MemoryType::CPU: 121 | free(buffer); 122 | break; 123 | 124 | default: 125 | std::cerr << "error: unexpected buffer allocated in CUDA managed memory" 126 | << std::endl; 127 | break; 128 | } 129 | } 130 | 131 | TEST_F(TritonServerTest, StartNone) 132 | { 133 | // Start server with default mode (NONE) 134 | try { 135 | auto server = tds::TritonServer::Create(options_); 136 | std::set loaded_models = server->LoadedModels(); 137 | ASSERT_EQ(loaded_models.size(), 4); 138 | ASSERT_NE(loaded_models.find("add_sub"), loaded_models.end()); 139 | ASSERT_NE(loaded_models.find("add_sub_str"), loaded_models.end()); 140 | ASSERT_NE(loaded_models.find("failing_infer"), loaded_models.end()); 141 | ASSERT_NE(loaded_models.find("square_int32"), loaded_models.end()); 142 | } 143 | catch (...) { 144 | ASSERT_NO_THROW(throw); 145 | } 146 | } 147 | 148 | TEST_F(TritonServerTest, NoneLoadUnload) 149 | { 150 | // Start server with NONE mode which explicit model control is not allowed 151 | try { 152 | auto server = tds::TritonServer::Create(options_); 153 | server->LoadModel("add_sub"); 154 | server->UnloadModel("add_sub"); 155 | } 156 | catch (std::exception& ex) { 157 | ASSERT_STREQ( 158 | ex.what(), 159 | "Error - LoadModel: Unavailable-explicit model load / unload is not " 160 | "allowed if polling is enabled\n"); 161 | } 162 | catch (...) { 163 | ASSERT_NO_THROW(throw); 164 | } 165 | } 166 | 167 | TEST_F(TritonServerTest, Explicit) 168 | { 169 | try { 170 | options_.model_control_mode_ = tds::ModelControlMode::EXPLICIT; 171 | 172 | std::set startup_models; 173 | startup_models.insert("add_sub"); 174 | options_.startup_models_ = startup_models; 175 | 176 | auto server = tds::TritonServer::Create(options_); 177 | std::set loaded_models = server->LoadedModels(); 178 | ASSERT_EQ(loaded_models.size(), 1); 179 | ASSERT_EQ(*loaded_models.begin(), "add_sub"); 180 | server->UnloadModel("add_sub"); 181 | loaded_models = server->LoadedModels(); 182 | ASSERT_EQ(loaded_models.size(), 0); 183 | 184 | server->LoadModel("add_sub_str"); 185 | loaded_models = server->LoadedModels(); 186 | ASSERT_EQ(loaded_models.size(), 1); 187 | ASSERT_EQ(*loaded_models.begin(), "add_sub_str"); 188 | } 189 | catch (...) { 190 | ASSERT_NO_THROW(throw); 191 | } 192 | } 193 | 194 | TEST_F(TritonServerTest, ModelRepoRegister) 195 | { 196 | try { 197 | options_.model_control_mode_ = tds::ModelControlMode::EXPLICIT; 198 | auto server = tds::TritonServer::Create(options_); 199 | server->UnregisterModelRepo("./models"); 200 | try { 201 | server->LoadModel("add_sub"); 202 | } 203 | catch (std::exception& ex) { 204 | ASSERT_STREQ( 205 | ex.what(), 206 | "Error - LoadModel: Internal-failed to load 'add_sub', failed to " 207 | "poll from model repository\n"); 208 | } 209 | server->RegisterModelRepo( 210 | tds::NewModelRepo("./models1", "add_sub", "add_sub1")); 211 | try { 212 | server->LoadModel("add_sub"); 213 | } 214 | catch (std::exception& ex) { 215 | ASSERT_STREQ( 216 | ex.what(), 217 | "Error - LoadModel: Internal-failed to load 'add_sub', failed to " 218 | "poll from model repository\n"); 219 | } 220 | server->LoadModel("add_sub1"); 221 | std::set loaded_models = server->LoadedModels(); 222 | ASSERT_EQ(loaded_models.size(), 1); 223 | ASSERT_EQ(*loaded_models.begin(), "add_sub1"); 224 | } 225 | catch (...) { 226 | ASSERT_NO_THROW(throw); 227 | } 228 | } 229 | 230 | TEST_F(TritonServerTest, InferMinimal) 231 | { 232 | try { 233 | auto server = tds::TritonServer::Create(options_); 234 | 235 | std::vector input_data; 236 | while (input_data.size() < 16) { 237 | input_data.emplace_back(input_data.size()); 238 | } 239 | auto request = tds::InferRequest::Create(tds::InferOptions("add_sub")); 240 | for (const auto& name : std::vector{"INPUT0", "INPUT1"}) { 241 | request->AddInput( 242 | name, tds::Tensor( 243 | reinterpret_cast(input_data.data()), 244 | input_data.size() * sizeof(int32_t), tds::DataType::INT32, 245 | {16}, tds::MemoryType::CPU, 0)); 246 | } 247 | std::future> result_future = 248 | server->AsyncInfer(*request); 249 | auto result = result_future.get(); 250 | ASSERT_FALSE(result->HasError()) << result->ErrorMsg(); 251 | 252 | // Check result metadata 253 | ASSERT_EQ(result->ModelName(), "add_sub"); 254 | ASSERT_EQ(result->ModelVersion(), "1"); 255 | ASSERT_EQ(result->Id(), ""); 256 | 257 | // OUTPUT0 -> sum 258 | { 259 | std::string out_name("OUTPUT0"); 260 | std::shared_ptr out = result->Output(out_name); 261 | ASSERT_EQ(out->shape_, std::vector{16}); 262 | ASSERT_EQ(out->data_type_, tds::DataType::INT32); 263 | ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t))); 264 | for (size_t i = 0; i < input_data.size(); ++i) { 265 | EXPECT_EQ( 266 | reinterpret_cast(out->buffer_)[i], 267 | (2 * input_data[i])); 268 | } 269 | } 270 | 271 | // OUTPUT1 -> diff 272 | { 273 | std::string out_name("OUTPUT1"); 274 | std::shared_ptr out = result->Output(out_name); 275 | ASSERT_EQ(out->shape_, std::vector{16}); 276 | ASSERT_EQ(out->data_type_, tds::DataType::INT32); 277 | ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t))); 278 | for (size_t i = 0; i < input_data.size(); ++i) { 279 | EXPECT_EQ(reinterpret_cast(out->buffer_)[i], 0); 280 | } 281 | } 282 | } 283 | catch (...) { 284 | ASSERT_NO_THROW(throw); 285 | } 286 | } 287 | 288 | TEST_F(TritonServerTest, InferString) 289 | { 290 | try { 291 | auto server = tds::TritonServer::Create(options_); 292 | 293 | std::vector input_data; 294 | std::vector input_data_str; 295 | while (input_data.size() < 16) { 296 | input_data.emplace_back(input_data.size()); 297 | input_data_str.emplace_back(std::to_string(input_data.back())); 298 | } 299 | 300 | auto request = tds::InferRequest::Create(tds::InferOptions("add_sub_str")); 301 | for (const auto& name : std::vector{"INPUT0", "INPUT1"}) { 302 | request->AddInput( 303 | name, input_data_str.begin(), input_data_str.end(), 304 | tds::DataType::BYTES, {16}, tds::MemoryType::CPU, 0); 305 | } 306 | 307 | std::future> result_future = 308 | server->AsyncInfer(*request); 309 | auto result = result_future.get(); 310 | ASSERT_FALSE(result->HasError()) << result->ErrorMsg(); 311 | 312 | // Check result metadata 313 | ASSERT_EQ(result->ModelName(), "add_sub_str"); 314 | ASSERT_EQ(result->ModelVersion(), "1"); 315 | ASSERT_EQ(result->Id(), ""); 316 | 317 | std::vector out_str; 318 | std::vector shape; 319 | tds::DataType datatype; 320 | // OUTPUT0 -> sum 321 | { 322 | std::string out_name("OUTPUT0"); 323 | std::shared_ptr out = result->Output(out_name); 324 | ASSERT_EQ(out->shape_, std::vector{16}); 325 | ASSERT_EQ(out->data_type_, tds::DataType::BYTES); 326 | out_str = result->StringData(out_name); 327 | for (size_t i = 0; i < input_data.size(); ++i) { 328 | EXPECT_EQ(out_str[i], std::to_string(2 * input_data[i])); 329 | } 330 | } 331 | 332 | // OUTPUT1 -> diff 333 | { 334 | std::string out_name("OUTPUT1"); 335 | std::shared_ptr out = result->Output(out_name); 336 | ASSERT_EQ(out->shape_, std::vector{16}); 337 | ASSERT_EQ(out->data_type_, tds::DataType::BYTES); 338 | out_str = result->StringData(out_name); 339 | for (size_t i = 0; i < input_data.size(); ++i) { 340 | EXPECT_EQ(out_str[i], "0"); 341 | } 342 | } 343 | } 344 | catch (...) { 345 | ASSERT_NO_THROW(throw); 346 | } 347 | } 348 | 349 | TEST_F(TritonServerTest, InferFailed) 350 | { 351 | try { 352 | auto server = tds::TritonServer::Create(options_); 353 | 354 | std::vector input_data; 355 | while (input_data.size() < 16) { 356 | input_data.emplace_back(input_data.size()); 357 | } 358 | auto request = 359 | tds::InferRequest::Create(tds::InferOptions("failing_infer")); 360 | request->AddInput( 361 | "INPUT", tds::Tensor( 362 | reinterpret_cast(input_data.data()), 363 | input_data.size() * sizeof(int32_t), tds::DataType::INT32, 364 | {16}, tds::MemoryType::CPU, 0)); 365 | std::future> result_future = 366 | server->AsyncInfer(*request); 367 | auto result = result_future.get(); 368 | ASSERT_TRUE(result->HasError()); 369 | ASSERT_STREQ(result->ErrorMsg().c_str(), "Internal-An Error Occurred\n"); 370 | } 371 | catch (...) { 372 | ASSERT_NO_THROW(throw); 373 | } 374 | } 375 | 376 | TEST_F(TritonServerTest, InferCustomAllocator) 377 | { 378 | try { 379 | auto server = tds::TritonServer::Create(options_); 380 | 381 | std::shared_ptr allocator( 382 | new tds::Allocator(CPUAllocator, ResponseRelease)); 383 | auto infer_options = tds::InferOptions("add_sub"); 384 | infer_options.custom_allocator_ = allocator; 385 | auto request = tds::InferRequest::Create(infer_options); 386 | 387 | std::vector input_data; 388 | while (input_data.size() < 16) { 389 | input_data.emplace_back(input_data.size()); 390 | } 391 | for (const auto& name : std::vector{"INPUT0", "INPUT1"}) { 392 | request->AddInput( 393 | name, tds::Tensor( 394 | reinterpret_cast(input_data.data()), 395 | input_data.size() * sizeof(int32_t), tds::DataType::INT32, 396 | {16}, tds::MemoryType::CPU, 0)); 397 | } 398 | std::future> result_future = 399 | server->AsyncInfer(*request); 400 | auto result = result_future.get(); 401 | ASSERT_FALSE(result->HasError()) << result->ErrorMsg(); 402 | 403 | // Check result metadata 404 | ASSERT_EQ(result->ModelName(), "add_sub"); 405 | ASSERT_EQ(result->ModelVersion(), "1"); 406 | ASSERT_EQ(result->Id(), ""); 407 | 408 | // OUTPUT0 -> sum 409 | { 410 | std::string out_name("OUTPUT0"); 411 | std::shared_ptr out = result->Output(out_name); 412 | ASSERT_EQ(out->shape_, std::vector{16}); 413 | ASSERT_EQ(out->data_type_, tds::DataType::INT32); 414 | ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t))); 415 | for (size_t i = 0; i < input_data.size(); ++i) { 416 | EXPECT_EQ( 417 | reinterpret_cast(out->buffer_)[i], 418 | (2 * input_data[i])); 419 | } 420 | } 421 | 422 | // OUTPUT1 -> diff 423 | { 424 | std::string out_name("OUTPUT1"); 425 | std::shared_ptr out = result->Output(out_name); 426 | ASSERT_EQ(out->shape_, std::vector{16}); 427 | ASSERT_EQ(out->data_type_, tds::DataType::INT32); 428 | ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t))); 429 | for (size_t i = 0; i < input_data.size(); ++i) { 430 | EXPECT_EQ(reinterpret_cast(out->buffer_)[i], 0); 431 | } 432 | } 433 | } 434 | catch (...) { 435 | ASSERT_NO_THROW(throw); 436 | } 437 | } 438 | 439 | TEST_F(TritonServerTest, InferPreAllocatedBuffer) 440 | { 441 | try { 442 | auto server = tds::TritonServer::Create(options_); 443 | 444 | std::vector input_data; 445 | while (input_data.size() < 16) { 446 | input_data.emplace_back(input_data.size()); 447 | } 448 | auto request = tds::InferRequest::Create(tds::InferOptions("add_sub")); 449 | for (const auto& name : std::vector{"INPUT0", "INPUT1"}) { 450 | request->AddInput( 451 | name, tds::Tensor( 452 | reinterpret_cast(input_data.data()), 453 | input_data.size() * sizeof(int32_t), tds::DataType::INT32, 454 | {16}, tds::MemoryType::CPU, 0)); 455 | } 456 | 457 | // Provide pre-allocated buffer for 'OUTPUT0' and use default allocator for 458 | // 'OUTPUT1' 459 | void* buffer_output0 = malloc(64); 460 | tds::Tensor output0( 461 | reinterpret_cast(buffer_output0), 64, tds::MemoryType::CPU, 0); 462 | request->AddRequestedOutput("OUTPUT0", output0); 463 | request->AddRequestedOutput("OUTPUT1"); 464 | 465 | std::future> result_future = 466 | server->AsyncInfer(*request); 467 | auto result = result_future.get(); 468 | ASSERT_FALSE(result->HasError()) << result->ErrorMsg(); 469 | 470 | // Check result metadata 471 | ASSERT_EQ(result->ModelName(), "add_sub"); 472 | ASSERT_EQ(result->ModelVersion(), "1"); 473 | ASSERT_EQ(result->Id(), ""); 474 | 475 | // OUTPUT0 -> sum 476 | { 477 | std::string out_name("OUTPUT0"); 478 | std::shared_ptr out = result->Output(out_name); 479 | ASSERT_EQ(out->shape_, std::vector{16}); 480 | ASSERT_EQ(out->data_type_, tds::DataType::INT32); 481 | ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t))); 482 | for (size_t i = 0; i < input_data.size(); ++i) { 483 | EXPECT_EQ( 484 | reinterpret_cast(buffer_output0)[i], 485 | (2 * input_data[i])); 486 | } 487 | } 488 | 489 | // OUTPUT1 -> diff 490 | { 491 | std::string out_name("OUTPUT1"); 492 | std::shared_ptr out = result->Output(out_name); 493 | ASSERT_EQ(out->shape_, std::vector{16}); 494 | ASSERT_EQ(out->data_type_, tds::DataType::INT32); 495 | ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t))); 496 | for (size_t i = 0; i < input_data.size(); ++i) { 497 | EXPECT_EQ(reinterpret_cast(out->buffer_)[i], 0); 498 | } 499 | } 500 | 501 | free(buffer_output0); 502 | } 503 | catch (...) { 504 | ASSERT_NO_THROW(throw); 505 | } 506 | } 507 | 508 | TEST_F(TritonServerTest, InferDecoupledMultipleResponses) 509 | { 510 | try { 511 | auto server = tds::TritonServer::Create(options_); 512 | 513 | std::vector input_data = {3}; 514 | auto request = tds::InferRequest::Create(tds::InferOptions("square_int32")); 515 | request->AddInput( 516 | "IN", tds::Tensor( 517 | reinterpret_cast(input_data.data()), 518 | input_data.size() * sizeof(int32_t), tds::DataType::INT32, 519 | {1}, tds::MemoryType::CPU, 0)); 520 | std::future> result_future = 521 | server->AsyncInfer(*request); 522 | 523 | // Retrieve results from multiple responses. 524 | std::vector> results; 525 | results.push_back(result_future.get()); 526 | size_t size = results.size(); 527 | int count = 0; 528 | for (size_t i = 0; i < size; i++) { 529 | if (results[i]) { 530 | ASSERT_FALSE(results[i]->HasError()) << results[i]->ErrorMsg(); 531 | auto next_future = results[i]->GetNextResult(); 532 | if (next_future) { 533 | results.push_back(next_future->get()); 534 | size++; 535 | } 536 | ASSERT_EQ(results[i]->ModelName(), "square_int32"); 537 | ASSERT_EQ(results[i]->ModelVersion(), "1"); 538 | ASSERT_EQ(results[i]->Id(), ""); 539 | count++; 540 | } 541 | } 542 | ASSERT_EQ(count, 3); 543 | 544 | // OUTPUT1 -> 3 545 | { 546 | for (auto& result : results) { 547 | if (result) { 548 | std::string out_name("OUT"); 549 | std::shared_ptr out = result->Output(out_name); 550 | ASSERT_EQ(out->shape_, std::vector{1}); 551 | ASSERT_EQ(out->data_type_, tds::DataType::INT32); 552 | ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t))); 553 | for (size_t i = 0; i < input_data.size(); ++i) { 554 | EXPECT_EQ(reinterpret_cast(out->buffer_)[i], 3); 555 | } 556 | } 557 | } 558 | } 559 | } 560 | catch (...) { 561 | ASSERT_NO_THROW(throw); 562 | } 563 | } 564 | 565 | TEST_F(TritonServerTest, InferDecoupledZeroResponse) 566 | { 567 | try { 568 | auto server = tds::TritonServer::Create(options_); 569 | 570 | std::vector input_data = {0}; 571 | auto request = tds::InferRequest::Create(tds::InferOptions("square_int32")); 572 | request->AddInput( 573 | "IN", tds::Tensor( 574 | reinterpret_cast(input_data.data()), 575 | input_data.size() * sizeof(int32_t), tds::DataType::INT32, 576 | {1}, tds::MemoryType::CPU, 0)); 577 | std::future> result_future = 578 | server->AsyncInfer(*request); 579 | std::vector> results; 580 | results.push_back(result_future.get()); 581 | size_t size = results.size(); 582 | int count = 0; 583 | for (size_t i = 0; i < size; i++) { 584 | if (results[i]) { 585 | ASSERT_FALSE(results[i]->HasError()) << results[i]->ErrorMsg(); 586 | auto next_future = results[i]->GetNextResult(); 587 | if (next_future) { 588 | results.push_back(next_future->get()); 589 | size++; 590 | } 591 | ASSERT_EQ(results[i]->ModelName(), "square_int32"); 592 | ASSERT_EQ(results[i]->ModelVersion(), "1"); 593 | ASSERT_EQ(results[i]->Id(), ""); 594 | count++; 595 | } 596 | } 597 | ASSERT_EQ(count, 0); 598 | 599 | { 600 | for (auto& result : results) { 601 | ASSERT_FALSE(result) << "Unexpected response."; 602 | } 603 | } 604 | } 605 | catch (...) { 606 | ASSERT_NO_THROW(throw); 607 | } 608 | } 609 | 610 | } // namespace 611 | 612 | int 613 | main(int argc, char** argv) 614 | { 615 | ::testing::InitGoogleTest(&argc, argv); 616 | return RUN_ALL_TESTS(); 617 | } 618 | -------------------------------------------------------------------------------- /server/examples/simple_addsub_async_infer.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions 5 | // are met: 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above copyright 9 | // notice, this list of conditions and the following disclaimer in the 10 | // documentation and/or other materials provided with the distribution. 11 | // * Neither the name of NVIDIA CORPORATION nor the names of its 12 | // contributors may be used to endorse or promote products derived 13 | // from this software without specific prior written permission. 14 | // 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | #include 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "triton/developer_tools/server_wrapper.h" 36 | 37 | 38 | #ifdef TRITON_ENABLE_GPU 39 | #include 40 | #endif // TRITON_ENABLE_GPU 41 | 42 | namespace tds = triton::developer_tools::server; 43 | 44 | namespace { 45 | 46 | #define FAIL(MSG) \ 47 | do { \ 48 | std::cerr << "error: " << (MSG) << std::endl; \ 49 | exit(1); \ 50 | } while (false) 51 | #ifdef TRITON_ENABLE_GPU 52 | #define FAIL_IF_CUDA_ERR(X, MSG) \ 53 | do { \ 54 | cudaError_t err__ = (X); \ 55 | if (err__ != cudaSuccess) { \ 56 | std::cerr << "error: " << (MSG) << ": " << cudaGetErrorString(err__) \ 57 | << std::endl; \ 58 | exit(1); \ 59 | } \ 60 | } while (false) 61 | #endif // TRITON_ENABLE_GPU 62 | 63 | bool enforce_memory_type = false; 64 | tds::MemoryType requested_memory_type; 65 | 66 | #ifdef TRITON_ENABLE_GPU 67 | static auto cuda_data_deleter = [](void* data) { 68 | if (data != nullptr) { 69 | cudaPointerAttributes attr; 70 | auto cuerr = cudaPointerGetAttributes(&attr, data); 71 | if (cuerr != cudaSuccess) { 72 | std::cerr << "error: failed to get CUDA pointer attribute of " << data 73 | << ": " << cudaGetErrorString(cuerr) << std::endl; 74 | } 75 | if (attr.type == cudaMemoryTypeDevice) { 76 | cuerr = cudaFree(data); 77 | } else if (attr.type == cudaMemoryTypeHost) { 78 | cuerr = cudaFreeHost(data); 79 | } 80 | if (cuerr != cudaSuccess) { 81 | std::cerr << "error: failed to release CUDA pointer " << data << ": " 82 | << cudaGetErrorString(cuerr) << std::endl; 83 | } 84 | } 85 | }; 86 | #endif // TRITON_ENABLE_GPU 87 | 88 | void 89 | Usage(char** argv, const std::string& msg = std::string()) 90 | { 91 | if (!msg.empty()) { 92 | std::cerr << msg << std::endl; 93 | } 94 | 95 | std::cerr << "Usage: " << argv[0] << " [options]" << std::endl; 96 | std::cerr << "\t-m <\"system\"|\"pinned\"|gpu>" 97 | << " Enforce the memory type for input and output tensors." 98 | << " If not specified, inputs will be in system memory and outputs" 99 | << " will be based on the model's preferred type." << std::endl; 100 | std::cerr << "\t-v Enable verbose logging" << std::endl; 101 | 102 | exit(1); 103 | } 104 | 105 | template 106 | void 107 | GenerateInputData( 108 | std::vector* input0_data, std::vector* input1_data) 109 | { 110 | input0_data->resize(16 * sizeof(T)); 111 | input1_data->resize(16 * sizeof(T)); 112 | for (size_t i = 0; i < 16; ++i) { 113 | ((T*)input0_data->data())[i] = i; 114 | ((T*)input1_data->data())[i] = 1; 115 | } 116 | } 117 | 118 | template 119 | void 120 | CompareResult( 121 | const std::string& output0_name, const std::string& output1_name, 122 | const void* input0, const void* input1, const char* output0, 123 | const char* output1) 124 | { 125 | for (size_t i = 0; i < 16; ++i) { 126 | std::cout << ((T*)input0)[i] << " + " << ((T*)input1)[i] << " = " 127 | << ((T*)output0)[i] << std::endl; 128 | std::cout << ((T*)input0)[i] << " - " << ((T*)input1)[i] << " = " 129 | << ((T*)output1)[i] << std::endl; 130 | 131 | if ((((T*)input0)[i] + ((T*)input1)[i]) != ((T*)output0)[i]) { 132 | FAIL("incorrect sum in " + output0_name); 133 | } 134 | if ((((T*)input0)[i] - ((T*)input1)[i]) != ((T*)output1)[i]) { 135 | FAIL("incorrect difference in " + output1_name); 136 | } 137 | } 138 | } 139 | 140 | void 141 | ResponseAllocator( 142 | const char* tensor_name, size_t byte_size, 143 | tds::MemoryType preferred_memory_type, int64_t preferred_memory_type_id, 144 | void** buffer, tds::MemoryType* actual_memory_type, 145 | int64_t* actual_memory_type_id) 146 | { 147 | std::cout << "Using custom allocation function" << std::endl; 148 | 149 | // Initially attempt to make the actual memory type and id that we 150 | // allocate be the same as preferred memory type 151 | *actual_memory_type = preferred_memory_type; 152 | *actual_memory_type_id = preferred_memory_type_id; 153 | 154 | // If 'byte_size' is zero just return 'buffer' == nullptr, we don't 155 | // need to do any other book-keeping. 156 | if (byte_size == 0) { 157 | *buffer = nullptr; 158 | std::cout << "allocated " << byte_size << " bytes for result tensor " 159 | << tensor_name << std::endl; 160 | } else { 161 | void* allocated_ptr = nullptr; 162 | if (enforce_memory_type) { 163 | *actual_memory_type = requested_memory_type; 164 | } 165 | 166 | switch (*actual_memory_type) { 167 | #ifdef TRITON_ENABLE_GPU 168 | case tds::MemoryType::CPU_PINNED: { 169 | auto err = cudaSetDevice(*actual_memory_type_id); 170 | if ((err != cudaSuccess) && (err != cudaErrorNoDevice) && 171 | (err != cudaErrorInsufficientDriver)) { 172 | throw tds::TritonException(std::string( 173 | "unable to recover current CUDA device: " + 174 | std::string(cudaGetErrorString(err)))); 175 | } 176 | 177 | err = cudaHostAlloc(&allocated_ptr, byte_size, cudaHostAllocPortable); 178 | if (err != cudaSuccess) { 179 | throw tds::TritonException(std::string( 180 | "cudaHostAlloc failed: " + std::string(cudaGetErrorString(err)))); 181 | } 182 | break; 183 | } 184 | 185 | case tds::MemoryType::GPU: { 186 | auto err = cudaSetDevice(*actual_memory_type_id); 187 | if ((err != cudaSuccess) && (err != cudaErrorNoDevice) && 188 | (err != cudaErrorInsufficientDriver)) { 189 | throw tds::TritonException(std::string( 190 | "unable to recover current CUDA device: " + 191 | std::string(cudaGetErrorString(err)))); 192 | } 193 | 194 | err = cudaMalloc(&allocated_ptr, byte_size); 195 | if (err != cudaSuccess) { 196 | throw tds::TritonException(std::string( 197 | "cudaMalloc failed: " + std::string(cudaGetErrorString(err)))); 198 | } 199 | break; 200 | } 201 | #endif // TRITON_ENABLE_GPU 202 | 203 | // Use CPU memory if the requested memory type is unknown 204 | // (default case). 205 | case tds::MemoryType::CPU: 206 | default: { 207 | *actual_memory_type = tds::MemoryType::CPU; 208 | allocated_ptr = malloc(byte_size); 209 | break; 210 | } 211 | } 212 | 213 | if (allocated_ptr != nullptr) { 214 | *buffer = allocated_ptr; 215 | std::cout << "allocated " << byte_size << " bytes in " 216 | << MemoryTypeString(*actual_memory_type) 217 | << " for result tensor " << tensor_name << std::endl; 218 | } 219 | } 220 | } 221 | 222 | void 223 | ResponseRelease( 224 | void* buffer, size_t byte_size, tds::MemoryType memory_type, 225 | int64_t memory_type_id) 226 | { 227 | std::cout << "Using custom response release function" << std::endl; 228 | 229 | std::stringstream ss; 230 | ss << buffer; 231 | std::string buffer_str = ss.str(); 232 | 233 | std::cout << "Releasing buffer " << buffer_str << " of size " 234 | << std::to_string(byte_size) << " in " 235 | << tds::MemoryTypeString(memory_type); 236 | 237 | switch (memory_type) { 238 | case tds::MemoryType::CPU: 239 | free(buffer); 240 | break; 241 | #ifdef TRITON_ENABLE_GPU 242 | case tds::MemoryType::CPU_PINNED: { 243 | auto err = cudaSetDevice(memory_type_id); 244 | if (err == cudaSuccess) { 245 | err = cudaFreeHost(buffer); 246 | } 247 | if (err != cudaSuccess) { 248 | std::cerr << "error: failed to cudaFree " << buffer << ": " 249 | << cudaGetErrorString(err) << std::endl; 250 | } 251 | break; 252 | } 253 | case tds::MemoryType::GPU: { 254 | auto err = cudaSetDevice(memory_type_id); 255 | if (err == cudaSuccess) { 256 | err = cudaFree(buffer); 257 | } 258 | if (err != cudaSuccess) { 259 | std::cerr << "error: failed to cudaFree " << buffer << ": " 260 | << cudaGetErrorString(err) << std::endl; 261 | } 262 | break; 263 | } 264 | #endif // TRITON_ENABLE_GPU 265 | default: 266 | std::cerr << "error: unexpected buffer allocated in CUDA managed memory" 267 | << std::endl; 268 | break; 269 | } 270 | } 271 | 272 | void 273 | Check( 274 | std::shared_ptr& output0, 275 | std::shared_ptr& output1, const std::vector& input0_data, 276 | const std::vector& input1_data, const std::string& output0_name, 277 | const std::string& output1_name, const size_t expected_byte_size, 278 | const tds::DataType expected_datatype, const std::string& model_name, 279 | const bool is_custom_alloc) 280 | { 281 | std::unordered_map> output_data; 282 | for (auto& output : 283 | {std::make_pair(output0_name, output0), 284 | std::make_pair(output1_name, output1)}) { 285 | if (model_name == "add_sub") { 286 | if ((output.second->shape_.size() != 1) || 287 | (output.second->shape_[0] != 16)) { 288 | FAIL("unexpected shape for '" + output.first + "'"); 289 | } 290 | } else if (model_name == "simple") { 291 | if ((output.second->shape_.size() != 2) || 292 | (output.second->shape_[0] != 1) || (output.second->shape_[1] != 16)) { 293 | FAIL("unexpected shape for '" + output.first + "'"); 294 | } 295 | } else { 296 | FAIL("unexpected model name '" + model_name + "'"); 297 | } 298 | 299 | if (output.second->data_type_ != expected_datatype) { 300 | FAIL( 301 | "unexpected datatype '" + 302 | std::string(DataTypeString(output.second->data_type_)) + "' for '" + 303 | output.first + "'"); 304 | } 305 | 306 | if (output.second->byte_size_ != expected_byte_size) { 307 | FAIL( 308 | "unexpected byte-size, expected " + 309 | std::to_string(expected_byte_size) + ", got " + 310 | std::to_string(output.second->byte_size_) + " for " + output.first); 311 | } 312 | 313 | // For this example, we use default allocator and pre-allocated buffer in 314 | // the first and second infer requests, so the memory type for both cases 315 | // should be 'CPU'. 316 | if (is_custom_alloc) { 317 | if (enforce_memory_type && 318 | (output.second->memory_type_ != requested_memory_type)) { 319 | FAIL( 320 | "unexpected memory type, expected to be allocated in " + 321 | std::string(MemoryTypeString(requested_memory_type)) + ", got " + 322 | std::string(MemoryTypeString(output.second->memory_type_)) + 323 | ", id " + std::to_string(output.second->memory_type_id_) + " for " + 324 | output.first); 325 | } 326 | } else { 327 | if (output.second->memory_type_ != tds::MemoryType::CPU) { 328 | FAIL( 329 | "unexpected memory type, expected to be allocated in CPU, got " + 330 | std::string(MemoryTypeString(output.second->memory_type_)) + 331 | ", id " + std::to_string(output.second->memory_type_id_) + " for " + 332 | output.first); 333 | } 334 | } 335 | 336 | // We make a copy of the data here... which we could avoid for 337 | // performance reasons but ok for this simple example. 338 | std::vector& odata = output_data[output.first]; 339 | switch (output.second->memory_type_) { 340 | case tds::MemoryType::CPU: { 341 | std::cout << output.first << " is stored in system memory" << std::endl; 342 | odata.assign( 343 | output.second->buffer_, 344 | output.second->buffer_ + output.second->byte_size_); 345 | break; 346 | } 347 | 348 | case tds::MemoryType::CPU_PINNED: { 349 | std::cout << output.first << " is stored in pinned memory" << std::endl; 350 | odata.assign( 351 | output.second->buffer_, 352 | output.second->buffer_ + output.second->byte_size_); 353 | break; 354 | } 355 | 356 | #ifdef TRITON_ENABLE_GPU 357 | case tds::MemoryType::GPU: { 358 | std::cout << output.first << " is stored in GPU memory" << std::endl; 359 | odata.reserve(output.second->byte_size_); 360 | FAIL_IF_CUDA_ERR( 361 | cudaMemcpy( 362 | &odata[0], output.second->buffer_, output.second->byte_size_, 363 | cudaMemcpyDeviceToHost), 364 | "getting " + output.first + " data from GPU memory"); 365 | break; 366 | } 367 | #endif 368 | 369 | default: 370 | FAIL("unexpected memory type"); 371 | } 372 | } 373 | 374 | CompareResult( 375 | output0_name, output1_name, &input0_data[0], &input1_data[0], 376 | output_data[output0_name].data(), output_data[output1_name].data()); 377 | } 378 | 379 | } // namespace 380 | 381 | int 382 | main(int argc, char** argv) 383 | { 384 | int verbose_level = 0; 385 | 386 | // Parse commandline... 387 | int opt; 388 | while ((opt = getopt(argc, argv, "vm:r:")) != -1) { 389 | switch (opt) { 390 | case 'm': { 391 | enforce_memory_type = true; 392 | if (!strcmp(optarg, "system")) { 393 | requested_memory_type = tds::MemoryType::CPU; 394 | } else if (!strcmp(optarg, "pinned")) { 395 | requested_memory_type = tds::MemoryType::CPU_PINNED; 396 | } else if (!strcmp(optarg, "gpu")) { 397 | requested_memory_type = tds::MemoryType::GPU; 398 | } else { 399 | Usage( 400 | argv, 401 | "-m must be used to specify one of the following types:" 402 | " <\"system\"|\"pinned\"|gpu>"); 403 | } 404 | break; 405 | } 406 | case 'v': 407 | verbose_level = 1; 408 | break; 409 | case '?': 410 | Usage(argv); 411 | break; 412 | } 413 | } 414 | 415 | #ifndef TRITON_ENABLE_GPU 416 | if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) { 417 | Usage(argv, "-m can only be set to \"system\" without enabling GPU"); 418 | } 419 | #endif // TRITON_ENABLE_GPU 420 | 421 | try { 422 | // Use 'ServerOptions' object to initialize TritonServer. Here we set model 423 | // control mode to 'EXPLICIT' so that we are able to load and unload models 424 | // after startup. 425 | tds::ServerOptions options({"./models"}); 426 | options.logging_.verbose_ = 427 | tds::LoggingOptions::VerboseLevel(verbose_level); 428 | options.model_control_mode_ = tds::ModelControlMode::EXPLICIT; 429 | // Enable tracing. The tracing output file 'trace_file' can be found after 430 | // this example is completed. 431 | options.trace_ = std::make_shared( 432 | "trace_file", tds::Trace::Level::TIMESTAMPS, 1, -1, 0); 433 | auto server = tds::TritonServer::Create(options); 434 | 435 | // Load 'simple' and 'add_sub' models. 436 | server->LoadModel("simple"); 437 | server->LoadModel("add_sub"); 438 | // Use 'ModelIndex' function to see model repository contents. Here we 439 | // should see both 'simple' and 'add_sub' models are ready. 440 | std::vector repo_index = server->ModelIndex(); 441 | std::cout << "ModelIndex:\n"; 442 | for (size_t i = 0; i < repo_index.size(); i++) { 443 | std::cout << repo_index[i].name_ << ", " << repo_index[i].version_ << ", " 444 | << ModelReadyStateString(repo_index[i].state_) << "\n"; 445 | } 446 | 447 | // Initialize 'InferRequest' with the name of the model that we want to run 448 | // an inference on. 449 | auto request1 = tds::InferRequest::Create(tds::InferOptions("add_sub")); 450 | 451 | // Add two input tensors to the inference request. 452 | std::vector input0_data; 453 | std::vector input1_data; 454 | GenerateInputData(&input0_data, &input1_data); 455 | size_t input0_size = input0_data.size(); 456 | size_t input1_size = input1_data.size(); 457 | 458 | // Use the iterator of input vector to add input data to a request. 459 | request1->AddInput( 460 | "INPUT0", input0_data.begin(), input0_data.end(), tds::DataType::INT32, 461 | {16}, tds::MemoryType::CPU, 0); 462 | request1->AddInput( 463 | "INPUT1", input1_data.begin(), input1_data.end(), tds::DataType::INT32, 464 | {16}, tds::MemoryType::CPU, 0); 465 | 466 | // Indicate that we want both output tensors calculated and returned 467 | // for the inference request. These calls are optional, if no 468 | // output(s) are specifically requested then all outputs defined by 469 | // the model will be calculated and returned. 470 | request1->AddRequestedOutput("OUTPUT0"); 471 | request1->AddRequestedOutput("OUTPUT1"); 472 | 473 | // Call 'AsyncInfer' function to run inference. 474 | auto result_future1 = server->AsyncInfer(*request1); 475 | 476 | // Get the infer result and check the result. 477 | auto result1 = result_future1.get(); 478 | if (result1->HasError()) { 479 | FAIL(result1->ErrorMsg()); 480 | } 481 | std::cout << "Ran inference on model '" << result1->ModelName() 482 | << "', version '" << result1->ModelVersion() 483 | << "', with request ID '" << result1->Id() << "'\n"; 484 | 485 | // Retrieve two outputs from the 'InferResult' object. 486 | std::shared_ptr result1_out0 = result1->Output("OUTPUT0"); 487 | std::shared_ptr result1_out1 = result1->Output("OUTPUT1"); 488 | 489 | Check( 490 | result1_out0, result1_out1, input0_data, input1_data, "OUTPUT0", 491 | "OUTPUT1", input0_size, tds::DataType::INT32, result1->ModelName(), 492 | false); 493 | 494 | // Get full response. 495 | std::cout << result1->DebugString() << std::endl; 496 | 497 | 498 | // Unload 'add_sub' model as we don't need it anymore. 499 | server->UnloadModel("add_sub"); 500 | // Run a new infer requset on 'simple' model. 501 | auto request2 = tds::InferRequest::Create(tds::InferOptions("simple")); 502 | 503 | // We can also use 'Tensor' object for adding input to a request. 504 | tds::Tensor input0( 505 | &input0_data[0], input0_data.size(), tds::DataType::INT32, {1, 16}, 506 | tds::MemoryType::CPU, 0); 507 | tds::Tensor input1( 508 | &input1_data[0], input1_data.size(), tds::DataType::INT32, {1, 16}, 509 | tds::MemoryType::CPU, 0); 510 | request2->AddInput("INPUT0", input0); 511 | request2->AddInput("INPUT1", input1); 512 | 513 | // For this inference, we provide pre-allocated buffer for output. The infer 514 | // result will be stored in-place to the buffer. 515 | std::shared_ptr allocated_output0(malloc(64), free); 516 | std::shared_ptr allocated_output1(malloc(64), free); 517 | 518 | tds::Tensor alloc_output0( 519 | reinterpret_cast(allocated_output0.get()), 64, 520 | tds::MemoryType::CPU, 0); 521 | tds::Tensor alloc_output1( 522 | reinterpret_cast(allocated_output1.get()), 64, 523 | tds::MemoryType::CPU, 0); 524 | request2->AddRequestedOutput("OUTPUT0", alloc_output0); 525 | request2->AddRequestedOutput("OUTPUT1", alloc_output1); 526 | 527 | // Call 'AsyncInfer' function to run inference. 528 | auto result_future2 = server->AsyncInfer(*request2); 529 | 530 | // Get the infer result and check the result. 531 | auto result2 = result_future2.get(); 532 | if (result2->HasError()) { 533 | FAIL(result2->ErrorMsg()); 534 | } 535 | std::cout << "Ran inference on model '" << result2->ModelName() 536 | << "', version '" << result2->ModelVersion() 537 | << "', with request ID '" << result2->Id() << "'\n"; 538 | 539 | // Retrieve two outputs from the 'InferResult' object. 540 | std::shared_ptr result2_out0 = result2->Output("OUTPUT0"); 541 | std::shared_ptr result2_out1 = result2->Output("OUTPUT1"); 542 | 543 | Check( 544 | result2_out0, result2_out1, input0_data, input1_data, "OUTPUT0", 545 | "OUTPUT1", input0_size, tds::DataType::INT32, result2->ModelName(), 546 | false); 547 | 548 | // Get full response. 549 | std::cout << result2->DebugString() << std::endl; 550 | 551 | // Check the output data in the pre-allocated buffer. 552 | CompareResult( 553 | "OUTPUT0", "OUTPUT1", &input0_data[0], &input1_data[0], 554 | reinterpret_cast(allocated_output0.get()), 555 | reinterpret_cast(allocated_output1.get())); 556 | 557 | // For the third inference, we use custom allocator for output allocation. 558 | // Initialize the allocator with our custom functions 'ResponseAllocator' 559 | // and 'ResponseRelease' which are implemented above. 560 | std::shared_ptr allocator( 561 | new tds::Allocator(ResponseAllocator, ResponseRelease)); 562 | auto infer_options = tds::InferOptions("simple"); 563 | infer_options.custom_allocator_ = allocator; 564 | auto request3 = tds::InferRequest::Create(infer_options); 565 | 566 | const void* input0_base = &input0_data[0]; 567 | const void* input1_base = &input1_data[0]; 568 | #ifdef TRITON_ENABLE_GPU 569 | std::unique_ptr input0_gpu( 570 | nullptr, cuda_data_deleter); 571 | std::unique_ptr input1_gpu( 572 | nullptr, cuda_data_deleter); 573 | bool use_cuda_memory = 574 | (enforce_memory_type && 575 | (requested_memory_type != tds::MemoryType::CPU)); 576 | if (use_cuda_memory) { 577 | FAIL_IF_CUDA_ERR(cudaSetDevice(0), "setting CUDA device to device 0"); 578 | if (requested_memory_type != tds::MemoryType::CPU_PINNED) { 579 | void* dst; 580 | FAIL_IF_CUDA_ERR( 581 | cudaMalloc(&dst, input0_size), 582 | "allocating GPU memory for INPUT0 data"); 583 | input0_gpu.reset(dst); 584 | FAIL_IF_CUDA_ERR( 585 | cudaMemcpy( 586 | dst, &input0_data[0], input0_size, cudaMemcpyHostToDevice), 587 | "setting INPUT0 data in GPU memory"); 588 | FAIL_IF_CUDA_ERR( 589 | cudaMalloc(&dst, input1_size), 590 | "allocating GPU memory for INPUT1 data"); 591 | input1_gpu.reset(dst); 592 | FAIL_IF_CUDA_ERR( 593 | cudaMemcpy( 594 | dst, &input1_data[0], input1_size, cudaMemcpyHostToDevice), 595 | "setting INPUT1 data in GPU memory"); 596 | } else { 597 | void* dst; 598 | FAIL_IF_CUDA_ERR( 599 | cudaHostAlloc(&dst, input0_size, cudaHostAllocPortable), 600 | "allocating pinned memory for INPUT0 data"); 601 | input0_gpu.reset(dst); 602 | FAIL_IF_CUDA_ERR( 603 | cudaMemcpy(dst, &input0_data[0], input0_size, cudaMemcpyHostToHost), 604 | "setting INPUT0 data in pinned memory"); 605 | FAIL_IF_CUDA_ERR( 606 | cudaHostAlloc(&dst, input1_size, cudaHostAllocPortable), 607 | "allocating pinned memory for INPUT1 data"); 608 | input1_gpu.reset(dst); 609 | FAIL_IF_CUDA_ERR( 610 | cudaMemcpy(dst, &input1_data[0], input1_size, cudaMemcpyHostToHost), 611 | "setting INPUT1 data in pinned memory"); 612 | } 613 | } 614 | 615 | input0_base = use_cuda_memory ? input0_gpu.get() : &input0_data[0]; 616 | input1_base = use_cuda_memory ? input1_gpu.get() : &input1_data[0]; 617 | #endif // TRITON_ENABLE_GPU 618 | 619 | // Reuse the two inputs and modify the buffer and memory type based on the 620 | // commandline. 621 | input0.buffer_ = reinterpret_cast(const_cast(input0_base)); 622 | input1.buffer_ = reinterpret_cast(const_cast(input1_base)); 623 | input0.memory_type_ = requested_memory_type; 624 | input1.memory_type_ = requested_memory_type; 625 | 626 | request3->AddInput("INPUT0", input0); 627 | request3->AddInput("INPUT1", input1); 628 | 629 | // Call 'AsyncInfer' function to run inference. 630 | auto result_future3 = server->AsyncInfer(*request3); 631 | 632 | // Get the infer result and check the result. 633 | auto result3 = result_future3.get(); 634 | if (result3->HasError()) { 635 | FAIL(result3->ErrorMsg()); 636 | } 637 | std::cout << "Ran inference on model '" << result3->ModelName() 638 | << "', version '" << result3->ModelVersion() 639 | << "', with request ID '" << result3->Id() << "'\n"; 640 | 641 | // Retrieve two outputs from the 'InferResult' object. 642 | std::shared_ptr result3_out0 = result3->Output("OUTPUT0"); 643 | std::shared_ptr result3_out1 = result3->Output("OUTPUT1"); 644 | 645 | Check( 646 | result3_out0, result3_out1, input0_data, input1_data, "OUTPUT0", 647 | "OUTPUT1", input0_size, tds::DataType::INT32, result3->ModelName(), 648 | true); 649 | 650 | // Get full response. 651 | std::cout << result3->DebugString() << std::endl; 652 | 653 | // Get the server metrics. 654 | std::string metrics_str = server->ServerMetrics(); 655 | std::cout << "\n\n\n=========Server Metrics===========\n" 656 | << metrics_str << "\n"; 657 | } 658 | catch (const tds::TritonException& ex) { 659 | std::cerr << "Error: " << ex.what(); 660 | exit(1); 661 | } 662 | 663 | return 0; 664 | } 665 | --------------------------------------------------------------------------------