├── README.md
├── .gitignore
├── .clang-format
├── .github
    └── workflows
    │   └── pre-commit.yml
├── qa
    ├── L0_server_unit_test
    │   ├── models
    │   │   ├── failing_infer
    │   │   │   ├── 1
    │   │   │   │   └── model.py
    │   │   │   └── config.pbtxt
    │   │   ├── add_sub
    │   │   │   ├── 1
    │   │   │   │   └── model.py
    │   │   │   └── config.pbtxt
    │   │   └── add_sub_str
    │   │   │   ├── 1
    │   │   │       └── model.py
    │   │   │   └── config.pbtxt
    │   └── test.sh
    ├── L0_java_simple_cpp_example
    │   └── test.sh
    └── L0_server_example
    │   └── test.sh
├── server
    ├── cmake
    │   └── TritonDeveloperToolsServerConfig.cmake.in
    ├── test
    │   ├── CMakeLists.txt
    │   └── wrapper_test.cc
    ├── install_dependencies_and_build.sh
    ├── examples
    │   ├── CMakeLists.txt
    │   ├── square_async_infer.cc
    │   ├── addsub_string_async_infer.cc
    │   └── simple_addsub_async_infer.cc
    ├── src
    │   ├── infer_requested_output.h
    │   ├── tracer.h
    │   └── tracer.cc
    ├── CMakeLists.txt
    ├── include
    │   └── triton
    │   │   └── developer_tools
    │   │       ├── generic_server_wrapper.h
    │   │       └── server_wrapper.h
    └── README.md
├── pyproject.toml
└── .pre-commit-config.yaml


/README.md:
--------------------------------------------------------------------------------
1 | # triton_developer_tools


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /build
2 | /.vscode
3 | *.so
4 | build
5 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | BasedOnStyle: Google
 3 | 
 4 | IndentWidth: 2
 5 | ColumnLimit: 80
 6 | ContinuationIndentWidth: 4
 7 | UseTab: Never
 8 | MaxEmptyLinesToKeep: 2
 9 | 
10 | SortIncludes: true
11 | CompactNamespaces: true
12 | ReflowComments: true
13 | 
14 | DerivePointerAlignment: false
15 | PointerAlignment: Left
16 | 
17 | AllowShortIfStatementsOnASingleLine: false
18 | AllowShortBlocksOnASingleLine: false
19 | AllowShortFunctionsOnASingleLine: Inline
20 | 
21 | AlwaysBreakAfterReturnType: TopLevelDefinitions
22 | AlignAfterOpenBracket: AlwaysBreak
23 | BreakBeforeBraces: Custom
24 | BraceWrapping:
25 |   AfterClass: false
26 |   AfterControlStatement: false
27 |   AfterEnum: false
28 |   AfterFunction: true
29 |   AfterNamespace: false
30 |   AfterStruct: false
31 |   AfterUnion: false
32 |   BeforeCatch: true
33 | 
34 | BinPackArguments: true
35 | BinPackParameters: true
36 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
37 | 
38 | IndentCaseLabels: true
39 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: pre-commit
28 | 
29 | on:
30 |   pull_request:
31 | 
32 | jobs:
33 |   pre-commit:
34 |     runs-on: ubuntu-latest
35 |     steps:
36 |     - uses: actions/checkout@v5.0.0
37 |     - uses: actions/setup-python@v6.0.0
38 |     - uses: pre-commit/action@v3.0.1
39 | 


--------------------------------------------------------------------------------
/qa/L0_server_unit_test/models/failing_infer/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "failing_infer"
28 | backend: "python"
29 | input [
30 |   {
31 |     name: "INPUT"
32 |     data_type: TYPE_INT32
33 |     dims: [ 16 ]
34 |   }
35 | ]
36 | output [
37 |   {
38 |     name: "OUTPUT"
39 |     data_type: TYPE_INT32
40 |     dims: [ 16 ]
41 |   }
42 | ]
43 | instance_group [{ kind: KIND_CPU }]
44 | 


--------------------------------------------------------------------------------
/server/cmake/TritonDeveloperToolsServerConfig.cmake.in:
--------------------------------------------------------------------------------
 1 | # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | include(CMakeFindDependencyMacro)
28 | 
29 | get_filename_component(
30 |   TRITONDEVELOPERTOOLSSERVER_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
31 | )
32 | 
33 | list(APPEND CMAKE_MODULE_PATH ${TRITONDEVELOPERTOOLSSERVER_CMAKE_DIR})
34 | 
35 | if(NOT TARGET TritonDeveloperToolsServer::triton-developer_tools-server)
36 |   include("${TRITONDEVELOPERTOOLSSERVER_CMAKE_DIR}/TritonDeveloperToolsServerTargets.cmake")
37 | endif()
38 | 


--------------------------------------------------------------------------------
/qa/L0_server_unit_test/models/add_sub/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | backend: "python"
28 | 
29 | input [
30 |   {
31 |     name: "INPUT0"
32 |     data_type: TYPE_INT32
33 |     dims: [ 16 ]
34 |   }
35 | ]
36 | input [
37 |   {
38 |     name: "INPUT1"
39 |     data_type: TYPE_INT32
40 |     dims: [ 16 ]
41 |   }
42 | ]
43 | output [
44 |   {
45 |     name: "OUTPUT0"
46 |     data_type: TYPE_INT32
47 |     dims: [ 16 ]
48 |   }
49 | ]
50 | output [
51 |   {
52 |     name: "OUTPUT1"
53 |     data_type: TYPE_INT32
54 |     dims: [ 16 ]
55 |   }
56 | ]
57 | 
58 | instance_group [{ kind: KIND_CPU }]
59 | 


--------------------------------------------------------------------------------
/qa/L0_server_unit_test/models/add_sub_str/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | backend: "python"
28 | 
29 | input [
30 |   {
31 |     name: "INPUT0"
32 |     data_type: TYPE_STRING
33 |     dims: [ -1 ]
34 |   }
35 | ]
36 | input [
37 |   {
38 |     name: "INPUT1"
39 |     data_type: TYPE_STRING
40 |     dims: [ -1 ]
41 |   }
42 | ]
43 | output [
44 |   {
45 |     name: "OUTPUT0"
46 |     data_type: TYPE_STRING
47 |     dims: [ -1 ]
48 |   }
49 | ]
50 | output [
51 |   {
52 |     name: "OUTPUT1"
53 |     data_type: TYPE_STRING
54 |     dims: [ -1 ]
55 |   }
56 | ]
57 | 
58 | instance_group [{ kind: KIND_CPU }]
59 | 


--------------------------------------------------------------------------------
/qa/L0_server_unit_test/models/failing_infer/1/model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | #
 5 | # Redistribution and use in source and binary forms, with or without
 6 | # modification, are permitted provided that the following conditions
 7 | # are met:
 8 | #  * Redistributions of source code must retain the above copyright
 9 | #    notice, this list of conditions and the following disclaimer.
10 | #  * Redistributions in binary form must reproduce the above copyright
11 | #    notice, this list of conditions and the following disclaimer in the
12 | #    documentation and/or other materials provided with the distribution.
13 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
14 | #    contributors may be used to endorse or promote products derived
15 | #    from this software without specific prior written permission.
16 | #
17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | import triton_python_backend_utils as pb_utils
30 | 
31 | 
32 | class TritonPythonModel:
33 |     """Test model that always returns error for all requests."""
34 | 
35 |     def execute(self, requests):
36 |         responses = []
37 | 
38 |         for _ in requests:
39 |             responses.append(
40 |                 pb_utils.InferenceResponse(
41 |                     output_tensors=[], error=pb_utils.TritonError("An Error Occurred")
42 |                 )
43 |             )
44 | 
45 |         # You must return a list of pb_utils.InferenceResponse. Length
46 |         # of this list must match the length of `requests` list.
47 |         return responses
48 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | [tool.codespell]
28 | # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
29 | # this is only to allow you to run codespell interactively
30 | skip = "./.git,./.github"
31 | # ignore short words, and typename parameters like OffsetT
32 | ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
33 | # use the 'clear' dictionary for unambiguous spelling mistakes
34 | builtin = "clear"
35 | # disable warnings about binary files and wrong encoding
36 | quiet-level = 3
37 | 
38 | [tool.isort]
39 | profile = "black"
40 | use_parentheses = true
41 | multi_line_output = 3
42 | include_trailing_comma = true
43 | force_grid_wrap = 0
44 | ensure_newline_before_comments = true
45 | line_length = 88
46 | balanced_wrapping = true
47 | indent = "    "
48 | skip = ["build"]
49 | 
50 | 


--------------------------------------------------------------------------------
/server/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | cmake_minimum_required(VERSION 3.31.8)
28 | 
29 | #
30 | # Unit tests
31 | #
32 | 
33 | #
34 | # Unit test for Triton Developer Tools Server
35 | #
36 | add_executable(
37 |   wrapper_test
38 |   wrapper_test.cc
39 | )
40 | 
41 | set_target_properties(
42 |   wrapper_test
43 |   PROPERTIES
44 |     SKIP_BUILD_RPATH TRUE
45 |     BUILD_WITH_INSTALL_RPATH TRUE
46 |     INSTALL_RPATH_USE_LINK_PATH FALSE
47 |     INSTALL_RPATH ""
48 | )
49 | 
50 | target_include_directories(
51 |   wrapper_test
52 |   PRIVATE
53 |     ${CMAKE_CURRENT_SOURCE_DIR}/../include
54 |     ${GTEST_INCLUDE_DIRS}
55 | )
56 | 
57 | target_link_libraries(
58 |   wrapper_test
59 |   PRIVATE
60 |     triton-developer_tools-server
61 |     triton-core-serverstub
62 |     GTest::gtest_main
63 | )
64 | 
65 | install(
66 |   TARGETS wrapper_test
67 |   RUNTIME DESTINATION bin
68 | )
69 | 


--------------------------------------------------------------------------------
/server/install_dependencies_and_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | # Install dependencies
29 | apt update -q=2 \
30 |     && apt install -y \
31 |         gpg \
32 |         wget \
33 |         rapidjson-dev \
34 |         software-properties-common \
35 |     && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - |  tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \
36 |     && . /etc/os-release \
37 |     && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \
38 |     && apt-get update -q=2 \
39 |     && apt-get install -y --no-install-recommends cmake=4.0.3* cmake-data=4.0.3* \
40 |     && cmake --version
41 | 
42 | # Install developer tools
43 | mkdir -p /opt/tritonserver/developer_tools/server/build && cd /opt/tritonserver/developer_tools/server/build
44 | cmake -DCMAKE_INSTALL_PREFIX:PATH=/opt/tritonserver/developer_tools/server/build/install \
45 |     -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} ..
46 | make install
47 | cp /opt/tritonserver/developer_tools/server/build/install/lib/libtritondevelopertoolsserver.a /opt/tritonserver/lib/
48 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | repos:
28 | - repo: https://github.com/PyCQA/isort
29 |   rev: 5.12.0
30 |   hooks:
31 |   - id: isort
32 |     additional_dependencies: [toml]
33 | - repo: https://github.com/psf/black
34 |   rev: 23.1.0
35 |   hooks:
36 |   - id: black
37 |     types_or: [python, cython]
38 | - repo: https://github.com/PyCQA/flake8
39 |   rev: 7.3.0
40 |   hooks:
41 |   - id: flake8
42 |     args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
43 |     types_or: [python, cython]
44 | - repo: https://github.com/pre-commit/mirrors-clang-format
45 |   rev: v16.0.5
46 |   hooks:
47 |   - id: clang-format
48 |     types_or: [c, c++, cuda, proto, textproto, java]
49 |     args: ["-fallback-style=none", "-style=file", "-i"]
50 | - repo: https://github.com/codespell-project/codespell
51 |   rev: v2.2.4
52 |   hooks:
53 |   - id: codespell
54 |     additional_dependencies: [tomli]
55 |     args: ["--toml", "pyproject.toml"]
56 |     exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)
57 | # More details about these pre-commit hooks here:
58 | # https://pre-commit.com/hooks.html
59 | - repo: https://github.com/pre-commit/pre-commit-hooks
60 |   rev: v6.0.0
61 |   hooks:
62 |   - id: check-case-conflict
63 |   - id: check-executables-have-shebangs
64 |   - id: check-merge-conflict
65 |   - id: check-json
66 |   - id: check-toml
67 |   - id: check-yaml
68 |   - id: check-shebang-scripts-are-executable
69 |   - id: end-of-file-fixer
70 |     types_or: [c, c++, cuda, proto, textproto, java, python]
71 |   - id: mixed-line-ending
72 |   - id: requirements-txt-fixer
73 |   - id: trailing-whitespace
74 | 


--------------------------------------------------------------------------------
/qa/L0_server_unit_test/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
29 | if [ "$#" -ge 1 ]; then
30 |     REPO_VERSION=$1
31 | fi
32 | if [ -z "$REPO_VERSION" ]; then
33 |     echo -e "Repository version must be specified"
34 |     echo -e "\n***\n*** Test Failed\n***"
35 |     exit 1
36 | fi
37 | if [ ! -z "$TEST_REPO_ARCH" ]; then
38 |     REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
39 | fi
40 | bash -x ../../server/install_dependencies_and_build.sh
41 | 
42 | export CUDA_VISIBLE_DEVICES=0
43 | 
44 | TEST_LOG=test.log
45 | 
46 | # Copy over the decoupled model placed in the python_backend repository.
47 | git clone --single-branch --depth=1 -b ${PYTHON_BACKEND_REPO_TAG} https://github.com/triton-inference-server/python_backend.git
48 | mkdir -p ./models/square_int32/1
49 | cp python_backend/examples/decoupled/square_model.py ./models/square_int32/1/model.py
50 | cp python_backend/examples/decoupled/square_config.pbtxt ./models/square_int32/config.pbtxt
51 | # Copy the model repository for 'ModelRepoRegister' test case.
52 | cp -fr ./models ./models1
53 | 
54 | RET=0
55 | 
56 | cp /opt/tritonserver/developer_tools/server/build/install/bin/wrapper_test ./
57 | 
58 | set +e
59 | # Must explicitly set LD_LIBRARY_PATH so that the test can find
60 | # libtritonserver.so.
61 | LD_LIBRARY_PATH=/opt/tritonserver/lib:${LD_LIBRARY_PATH} ./wrapper_test >> ${TEST_LOG} 2>&1
62 | if [ $? -ne 0 ]; then
63 |     cat ${TEST_LOG}
64 |     RET=1
65 | fi
66 | set -e
67 | 
68 | if [ $RET -eq 0 ]; then
69 |     echo -e "\n***\n*** Test Passed\n***"
70 | else
71 |     echo -e "\n***\n*** Test FAILED\n***"
72 | fi
73 | 
74 | exit $RET
75 | 


--------------------------------------------------------------------------------
/qa/L0_server_unit_test/models/add_sub/1/model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | #
 5 | # Redistribution and use in source and binary forms, with or without
 6 | # modification, are permitted provided that the following conditions
 7 | # are met:
 8 | #  * Redistributions of source code must retain the above copyright
 9 | #    notice, this list of conditions and the following disclaimer.
10 | #  * Redistributions in binary form must reproduce the above copyright
11 | #    notice, this list of conditions and the following disclaimer in the
12 | #    documentation and/or other materials provided with the distribution.
13 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
14 | #    contributors may be used to endorse or promote products derived
15 | #    from this software without specific prior written permission.
16 | #
17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | import json
30 | 
31 | import numpy as np
32 | import triton_python_backend_utils as pb_utils
33 | 
34 | 
35 | class TritonPythonModel:
36 |     def initialize(self, args):
37 |         self.model_config = model_config = json.loads(args["model_config"])
38 | 
39 |         output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
40 |         output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
41 | 
42 |         self.output0_dtype = pb_utils.triton_string_to_numpy(
43 |             output0_config["data_type"]
44 |         )
45 |         self.output1_dtype = pb_utils.triton_string_to_numpy(
46 |             output1_config["data_type"]
47 |         )
48 | 
49 |     def execute(self, requests):
50 |         """This function is called on inference request."""
51 | 
52 |         output0_dtype = self.output0_dtype
53 |         output1_dtype = self.output1_dtype
54 | 
55 |         responses = []
56 |         for request in requests:
57 |             in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
58 |             in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
59 |             if (
60 |                 in_0.as_numpy().dtype.type is np.bytes_
61 |                 or in_0.as_numpy().dtype == np.object_
62 |             ):
63 |                 out_0, out_1 = (
64 |                     in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32),
65 |                     in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),
66 |                 )
67 |             else:
68 |                 out_0, out_1 = (
69 |                     in_0.as_numpy() + in_1.as_numpy(),
70 |                     in_0.as_numpy() - in_1.as_numpy(),
71 |                 )
72 | 
73 |             out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
74 |             out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
75 |             responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
76 |         return responses
77 | 


--------------------------------------------------------------------------------
/qa/L0_server_unit_test/models/add_sub_str/1/model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | #
 5 | # Redistribution and use in source and binary forms, with or without
 6 | # modification, are permitted provided that the following conditions
 7 | # are met:
 8 | #  * Redistributions of source code must retain the above copyright
 9 | #    notice, this list of conditions and the following disclaimer.
10 | #  * Redistributions in binary form must reproduce the above copyright
11 | #    notice, this list of conditions and the following disclaimer in the
12 | #    documentation and/or other materials provided with the distribution.
13 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
14 | #    contributors may be used to endorse or promote products derived
15 | #    from this software without specific prior written permission.
16 | #
17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | import json
30 | 
31 | import numpy as np
32 | import triton_python_backend_utils as pb_utils
33 | 
34 | 
35 | class TritonPythonModel:
36 |     def initialize(self, args):
37 |         self.model_config = model_config = json.loads(args["model_config"])
38 | 
39 |         output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
40 |         output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
41 | 
42 |         self.output0_dtype = pb_utils.triton_string_to_numpy(
43 |             output0_config["data_type"]
44 |         )
45 |         self.output1_dtype = pb_utils.triton_string_to_numpy(
46 |             output1_config["data_type"]
47 |         )
48 | 
49 |     def execute(self, requests):
50 |         """This function is called on inference request."""
51 | 
52 |         output0_dtype = self.output0_dtype
53 |         output1_dtype = self.output1_dtype
54 | 
55 |         responses = []
56 |         for request in requests:
57 |             in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
58 |             in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
59 |             if (
60 |                 in_0.as_numpy().dtype.type is np.bytes_
61 |                 or in_0.as_numpy().dtype == np.object_
62 |             ):
63 |                 out_0, out_1 = (
64 |                     in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32),
65 |                     in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),
66 |                 )
67 |             else:
68 |                 out_0, out_1 = (
69 |                     in_0.as_numpy() + in_1.as_numpy(),
70 |                     in_0.as_numpy() - in_1.as_numpy(),
71 |                 )
72 | 
73 |             out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
74 |             out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
75 |             responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
76 |         return responses
77 | 


--------------------------------------------------------------------------------
/qa/L0_java_simple_cpp_example/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
29 | if [ "$#" -ge 1 ]; then
30 |     REPO_VERSION=$1
31 | fi
32 | if [ -z "$REPO_VERSION" ]; then
33 |     echo -e "Repository version must be specified"
34 |     echo -e "\n***\n*** Test Failed\n***"
35 |     exit 1
36 | fi
37 | 
38 | # set variables
39 | CLIENT_LOG="client.log"
40 | MODEL_REPO=$PWD/models
41 | SAMPLES_REPO=$PWD/javacpp-presets/tritonserver/samples/simplecpp
42 | TRITON_SERVER_REPO_TAG=${TRITON_SERVER_REPO_TAG:="main"}
43 | TRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG:="main"}
44 | TEST_HOME=$PWD
45 | 
46 | # generate models
47 | rm -rf ${MODEL_REPO}
48 | git clone --single-branch --depth=1 -b ${TRITON_SERVER_REPO_TAG} https://github.com/triton-inference-server/server.git
49 | mkdir -p ${MODEL_REPO}
50 | cp -r server/docs/examples/model_repository/simple ${MODEL_REPO}/simple
51 | 
52 | # use build script to generate .jar
53 | git clone --single-branch --depth=1 -b ${TRITON_CLIENT_REPO_TAG} https://github.com/triton-inference-server/client.git
54 | source client/src/java-api-bindings/scripts/install_dependencies_and_build.sh --enable-developer-tools-server
55 | 
56 | cd ${TEST_HOME}
57 | # build javacpp-presets/tritonserver
58 | set +e
59 | rm -r javacpp-presets
60 | git clone --single-branch --depth=1 -b ${JAVACPP_BRANCH_TAG} ${JAVACPP_BRANCH}
61 | cd javacpp-presets
62 | ${MAVEN_PATH} clean install --projects .,tritonserver
63 | ${MAVEN_PATH} clean install -f platform --projects ../tritonserver/platform -Djavacpp.platform.host
64 | cd ..
65 | set -e
66 | 
67 | rm -f *.log
68 | RET=0
69 | 
70 | set +e
71 | # Build SimpleCPP example
72 | BASE_COMMAND="${MAVEN_PATH} clean compile -f ${SAMPLES_REPO} exec:java -Djavacpp.platform=linux-x86_64"
73 | ${BASE_COMMAND} -Dexec.args="-r ${MODEL_REPO}" >>${CLIENT_LOG} 2>&1
74 | if [ $? -ne 0 ]; then
75 |     echo -e "Failed to run: ${BASE_COMMAND} -Dexec.args=\"-r ${MODEL_REPO}\""
76 |     RET=1
77 | fi
78 | 
79 | # Run SimpleCPP with generated jar
80 | java -cp ${JAR_INSTALL_PATH}/tritonserver-java-bindings.jar ${SAMPLES_REPO}/SimpleCPP.java
81 | if [ $? -ne 0 ]; then
82 |     echo -e "Failed to run: java -cp ${JAR_INSTALL_PATH}/tritonserver-java-bindings.jar ${SAMPLES_REPO}/SimpleCPP.java -r ${MODEL_REPO}"
83 |     RET=1
84 | fi
85 | 
86 | set -e
87 | 
88 | if [ $RET -eq 0 ]; then
89 |     echo -e "\n***\n*** Test Passed\n***"
90 | else
91 |     echo -e "\n***\n*** Test FAILED\n***"
92 | fi
93 | 
94 | exit $RET
95 | 


--------------------------------------------------------------------------------
/server/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | cmake_minimum_required(VERSION 3.31.8)
 28 | 
 29 | #
 30 | # examples
 31 | #
 32 | 
 33 | #
 34 | # simple_addsub_async_infer
 35 | #
 36 | add_executable(
 37 |   simple_addsub_async_infer
 38 |   simple_addsub_async_infer.cc
 39 | )
 40 | 
 41 | set_target_properties(
 42 |   simple_addsub_async_infer
 43 |   PROPERTIES
 44 |     SKIP_BUILD_RPATH TRUE
 45 |     BUILD_WITH_INSTALL_RPATH TRUE
 46 |     INSTALL_RPATH_USE_LINK_PATH FALSE
 47 |     INSTALL_RPATH ""
 48 | )
 49 | 
 50 | target_include_directories(
 51 |   simple_addsub_async_infer
 52 |   PRIVATE
 53 |     ${CMAKE_CURRENT_SOURCE_DIR}/../include
 54 | )
 55 | 
 56 | target_link_libraries(
 57 |   simple_addsub_async_infer
 58 |   PRIVATE
 59 |     triton-developer_tools-server
 60 |     triton-core-serverstub
 61 | )
 62 | 
 63 | install(
 64 |   TARGETS simple_addsub_async_infer
 65 |   RUNTIME DESTINATION bin
 66 | )
 67 | 
 68 | #
 69 | # addsub_string_async_infer
 70 | #
 71 | add_executable(
 72 |   addsub_string_async_infer
 73 |   addsub_string_async_infer.cc
 74 | )
 75 | 
 76 | set_target_properties(
 77 |   addsub_string_async_infer
 78 |   PROPERTIES
 79 |     SKIP_BUILD_RPATH TRUE
 80 |     BUILD_WITH_INSTALL_RPATH TRUE
 81 |     INSTALL_RPATH_USE_LINK_PATH FALSE
 82 |     INSTALL_RPATH ""
 83 | )
 84 | 
 85 | target_include_directories(
 86 |   addsub_string_async_infer
 87 |   PRIVATE
 88 |     ${CMAKE_CURRENT_SOURCE_DIR}/../include
 89 | )
 90 | 
 91 | target_link_libraries(
 92 |   addsub_string_async_infer
 93 |   PRIVATE
 94 |     triton-developer_tools-server
 95 |     triton-core-serverstub
 96 | )
 97 | 
 98 | install(
 99 |   TARGETS addsub_string_async_infer
100 |   RUNTIME DESTINATION bin
101 | )
102 | 
103 | #
104 | # square_async_infer
105 | #
106 | add_executable(
107 |   square_async_infer
108 |   square_async_infer.cc
109 | )
110 | 
111 | set_target_properties(
112 |   square_async_infer
113 |   PROPERTIES
114 |     SKIP_BUILD_RPATH TRUE
115 |     BUILD_WITH_INSTALL_RPATH TRUE
116 |     INSTALL_RPATH_USE_LINK_PATH FALSE
117 |     INSTALL_RPATH ""
118 | )
119 | 
120 | target_include_directories(
121 |   square_async_infer
122 |   PRIVATE
123 |     ${CMAKE_CURRENT_SOURCE_DIR}/../include
124 | )
125 | 
126 | target_link_libraries(
127 |   square_async_infer
128 |   PRIVATE
129 |     triton-developer_tools-server
130 |     triton-core-serverstub
131 | )
132 | 
133 | install(
134 |   TARGETS square_async_infer
135 |   RUNTIME DESTINATION bin
136 | )


--------------------------------------------------------------------------------
/qa/L0_server_example/test.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | 
 28 | REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
 29 | if [ "$#" -ge 1 ]; then
 30 |     REPO_VERSION=$1
 31 | fi
 32 | if [ -z "$REPO_VERSION" ]; then
 33 |     echo -e "Repository version must be specified"
 34 |     echo -e "\n***\n*** Test Failed\n***"
 35 |     exit 1
 36 | fi
 37 | if [ ! -z "$TEST_REPO_ARCH" ]; then
 38 |     REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
 39 | fi
 40 | 
 41 | bash -x ../../server/install_dependencies_and_build.sh
 42 | export CUDA_VISIBLE_DEVICES=0
 43 | 
 44 | CLIENT_LOG=`pwd`/client.log
 45 | SIMPLE_ADDSUB_ASYNC_INFER_CLIENT=/opt/tritonserver/developer_tools/server/build/install/bin/simple_addsub_async_infer
 46 | ADDSUB_STRING_ASYNC_INFER_CLIENT=/opt/tritonserver/developer_tools/server/build/install/bin/addsub_string_async_infer
 47 | SQUARE_ASYNC_INFER_CLIENT=/opt/tritonserver/developer_tools/server/build/install/bin/square_async_infer
 48 | 
 49 | RET=0
 50 | 
 51 | # Prepare required models for the examples
 52 | mkdir models
 53 | cp -r ../L0_server_unit_test/models/add_sub* ./models/.
 54 | git clone --single-branch --depth=1 -b ${TRITON_SERVER_BRANCH_NAME} https://github.com/triton-inference-server/server.git
 55 | cp -r server/docs/examples/model_repository/simple ./models/.
 56 | # Copy over the decoupled model placed in the python_backend repository.
 57 | git clone --single-branch --depth=1 -b ${PYTHON_BACKEND_REPO_TAG} https://github.com/triton-inference-server/python_backend.git
 58 | mkdir -p ./models/square_int32/1
 59 | cp python_backend/examples/decoupled/square_model.py ./models/square_int32/1/model.py
 60 | cp python_backend/examples/decoupled/square_config.pbtxt ./models/square_int32/config.pbtxt
 61 | 
 62 | # Must explicitly set LD_LIBRARY_PATH so that the test can find
 63 | # libtritonserver.so.
 64 | LD_LIBRARY_PATH=/opt/tritonserver/lib:${LD_LIBRARY_PATH}
 65 | 
 66 | set +e
 67 | 
 68 | for i in \
 69 |     $SIMPLE_ADDSUB_ASYNC_INFER_CLIENT \
 70 |     $ADDSUB_STRING_ASYNC_INFER_CLIENT \
 71 |     $SQUARE_ASYNC_INFER_CLIENT \
 72 |     ; do
 73 |     BASE=$(basename -- $i)
 74 |     SUFFIX="${BASE%.*}"
 75 | 
 76 |     if [ $i == $SIMPLE_ADDSUB_ASYNC_INFER_CLIENT ]; then
 77 |         # Enforce I/O to be in specific memory type
 78 |         for MEM_TYPE in system pinned gpu ; do
 79 |             $i -v -m $MEM_TYPE >> $CLIENT_LOG.${SUFFIX}.$MEM_TYPE 2>&1
 80 |             if [ $? -ne 0 ]; then
 81 |                 cat $CLIENT_LOG.${SUFFIX}.$MEM_TYPE
 82 |                 RET=1
 83 |             fi
 84 |         done
 85 |     else
 86 |         $i -v >> ${CLIENT_LOG}.${SUFFIX} 2>&1
 87 |         if [ $? -ne 0 ]; then
 88 |             cat ${CLIENT_LOG}.${SUFFIX}
 89 |             RET=1
 90 |         fi
 91 |     fi
 92 | done
 93 | 
 94 | set -e
 95 | 
 96 | if [ $RET -eq 0 ]; then
 97 |     echo -e "\n***\n*** Test Passed\n***"
 98 | else
 99 |     echo -e "\n***\n*** Test FAILED\n***"
100 | fi
101 | 
102 | exit $RET
103 | 


--------------------------------------------------------------------------------
/server/src/infer_requested_output.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | //
  3 | // Redistribution and use in source and binary forms, with or without
  4 | // modification, are permitted provided that the following conditions
  5 | // are met:
  6 | //  * Redistributions of source code must retain the above copyright
  7 | //    notice, this list of conditions and the following disclaimer.
  8 | //  * Redistributions in binary form must reproduce the above copyright
  9 | //    notice, this list of conditions and the following disclaimer in the
 10 | //    documentation and/or other materials provided with the distribution.
 11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | //    contributors may be used to endorse or promote products derived
 13 | //    from this software without specific prior written permission.
 14 | //
 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | #pragma once
 27 | 
 28 | #include <string>
 29 | 
 30 | #include "triton/core/tritonserver.h"
 31 | #include "triton/developer_tools/common.h"
 32 | 
 33 | namespace triton { namespace developer_tools { namespace server {
 34 | 
 35 | //==============================================================================
 36 | /// An InferRequestedOutput object is used to describe the requested model
 37 | /// output for inference.
 38 | ///
 39 | class InferRequestedOutput {
 40 |  public:
 41 |   /// Create an InferRequestedOutput instance that describes a model output
 42 |   /// being requested.
 43 |   /// \param name The name of output being requested.
 44 |   /// \return Returns a new InferRequestedOutput object.
 45 |   static std::unique_ptr<InferRequestedOutput> Create(const std::string& name)
 46 |   {
 47 |     return std::unique_ptr<InferRequestedOutput>(
 48 |         new InferRequestedOutput(name));
 49 |   }
 50 | 
 51 |   /// Create a InferRequestedOutput instance that describes a model output being
 52 |   /// requested with pre-allocated output buffer.
 53 |   /// \param name The name of output being requested.
 54 |   /// \param buffer The pointer to the start of the pre-allocated buffer.
 55 |   /// \param byte_size The size of buffer in bytes.
 56 |   /// \param memory_type The memory type of the output.
 57 |   /// \param memory_type_id The memory type id of the output.
 58 |   /// \return Returns a new InferRequestedOutput object.
 59 |   static std::unique_ptr<InferRequestedOutput> Create(
 60 |       const std::string& name, const char* buffer, size_t byte_size,
 61 |       MemoryType memory_type, int64_t memory_type_id)
 62 |   {
 63 |     return std::unique_ptr<InferRequestedOutput>(new InferRequestedOutput(
 64 |         name, buffer, byte_size, memory_type, memory_type_id));
 65 |   }
 66 | 
 67 |   /// Get name of the associated output tensor.
 68 |   /// \return The name of the tensor.
 69 |   const std::string& Name() const { return name_; }
 70 | 
 71 |   /// Get buffer of the associated output tensor.
 72 |   /// \return The name of the tensor.
 73 |   const char* Buffer() { return buffer_; }
 74 | 
 75 |   /// Get byte size of the associated output tensor.
 76 |   /// \return The name of the tensor.
 77 |   size_t ByteSize() { return byte_size_; }
 78 | 
 79 |   /// Get the memory type of the output tensor.
 80 |   /// \return The memory type of the tensor.
 81 |   const MemoryType& GetMemoryType() const { return memory_type_; }
 82 | 
 83 |   /// Get the memory type id of the output tensor.
 84 |   /// \return The memory type id of the tensor.
 85 |   const int64_t& MemoryTypeId() const { return memory_type_id_; }
 86 | 
 87 |   InferRequestedOutput(const std::string& name)
 88 |       : name_(name), buffer_(nullptr), byte_size_(0),
 89 |         memory_type_(MemoryType::CPU), memory_type_id_(0)
 90 |   {
 91 |   }
 92 | 
 93 |   InferRequestedOutput(
 94 |       const std::string& name, const char* buffer, size_t byte_size,
 95 |       MemoryType memory_type, int64_t memory_type_id)
 96 |       : name_(name), buffer_(buffer), byte_size_(byte_size),
 97 |         memory_type_(memory_type), memory_type_id_(memory_type_id)
 98 |   {
 99 |   }
100 | 
101 |  private:
102 |   std::string name_;
103 |   const char* buffer_;
104 |   size_t byte_size_;
105 |   MemoryType memory_type_;
106 |   int64_t memory_type_id_;
107 | };
108 | 
109 | }}}  // namespace triton::developer_tools::server
110 | 


--------------------------------------------------------------------------------
/server/src/tracer.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | //
  3 | // Redistribution and use in source and binary forms, with or without
  4 | // modification, are permitted provided that the following conditions
  5 | // are met:
  6 | //  * Redistributions of source code must retain the above copyright
  7 | //    notice, this list of conditions and the following disclaimer.
  8 | //  * Redistributions in binary form must reproduce the above copyright
  9 | //    notice, this list of conditions and the following disclaimer in the
 10 | //    documentation and/or other materials provided with the distribution.
 11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | //    contributors may be used to endorse or promote products derived
 13 | //    from this software without specific prior written permission.
 14 | //
 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | #pragma once
 27 | 
 28 | #include <atomic>
 29 | #include <condition_variable>
 30 | #include <fstream>
 31 | #include <memory>
 32 | #include <mutex>
 33 | #include <set>
 34 | #include <sstream>
 35 | #include <string>
 36 | #include <unordered_map>
 37 | 
 38 | #include "triton/core/tritonserver.h"
 39 | #include "triton/developer_tools/common.h"
 40 | 
 41 | namespace triton { namespace developer_tools { namespace server {
 42 | 
 43 | class TraceManager {
 44 |  public:
 45 |   class TraceSetting;
 46 | 
 47 |   class TraceFile {
 48 |    public:
 49 |     TraceFile(const std::string& file_name)
 50 |         : file_name_(file_name), index_(0), first_write_(true)
 51 |     {
 52 |     }
 53 |     ~TraceFile();
 54 | 
 55 |     // Save the traces stored in 'trace_stream' into the file. 'to_index_file'
 56 |     // specifies whether the file name should be indexed, if true, the traces
 57 |     // will be written to 'file_name.index' where index will be incremented
 58 |     // every time the traces are written to a file with index. If false, the
 59 |     // trace will be written to 'file_name'.
 60 |     void SaveTraces(std::stringstream& trace_stream, const bool to_index_file);
 61 | 
 62 |     const std::string& FileName() { return file_name_; }
 63 | 
 64 |    private:
 65 |     const std::string file_name_;
 66 |     // The file index for the next index file write.
 67 |     std::atomic<uint32_t> index_;
 68 | 
 69 |     // Multiple traces may be finished and write to the trace file at the same
 70 |     // time
 71 |     std::mutex mu_;
 72 |     std::ofstream trace_file_;
 73 |     bool first_write_;
 74 |   };
 75 | 
 76 |   struct Trace {
 77 |     Trace() : trace_(nullptr), trace_id_(0) {}
 78 |     ~Trace();
 79 |     std::shared_ptr<TraceSetting> setting_;
 80 |     // Group the spawned traces by trace ID for better formatting
 81 |     std::mutex mtx_;
 82 |     std::unordered_map<uint64_t, std::unique_ptr<std::stringstream>> streams_;
 83 |     // Triton trace object that this trace is assosicated with,
 84 |     // 'Trace' object does not take ownership of 'trace_'. The caller of
 85 |     // SampleTrace() must call TraceManager::TraceRelease() with 'trace_userp_'
 86 |     // to properly release the resources if 'trace_' is not passed to a
 87 |     // TRITONSERVER_ServerInferAsync() call.
 88 |     TRITONSERVER_InferenceTrace* trace_;
 89 |     void* trace_userp_;
 90 | 
 91 |     uint64_t trace_id_;
 92 |   };
 93 | 
 94 |   TraceManager(
 95 |       const TRITONSERVER_InferenceTraceLevel level, const uint32_t rate,
 96 |       const int32_t count, const uint32_t log_frequency,
 97 |       const std::string& filepath);
 98 | 
 99 |   ~TraceManager() = default;
100 | 
101 |   void UpdateTraceSetting(
102 |       const std::string& model_name, const TraceSetting& new_setting);
103 | 
104 |   // Return a trace that should be used to collected trace activities
105 |   // for an inference request. Return nullptr if no tracing should occur.
106 |   std::shared_ptr<Trace> SampleTrace(const std::string& model_name);
107 | 
108 |   static void TraceRelease(TRITONSERVER_InferenceTrace* trace, void* userp);
109 | 
110 |   class TraceSetting {
111 |    public:
112 |     TraceSetting();
113 | 
114 |     TraceSetting(
115 |         const TRITONSERVER_InferenceTraceLevel level, const uint32_t rate,
116 |         const int32_t count, const uint32_t log_frequency,
117 |         const std::shared_ptr<TraceFile>& file);
118 | 
119 |     ~TraceSetting();
120 | 
121 |     bool Valid() { return invalid_reason_.empty() && (count_ != 0); }
122 |     const std::string& Reason() { return invalid_reason_; }
123 | 
124 |     void WriteTrace(
125 |         const std::unordered_map<uint64_t, std::unique_ptr<std::stringstream>>&
126 |             streams);
127 | 
128 |     std::shared_ptr<Trace> SampleTrace();
129 | 
130 |     TRITONSERVER_InferenceTraceLevel level_;
131 |     uint32_t rate_;
132 |     int32_t count_;
133 |     uint32_t log_frequency_;
134 |     std::shared_ptr<TraceFile> file_;
135 | 
136 |    private:
137 |     std::string invalid_reason_;
138 | 
139 |     std::mutex mu_;
140 | 
141 |     // use to sample a trace based on sampling rate.
142 |     uint64_t sample_;
143 | 
144 |     // use to track the status of trace count feature
145 |     uint64_t created_;
146 |     uint64_t collected_;
147 | 
148 |     // Tracking traces that haven't been saved to file
149 |     uint32_t sample_in_stream_;
150 |     std::stringstream trace_stream_;
151 |   };
152 | 
153 |  private:
154 |   static void TraceActivity(
155 |       TRITONSERVER_InferenceTrace* trace,
156 |       TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
157 |       void* userp);
158 | 
159 |   static void TraceTensorActivity(
160 |       TRITONSERVER_InferenceTrace* trace,
161 |       TRITONSERVER_InferenceTraceActivity activity, const char* name,
162 |       TRITONSERVER_DataType datatype, const void* base, size_t byte_size,
163 |       const int64_t* shape, uint64_t dim_count,
164 |       TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, void* userp);
165 | 
166 |   std::shared_ptr<TraceSetting> global_setting_;
167 |   std::unordered_map<std::string, std::shared_ptr<TraceSetting>>
168 |       model_settings_;
169 | 
170 |   std::unordered_map<std::string, std::weak_ptr<TraceFile>> trace_files_;
171 | 
172 |   // lock for accessing trace setting.
173 |   std::mutex r_mu_;
174 | };
175 | 
176 | }}}  // namespace triton::developer_tools::server
177 | 


--------------------------------------------------------------------------------
/server/examples/square_async_infer.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | //
  3 | // Redistribution and use in source and binary forms, with or without
  4 | // modification, are permitted provided that the following conditions
  5 | // are met:
  6 | //  * Redistributions of source code must retain the above copyright
  7 | //    notice, this list of conditions and the following disclaimer.
  8 | //  * Redistributions in binary form must reproduce the above copyright
  9 | //    notice, this list of conditions and the following disclaimer in the
 10 | //    documentation and/or other materials provided with the distribution.
 11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | //    contributors may be used to endorse or promote products derived
 13 | //    from this software without specific prior written permission.
 14 | //
 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | #include <unistd.h>
 28 | 
 29 | #include <string>
 30 | 
 31 | #include "triton/developer_tools/server_wrapper.h"
 32 | 
 33 | namespace tds = triton::developer_tools::server;
 34 | 
 35 | namespace {
 36 | 
 37 | #define FAIL(MSG)                                 \
 38 |   do {                                            \
 39 |     std::cerr << "error: " << (MSG) << std::endl; \
 40 |     exit(1);                                      \
 41 |   } while (false)
 42 | 
 43 | void
 44 | Usage(char** argv, const std::string& msg = std::string())
 45 | {
 46 |   if (!msg.empty()) {
 47 |     std::cerr << msg << std::endl;
 48 |   }
 49 | 
 50 |   std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
 51 |   std::cerr << "\t-v Enable verbose logging" << std::endl;
 52 | 
 53 |   exit(1);
 54 | }
 55 | 
 56 | void
 57 | GetResults(
 58 |     std::vector<std::unique_ptr<tds::InferResult>>& results,
 59 |     std::future<std::unique_ptr<tds::InferResult>> future)
 60 | {
 61 |   results.push_back(future.get());
 62 |   size_t size = results.size();
 63 |   for (size_t i = 0; i < size; i++) {
 64 |     if (results[i]) {
 65 |       if (results[i]->HasError()) {
 66 |         FAIL(results[i]->ErrorMsg());
 67 |       }
 68 |       auto next_future = results[i]->GetNextResult();
 69 |       if (next_future) {
 70 |         results.push_back(next_future->get());
 71 |         size++;
 72 |       }
 73 |     }
 74 |   }
 75 | }
 76 | 
 77 | void
 78 | Check(
 79 |     const std::vector<std::unique_ptr<tds::InferResult>>& results,
 80 |     int32_t input_value)
 81 | {
 82 |   int count = 0;
 83 |   std::cout << "Outputs:\n";
 84 |   for (auto& result : results) {
 85 |     if (result) {
 86 |       std::shared_ptr<tds::Tensor> out = result->Output("OUT");
 87 | 
 88 |       if ((out->shape_.size() != 1) || (out->shape_[0] != 1)) {
 89 |         FAIL("error: received incorrect shapes");
 90 |       }
 91 | 
 92 |       if (out->memory_type_ != tds::MemoryType::CPU) {
 93 |         FAIL(
 94 |             "unexpected memory type, expected to be allocated in CPU, got " +
 95 |             std::string(MemoryTypeString(out->memory_type_)) + ", id " +
 96 |             std::to_string(out->memory_type_id_));
 97 |       }
 98 | 
 99 |       if (out->data_type_ != tds::DataType::INT32) {
100 |         FAIL(
101 |             "unexpected datatype '" +
102 |             std::string(DataTypeString(out->data_type_)));
103 |       }
104 | 
105 |       if (input_value && (*((int32_t*)out->buffer_) != input_value)) {
106 |         FAIL(
107 |             "incorrect value, expected: '" + std::to_string(input_value) +
108 |             ", got :" + std::to_string(*((int32_t*)out->buffer_)));
109 |       }
110 | 
111 |       std::cout << *((int32_t*)out->buffer_) << "\n";
112 |       count++;
113 |     }
114 |   }
115 | 
116 |   if (count != input_value) {
117 |     std::cerr << "error: received incorrect number of responses. Expected: "
118 |               << input_value << ", got: " << count << std::endl;
119 |   }
120 | }
121 | 
122 | }  // namespace
123 | 
124 | int
125 | main(int argc, char** argv)
126 | {
127 |   int verbose_level = 0;
128 | 
129 |   // Parse commandline...
130 |   int opt;
131 |   while ((opt = getopt(argc, argv, "vu:H:")) != -1) {
132 |     switch (opt) {
133 |       case 'v':
134 |         verbose_level = 1;
135 |         break;
136 |       case '?':
137 |         Usage(argv);
138 |         break;
139 |     }
140 |   }
141 |   try {
142 |     // Use 'ServerOptions' object to initialize TritonServer.
143 |     tds::ServerOptions options({"./models"});
144 |     options.logging_.verbose_ =
145 |         tds::LoggingOptions::VerboseLevel(verbose_level);
146 |     options.model_control_mode_ = tds::ModelControlMode::EXPLICIT;
147 |     auto server = tds::TritonServer::Create(options);
148 | 
149 |     // Load 'square_int32' model.
150 |     server->LoadModel("square_int32");
151 | 
152 |     // Please see here for more information about this decoupled model:
153 |     // https://github.com/triton-inference-server/python_backend/tree/main/examples/decoupled.
154 |     std::string model_name = "square_int32";
155 | 
156 |     // Initialize 'InferRequest' with the name of the model that we want to run
157 |     // an inference on.
158 |     auto request1 = tds::InferRequest::Create(tds::InferOptions(model_name));
159 | 
160 |     // Create the data for an input tensor. For square model, value '3' here
161 |     // means there will be three responses for this request, and each response
162 |     // contains only one output with value '3'.
163 |     std::vector<int32_t> input_data1 = {3};
164 | 
165 |     std::vector<int64_t> shape{1};
166 | 
167 |     // Add input tensor to the inference request.
168 |     request1->AddInput(
169 |         "IN", input_data1.begin(), input_data1.end(), tds::DataType::INT32,
170 |         shape, tds::MemoryType::CPU, 0);
171 | 
172 |     // Call 'AsyncInfer' function to run inference.
173 |     auto result_future1 = server->AsyncInfer(*request1);
174 | 
175 |     // Run the second inference.
176 |     auto request2 = tds::InferRequest::Create(tds::InferOptions(model_name));
177 | 
178 |     // Create the data for an input tensor. For square model, value '0' here
179 |     // means there won't be any responses for this request.
180 |     std::vector<int32_t> input_data2 = {0};
181 |     request2->AddInput(
182 |         "IN", input_data2.begin(), input_data2.end(), tds::DataType::INT32,
183 |         shape, tds::MemoryType::CPU, 0);
184 | 
185 |     // Call 'AsyncInfer' function to run inference.
186 |     auto result_future2 = server->AsyncInfer(*request2);
187 | 
188 |     // Get the infer results from both inferences and check the results.
189 |     std::vector<std::unique_ptr<tds::InferResult>> results1;
190 |     GetResults(results1, std::move(result_future1));
191 |     Check(results1, 3);
192 | 
193 |     std::vector<std::unique_ptr<tds::InferResult>> results2;
194 |     GetResults(results2, std::move(result_future2));
195 |     Check(results2, 0);
196 |   }
197 |   catch (const tds::TritonException& ex) {
198 |     std::cerr << "Error: " << ex.what();
199 |     exit(1);
200 |   }
201 | 
202 |   return 0;
203 | }
204 | 


--------------------------------------------------------------------------------
/server/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | cmake_minimum_required(VERSION 3.31.8)
 28 | 
 29 | project(tritondevelopertoolsserver LANGUAGES C CXX)
 30 | 
 31 | #
 32 | # Options
 33 | #
 34 | option(TRITON_ENABLE_GPU "Enable GPU support in backend utilities" ON)
 35 | option(TRITON_ENABLE_STATS "Include statistics collections in backend utilities" ON)
 36 | option(TRITON_BUILD_TEST "Include unit test for the Server Wrapper" ON)
 37 | option(TRITON_ENABLE_EXAMPLES "Include examples in build" ON)
 38 | 
 39 | option(TRITON_BUILD_STATIC_LIBRARY "Create multiple static libraries, otherwise create one dynamic library" ON)
 40 | set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
 41 | set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
 42 | 
 43 | if(NOT CMAKE_BUILD_TYPE)
 44 |   set(CMAKE_BUILD_TYPE Release)
 45 | endif()
 46 | 
 47 | #
 48 | # Dependencies
 49 | #
 50 | include(FetchContent)
 51 | 
 52 | FetchContent_Declare(
 53 |   repo-common
 54 |   GIT_REPOSITORY https://github.com/triton-inference-server/common.git
 55 |   GIT_TAG ${TRITON_COMMON_REPO_TAG}
 56 |   GIT_SHALLOW ON
 57 | )
 58 | FetchContent_Declare(
 59 |   repo-core
 60 |   GIT_REPOSITORY https://github.com/triton-inference-server/core.git
 61 |   GIT_TAG ${TRITON_CORE_REPO_TAG}
 62 |   GIT_SHALLOW ON
 63 | )
 64 | FetchContent_MakeAvailable(repo-common repo-core)
 65 | 
 66 | #
 67 | # CUDA
 68 | #
 69 | if(${TRITON_ENABLE_GPU})
 70 |   find_package(CUDAToolkit REQUIRED)
 71 |   set(CUDA_NVCC_FLAGS -std=c++11)
 72 | 
 73 |   if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.1")
 74 |     add_definitions(-DTRITON_ENABLE_CUDA_GRAPH=1)
 75 |   else()
 76 |     message(WARNING "CUDA ${CUDAToolkit_VERSION} does not support CUDA graphs.")
 77 |   endif()
 78 | endif() # TRITON_ENABLE_GPU
 79 | 
 80 | find_package(Threads REQUIRED)
 81 | 
 82 | #
 83 | # Triton Developer Tools Server
 84 | #
 85 | file(GLOB SRC_FILES src/*.cc src/*.h)
 86 | if(${TRITON_BUILD_STATIC_LIBRARY})
 87 |   add_library(
 88 |     triton-developer_tools-server
 89 |     ${SRC_FILES}
 90 |   )
 91 | else()
 92 | add_library(
 93 |   triton-developer_tools-server SHARED
 94 |   ${SRC_FILES}
 95 | )
 96 | endif()
 97 | 
 98 | 
 99 | add_library(
100 |   TritonDeveloperToolsServer::triton-developer_tools-server ALIAS triton-developer_tools-server
101 | )
102 | 
103 | target_include_directories(
104 |   triton-developer_tools-server
105 |   PUBLIC
106 |     $<INSTALL_INTERFACE:include>
107 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
108 |   PRIVATE
109 |     ${CMAKE_CURRENT_SOURCE_DIR}/src
110 | )
111 | 
112 | if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
113 |   message("Using MSVC as compiler, default target on Windows 10. "
114 |       "If the target system is not Windows 10, please update _WIN32_WINNT "
115 |       "to corresponding value.")
116 | endif()
117 | target_compile_features(triton-developer_tools-server PRIVATE cxx_std_11)
118 | target_compile_options(
119 |   triton-developer_tools-server
120 |   PRIVATE
121 |   $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
122 |     -Wall -Wextra -Wno-unused-parameter>
123 |   $<$<CXX_COMPILER_ID:MSVC>:/Wall /D_WIN32_WINNT=0x0A00 /EHsc>
124 | )
125 | 
126 | # TRITON_ENABLE_GPU exposed in header so set PUBLIC
127 | if(${TRITON_ENABLE_GPU})
128 | target_compile_definitions(
129 |   triton-developer_tools-server
130 |   PUBLIC TRITON_ENABLE_GPU=1
131 | )
132 | endif() # TRITON_ENABLE_GPU
133 | 
134 | # TRITON_ENABLE_STATS exposed in header so set PUBLIC
135 | if(${TRITON_ENABLE_STATS})
136 | target_compile_definitions(
137 |   triton-developer_tools-server
138 |   PUBLIC TRITON_ENABLE_STATS=1
139 | )
140 | endif() # TRITON_ENABLE_STATS
141 | 
142 | set_target_properties(
143 |   triton-developer_tools-server PROPERTIES
144 |   WINDOWS_EXPORT_ALL_SYMBOLS TRUE
145 |   POSITION_INDEPENDENT_CODE ON
146 |   OUTPUT_NAME tritondevelopertoolsserver
147 | )
148 | 
149 | target_link_libraries(
150 |   triton-developer_tools-server
151 |   PUBLIC
152 |     Threads::Threads
153 |     triton-core-serverapi         # from repo-core
154 |     triton-core-serverstub        # from repo-core
155 |     triton-common-logging         # from repo-common
156 | )
157 | 
158 | if(${TRITON_ENABLE_GPU})
159 |   target_link_libraries(
160 |     triton-developer_tools-server
161 |     PUBLIC
162 |       CUDA::cudart
163 |   )
164 | endif() # TRITON_ENABLE_GPU
165 | 
166 | #
167 | # Install
168 | #
169 | include(GNUInstallDirs)
170 | set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonDeveloperToolsServer)
171 | 
172 | install(
173 |   TARGETS
174 |     triton-developer_tools-server
175 |   EXPORT
176 |     triton-developer_tools-server-targets
177 |     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
178 |     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
179 | )
180 | 
181 | install(
182 |   DIRECTORY include/
183 |   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
184 | )
185 | 
186 | install(
187 |   EXPORT
188 |     triton-developer_tools-server-targets
189 |   FILE
190 |     TritonDeveloperToolsServerTargets.cmake
191 |   NAMESPACE
192 |     TritonDeveloperToolsServer::
193 |   DESTINATION
194 |     ${INSTALL_CONFIGDIR}
195 | )
196 | 
197 | include(CMakePackageConfigHelpers)
198 | configure_package_config_file(
199 |   ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonDeveloperToolsServerConfig.cmake.in
200 |   ${CMAKE_CURRENT_BINARY_DIR}/TritonDeveloperToolsServerConfig.cmake
201 |   INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
202 | )
203 | 
204 | install(
205 |   FILES
206 |   ${CMAKE_CURRENT_BINARY_DIR}/TritonDeveloperToolsServerConfig.cmake
207 |   DESTINATION ${INSTALL_CONFIGDIR}
208 | )
209 | 
210 | #
211 | # Export from build tree
212 | #
213 | export(
214 |   EXPORT triton-developer_tools-server-targets
215 |   FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonDeveloperToolsServerTargets.cmake
216 |   NAMESPACE TritonDeveloperToolsServer::
217 | )
218 | 
219 | export(PACKAGE TritonDeveloperToolsServer)
220 | 
221 | if(${TRITON_BUILD_TEST})
222 |   FetchContent_Declare(
223 |     googletest
224 |     GIT_REPOSITORY https://github.com/google/googletest.git
225 |     GIT_TAG release-1.12.1
226 |   )
227 |   FetchContent_MakeAvailable(googletest)
228 |   add_subdirectory(test test)
229 | endif()
230 | 
231 | if(TRITON_ENABLE_EXAMPLES)
232 |   add_subdirectory(examples)
233 | endif() # TRITON_ENABLE_EXAMPLES
234 | 


--------------------------------------------------------------------------------
/server/examples/addsub_string_async_infer.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | //
  3 | // Redistribution and use in source and binary forms, with or without
  4 | // modification, are permitted provided that the following conditions
  5 | // are met:
  6 | //  * Redistributions of source code must retain the above copyright
  7 | //    notice, this list of conditions and the following disclaimer.
  8 | //  * Redistributions in binary form must reproduce the above copyright
  9 | //    notice, this list of conditions and the following disclaimer in the
 10 | //    documentation and/or other materials provided with the distribution.
 11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | //    contributors may be used to endorse or promote products derived
 13 | //    from this software without specific prior written permission.
 14 | //
 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | #include <unistd.h>
 28 | 
 29 | #include <iostream>
 30 | #include <string>
 31 | 
 32 | #include "triton/developer_tools/server_wrapper.h"
 33 | 
 34 | namespace tds = triton::developer_tools::server;
 35 | 
 36 | namespace {
 37 | 
 38 | #define FAIL(MSG)                                 \
 39 |   do {                                            \
 40 |     std::cerr << "error: " << (MSG) << std::endl; \
 41 |     exit(1);                                      \
 42 |   } while (false)
 43 | 
 44 | void
 45 | Usage(char** argv, const std::string& msg = std::string())
 46 | {
 47 |   if (!msg.empty()) {
 48 |     std::cerr << msg << std::endl;
 49 |   }
 50 | 
 51 |   std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
 52 |   std::cerr << "\t-v Enable verbose logging" << std::endl;
 53 | 
 54 |   exit(1);
 55 | }
 56 | 
 57 | void
 58 | CompareResult(
 59 |     const std::vector<std::string>& input0_data,
 60 |     const std::vector<std::string>& input1_data,
 61 |     const std::vector<std::string>& result0_data,
 62 |     const std::vector<std::string>& result1_data,
 63 |     const std::vector<int32_t>& expected_sum,
 64 |     const std::vector<int32_t>& expected_diff)
 65 | {
 66 |   for (size_t i = 0; i < 16; ++i) {
 67 |     std::cout << input0_data[i] << " + " << input0_data[i] << " = "
 68 |               << result0_data[i] << std::endl;
 69 |     std::cout << input0_data[i] << " - " << input1_data[i] << " = "
 70 |               << result1_data[i] << std::endl;
 71 | 
 72 |     if (expected_sum[i] != std::stoi(result0_data[i])) {
 73 |       std::cerr << "error: incorrect sum" << std::endl;
 74 |       exit(1);
 75 |     }
 76 |     if (expected_diff[i] != std::stoi(result1_data[i])) {
 77 |       std::cerr << "error: incorrect difference" << std::endl;
 78 |       exit(1);
 79 |     }
 80 |   }
 81 | }
 82 | 
 83 | void
 84 | Check(
 85 |     std::shared_ptr<tds::Tensor>& output0,
 86 |     std::shared_ptr<tds::Tensor>& output1,
 87 |     const std::vector<std::string>& input0_data,
 88 |     const std::vector<std::string>& input1_data,
 89 |     const std::string& output0_name, const std::string& output1_name,
 90 |     const std::vector<std::string>& result0_data,
 91 |     const std::vector<std::string>& result1_data,
 92 |     const std::vector<int32_t>& expected_sum,
 93 |     const std::vector<int32_t>& expected_diff)
 94 | {
 95 |   for (auto& output :
 96 |        {std::make_pair(output0_name, output0),
 97 |         std::make_pair(output1_name, output1)}) {
 98 |     if ((output.second->shape_.size() != 1) ||
 99 |         (output.second->shape_[0] != 16)) {
100 |       std::cerr << "error: received incorrect shapes for " << output.first
101 |                 << std::endl;
102 |       exit(1);
103 |     }
104 | 
105 |     if (output.second->data_type_ != tds::DataType::BYTES) {
106 |       FAIL(
107 |           "unexpected datatype '" +
108 |           std::string(DataTypeString(output.second->data_type_)) + "' for '" +
109 |           output.first + "'");
110 |     }
111 | 
112 |     if (output.second->memory_type_ != tds::MemoryType::CPU) {
113 |       FAIL(
114 |           "unexpected memory type, expected to be allocated in CPU, got " +
115 |           std::string(MemoryTypeString(output.second->memory_type_)) + ", id " +
116 |           std::to_string(output.second->memory_type_id_) + " for " +
117 |           output.first);
118 |     }
119 |   }
120 | 
121 |   if (result0_data.size() != 16) {
122 |     std::cerr << "error: received incorrect number of strings for OUTPUT0: "
123 |               << result0_data.size() << std::endl;
124 |   }
125 |   if (result1_data.size() != 16) {
126 |     std::cerr << "error: received incorrect number of strings for OUTPUT1: "
127 |               << result1_data.size() << std::endl;
128 |   }
129 | 
130 |   CompareResult(
131 |       input0_data, input1_data, result0_data, result1_data, expected_sum,
132 |       expected_diff);
133 | }
134 | 
135 | }  // namespace
136 | 
137 | int
138 | main(int argc, char** argv)
139 | {
140 |   int verbose_level = 0;
141 | 
142 |   // Parse commandline...
143 |   int opt;
144 |   while ((opt = getopt(argc, argv, "vu:H:")) != -1) {
145 |     switch (opt) {
146 |       case 'v':
147 |         verbose_level = 1;
148 |         break;
149 |       case '?':
150 |         Usage(argv);
151 |         break;
152 |     }
153 |   }
154 |   try {
155 |     // Use 'ServerOptions' object to initialize TritonServer.
156 |     tds::ServerOptions options({"./models"});
157 |     options.logging_.verbose_ =
158 |         tds::LoggingOptions::VerboseLevel(verbose_level);
159 |     auto server = tds::TritonServer::Create(options);
160 | 
161 |     // We use a simple model that takes 2 input tensors of 16 strings
162 |     // each and returns 2 output tensors of 16 strings each. The input
163 |     // strings must represent integers. One output tensor is the
164 |     // element-wise sum of the inputs and one output is the element-wise
165 |     // difference.
166 |     std::string model_name = "add_sub_str";
167 | 
168 |     // Use 'LoadedModels' function to check if the model we need is loaded.
169 |     std::set<std::string> loaded_models = server->LoadedModels();
170 |     if (loaded_models.find(model_name) == loaded_models.end()) {
171 |       FAIL("Model '" + model_name + "' is not found.");
172 |     }
173 | 
174 |     // Initialize 'InferRequest' with the name of the model that we want to run
175 |     // an inference on.
176 |     auto request = tds::InferRequest::Create(tds::InferOptions(model_name));
177 | 
178 |     // Create the data for the two input tensors. Initialize the first
179 |     // to unique integers and the second to all ones. The input tensors
180 |     // are the string representation of these values.
181 |     std::vector<std::string> input0_data(16);
182 |     std::vector<std::string> input1_data(16);
183 |     std::vector<int32_t> expected_sum(16);
184 |     std::vector<int32_t> expected_diff(16);
185 |     for (size_t i = 0; i < 16; ++i) {
186 |       input0_data[i] = std::to_string(i);
187 |       input1_data[i] = std::to_string(1);
188 |       expected_sum[i] = i + 1;
189 |       expected_diff[i] = i - 1;
190 |     }
191 | 
192 |     std::vector<int64_t> shape{16};
193 | 
194 |     // Add two input tensors to the inference request.
195 |     request->AddInput(
196 |         "INPUT0", input0_data.begin(), input0_data.end(), tds::DataType::BYTES,
197 |         shape, tds::MemoryType::CPU, 0);
198 |     request->AddInput(
199 |         "INPUT1", input1_data.begin(), input1_data.end(), tds::DataType::BYTES,
200 |         shape, tds::MemoryType::CPU, 0);
201 | 
202 |     // Indicate that we want both output tensors calculated and returned
203 |     // for the inference request. These calls are optional, if no
204 |     // output(s) are specifically requested then all outputs defined by
205 |     // the model will be calculated and returned.
206 |     request->AddRequestedOutput("OUTPUT0");
207 |     request->AddRequestedOutput("OUTPUT1");
208 | 
209 |     // Call 'AsyncInfer' function to run inference.
210 |     auto result_future = server->AsyncInfer(*request);
211 | 
212 |     // Get the infer result and check the result.
213 |     auto result = result_future.get();
214 |     if (result->HasError()) {
215 |       FAIL(result->ErrorMsg());
216 |     }
217 |     std::string name = result->ModelName();
218 |     std::string version = result->ModelVersion();
219 |     std::string id = result->Id();
220 |     std::cout << "Ran inferencece on model '" << name << "', version '"
221 |               << version << "', with request ID '" << id << "'\n";
222 | 
223 |     // Retrieve two outputs from the 'InferResult' object.
224 |     std::shared_ptr<tds::Tensor> out0 = result->Output("OUTPUT0");
225 |     std::shared_ptr<tds::Tensor> out1 = result->Output("OUTPUT1");
226 | 
227 |     // Get the result data as a vector of string.
228 |     std::vector<std::string> result0_data = result->StringData("OUTPUT0");
229 |     std::vector<std::string> result1_data = result->StringData("OUTPUT1");
230 |     if (result0_data.size() != 16) {
231 |       std::cerr << "error: received incorrect number of strings for OUTPUT0: "
232 |                 << result0_data.size() << std::endl;
233 |     }
234 |     if (result1_data.size() != 16) {
235 |       std::cerr << "error: received incorrect number of strings for OUTPUT1: "
236 |                 << result1_data.size() << std::endl;
237 |     }
238 | 
239 |     Check(
240 |         out0, out1, input0_data, input1_data, "OUTPUT0", "OUTPUT1",
241 |         result0_data, result1_data, expected_sum, expected_diff);
242 | 
243 |     // Get full response.
244 |     std::cout << result->DebugString() << std::endl;
245 |   }
246 |   catch (const tds::TritonException& ex) {
247 |     std::cerr << "Error: " << ex.what();
248 |     exit(1);
249 |   }
250 | 
251 |   return 0;
252 | }
253 | 


--------------------------------------------------------------------------------
/server/include/triton/developer_tools/generic_server_wrapper.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | //
  3 | // Redistribution and use in source and binary forms, with or without
  4 | // modification, are permitted provided that the following conditions
  5 | // are met:
  6 | //  * Redistributions of source code must retain the above copyright
  7 | //    notice, this list of conditions and the following disclaimer.
  8 | //  * Redistributions in binary form must reproduce the above copyright
  9 | //    notice, this list of conditions and the following disclaimer in the
 10 | //    documentation and/or other materials provided with the distribution.
 11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | //    contributors may be used to endorse or promote products derived
 13 | //    from this software without specific prior written permission.
 14 | //
 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | #pragma once
 27 | #include <list>
 28 | #include <memory>
 29 | #include <unordered_map>
 30 | #include <vector>
 31 | 
 32 | #include "../src/infer_requested_output.h"
 33 | #include "../src/tracer.h"
 34 | #include "common.h"
 35 | 
 36 | namespace triton { namespace developer_tools { namespace server {
 37 | 
 38 | class ServerOptions;
 39 | class InferOptions;
 40 | class RepositoryIndex;
 41 | class NewModelRepo;
 42 | class Tensor;
 43 | class GenericInferRequest;
 44 | class GenericInferResult;
 45 | using TensorAllocMap = std::unordered_map<
 46 |     std::string,
 47 |     std::tuple<const void*, size_t, TRITONSERVER_MemoryType, int64_t>>;
 48 | 
 49 | //==============================================================================
 50 | /// Object that encapsulates in-process C API functionalities.
 51 | ///
 52 | class GenericTritonServer {
 53 |  public:
 54 |   ///  Create a GenericTritonServer instance.
 55 |   static std::unique_ptr<GenericTritonServer> Create(
 56 |       const ServerOptions& server_options);
 57 | 
 58 |   virtual ~GenericTritonServer();
 59 | 
 60 |   /// Load the requested model or reload the model if it is already loaded.
 61 |   /// \param model_name The name of the model.
 62 |   virtual void LoadModel(const std::string& model_name) = 0;
 63 | 
 64 |   /// Unload the requested model. Unloading a model that is not loaded
 65 |   /// on server has no affect.
 66 |   /// \param model_name The name of the model.
 67 |   virtual void UnloadModel(const std::string& model_name) = 0;
 68 | 
 69 |   /// Get the set of names of models that are loaded and ready for inference.
 70 |   /// \return Returns the set of names of models that are
 71 |   /// loaded and ready for inference.
 72 |   virtual std::set<std::string> LoadedModels() = 0;
 73 | 
 74 |   /// Get the index of model repository contents.
 75 |   /// \return Returns a vector of 'RepositoryIndex' object
 76 |   /// representing the repository index.
 77 |   virtual std::vector<RepositoryIndex> ModelIndex() = 0;
 78 | 
 79 |   /// Get the metrics of the server.
 80 |   /// \return Returns a string representing the metrics.
 81 |   virtual std::string ServerMetrics() = 0;
 82 | 
 83 |   /// Get the inference statistics of the specified model.
 84 |   /// \param model_name The name of the model.
 85 |   /// \param model_version the version of the model requested.
 86 |   /// \return Returns a json string representing the model metrics.
 87 |   virtual std::string ModelStatistics(
 88 |       const std::string& model_name, const int64_t model_version) = 0;
 89 | 
 90 |   /// Is the server live?
 91 |   /// \return Returns true if server is live, false otherwise.
 92 |   virtual bool IsServerLive() = 0;
 93 | 
 94 |   /// Is the server ready?
 95 |   /// \return Returns true if server is ready, false otherwise.
 96 |   virtual bool IsServerReady() = 0;
 97 | 
 98 |   /// Stop a server object. A server can't be restarted once it is
 99 |   /// stopped.
100 |   virtual void ServerStop() = 0;
101 | 
102 |   /// Is the model ready?
103 |   /// \param model_name The name of the model to get readiness for.
104 |   /// \param model_version The version of the model to get readiness
105 |   /// for.  If -1 then the server will choose a version based on the
106 |   /// model's policy. This field is optional, default is -1.
107 |   /// \return Returns true if server is ready, false otherwise.
108 |   virtual bool IsModelReady(
109 |       const std::string& model_name, const int64_t model_version = -1) = 0;
110 | 
111 |   /// Get the configuration of specified model.
112 |   /// \param model_name The name of the model.
113 |   /// \param model_version The version of the model to get configuration.
114 |   /// The default value is -1 which means then the server will
115 |   /// choose a version based on the model and internal policy. This field is
116 |   /// optional. \return Returns JSON representation of model configuration as a
117 |   /// string.
118 |   virtual std::string ModelConfig(
119 |       const std::string& model_name, const int64_t model_version = -1) = 0;
120 | 
121 |   /// Get the metadata of the server.
122 |   /// \return Returns JSON representation of server metadata as a string.
123 |   virtual std::string ServerMetadata() = 0;
124 | 
125 |   /// Get the metadata of specified model.
126 |   /// \param model_name The name of the model.
127 |   /// \param model_version The version of the model to get configuration.
128 |   /// The default value is -1 which means then the server will choose a version
129 |   /// based on the model and internal policy. This field is optional.
130 |   /// \return Returns JSON representation of model metadata as a string.
131 |   virtual std::string ModelMetadata(
132 |       const std::string& model_name, const int64_t model_version = -1) = 0;
133 | 
134 |   /// Register a new model repository. This function is not available in polling
135 |   /// mode.
136 |   /// \param new_model_repo The 'NewModelRepo' object contains the info of the
137 |   /// new model repo to be registered.
138 |   virtual void RegisterModelRepo(const NewModelRepo& new_model_repo) = 0;
139 | 
140 |   /// Unregister a model repository. This function is not available in polling
141 |   /// mode.
142 |   /// \param repo_path The full path to the model repository.
143 |   virtual void UnregisterModelRepo(const std::string& repo_path) = 0;
144 | 
145 |   virtual std::unique_ptr<GenericInferResult> Infer(
146 |       GenericInferRequest& infer_request) = 0;
147 | };
148 | 
149 | //==============================================================================
150 | /// An interface for InferResult object to interpret the response to an
151 | /// inference request.
152 | ///
153 | class GenericInferResult {
154 |  public:
155 |   virtual ~GenericInferResult();
156 | 
157 |   /// Get the name of the model which generated this response.
158 |   /// \return Returns the name of the model.
159 |   virtual std::string ModelName() noexcept = 0;
160 | 
161 |   /// Get the version of the model which generated this response.
162 |   /// \return Returns the version of the model.
163 |   virtual std::string ModelVersion() noexcept = 0;
164 | 
165 |   /// Get the id of the request which generated this response.
166 |   /// \return Returns the id of the request.
167 |   virtual std::string Id() noexcept = 0;
168 | 
169 |   /// Get the output names from the infer result
170 |   /// \return Vector of output names
171 |   virtual std::vector<std::string> OutputNames() = 0;
172 | 
173 |   /// Get the result output as a shared pointer of 'Tensor' object. The 'buffer'
174 |   /// field of the output is owned by the returned 'Tensor' object itself. Note
175 |   /// that for string data, need to use 'StringData' function for string data
176 |   /// result.
177 |   /// \param name The name of the output tensor to be retrieved.
178 |   /// \return Returns the output result as a shared pointer of 'Tensor' object.
179 |   virtual std::shared_ptr<Tensor> Output(const std::string& name) = 0;
180 | 
181 |   /// Get the result data as a vector of strings. The vector will
182 |   /// receive a copy of result data. An exception will be thrown if
183 |   /// the data type of output is not 'BYTES'.
184 |   /// \param output_name The name of the output to get result data.
185 |   /// \return Returns the result data represented as a vector of strings. The
186 |   /// strings are stored in the row-major order.
187 |   virtual std::vector<std::string> StringData(
188 |       const std::string& output_name) = 0;
189 | 
190 |   /// Return the complete response as a user friendly string.
191 |   /// \return The string describing the complete response.
192 |   virtual std::string DebugString() = 0;
193 | 
194 |   /// Return if there is an error within this result.
195 |   /// \return True if this 'GenericInferResult' object has an error, false if no
196 |   /// error.
197 |   virtual bool HasError() = 0;
198 | 
199 |   /// Return the error message of the error.
200 |   /// \return The message for the error. Empty if no error.
201 |   virtual std::string ErrorMsg() = 0;
202 | };
203 | 
204 | //==============================================================================
205 | /// Object that describes an inflight inference request.
206 | ///
207 | class GenericInferRequest {
208 |  public:
209 |   ///  Create an InferRequest instance.
210 |   static std::unique_ptr<GenericInferRequest> Create(
211 |       const InferOptions& infer_options);
212 | 
213 |   virtual ~GenericInferRequest();
214 | 
215 |   /// Add an input tensor to be sent within an InferRequest object. The input
216 |   /// data buffer within the 'Tensor' object must not be modified until
217 |   /// inference is completed and result is returned.
218 |   /// \param name The name of the input tensor.
219 |   /// \param input A Tensor object that describes an input tensor.
220 |   virtual void AddInput(
221 |       const std::string& name, const Tensor& input) noexcept = 0;
222 | 
223 |   /// Add a requested output to be sent within an InferRequest object.
224 |   /// Calling this function is optional. If no output(s) are specifically
225 |   /// requested then all outputs defined by the model will be calculated and
226 |   /// returned. Pre-allocated buffer for each output should be specified within
227 |   /// the 'Tensor' object.
228 |   /// \param name The name of the output tensor.
229 |   /// \param output A Tensor object that describes an output tensor containing
230 |   /// its pre-allocated buffer.
231 |   virtual void AddRequestedOutput(const std::string& name, Tensor& output) = 0;
232 | 
233 |   /// Add a requested output to be sent within an InferRequest object.
234 |   /// Calling this function is optional. If no output(s) are specifically
235 |   /// requested then all outputs defined by the model will be calculated and
236 |   /// returned.
237 |   /// \param name The name of the output tensor.
238 |   virtual void AddRequestedOutput(const std::string& name) = 0;
239 | 
240 |   /// Clear inputs and outputs of the request. This allows users to reuse the
241 |   /// InferRequest object if needed.
242 |   virtual void Reset() = 0;
243 | };
244 | 
245 | }}}  // namespace triton::developer_tools::server
246 | 


--------------------------------------------------------------------------------
/server/README.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 | # Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | -->
 28 | 
 29 | [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 30 | 
 31 | # Triton Server C-API Wrapper
 32 | 
 33 | Triton Server C-API Wrapper wraps up the functionality of
 34 | [Triton in-process C-API](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#in-process-triton-server-api)
 35 | , providing a simpler interface for users to use Triton in-process C API for
 36 | developing their application without having in-depth knowledge of Triton
 37 | implementation details or writing complicated code. This wrapper is also called
 38 | the "Higher Level In Process C++ API" or just "Server Wrapper" for short. The
 39 | header files that defines and documents the Server C-API Wrapper is
 40 | [server_wrapper.h](include/triton/developer_tools/server_wrapper.h). Ask
 41 | questions or report problems in the main Triton
 42 | [issues page](https://github.com/triton-inference-server/server/issues).
 43 | 
 44 | ## Build the Server C-API Wrapper library and custom application
 45 | 
 46 | To build and install the Server Wrapper library from
 47 | `developer_tools/server`, use the following commands.
 48 | 
 49 | ```
 50 | $ mkdir build
 51 | $ cd build
 52 | $ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..
 53 | $ make install
 54 | ```
 55 | 
 56 | The following required Triton repositories will be pulled and used in
 57 | the build. By default the "main" branch/tag will be used for each repo
 58 | but the listed CMake argument can be used to override.
 59 | 
 60 | * triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]
 61 | * triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]
 62 | 
 63 | See the [CMakeLists.txt](CMakeLists.txt) file for other build options.
 64 | 
 65 | When the build completes, the library `libtritondevelopertoolsserver.a` and examples
 66 | can be found in the install directory.
 67 | 
 68 | For custom application, you can refer to
 69 | [CMakeLists.txt](examples/CMakeLists.txt) to see how to build your executable
 70 | with the Server Wrapper library.
 71 | 
 72 | ### API Description
 73 | 
 74 | Triton Server C-API Wrapper is encapsulated in a shared library which is built
 75 | from source contained in this repository. You can include the full
 76 | capabilities by linking the shared library into your application and by using
 77 | the C++ API defined in [server_wrapper.h](include/triton/developer_tools/server_wrapper.h).
 78 | 
 79 | #### Inference APIs
 80 | 
 81 | Three main objects will be used for Server Wrapper.
 82 | 
 83 | ##### TritonServer
 84 | 
 85 | The top-level abstraction used by Server Wrapper is `TritonServer`,
 86 | which represents the Triton core logic that is capable of implementing
 87 | some of the features and capabilities of Triton.
 88 | 
 89 | ##### InferRequest
 90 | 
 91 | `InferRequest` carries the information for a inference request. This object
 92 | allows you to set inference options, add inputs and requeseted outputs to a request.
 93 | 
 94 | ##### InferResult
 95 | 
 96 | `InferResult` provides an interface to interpret the inference response, making
 97 | it more easily to retrieve output data.
 98 | 
 99 | ##### General Workflow
100 | 
101 | Performing an inference request requires the use of some Server C++ API
102 | functions and objects, as demonstrated in
103 | [simple_addsub_async_infer.cc](examples/simple_addsub_async_infer.cc).
104 | The general usage requires the following steps.
105 | 
106 | 1. Start Server
107 | 
108 | To start a Triton server, you need to  create a `TritonServer` instance with
109 | the `ServerOptions` structure which contains the server options used to
110 | initialize the server.
111 | 
112 | ```cpp
113 | auto server = TritonServer::Create(ServerOptions options({"path/to/your/model_repository", "path/to/another/model_repository"}));
114 | ```
115 | 
116 | 2. Load model (optional)
117 | 
118 | This step is optional as all the models in the model repository paths provided
119 | in the previous step will be loaded to the server by default. However, if
120 | [model control mode](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_management.md)
121 | is set to "EXPLICIT" when setting the server options in the previous step, you
122 | can load a specific model by calling
123 | 
124 | ```cpp
125 | server->LoadModel("your_model_name");
126 | ```
127 | 
128 | 3. Construct `InferRequest` with infer options
129 | 
130 | Initialize the request with `InferOptions` structure, specifying the name of
131 | the model that you want to run an inference on and other inference options.
132 | 
133 | ```cpp
134 | auto request = InferRequest::Create(InferOptions("your_model_name"));
135 | ```
136 | 
137 | 4. Add inputs / requested outputs to a request
138 | 
139 | You can add an input to a request by either using `Tensor` object, which
140 | contains the information of an input tensor, or using the iterator if the input
141 | data is stored in a contiguous container. Iterator can also be used if input
142 | data is of 'string' type and is stored in a contiguous container. Note that the
143 | input data buffer within the 'Tensor' object must not be modified until
144 | inference is completed and result is returned.
145 | 
146 | For output, you can add the name of requested output to a request, indicating
147 | what output to be calculated and returned for inference. You can also provide
148 | pre-allocated buffer for output in this step if you want the output data to
149 | be stored in-place in the provided buffer.  See "Use pre-allocated buffer"
150 | section in the next step for more information.
151 | 
152 | ```cpp
153 | // Assume that we have input data in these two vectors.
154 | std::vector<char> input0_data;
155 | std::vector<char> input1_data;
156 | 
157 | Tensor input0(&input0_data[0], input0_data.size(), DataType::INT32, {1, 16}, MemoryType::CPU, 0);
158 | Tensor input1(&input1_data[0], input1_data.size(), DataType::INT32, {1, 16}, MemoryType::CPU, 0);
159 | 
160 | request->AddInput("INPUT0_NAME", input0);
161 | request->AddInput("INPUT1_NAME", input1);
162 | 
163 | request->AddRequestedOutput("OUTPUT0_NAME");
164 | request->AddRequestedOutput("OUTPUT1_NAME");
165 | ```
166 | 
167 | 5. Call the inference method
168 | 
169 | Server Wrapper uses promise-future based structure for asynchronous inference.
170 | A future of a unique pointer of `InferResult` object will be returned from
171 | `AsyncInfer` function, and the result can be retrieved whenever needed by
172 | calling `future.get()`.
173 | 
174 | > [!Note]
175 | > For
176 | > [decoupled models](https://github.com/triton-inference-server/python_backend/tree/main/examples/decoupled#decoupled-model-examples)
177 | > with multi-part responses we recommend using the example `GetResults`
178 | > function, as demonstrated in
179 | > [square_async_infer.cc](examples/square_async_infer.cc), to ensure the entire
180 | > response from the model is collected.
181 | 
182 | When running inference, Server Wrapper provides three options for the
183 | allocation and deallocation of output tensors.
184 | 
185 | * Use default allocator
186 | 
187 | Default output allocation/deallocation will be used. No need to specify how to
188 | allocate/deallocate the output tensors.
189 | 
190 | ```cpp
191 | // Call the inference method.
192 | std::future<std::unique_ptr<InferResult>> result_future = server->AsyncInfer(*request);
193 | 
194 | // Get the infer result and check the result.
195 | auto result = result_future.get();
196 | if (result->HasError()) {
197 |     std::cerr << result->ErrorMsg();
198 | } else {
199 |     // Retrieve output data from 'InferResult' object...
200 | }
201 | ```
202 | 
203 | * Use custom allocator
204 | 
205 | You can provide your custom allocator using `Allocator` object. You need to
206 | register your callback functions to the allocator when creating the
207 | `Allocator` object, and set `InferOptions` properly when initializing
208 | `InferRequest`. The signatures of the callback functions are defined in
209 | [common.h](include/triton/developer_tools/common.h).
210 | 
211 | ```cpp
212 | // 'ResponseAllocator' and 'ResponseRelease' are the custom output allocation
213 | // and deallocation functions.
214 | Allocator allocator(ResponseAllocator, ResponseRelease);
215 | auto infer_options = InferOptions("your_model_name");
216 | 
217 | // Set custom allocator to 'InferOptions'.
218 | infer_options.custom_allocator_ = &allocator;
219 | auto request = InferRequest(infer_options);
220 | 
221 | /**
222 | Add inputs/requested outputs to a request as shown in the previous step...
223 | */
224 | 
225 | // Call the inference method, and the custom allocator will be used.
226 | std::future<std::unique_ptr<InferResult>> result_future = server->AsyncInfer(*request);
227 | 
228 | // Get the infer result and check the result.
229 | auto result = result_future.get();
230 | if (result->HasError()) {
231 |     std::cerr << result->ErrorMsg();
232 | } else {
233 |     // Retrieve output data from 'InferResult' object...
234 | }
235 | ```
236 | 
237 | * Use pre-allocated buffer
238 | 
239 | You can pre-allocate buffers for output tensors. The output data will be
240 | stored in the buffer you provided when adding requested outputs to a request in
241 | the previous step. Note that those buffers will *not* be freed when the `Tensor`
242 | object goes out of scope, and should be freed manually when they are no
243 | longer needed.
244 | 
245 | ```cpp
246 | /*
247 | Add inputs to a request as shown in the previous step...
248 | */
249 | 
250 | void* buffer_ptr0 = malloc(64);
251 | void* buffer_ptr1 = malloc(64);
252 | 
253 | // Provide pre-allocated buffer for each output tensor.
254 | Tensor output0(reinterpret_cast<char*>(buffer_ptr0), 64, MemoryType::CPU, 0);
255 | Tensor output1(reinterpret_cast<char*>(buffer_ptr1), 64, MemoryType::CPU, 0);
256 | 
257 | request->AddRequestedOutput("OUTPUT0_NAME", output0);
258 | request->AddRequestedOutput("OUTPUT1_NAME", output1);
259 | 
260 | // Call the inference method.
261 | std::future<std::unique_ptr<InferResult>> result_future = server->AsyncInfer(*request);
262 | 
263 | // Get the infer result and check the result.
264 | auto result = result_future.get();
265 | if (result->HasError()) {
266 |     std::cerr << result->ErrorMsg();
267 | } else {
268 |     // Retrieve output data from 'InferResult' object...
269 | }
270 | 
271 | // Need to free the buffer manually.
272 | free(buffer_ptr0);
273 | free(buffer_ptr1);
274 | ```
275 | 
276 | The lifetime of output data is owned by each returned output `Tensor` object.
277 | For cases using default allocator or custom allocator, the deallocation of
278 | the buffer where the output data is stored will occurs when the `Tensor`
279 | object goes out of scope.
280 | 
281 | #### Non-Inference APIs
282 | 
283 | Server Wrapper contains APIs for loading/unloading models, getting metrics, and
284 | model index, etc. The use of these functions is straightforward and these
285 | functions are documented in
286 | [server_wrapper.h](include/triton/developer_tools/server_wrapper.h). You can
287 | find some of the functions demonstrated in the [examples](examples).
288 | 
289 | #### Error Handling
290 | 
291 | Most Higher Level Server C++ API functions throws a `TritonException` when an
292 | error occurs. You can utilize `TritonException`, which is documented in
293 | [common.h](include/triton/developer_tools/common.h), in your application for
294 | error handling.
295 | 
296 | #### Examples
297 | 
298 | A simple example using the Server Wrapper can be found in
299 | [simple_addsub_async_infer.cc](examples/simple_addsub_async_infer.cc)
300 | which is heavily commented. For string type IO, an example can be found in
301 | [addsub_string_async_infer.cc](examples/addsub_string_async_infer.cc). For
302 | decoupled models, please refer to
303 | [square_async_infer.cc](examples/square_async_infer.cc).
304 | 
305 | When running the examples, make sure the model repository is placed under the
306 | same path, and `LD_LIBRARY_PATH` is set properly for `libtritonserver.so`.
307 | 
308 | ```
309 | # Prepare the models required by the examples.
310 | 
311 | $ cd /path/to/developer_tools/server
312 | 
313 | $ mkdir -p ./examples/models
314 | 
315 | # Copy over the models placed in the qa folder.
316 | $ cp -r ../qa/L0_server_unit_test/models/add_sub* ./examples/models/.
317 | 
318 | # Copy over the models placed in the server repository.
319 | $ git clone https://github.com/triton-inference-server/server.git
320 | $ cp -r server/docs/examples/model_repository/simple ./examples/models/.
321 | 
322 | # Copy over the decoupled model placed in the python_backend repository.
323 | $ git clone https://github.com/triton-inference-server/python_backend.git
324 | $ mkdir -p ./examples/models/square_int32/1
325 | $ cp python_backend/examples/decoupled/square_model.py ./examples/models/square_int32/1/model.py
326 | $ cp python_backend/examples/decoupled/square_config.pbtxt ./examples/models/square_int32/config.pbtxt
327 | 
328 | # Copy over the executables from the install directory.
329 | $ cp /path/to/install/bin/simple_addsub_async_infer ./examples
330 | $ cp /path/to/install/bin/addsub_string_async_infer ./examples
331 | $ cp /path/to/install/bin/square_async_infer ./examples
332 | 
333 | # Assume libtritonserver.so is placed under "/opt/tritonserver/lib"
334 | $ LD_LIBRARY_PATH=/opt/tritonserver/lib:${LD_LIBRARY_PATH}
335 | 
336 | $ cd ./examples
337 | 
338 | # Run examples
339 | $ ./simple_addsub_async_infer
340 | $ ./addsub_string_async_infer
341 | $ ./square_async_infer
342 | ```
343 | 
344 | ## Triton Server C-API Wrapper Java Bindings
345 | Similar to the [Java bindings for In-Process Triton Server API](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#java-bindings-for-in-process-triton-server-api) C-API Wrapper Java Bindings
346 | is created using [Java CPP](https://github.com/bytedeco/javacpp).
347 | 
348 | 
349 | The API is documented in [tritonserver.java](https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/src/gen/java/org/bytedeco/tritonserver/global/tritonserver.java).
350 | **Note:** Currently, `tritonserver.java` contains bindings for both
351 |  `In-Process API` and `C-API Wrapper`.
352 |  More information about the `In-Process API` can be found in [Inference Protocol README](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#in-process-triton-server-api).
353 | 
354 | 
355 | A simple example using the Java API can be found in
356 | [Samples folder](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver/samples)
357 | which includes `SimpleCPP.java` which is similar to
358 | [`simple.cc`](https://github.com/triton-inference-server/server/blob/main/src/simple.cc), which uses the `In-Process API`.
359 | 
360 | 
361 | In the [QA folder](https://github.com/triton-inference-server/developer_tools/tree/main/qa), folders starting with L0_java include Java API tests.
362 | 
363 | ### Java API setup instructions
364 | 
365 | Please refer to [Java API setup instructions](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#java-api-setup-instructions) for instructions on how to use C-API Wrapper Java Bindings.
366 | 


--------------------------------------------------------------------------------
/server/include/triton/developer_tools/server_wrapper.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | //
  3 | // Redistribution and use in source and binary forms, with or without
  4 | // modification, are permitted provided that the following conditions
  5 | // are met:
  6 | //  * Redistributions of source code must retain the above copyright
  7 | //    notice, this list of conditions and the following disclaimer.
  8 | //  * Redistributions in binary form must reproduce the above copyright
  9 | //    notice, this list of conditions and the following disclaimer in the
 10 | //    documentation and/or other materials provided with the distribution.
 11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | //    contributors may be used to endorse or promote products derived
 13 | //    from this software without specific prior written permission.
 14 | //
 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | #pragma once
 27 | 
 28 | #include <climits>
 29 | #include <future>
 30 | #include <iostream>
 31 | #include <list>
 32 | #include <string>
 33 | #include <unordered_map>
 34 | #include <vector>
 35 | 
 36 | #include "generic_server_wrapper.h"
 37 | #include "triton/core/tritonserver.h"
 38 | 
 39 | #ifdef TRITON_ENABLE_GPU
 40 | #include <cuda_runtime_api.h>
 41 | #endif  // TRITON_ENABLE_GPU
 42 | 
 43 | namespace triton { namespace developer_tools { namespace server {
 44 | 
 45 | class Allocator;
 46 | class InferResult;
 47 | class InferRequest;
 48 | struct ResponseParameters;
 49 | class TraceManager;
 50 | 
 51 | //==============================================================================
 52 | /// Object that encapsulates in-process C API functionalities.
 53 | ///
 54 | class TritonServer : public GenericTritonServer {
 55 |  public:
 56 |   static std::unique_ptr<TritonServer> Create(
 57 |       const ServerOptions& server_options);
 58 | 
 59 |   virtual ~TritonServer();
 60 | 
 61 |   /// Load the requested model or reload the model if it is already loaded.
 62 |   /// \param model_name The name of the model.
 63 |   void LoadModel(const std::string& model_name) override;
 64 | 
 65 |   /// Unload the requested model. Unloading a model that is not loaded
 66 |   /// on server has no affect.
 67 |   /// \param model_name The name of the model.
 68 |   void UnloadModel(const std::string& model_name) override;
 69 | 
 70 |   /// Get the set of names of models that are loaded and ready for inference.
 71 |   /// \return Returns the set of names of models that are
 72 |   /// loaded and ready for inference.
 73 |   std::set<std::string> LoadedModels() override;
 74 | 
 75 |   /// Get the index of model repository contents.
 76 |   /// \return Returns a vector of 'RepositoryIndex' object
 77 |   /// representing the repository index.
 78 |   std::vector<RepositoryIndex> ModelIndex() override;
 79 | 
 80 |   /// Get the metrics of the server.
 81 |   /// \return Returns a string representing the metrics.
 82 |   std::string ServerMetrics() override;
 83 | 
 84 |   /// Get the inference statistics of the specified model.
 85 |   /// \param model_name The name of the model.
 86 |   /// \param model_version the version of the model requested.
 87 |   /// \return Returns a json string representing the model metrics.
 88 |   std::string ModelStatistics(
 89 |       const std::string& model_name, const int64_t model_version) override;
 90 | 
 91 |   /// Run synchronous inference on server.
 92 |   /// \param infer_request The InferRequest object contains
 93 |   /// the inputs, outputs and infer options for an inference request.
 94 |   /// \return Returns the result of inference as a future of
 95 |   /// a unique pointer of InferResult object.
 96 |   virtual std::unique_ptr<InferResult> Infer(InferRequest& infer_request) = 0;
 97 | 
 98 |   /// Run asynchronous inference on server.
 99 |   /// \param infer_request The InferRequest object contains
100 |   /// the inputs, outputs and infer options for an inference request.
101 |   /// \return Returns the result of inference as a future of
102 |   /// a unique pointer of InferResult object.
103 |   virtual std::future<std::unique_ptr<InferResult>> AsyncInfer(
104 |       InferRequest& infer_request) = 0;
105 | 
106 |   /// Is the server live?
107 |   /// \return Returns true if server is live, false otherwise.
108 |   bool IsServerLive() override;
109 | 
110 |   /// Is the server ready?
111 |   /// \return Returns true if server is ready, false otherwise.
112 |   bool IsServerReady() override;
113 | 
114 |   /// Stop a server object. A server can't be restarted once it is
115 |   /// stopped.
116 |   void ServerStop() override;
117 | 
118 |   /// Is the model ready?
119 |   /// \param model_name The name of the model to get readiness for.
120 |   /// \param model_version The version of the model to get readiness
121 |   /// for.  If -1 then the server will choose a version based on the
122 |   /// model's policy. This field is optional, default is -1.
123 |   /// \return Returns true if server is ready, false otherwise.
124 |   bool IsModelReady(
125 |       const std::string& model_name, const int64_t model_version = -1) override;
126 | 
127 |   /// Get the configuration of specified model.
128 |   /// \param model_name The name of the model.
129 |   /// \param model_version The version of the model to get configuration.
130 |   /// The default value is -1 which means then the server will
131 |   /// choose a version based on the model and internal policy. This field is
132 |   /// optional. \return Returns JSON representation of model configuration as a
133 |   /// string.
134 |   std::string ModelConfig(
135 |       const std::string& model_name, const int64_t model_version = -1) override;
136 | 
137 |   /// Get the metadata of the server.
138 |   /// \return Returns JSON representation of server metadata as a string.
139 |   std::string ServerMetadata() override;
140 | 
141 |   /// Get the metadata of specified model.
142 |   /// \param model_name The name of the model.
143 |   /// \param model_version The version of the model to get configuration.
144 |   /// The default value is -1 which means then the server will choose a version
145 |   /// based on the model and internal policy. This field is optional.
146 |   /// \return Returns JSON representation of model metadata as a string.
147 |   std::string ModelMetadata(
148 |       const std::string& model_name, const int64_t model_version = -1) override;
149 | 
150 |   /// Register a new model repository. This function is not available in polling
151 |   /// mode.
152 |   /// \param new_model_repo The 'NewModelRepo' object contains the info of the
153 |   /// new model repo to be registered.
154 |   void RegisterModelRepo(const NewModelRepo& new_model_repo) override;
155 | 
156 |   /// Unregister a model repository. This function is not available in polling
157 |   /// mode.
158 |   /// \param repo_path The full path to the model repository.
159 |   void UnregisterModelRepo(const std::string& repo_path) override;
160 | 
161 |  protected:
162 |   void PrepareInferenceRequest(
163 |       TRITONSERVER_InferenceRequest** irequest, const InferRequest& request);
164 | 
165 |   void PrepareInferenceInput(
166 |       TRITONSERVER_InferenceRequest* irequest, const InferRequest& request);
167 | 
168 |   void PrepareInferenceOutput(
169 |       TRITONSERVER_InferenceRequest* irequest, InferRequest& request);
170 | 
171 |   void PreprocessIrequest(
172 |       TRITONSERVER_InferenceRequest** irequest,
173 |       const InferRequest& infer_request);
174 | 
175 |   // The server object.
176 |   std::shared_ptr<TRITONSERVER_Server> server_;
177 |   // The allocator object allocating output tensor.
178 |   TRITONSERVER_ResponseAllocator* allocator_;
179 |   // The trace manager.
180 |   std::shared_ptr<TraceManager> trace_manager_;
181 | };
182 | 
183 | 
184 | //==============================================================================
185 | /// An interface for InferResult object to interpret the response to an
186 | /// inference request.
187 | ///
188 | class InferResult : public GenericInferResult {
189 |  public:
190 |   virtual ~InferResult();
191 | 
192 |   /// Get the name of the model which generated this response.
193 |   /// \return Returns the name of the model.
194 |   std::string ModelName() noexcept override;
195 | 
196 |   /// Get the version of the model which generated this response.
197 |   /// \return Returns the version of the model.
198 |   std::string ModelVersion() noexcept override;
199 | 
200 |   /// Get the id of the request which generated this response.
201 |   /// \return Returns the id of the request.
202 |   std::string Id() noexcept override;
203 | 
204 |   /// Get the output names from the infer result
205 |   /// \return Vector of output names
206 |   std::vector<std::string> OutputNames() override;
207 |   /// Get the result output as a shared pointer of 'Tensor' object. The 'buffer'
208 |   /// field of the output is owned by the returned 'Tensor' object itself. Note
209 |   /// that for string data, need to use 'StringData' function for string data
210 |   /// result.
211 |   /// \param name The name of the output tensor to be retrieved.
212 |   /// \return Returns the output result as a shared pointer of 'Tensor' object.
213 |   std::shared_ptr<Tensor> Output(const std::string& name) override;
214 | 
215 |   /// Get the result data as a vector of strings. The vector will
216 |   /// receive a copy of result data. An exception will be thrown if
217 |   /// the data type of output is not 'BYTES'.
218 |   /// \param output_name The name of the output to get result data.
219 |   /// \return Returns the result data represented as a vector of strings. The
220 |   /// strings are stored in the row-major order.
221 |   std::vector<std::string> StringData(const std::string& output_name) override;
222 | 
223 |   /// Return the complete response as a user friendly string.
224 |   /// \return The string describing the complete response.
225 |   std::string DebugString() override;
226 | 
227 |   /// Return if there is an error within this result.
228 |   /// \return True if this 'InferResult' object has an error, false if no error.
229 |   bool HasError() override;
230 | 
231 |   /// Return the error message of the error.
232 |   /// \return The message for the error. Empty if no error.
233 |   std::string ErrorMsg() override;
234 | 
235 |   // Get the pointer to the future of the next result. This function is used for
236 |   // retrieving multiple responses from decoupled model. If there is no next
237 |   // result, this function will return nullptr.
238 |   std::unique_ptr<std::future<std::unique_ptr<InferResult>>> GetNextResult();
239 | 
240 |   friend class InternalServer;
241 | 
242 |  protected:
243 |   InferResult();
244 |   const char* model_name_;
245 |   int64_t model_version_;
246 |   const char* request_id_;
247 |   std::vector<std::unique_ptr<ResponseParameters>> params_;
248 |   std::unordered_map<std::string, std::shared_ptr<Tensor>> infer_outputs_;
249 |   bool has_error_;
250 |   std::string error_msg_;
251 | 
252 |   // The pointer to the future of the next result.
253 |   std::unique_ptr<std::future<std::unique_ptr<InferResult>>>
254 |       next_result_future_;
255 | 
256 |   TRITONSERVER_InferenceResponse* completed_response_;
257 | };
258 | 
259 | //==============================================================================
260 | /// Object that describes an inflight inference request.
261 | ///
262 | class InferRequest : public GenericInferRequest {
263 |  public:
264 |   ///  Create an InferRequest instance.
265 |   static std::unique_ptr<InferRequest> Create(
266 |       const InferOptions& infer_options);
267 | 
268 |   ~InferRequest();
269 | 
270 |   /// Add an input tensor to be sent within an InferRequest object. The input
271 |   /// data buffer within the 'Tensor' object must not be modified until
272 |   /// inference is completed and result is returned.
273 |   /// \param name The name of the input tensor.
274 |   /// \param input A Tensor object that describes an input tensor.
275 |   void AddInput(const std::string& name, const Tensor& input) noexcept override;
276 | 
277 |   /// Add an input tensor to be sent within an InferRequest object. This
278 |   /// function is for containers holding 'non-string' data elements. Data in the
279 |   /// container should be contiguous, and the the container must not be modified
280 |   /// until inference is completed and result is returned.
281 |   /// \param name The name of the input tensor.
282 |   /// \param begin The begin iterator of the container.
283 |   /// \param end  The end iterator of the container.
284 |   /// \param data_type The data type of the input.
285 |   /// \param shape The shape of the input.
286 |   /// \param memory_type The memory type of the input.
287 |   /// \param memory_type_id The ID of the memory for the tensor. (e.g. '0' is
288 |   /// the memory type id of 'GPU-0')
289 |   template <
290 |       typename Iterator,
291 |       typename std::enable_if<std::is_same<
292 |           typename std::iterator_traits<Iterator>::value_type,
293 |           std::string>::value>::type* = nullptr>
294 |   void AddInput(
295 |       const std::string& name, const Iterator begin, const Iterator end,
296 |       const DataType& data_type, const std::vector<int64_t>& shape,
297 |       const MemoryType& memory_type, const int64_t memory_type_id) noexcept;
298 | 
299 |   /// Add an input tensor to be sent within an InferRequest object. This
300 |   /// function is for containers holding 'string' elements. Data in the
301 |   /// container should be contiguous, and the the container must not be modified
302 |   /// until inference is completed and the result is returned.
303 |   /// \param name The name of the input tensor.
304 |   /// \param begin The begin iterator of the container.
305 |   /// \param end  The end iterator of the container.
306 |   /// \param data_type The data type of the input. For 'string' input, data type
307 |   /// should be 'BYTES'.
308 |   /// \param shape The shape of the input.
309 |   /// \param memory_type The memory type of the input.
310 |   /// \param memory_type_id The ID of the memory for the tensor. (e.g. '0' is
311 |   /// the memory type id of 'GPU-0')
312 |   template <
313 |       typename Iterator,
314 |       typename std::enable_if<!std::is_same<
315 |           typename std::iterator_traits<Iterator>::value_type,
316 |           std::string>::value>::type* = nullptr>
317 |   void AddInput(
318 |       const std::string& name, const Iterator begin, const Iterator end,
319 |       const DataType& data_type, const std::vector<int64_t>& shape,
320 |       const MemoryType& memory_type, const int64_t memory_type_id) noexcept;
321 | 
322 |   /// Add a requested output to be sent within an InferRequest object.
323 |   /// Calling this function is optional. If no output(s) are specifically
324 |   /// requested then all outputs defined by the model will be calculated and
325 |   /// returned. Pre-allocated buffer for each output should be specified within
326 |   /// the 'Tensor' object.
327 |   /// \param name The name of the output tensor.
328 |   /// \param output A Tensor object that describes an output tensor containing
329 |   /// its pre-allocated buffer.
330 |   void AddRequestedOutput(const std::string& name, Tensor& output) override;
331 | 
332 |   /// Add a requested output to be sent within an InferRequest object.
333 |   /// Calling this function is optional. If no output(s) are specifically
334 |   /// requested then all outputs defined by the model will be calculated and
335 |   /// returned.
336 |   /// \param name The name of the output tensor.
337 |   void AddRequestedOutput(const std::string& name) override;
338 | 
339 |   /// Clear inputs and outputs of the request. This allows users to reuse the
340 |   /// InferRequest object if needed.
341 |   void Reset() override;
342 |   friend class TritonServer;
343 |   friend class InternalServer;
344 | 
345 |  protected:
346 |   InferRequest();
347 | 
348 |   std::unique_ptr<InferOptions> infer_options_;
349 |   std::list<std::string> str_bufs_;
350 |   std::unordered_map<std::string, std::unique_ptr<Tensor>> inputs_;
351 |   std::vector<std::unique_ptr<InferRequestedOutput>> outputs_;
352 | 
353 |   // The map for each output tensor and a tuple of it's pre-allocated buffer,
354 |   // byte size, memory type and memory type id.
355 |   TensorAllocMap tensor_alloc_map_;
356 |   // The updated trace setting for the specified model set within
357 |   // 'InferOptions'. If set, the lifetime of this 'TraceManager::Trace' object
358 |   // should be long enough until the trace associated with this request is
359 |   // written to file.
360 |   std::shared_ptr<TraceManager::Trace> trace_;
361 | 
362 |   // If the requested model is a decoupled model. If true, the lifetime of this
363 |   // 'InferRequest' should be long enough until all the responses are returned
364 |   // and retrieved.
365 |   bool is_decoupled_;
366 | 
367 |  private:
368 |   // The promise object used for setting value to the result future.
369 |   std::unique_ptr<std::promise<std::unique_ptr<InferResult>>> prev_promise_;
370 | };
371 | //==============================================================================
372 | /// Helper functions to convert Wrapper enum to string.
373 | ///
374 | std::string MemoryTypeString(const MemoryType& memory_type);
375 | std::string DataTypeString(const DataType& data_type);
376 | std::string ModelReadyStateString(const ModelReadyState& state);
377 | 
378 | //==============================================================================
379 | /// Implementation of template functions
380 | ///
381 | template <
382 |     typename Iterator, typename std::enable_if<std::is_same<
383 |                            typename std::iterator_traits<Iterator>::value_type,
384 |                            std::string>::value>::type*>
385 | void
386 | InferRequest::AddInput(
387 |     const std::string& name, const Iterator begin, const Iterator end,
388 |     const DataType& data_type, const std::vector<int64_t>& shape,
389 |     const MemoryType& memory_type, const int64_t memory_type_id) noexcept
390 | {
391 |   // Serialize the strings into a "raw" buffer. The first 4-bytes are
392 |   // the length of the string length. Next are the actual string
393 |   // characters. There is *not* a null-terminator on the string.
394 |   str_bufs_.emplace_back();
395 |   std::string& sbuf = str_bufs_.back();
396 | 
397 |   Iterator it;
398 |   for (it = begin; it != end; it++) {
399 |     auto len = it->size();
400 |     sbuf.append(reinterpret_cast<const char*>(&len), sizeof(uint32_t));
401 |     sbuf.append(*it);
402 |   }
403 |   Tensor input(
404 |       reinterpret_cast<char*>(&sbuf[0]), sbuf.size(), DataType::BYTES, shape,
405 |       memory_type, memory_type_id);
406 | 
407 |   AddInput(name, input);
408 | }
409 | 
410 | template <
411 |     typename Iterator, typename std::enable_if<!std::is_same<
412 |                            typename std::iterator_traits<Iterator>::value_type,
413 |                            std::string>::value>::type*>
414 | void
415 | InferRequest::AddInput(
416 |     const std::string& name, const Iterator begin, const Iterator end,
417 |     const DataType& data_type, const std::vector<int64_t>& shape,
418 |     const MemoryType& memory_type, const int64_t memory_type_id) noexcept
419 | {
420 |   // FIXME (DLIS-4134) This function should also work for non-contiguous
421 |   // container, and input data should be copied so that we don't need to worry
422 |   // about the lifetime of input data.
423 |   size_t bytes = sizeof(*begin) * std::distance(begin, end);
424 |   Tensor input(
425 |       reinterpret_cast<char*>(&(*begin)), bytes, data_type, shape, memory_type,
426 |       memory_type_id);
427 | 
428 |   AddInput(name, input);
429 | }
430 | 
431 | }}}  // namespace triton::developer_tools::server
432 | 


--------------------------------------------------------------------------------
/server/src/tracer.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | //
  3 | // Redistribution and use in source and binary forms, with or without
  4 | // modification, are permitted provided that the following conditions
  5 | // are met:
  6 | //  * Redistributions of source code must retain the above copyright
  7 | //    notice, this list of conditions and the following disclaimer.
  8 | //  * Redistributions in binary form must reproduce the above copyright
  9 | //    notice, this list of conditions and the following disclaimer in the
 10 | //    documentation and/or other materials provided with the distribution.
 11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | //    contributors may be used to endorse or promote products derived
 13 | //    from this software without specific prior written permission.
 14 | //
 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | #include "tracer.h"
 28 | 
 29 | #include <stdlib.h>
 30 | 
 31 | #include <unordered_map>
 32 | 
 33 | #include "triton/common/logging.h"
 34 | #ifdef TRITON_ENABLE_GPU
 35 | #include <cuda_runtime_api.h>
 36 | #endif  // TRITON_ENABLE_GPU
 37 | #include <iostream>
 38 | 
 39 | namespace triton { namespace developer_tools { namespace server {
 40 | 
 41 | #define IGNORE_ERROR(X)                   \
 42 |   do {                                    \
 43 |     TRITONSERVER_Error* ie_err__ = (X);   \
 44 |     if (ie_err__ != nullptr) {            \
 45 |       TRITONSERVER_ErrorDelete(ie_err__); \
 46 |     }                                     \
 47 |   } while (false)
 48 | 
 49 | #define LOG_IF_ERROR(X, MSG)                                                   \
 50 |   do {                                                                         \
 51 |     TRITONSERVER_Error* lie_err__ = (X);                                       \
 52 |     if (lie_err__ != nullptr) {                                                \
 53 |       IGNORE_ERROR(TRITONSERVER_LogMessage(                                    \
 54 |           TRITONSERVER_LOG_ERROR, __FILE__, __LINE__,                          \
 55 |           (std::string(MSG) + ": " + TRITONSERVER_ErrorCodeString(lie_err__) + \
 56 |            " - " + TRITONSERVER_ErrorMessage(lie_err__))                       \
 57 |               .c_str()));                                                      \
 58 |       TRITONSERVER_ErrorDelete(lie_err__);                                     \
 59 |     }                                                                          \
 60 |   } while (false)
 61 | 
 62 | TraceManager::TraceManager(
 63 |     const TRITONSERVER_InferenceTraceLevel level, const uint32_t rate,
 64 |     const int32_t count, const uint32_t log_frequency,
 65 |     const std::string& filepath)
 66 | {
 67 |   std::shared_ptr<TraceFile> file(new TraceFile(filepath));
 68 |   global_setting_.reset(
 69 |       new TraceSetting(level, rate, count, log_frequency, file));
 70 |   trace_files_.emplace(filepath, file);
 71 | }
 72 | 
 73 | void
 74 | TraceManager::UpdateTraceSetting(
 75 |     const std::string& model_name, const TraceSetting& new_setting)
 76 | {
 77 |   std::shared_ptr<TraceSetting> setting(new TraceSetting(
 78 |       new_setting.level_, new_setting.rate_, new_setting.count_,
 79 |       new_setting.log_frequency_, new_setting.file_));
 80 |   if ((!setting->Valid()) &&
 81 |       (new_setting.level_ != TRITONSERVER_TRACE_LEVEL_DISABLED)) {
 82 |     throw TritonException(
 83 |         std::string("Attempting to set invalid trace setting: ") +
 84 |         setting->Reason());
 85 |   }
 86 | 
 87 |   std::lock_guard<std::mutex> r_lk(r_mu_);
 88 |   auto it = model_settings_.find(model_name);
 89 |   if (it != model_settings_.end()) {
 90 |     // Model update
 91 |     it->second = std::move(setting);
 92 |   } else {
 93 |     // Model init
 94 |     model_settings_.emplace(model_name, setting);
 95 |   }
 96 | }
 97 | 
 98 | std::shared_ptr<TraceManager::Trace>
 99 | TraceManager::SampleTrace(const std::string& model_name)
100 | {
101 |   std::shared_ptr<TraceSetting> trace_setting;
102 |   {
103 |     std::lock_guard<std::mutex> r_lk(r_mu_);
104 |     auto m_it = model_settings_.find(model_name);
105 |     trace_setting =
106 |         (m_it == model_settings_.end()) ? global_setting_ : m_it->second;
107 |   }
108 |   std::shared_ptr<Trace> ts = trace_setting->SampleTrace();
109 |   if (ts != nullptr) {
110 |     ts->setting_ = trace_setting;
111 |   }
112 |   return ts;
113 | }
114 | 
115 | void
116 | TraceManager::TraceRelease(TRITONSERVER_InferenceTrace* trace, void* userp)
117 | {
118 |   uint64_t parent_id;
119 |   LOG_IF_ERROR(
120 |       TRITONSERVER_InferenceTraceParentId(trace, &parent_id),
121 |       "getting trace parent id");
122 |   // The userp will be shared with the trace children, so only delete it
123 |   // if the root trace is being released
124 |   if (parent_id == 0) {
125 |     delete reinterpret_cast<std::shared_ptr<TraceManager::Trace>*>(userp);
126 |   }
127 |   LOG_IF_ERROR(TRITONSERVER_InferenceTraceDelete(trace), "deleting trace");
128 | }
129 | 
130 | void
131 | TraceManager::TraceActivity(
132 |     TRITONSERVER_InferenceTrace* trace,
133 |     TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
134 |     void* userp)
135 | {
136 |   uint64_t id;
137 |   LOG_IF_ERROR(TRITONSERVER_InferenceTraceId(trace, &id), "getting trace id");
138 | 
139 |   // The function may be called with different traces but the same 'userp',
140 |   // group the activity of the same trace together for more readable output.
141 |   auto ts =
142 |       reinterpret_cast<std::shared_ptr<TraceManager::Trace>*>(userp)->get();
143 | 
144 |   std::lock_guard<std::mutex> lk(ts->mtx_);
145 |   std::stringstream* ss = nullptr;
146 |   {
147 |     if (ts->streams_.find(id) == ts->streams_.end()) {
148 |       std::unique_ptr<std::stringstream> stream(new std::stringstream());
149 |       ss = stream.get();
150 |       ts->streams_.emplace(id, std::move(stream));
151 |     } else {
152 |       ss = ts->streams_[id].get();
153 |       // If the string stream is not newly created, add "," as there is
154 |       // already content in the string stream
155 |       *ss << ",";
156 |     }
157 |   }
158 | 
159 |   // If 'activity' is TRITONSERVER_TRACE_REQUEST_START then collect
160 |   // and serialize trace details.
161 |   if (activity == TRITONSERVER_TRACE_REQUEST_START) {
162 |     const char* model_name;
163 |     int64_t model_version;
164 |     uint64_t parent_id;
165 | 
166 |     LOG_IF_ERROR(
167 |         TRITONSERVER_InferenceTraceModelName(trace, &model_name),
168 |         "getting model name");
169 |     LOG_IF_ERROR(
170 |         TRITONSERVER_InferenceTraceModelVersion(trace, &model_version),
171 |         "getting model version");
172 |     LOG_IF_ERROR(
173 |         TRITONSERVER_InferenceTraceParentId(trace, &parent_id),
174 |         "getting trace parent id");
175 | 
176 |     *ss << "{\"id\":" << id << ",\"model_name\":\"" << model_name
177 |         << "\",\"model_version\":" << model_version;
178 |     if (parent_id != 0) {
179 |       *ss << ",\"parent_id\":" << parent_id;
180 |     }
181 |     *ss << "},";
182 |   }
183 | 
184 |   *ss << "{\"id\":" << id << ",\"timestamps\":["
185 |       << "{\"name\":\"" << TRITONSERVER_InferenceTraceActivityString(activity)
186 |       << "\",\"ns\":" << timestamp_ns << "}]}";
187 | }
188 | 
189 | void
190 | TraceManager::TraceTensorActivity(
191 |     TRITONSERVER_InferenceTrace* trace,
192 |     TRITONSERVER_InferenceTraceActivity activity, const char* name,
193 |     TRITONSERVER_DataType datatype, const void* base, size_t byte_size,
194 |     const int64_t* shape, uint64_t dim_count,
195 |     TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, void* userp)
196 | {
197 |   if ((activity != TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT) &&
198 |       (activity != TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT) &&
199 |       (activity != TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT)) {
200 |     LOG_ERROR << "Unsupported activity: "
201 |               << TRITONSERVER_InferenceTraceActivityString(activity);
202 |     return;
203 |   }
204 | 
205 |   void* buffer_base = const_cast<void*>(base);
206 |   if (memory_type == TRITONSERVER_MEMORY_GPU) {
207 | #ifdef TRITON_ENABLE_GPU
208 |     buffer_base = malloc(byte_size);
209 |     if (buffer_base == nullptr) {
210 |       LOG_ERROR << "Failed to malloc CPU buffer";
211 |       return;
212 |     }
213 |     cudaError_t err =
214 |         cudaMemcpy(buffer_base, base, byte_size, cudaMemcpyDeviceToHost);
215 |     if (err != cudaSuccess) {
216 |       throw TritonException(
217 |           std::string("Error - copying buffer into CPU memory: ") +
218 |           cudaGetErrorString(err));
219 |     }
220 | 
221 |     // FAIL_IF_CUDA_ERR(
222 |     //     cudaMemcpy(buffer_base, base, byte_size, cudaMemcpyDeviceToHost),
223 |     //     "copying buffer into CPU memory");
224 | #else
225 |     LOG_ERROR << "GPU buffer is unsupported";
226 |     return;
227 | #endif  // TRITON_ENABLE_GPU
228 |   }
229 | 
230 |   uint64_t id;
231 |   LOG_IF_ERROR(TRITONSERVER_InferenceTraceId(trace, &id), "getting trace id");
232 | 
233 |   // The function may be called with different traces but the same 'userp',
234 |   // group the activity of the same trace together for more readable output.
235 |   auto ts =
236 |       reinterpret_cast<std::shared_ptr<TraceManager::Trace>*>(userp)->get();
237 | 
238 |   std::lock_guard<std::mutex> lk(ts->mtx_);
239 |   std::stringstream* ss = nullptr;
240 |   {
241 |     if (ts->streams_.find(id) == ts->streams_.end()) {
242 |       std::unique_ptr<std::stringstream> stream(new std::stringstream());
243 |       ss = stream.get();
244 |       ts->streams_.emplace(id, std::move(stream));
245 |     } else {
246 |       ss = ts->streams_[id].get();
247 |       // If the string stream is not newly created, add "," as there is
248 |       // already content in the string stream
249 |       *ss << ",";
250 |     }
251 |   }
252 | 
253 |   // collect and serialize trace details.
254 |   *ss << "{\"id\":" << id << ",\"activity\":\""
255 |       << TRITONSERVER_InferenceTraceActivityString(activity) << "\"";
256 |   // collect tensor
257 |   *ss << ",\"tensor\":{";
258 |   // collect tensor name
259 |   *ss << "\"name\":\"" << std::string(name) << "\"";
260 |   // collect tensor data
261 |   *ss << ",\"data\":\"";
262 |   size_t element_count = 1;
263 |   for (uint64_t i = 0; i < dim_count; i++) {
264 |     element_count *= shape[i];
265 |   }
266 |   switch (datatype) {
267 |     case TRITONSERVER_TYPE_BOOL: {
268 |       const uint8_t* bool_base = reinterpret_cast<const uint8_t*>(buffer_base);
269 |       for (size_t e = 0; e < element_count; ++e) {
270 |         *ss << ((bool_base[e] == 0) ? false : true);
271 |         if (e < (element_count - 1))
272 |           *ss << ",";
273 |       }
274 |       break;
275 |     }
276 |     case TRITONSERVER_TYPE_UINT8: {
277 |       const uint8_t* cbase = reinterpret_cast<const uint8_t*>(buffer_base);
278 |       for (size_t e = 0; e < element_count; ++e) {
279 |         *ss << cbase[e];
280 |         if (e < (element_count - 1))
281 |           *ss << ",";
282 |       }
283 |       break;
284 |     }
285 |     case TRITONSERVER_TYPE_UINT16: {
286 |       const uint16_t* cbase = reinterpret_cast<const uint16_t*>(buffer_base);
287 |       for (size_t e = 0; e < element_count; ++e) {
288 |         *ss << cbase[e];
289 |         if (e < (element_count - 1))
290 |           *ss << ",";
291 |       }
292 |       break;
293 |     }
294 |     case TRITONSERVER_TYPE_UINT32: {
295 |       const uint32_t* cbase = reinterpret_cast<const uint32_t*>(buffer_base);
296 |       for (size_t e = 0; e < element_count; ++e) {
297 |         *ss << cbase[e];
298 |         if (e < (element_count - 1))
299 |           *ss << ",";
300 |       }
301 |       break;
302 |     }
303 |     case TRITONSERVER_TYPE_UINT64: {
304 |       const uint64_t* cbase = reinterpret_cast<const uint64_t*>(buffer_base);
305 |       for (size_t e = 0; e < element_count; ++e) {
306 |         *ss << cbase[e];
307 |         if (e < (element_count - 1))
308 |           *ss << ",";
309 |       }
310 |       break;
311 |     }
312 |     case TRITONSERVER_TYPE_INT8: {
313 |       const int8_t* cbase = reinterpret_cast<const int8_t*>(buffer_base);
314 |       for (size_t e = 0; e < element_count; ++e) {
315 |         *ss << cbase[e];
316 |         if (e < (element_count - 1))
317 |           *ss << ",";
318 |       }
319 |       break;
320 |     }
321 |     case TRITONSERVER_TYPE_INT16: {
322 |       const int16_t* cbase = reinterpret_cast<const int16_t*>(buffer_base);
323 |       for (size_t e = 0; e < element_count; ++e) {
324 |         *ss << cbase[e];
325 |         if (e < (element_count - 1))
326 |           *ss << ",";
327 |       }
328 |       break;
329 |     }
330 |     case TRITONSERVER_TYPE_INT32: {
331 |       const int32_t* cbase = reinterpret_cast<const int32_t*>(buffer_base);
332 |       for (size_t e = 0; e < element_count; ++e) {
333 |         *ss << cbase[e];
334 |         if (e < (element_count - 1))
335 |           *ss << ",";
336 |       }
337 |       break;
338 |     }
339 |     case TRITONSERVER_TYPE_INT64: {
340 |       const int64_t* cbase = reinterpret_cast<const int64_t*>(buffer_base);
341 |       for (size_t e = 0; e < element_count; ++e) {
342 |         *ss << cbase[e];
343 |         if (e < (element_count - 1))
344 |           *ss << ",";
345 |       }
346 |       break;
347 |     }
348 | 
349 |     // FP16 / BF16 already handled as binary blobs, no need to manipulate here
350 |     case TRITONSERVER_TYPE_FP16: {
351 |       break;
352 |     }
353 |     case TRITONSERVER_TYPE_BF16: {
354 |       break;
355 |     }
356 | 
357 |     case TRITONSERVER_TYPE_FP32: {
358 |       const float* cbase = reinterpret_cast<const float*>(buffer_base);
359 |       for (size_t e = 0; e < element_count; ++e) {
360 |         *ss << cbase[e];
361 |         if (e < (element_count - 1))
362 |           *ss << ",";
363 |       }
364 |       break;
365 |     }
366 |     case TRITONSERVER_TYPE_FP64: {
367 |       const double* cbase = reinterpret_cast<const double*>(buffer_base);
368 |       for (size_t e = 0; e < element_count; ++e) {
369 |         *ss << cbase[e];
370 |         if (e < (element_count - 1))
371 |           *ss << ",";
372 |       }
373 |       break;
374 |     }
375 |     case TRITONSERVER_TYPE_BYTES: {
376 |       const char* cbase = reinterpret_cast<const char*>(buffer_base);
377 |       size_t offset = 0;
378 |       for (size_t e = 0; e < element_count; ++e) {
379 |         if ((offset + sizeof(uint32_t)) > byte_size) {
380 |           return;
381 |         }
382 |         const size_t len = *(reinterpret_cast<const uint32_t*>(cbase + offset));
383 |         offset += sizeof(uint32_t);
384 |         if ((offset + len) > byte_size) {
385 |           return;
386 |         }
387 |         std::string str(cbase + offset, len);
388 |         *ss << "\\\"" << str << "\\\"";
389 |         offset += len;
390 | 
391 |         if (e < (element_count - 1))
392 |           *ss << ",";
393 |       }
394 |       break;
395 |     }
396 |     case TRITONSERVER_TYPE_INVALID: {
397 |       return;
398 |     }
399 |   }
400 |   *ss << "\",\"shape\":\"";
401 |   for (uint64_t i = 0; i < dim_count; i++) {
402 |     *ss << shape[i];
403 |     if (i < (dim_count - 1)) {
404 |       *ss << ",";
405 |     }
406 |   }
407 |   *ss << "\",\"dtype\":\"" << TRITONSERVER_DataTypeString(datatype) << "\"}";
408 |   *ss << "}";
409 | 
410 |   if (memory_type == TRITONSERVER_MEMORY_GPU) {
411 | #ifdef TRITON_ENABLE_GPU
412 |     if (buffer_base != nullptr) {
413 |       free(buffer_base);
414 |     }
415 | #endif  // TRITON_ENABLE_GPU
416 |   }
417 | }
418 | 
419 | TraceManager::Trace::~Trace()
420 | {
421 |   // Write trace now
422 |   setting_->WriteTrace(streams_);
423 | }
424 | 
425 | TraceManager::TraceFile::~TraceFile()
426 | {
427 |   if (!first_write_) {
428 |     trace_file_ << "]";
429 |   }
430 | }
431 | 
432 | void
433 | TraceManager::TraceFile::SaveTraces(
434 |     std::stringstream& trace_stream, const bool to_index_file)
435 | {
436 |   try {
437 |     if (to_index_file) {
438 |       std::string file_name =
439 |           file_name_ + "." + std::to_string(index_.fetch_add(1));
440 |       std::ofstream file_stream;
441 |       file_stream.open(file_name);
442 |       file_stream << "[";
443 |       file_stream << trace_stream.rdbuf();
444 |       file_stream << "]";
445 |     } else {
446 |       std::lock_guard<std::mutex> lock(mu_);
447 |       if (first_write_) {
448 |         trace_file_.open(file_name_);
449 |         trace_file_ << "[";
450 |         first_write_ = false;
451 |       } else {
452 |         trace_file_ << ",";
453 |       }
454 |       trace_file_ << trace_stream.rdbuf();
455 |     }
456 |   }
457 |   catch (const std::ofstream::failure& e) {
458 |     LOG_ERROR << "failed creating trace file: " << e.what();
459 |   }
460 |   catch (...) {
461 |     LOG_ERROR << "failed creating trace file: reason unknown";
462 |   }
463 | }
464 | 
465 | std::shared_ptr<TraceManager::Trace>
466 | TraceManager::TraceSetting::SampleTrace()
467 | {
468 |   bool create_trace = false;
469 |   {
470 |     std::lock_guard<std::mutex> lk(mu_);
471 |     if (!Valid()) {
472 |       return nullptr;
473 |     }
474 |     create_trace = (((++sample_) % rate_) == 0);
475 |     if (create_trace && (count_ > 0)) {
476 |       --count_;
477 |       ++created_;
478 |     }
479 |   }
480 |   if (create_trace) {
481 |     std::shared_ptr<TraceManager::Trace> lts(new Trace());
482 |     // Split 'Trace' management to frontend and Triton trace separately
483 |     // to avoid dependency between frontend request and Triton trace's liveness
484 |     auto trace_userp = new std::shared_ptr<TraceManager::Trace>(lts);
485 |     TRITONSERVER_InferenceTrace* trace;
486 |     TRITONSERVER_Error* err = TRITONSERVER_InferenceTraceTensorNew(
487 |         &trace, level_, 0 /* parent_id */, TraceActivity, TraceTensorActivity,
488 |         TraceRelease, trace_userp);
489 |     if (err != nullptr) {
490 |       LOG_IF_ERROR(err, "creating inference trace object");
491 |       delete trace_userp;
492 |       return nullptr;
493 |     }
494 |     lts->trace_ = trace;
495 |     lts->trace_userp_ = trace_userp;
496 |     LOG_IF_ERROR(
497 |         TRITONSERVER_InferenceTraceId(trace, &lts->trace_id_),
498 |         "getting trace id");
499 |     return lts;
500 |   }
501 | 
502 |   return nullptr;
503 | }
504 | 
505 | void
506 | TraceManager::TraceSetting::WriteTrace(
507 |     const std::unordered_map<uint64_t, std::unique_ptr<std::stringstream>>&
508 |         streams)
509 | {
510 |   std::unique_lock<std::mutex> lock(mu_);
511 | 
512 |   if (sample_in_stream_ != 0) {
513 |     trace_stream_ << ",";
514 |   }
515 |   ++sample_in_stream_;
516 |   ++collected_;
517 | 
518 |   size_t stream_count = 0;
519 |   for (const auto& stream : streams) {
520 |     trace_stream_ << stream.second->rdbuf();
521 |     // Need to add ',' unless it is the last trace in the group
522 |     ++stream_count;
523 |     if (stream_count != streams.size()) {
524 |       trace_stream_ << ",";
525 |     }
526 |   }
527 |   // Write to file with index when one of the following is true
528 |   // 1. trace_count is specified and that number of traces has been collected
529 |   // 2. log_frequency is specified and that number of traces has been collected
530 |   if (((count_ == 0) && (collected_ == sample_)) ||
531 |       ((log_frequency_ != 0) && (sample_in_stream_ >= log_frequency_))) {
532 |     // Reset variables and release lock before saving to file
533 |     sample_in_stream_ = 0;
534 |     std::stringstream stream;
535 |     trace_stream_.swap(stream);
536 |     lock.unlock();
537 | 
538 |     file_->SaveTraces(stream, true /* to_index_file */);
539 |   }
540 | }
541 | 
542 | TraceManager::TraceSetting::TraceSetting()
543 |     : level_(TRITONSERVER_TRACE_LEVEL_DISABLED), rate_(0), count_(-1),
544 |       log_frequency_(0), sample_(0), created_(0), collected_(0),
545 |       sample_in_stream_(0)
546 | {
547 |   invalid_reason_ = "Setting hasn't been initialized";
548 | }
549 | 
550 | TraceManager::TraceSetting::TraceSetting(
551 |     const TRITONSERVER_InferenceTraceLevel level, const uint32_t rate,
552 |     const int32_t count, const uint32_t log_frequency,
553 |     const std::shared_ptr<TraceFile>& file)
554 |     : level_(level), rate_(rate), count_(count), log_frequency_(log_frequency),
555 |       file_(file), sample_(0), created_(0), collected_(0), sample_in_stream_(0)
556 | {
557 |   if (level_ == TRITONSERVER_TRACE_LEVEL_DISABLED) {
558 |     invalid_reason_ = "tracing is disabled";
559 |   } else if (rate_ == 0) {
560 |     invalid_reason_ = "sample rate must be non-zero";
561 |   } else if (file_->FileName().empty()) {
562 |     invalid_reason_ = "trace file name is not given";
563 |   }
564 | }
565 | 
566 | TraceManager::TraceSetting::~TraceSetting()
567 | {
568 |   // If log frequency is set, should log the remaining traces to indexed file.
569 |   if (sample_in_stream_ != 0) {
570 |     file_->SaveTraces(trace_stream_, (log_frequency_ != 0));
571 |   }
572 | }
573 | 
574 | }}}  // namespace triton::developer_tools::server
575 | 


--------------------------------------------------------------------------------
/server/test/wrapper_test.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | //
  3 | // Redistribution and use in source and binary forms, with or without
  4 | // modification, are permitted provided that the following conditions
  5 | // are met:
  6 | //  * Redistributions of source code must retain the above copyright
  7 | //    notice, this list of conditions and the following disclaimer.
  8 | //  * Redistributions in binary form must reproduce the above copyright
  9 | //    notice, this list of conditions and the following disclaimer in the
 10 | //    documentation and/or other materials provided with the distribution.
 11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | //    contributors may be used to endorse or promote products derived
 13 | //    from this software without specific prior written permission.
 14 | //
 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | #include <exception>
 27 | 
 28 | #include "gtest/gtest.h"
 29 | #include "triton/core/tritonserver.h"
 30 | #include "triton/developer_tools/server_wrapper.h"
 31 | 
 32 | namespace tds = triton::developer_tools::server;
 33 | 
 34 | namespace {
 35 | 
 36 | TEST(TritonServer, LibraryVersionCheck)
 37 | {
 38 |   // Check that proper 'libtritonserver.so' is used
 39 |   uint32_t major = 0;
 40 |   uint32_t minor = 0;
 41 |   auto err = TRITONSERVER_ApiVersion(&major, &minor);
 42 |   ASSERT_TRUE(err == nullptr) << "Unexpected error from API version call";
 43 |   ASSERT_EQ(major, TRITONSERVER_API_VERSION_MAJOR) << "Mismatch major version";
 44 |   ASSERT_GE(minor, TRITONSERVER_API_VERSION_MINOR) << "Older minor version";
 45 | }
 46 | 
 47 | TEST(TritonServer, StartInvalidRepository)
 48 | {
 49 |   // Run server with invalid model repository
 50 |   try {
 51 |     tds::TritonServer::Create(
 52 |         tds::ServerOptions({"/invalid_model_repository"}));
 53 |   }
 54 |   catch (std::exception& ex) {
 55 |     ASSERT_STREQ(
 56 |         ex.what(), "Internal-failed to stat file /invalid_model_repository\n");
 57 |   }
 58 |   catch (...) {
 59 |     ASSERT_NO_THROW(throw);
 60 |   }
 61 | }
 62 | 
 63 | class TritonServerTest : public ::testing::Test {
 64 |  protected:
 65 |   TritonServerTest() : options_({"./models"})
 66 |   {
 67 |     options_.logging_ = tds::LoggingOptions(
 68 |         tds::LoggingOptions::VerboseLevel(0), false, false, false,
 69 |         tds::LoggingOptions::LogFormat::DEFAULT, "");
 70 |   }
 71 | 
 72 |   tds::ServerOptions options_;
 73 | };
 74 | 
 75 | void
 76 | CPUAllocator(
 77 |     const char* tensor_name, size_t byte_size,
 78 |     tds::MemoryType preferred_memory_type, int64_t preferred_memory_type_id,
 79 |     void** buffer, tds::MemoryType* actual_memory_type,
 80 |     int64_t* actual_memory_type_id)
 81 | {
 82 |   std::cout << "Using custom allocation function" << std::endl;
 83 | 
 84 |   *actual_memory_type = tds::MemoryType::CPU;
 85 |   *actual_memory_type_id = preferred_memory_type_id;
 86 | 
 87 |   // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
 88 |   // need to do any other book-keeping.
 89 |   if (byte_size == 0) {
 90 |     *buffer = nullptr;
 91 |     std::cout << "allocated " << byte_size << " bytes for result tensor "
 92 |               << tensor_name << std::endl;
 93 |   } else {
 94 |     void* allocated_ptr = malloc(byte_size);
 95 |     if (allocated_ptr != nullptr) {
 96 |       *buffer = allocated_ptr;
 97 |       std::cout << "allocated " << byte_size << " bytes in "
 98 |                 << MemoryTypeString(*actual_memory_type)
 99 |                 << " for result tensor " << tensor_name << std::endl;
100 |     }
101 |   }
102 | }
103 | 
104 | void
105 | ResponseRelease(
106 |     void* buffer, size_t byte_size, tds::MemoryType memory_type,
107 |     int64_t memory_type_id)
108 | {
109 |   std::cout << "Using custom response release function" << std::endl;
110 | 
111 |   std::stringstream ss;
112 |   ss << buffer;
113 |   std::string buffer_str = ss.str();
114 | 
115 |   std::cout << "Releasing buffer " << buffer_str << " of size "
116 |             << std::to_string(byte_size) << " in "
117 |             << tds::MemoryTypeString(memory_type);
118 | 
119 |   switch (memory_type) {
120 |     case tds::MemoryType::CPU:
121 |       free(buffer);
122 |       break;
123 | 
124 |     default:
125 |       std::cerr << "error: unexpected buffer allocated in CUDA managed memory"
126 |                 << std::endl;
127 |       break;
128 |   }
129 | }
130 | 
131 | TEST_F(TritonServerTest, StartNone)
132 | {
133 |   // Start server with default mode (NONE)
134 |   try {
135 |     auto server = tds::TritonServer::Create(options_);
136 |     std::set<std::string> loaded_models = server->LoadedModels();
137 |     ASSERT_EQ(loaded_models.size(), 4);
138 |     ASSERT_NE(loaded_models.find("add_sub"), loaded_models.end());
139 |     ASSERT_NE(loaded_models.find("add_sub_str"), loaded_models.end());
140 |     ASSERT_NE(loaded_models.find("failing_infer"), loaded_models.end());
141 |     ASSERT_NE(loaded_models.find("square_int32"), loaded_models.end());
142 |   }
143 |   catch (...) {
144 |     ASSERT_NO_THROW(throw);
145 |   }
146 | }
147 | 
148 | TEST_F(TritonServerTest, NoneLoadUnload)
149 | {
150 |   // Start server with NONE mode which explicit model control is not allowed
151 |   try {
152 |     auto server = tds::TritonServer::Create(options_);
153 |     server->LoadModel("add_sub");
154 |     server->UnloadModel("add_sub");
155 |   }
156 |   catch (std::exception& ex) {
157 |     ASSERT_STREQ(
158 |         ex.what(),
159 |         "Error - LoadModel: Unavailable-explicit model load / unload is not "
160 |         "allowed if polling is enabled\n");
161 |   }
162 |   catch (...) {
163 |     ASSERT_NO_THROW(throw);
164 |   }
165 | }
166 | 
167 | TEST_F(TritonServerTest, Explicit)
168 | {
169 |   try {
170 |     options_.model_control_mode_ = tds::ModelControlMode::EXPLICIT;
171 | 
172 |     std::set<std::string> startup_models;
173 |     startup_models.insert("add_sub");
174 |     options_.startup_models_ = startup_models;
175 | 
176 |     auto server = tds::TritonServer::Create(options_);
177 |     std::set<std::string> loaded_models = server->LoadedModels();
178 |     ASSERT_EQ(loaded_models.size(), 1);
179 |     ASSERT_EQ(*loaded_models.begin(), "add_sub");
180 |     server->UnloadModel("add_sub");
181 |     loaded_models = server->LoadedModels();
182 |     ASSERT_EQ(loaded_models.size(), 0);
183 | 
184 |     server->LoadModel("add_sub_str");
185 |     loaded_models = server->LoadedModels();
186 |     ASSERT_EQ(loaded_models.size(), 1);
187 |     ASSERT_EQ(*loaded_models.begin(), "add_sub_str");
188 |   }
189 |   catch (...) {
190 |     ASSERT_NO_THROW(throw);
191 |   }
192 | }
193 | 
194 | TEST_F(TritonServerTest, ModelRepoRegister)
195 | {
196 |   try {
197 |     options_.model_control_mode_ = tds::ModelControlMode::EXPLICIT;
198 |     auto server = tds::TritonServer::Create(options_);
199 |     server->UnregisterModelRepo("./models");
200 |     try {
201 |       server->LoadModel("add_sub");
202 |     }
203 |     catch (std::exception& ex) {
204 |       ASSERT_STREQ(
205 |           ex.what(),
206 |           "Error - LoadModel: Internal-failed to load 'add_sub', failed to "
207 |           "poll from model repository\n");
208 |     }
209 |     server->RegisterModelRepo(
210 |         tds::NewModelRepo("./models1", "add_sub", "add_sub1"));
211 |     try {
212 |       server->LoadModel("add_sub");
213 |     }
214 |     catch (std::exception& ex) {
215 |       ASSERT_STREQ(
216 |           ex.what(),
217 |           "Error - LoadModel: Internal-failed to load 'add_sub', failed to "
218 |           "poll from model repository\n");
219 |     }
220 |     server->LoadModel("add_sub1");
221 |     std::set<std::string> loaded_models = server->LoadedModels();
222 |     ASSERT_EQ(loaded_models.size(), 1);
223 |     ASSERT_EQ(*loaded_models.begin(), "add_sub1");
224 |   }
225 |   catch (...) {
226 |     ASSERT_NO_THROW(throw);
227 |   }
228 | }
229 | 
230 | TEST_F(TritonServerTest, InferMinimal)
231 | {
232 |   try {
233 |     auto server = tds::TritonServer::Create(options_);
234 | 
235 |     std::vector<int32_t> input_data;
236 |     while (input_data.size() < 16) {
237 |       input_data.emplace_back(input_data.size());
238 |     }
239 |     auto request = tds::InferRequest::Create(tds::InferOptions("add_sub"));
240 |     for (const auto& name : std::vector<std::string>{"INPUT0", "INPUT1"}) {
241 |       request->AddInput(
242 |           name, tds::Tensor(
243 |                     reinterpret_cast<char*>(input_data.data()),
244 |                     input_data.size() * sizeof(int32_t), tds::DataType::INT32,
245 |                     {16}, tds::MemoryType::CPU, 0));
246 |     }
247 |     std::future<std::unique_ptr<tds::InferResult>> result_future =
248 |         server->AsyncInfer(*request);
249 |     auto result = result_future.get();
250 |     ASSERT_FALSE(result->HasError()) << result->ErrorMsg();
251 | 
252 |     // Check result metadata
253 |     ASSERT_EQ(result->ModelName(), "add_sub");
254 |     ASSERT_EQ(result->ModelVersion(), "1");
255 |     ASSERT_EQ(result->Id(), "");
256 | 
257 |     // OUTPUT0 -> sum
258 |     {
259 |       std::string out_name("OUTPUT0");
260 |       std::shared_ptr<tds::Tensor> out = result->Output(out_name);
261 |       ASSERT_EQ(out->shape_, std::vector<int64_t>{16});
262 |       ASSERT_EQ(out->data_type_, tds::DataType::INT32);
263 |       ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t)));
264 |       for (size_t i = 0; i < input_data.size(); ++i) {
265 |         EXPECT_EQ(
266 |             reinterpret_cast<const int32_t*>(out->buffer_)[i],
267 |             (2 * input_data[i]));
268 |       }
269 |     }
270 | 
271 |     // OUTPUT1 -> diff
272 |     {
273 |       std::string out_name("OUTPUT1");
274 |       std::shared_ptr<tds::Tensor> out = result->Output(out_name);
275 |       ASSERT_EQ(out->shape_, std::vector<int64_t>{16});
276 |       ASSERT_EQ(out->data_type_, tds::DataType::INT32);
277 |       ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t)));
278 |       for (size_t i = 0; i < input_data.size(); ++i) {
279 |         EXPECT_EQ(reinterpret_cast<const int32_t*>(out->buffer_)[i], 0);
280 |       }
281 |     }
282 |   }
283 |   catch (...) {
284 |     ASSERT_NO_THROW(throw);
285 |   }
286 | }
287 | 
288 | TEST_F(TritonServerTest, InferString)
289 | {
290 |   try {
291 |     auto server = tds::TritonServer::Create(options_);
292 | 
293 |     std::vector<int32_t> input_data;
294 |     std::vector<std::string> input_data_str;
295 |     while (input_data.size() < 16) {
296 |       input_data.emplace_back(input_data.size());
297 |       input_data_str.emplace_back(std::to_string(input_data.back()));
298 |     }
299 | 
300 |     auto request = tds::InferRequest::Create(tds::InferOptions("add_sub_str"));
301 |     for (const auto& name : std::vector<std::string>{"INPUT0", "INPUT1"}) {
302 |       request->AddInput(
303 |           name, input_data_str.begin(), input_data_str.end(),
304 |           tds::DataType::BYTES, {16}, tds::MemoryType::CPU, 0);
305 |     }
306 | 
307 |     std::future<std::unique_ptr<tds::InferResult>> result_future =
308 |         server->AsyncInfer(*request);
309 |     auto result = result_future.get();
310 |     ASSERT_FALSE(result->HasError()) << result->ErrorMsg();
311 | 
312 |     // Check result metadata
313 |     ASSERT_EQ(result->ModelName(), "add_sub_str");
314 |     ASSERT_EQ(result->ModelVersion(), "1");
315 |     ASSERT_EQ(result->Id(), "");
316 | 
317 |     std::vector<std::string> out_str;
318 |     std::vector<int64_t> shape;
319 |     tds::DataType datatype;
320 |     // OUTPUT0 -> sum
321 |     {
322 |       std::string out_name("OUTPUT0");
323 |       std::shared_ptr<tds::Tensor> out = result->Output(out_name);
324 |       ASSERT_EQ(out->shape_, std::vector<int64_t>{16});
325 |       ASSERT_EQ(out->data_type_, tds::DataType::BYTES);
326 |       out_str = result->StringData(out_name);
327 |       for (size_t i = 0; i < input_data.size(); ++i) {
328 |         EXPECT_EQ(out_str[i], std::to_string(2 * input_data[i]));
329 |       }
330 |     }
331 | 
332 |     // OUTPUT1 -> diff
333 |     {
334 |       std::string out_name("OUTPUT1");
335 |       std::shared_ptr<tds::Tensor> out = result->Output(out_name);
336 |       ASSERT_EQ(out->shape_, std::vector<int64_t>{16});
337 |       ASSERT_EQ(out->data_type_, tds::DataType::BYTES);
338 |       out_str = result->StringData(out_name);
339 |       for (size_t i = 0; i < input_data.size(); ++i) {
340 |         EXPECT_EQ(out_str[i], "0");
341 |       }
342 |     }
343 |   }
344 |   catch (...) {
345 |     ASSERT_NO_THROW(throw);
346 |   }
347 | }
348 | 
349 | TEST_F(TritonServerTest, InferFailed)
350 | {
351 |   try {
352 |     auto server = tds::TritonServer::Create(options_);
353 | 
354 |     std::vector<int32_t> input_data;
355 |     while (input_data.size() < 16) {
356 |       input_data.emplace_back(input_data.size());
357 |     }
358 |     auto request =
359 |         tds::InferRequest::Create(tds::InferOptions("failing_infer"));
360 |     request->AddInput(
361 |         "INPUT", tds::Tensor(
362 |                      reinterpret_cast<char*>(input_data.data()),
363 |                      input_data.size() * sizeof(int32_t), tds::DataType::INT32,
364 |                      {16}, tds::MemoryType::CPU, 0));
365 |     std::future<std::unique_ptr<tds::InferResult>> result_future =
366 |         server->AsyncInfer(*request);
367 |     auto result = result_future.get();
368 |     ASSERT_TRUE(result->HasError());
369 |     ASSERT_STREQ(result->ErrorMsg().c_str(), "Internal-An Error Occurred\n");
370 |   }
371 |   catch (...) {
372 |     ASSERT_NO_THROW(throw);
373 |   }
374 | }
375 | 
376 | TEST_F(TritonServerTest, InferCustomAllocator)
377 | {
378 |   try {
379 |     auto server = tds::TritonServer::Create(options_);
380 | 
381 |     std::shared_ptr<tds::Allocator> allocator(
382 |         new tds::Allocator(CPUAllocator, ResponseRelease));
383 |     auto infer_options = tds::InferOptions("add_sub");
384 |     infer_options.custom_allocator_ = allocator;
385 |     auto request = tds::InferRequest::Create(infer_options);
386 | 
387 |     std::vector<int32_t> input_data;
388 |     while (input_data.size() < 16) {
389 |       input_data.emplace_back(input_data.size());
390 |     }
391 |     for (const auto& name : std::vector<std::string>{"INPUT0", "INPUT1"}) {
392 |       request->AddInput(
393 |           name, tds::Tensor(
394 |                     reinterpret_cast<char*>(input_data.data()),
395 |                     input_data.size() * sizeof(int32_t), tds::DataType::INT32,
396 |                     {16}, tds::MemoryType::CPU, 0));
397 |     }
398 |     std::future<std::unique_ptr<tds::InferResult>> result_future =
399 |         server->AsyncInfer(*request);
400 |     auto result = result_future.get();
401 |     ASSERT_FALSE(result->HasError()) << result->ErrorMsg();
402 | 
403 |     // Check result metadata
404 |     ASSERT_EQ(result->ModelName(), "add_sub");
405 |     ASSERT_EQ(result->ModelVersion(), "1");
406 |     ASSERT_EQ(result->Id(), "");
407 | 
408 |     // OUTPUT0 -> sum
409 |     {
410 |       std::string out_name("OUTPUT0");
411 |       std::shared_ptr<tds::Tensor> out = result->Output(out_name);
412 |       ASSERT_EQ(out->shape_, std::vector<int64_t>{16});
413 |       ASSERT_EQ(out->data_type_, tds::DataType::INT32);
414 |       ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t)));
415 |       for (size_t i = 0; i < input_data.size(); ++i) {
416 |         EXPECT_EQ(
417 |             reinterpret_cast<const int32_t*>(out->buffer_)[i],
418 |             (2 * input_data[i]));
419 |       }
420 |     }
421 | 
422 |     // OUTPUT1 -> diff
423 |     {
424 |       std::string out_name("OUTPUT1");
425 |       std::shared_ptr<tds::Tensor> out = result->Output(out_name);
426 |       ASSERT_EQ(out->shape_, std::vector<int64_t>{16});
427 |       ASSERT_EQ(out->data_type_, tds::DataType::INT32);
428 |       ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t)));
429 |       for (size_t i = 0; i < input_data.size(); ++i) {
430 |         EXPECT_EQ(reinterpret_cast<const int32_t*>(out->buffer_)[i], 0);
431 |       }
432 |     }
433 |   }
434 |   catch (...) {
435 |     ASSERT_NO_THROW(throw);
436 |   }
437 | }
438 | 
439 | TEST_F(TritonServerTest, InferPreAllocatedBuffer)
440 | {
441 |   try {
442 |     auto server = tds::TritonServer::Create(options_);
443 | 
444 |     std::vector<int32_t> input_data;
445 |     while (input_data.size() < 16) {
446 |       input_data.emplace_back(input_data.size());
447 |     }
448 |     auto request = tds::InferRequest::Create(tds::InferOptions("add_sub"));
449 |     for (const auto& name : std::vector<std::string>{"INPUT0", "INPUT1"}) {
450 |       request->AddInput(
451 |           name, tds::Tensor(
452 |                     reinterpret_cast<char*>(input_data.data()),
453 |                     input_data.size() * sizeof(int32_t), tds::DataType::INT32,
454 |                     {16}, tds::MemoryType::CPU, 0));
455 |     }
456 | 
457 |     // Provide pre-allocated buffer for 'OUTPUT0' and use default allocator for
458 |     // 'OUTPUT1'
459 |     void* buffer_output0 = malloc(64);
460 |     tds::Tensor output0(
461 |         reinterpret_cast<char*>(buffer_output0), 64, tds::MemoryType::CPU, 0);
462 |     request->AddRequestedOutput("OUTPUT0", output0);
463 |     request->AddRequestedOutput("OUTPUT1");
464 | 
465 |     std::future<std::unique_ptr<tds::InferResult>> result_future =
466 |         server->AsyncInfer(*request);
467 |     auto result = result_future.get();
468 |     ASSERT_FALSE(result->HasError()) << result->ErrorMsg();
469 | 
470 |     // Check result metadata
471 |     ASSERT_EQ(result->ModelName(), "add_sub");
472 |     ASSERT_EQ(result->ModelVersion(), "1");
473 |     ASSERT_EQ(result->Id(), "");
474 | 
475 |     // OUTPUT0 -> sum
476 |     {
477 |       std::string out_name("OUTPUT0");
478 |       std::shared_ptr<tds::Tensor> out = result->Output(out_name);
479 |       ASSERT_EQ(out->shape_, std::vector<int64_t>{16});
480 |       ASSERT_EQ(out->data_type_, tds::DataType::INT32);
481 |       ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t)));
482 |       for (size_t i = 0; i < input_data.size(); ++i) {
483 |         EXPECT_EQ(
484 |             reinterpret_cast<const int32_t*>(buffer_output0)[i],
485 |             (2 * input_data[i]));
486 |       }
487 |     }
488 | 
489 |     // OUTPUT1 -> diff
490 |     {
491 |       std::string out_name("OUTPUT1");
492 |       std::shared_ptr<tds::Tensor> out = result->Output(out_name);
493 |       ASSERT_EQ(out->shape_, std::vector<int64_t>{16});
494 |       ASSERT_EQ(out->data_type_, tds::DataType::INT32);
495 |       ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t)));
496 |       for (size_t i = 0; i < input_data.size(); ++i) {
497 |         EXPECT_EQ(reinterpret_cast<const int32_t*>(out->buffer_)[i], 0);
498 |       }
499 |     }
500 | 
501 |     free(buffer_output0);
502 |   }
503 |   catch (...) {
504 |     ASSERT_NO_THROW(throw);
505 |   }
506 | }
507 | 
508 | TEST_F(TritonServerTest, InferDecoupledMultipleResponses)
509 | {
510 |   try {
511 |     auto server = tds::TritonServer::Create(options_);
512 | 
513 |     std::vector<int32_t> input_data = {3};
514 |     auto request = tds::InferRequest::Create(tds::InferOptions("square_int32"));
515 |     request->AddInput(
516 |         "IN", tds::Tensor(
517 |                   reinterpret_cast<char*>(input_data.data()),
518 |                   input_data.size() * sizeof(int32_t), tds::DataType::INT32,
519 |                   {1}, tds::MemoryType::CPU, 0));
520 |     std::future<std::unique_ptr<tds::InferResult>> result_future =
521 |         server->AsyncInfer(*request);
522 | 
523 |     // Retrieve results from multiple responses.
524 |     std::vector<std::unique_ptr<tds::InferResult>> results;
525 |     results.push_back(result_future.get());
526 |     size_t size = results.size();
527 |     int count = 0;
528 |     for (size_t i = 0; i < size; i++) {
529 |       if (results[i]) {
530 |         ASSERT_FALSE(results[i]->HasError()) << results[i]->ErrorMsg();
531 |         auto next_future = results[i]->GetNextResult();
532 |         if (next_future) {
533 |           results.push_back(next_future->get());
534 |           size++;
535 |         }
536 |         ASSERT_EQ(results[i]->ModelName(), "square_int32");
537 |         ASSERT_EQ(results[i]->ModelVersion(), "1");
538 |         ASSERT_EQ(results[i]->Id(), "");
539 |         count++;
540 |       }
541 |     }
542 |     ASSERT_EQ(count, 3);
543 | 
544 |     // OUTPUT1 -> 3
545 |     {
546 |       for (auto& result : results) {
547 |         if (result) {
548 |           std::string out_name("OUT");
549 |           std::shared_ptr<tds::Tensor> out = result->Output(out_name);
550 |           ASSERT_EQ(out->shape_, std::vector<int64_t>{1});
551 |           ASSERT_EQ(out->data_type_, tds::DataType::INT32);
552 |           ASSERT_EQ(out->byte_size_, (input_data.size() * sizeof(int32_t)));
553 |           for (size_t i = 0; i < input_data.size(); ++i) {
554 |             EXPECT_EQ(reinterpret_cast<const int32_t*>(out->buffer_)[i], 3);
555 |           }
556 |         }
557 |       }
558 |     }
559 |   }
560 |   catch (...) {
561 |     ASSERT_NO_THROW(throw);
562 |   }
563 | }
564 | 
565 | TEST_F(TritonServerTest, InferDecoupledZeroResponse)
566 | {
567 |   try {
568 |     auto server = tds::TritonServer::Create(options_);
569 | 
570 |     std::vector<int32_t> input_data = {0};
571 |     auto request = tds::InferRequest::Create(tds::InferOptions("square_int32"));
572 |     request->AddInput(
573 |         "IN", tds::Tensor(
574 |                   reinterpret_cast<char*>(input_data.data()),
575 |                   input_data.size() * sizeof(int32_t), tds::DataType::INT32,
576 |                   {1}, tds::MemoryType::CPU, 0));
577 |     std::future<std::unique_ptr<tds::InferResult>> result_future =
578 |         server->AsyncInfer(*request);
579 |     std::vector<std::unique_ptr<tds::InferResult>> results;
580 |     results.push_back(result_future.get());
581 |     size_t size = results.size();
582 |     int count = 0;
583 |     for (size_t i = 0; i < size; i++) {
584 |       if (results[i]) {
585 |         ASSERT_FALSE(results[i]->HasError()) << results[i]->ErrorMsg();
586 |         auto next_future = results[i]->GetNextResult();
587 |         if (next_future) {
588 |           results.push_back(next_future->get());
589 |           size++;
590 |         }
591 |         ASSERT_EQ(results[i]->ModelName(), "square_int32");
592 |         ASSERT_EQ(results[i]->ModelVersion(), "1");
593 |         ASSERT_EQ(results[i]->Id(), "");
594 |         count++;
595 |       }
596 |     }
597 |     ASSERT_EQ(count, 0);
598 | 
599 |     {
600 |       for (auto& result : results) {
601 |         ASSERT_FALSE(result) << "Unexpected response.";
602 |       }
603 |     }
604 |   }
605 |   catch (...) {
606 |     ASSERT_NO_THROW(throw);
607 |   }
608 | }
609 | 
610 | }  // namespace
611 | 
612 | int
613 | main(int argc, char** argv)
614 | {
615 |   ::testing::InitGoogleTest(&argc, argv);
616 |   return RUN_ALL_TESTS();
617 | }
618 | 


--------------------------------------------------------------------------------
/server/examples/simple_addsub_async_infer.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | //
  3 | // Redistribution and use in source and binary forms, with or without
  4 | // modification, are permitted provided that the following conditions
  5 | // are met:
  6 | //  * Redistributions of source code must retain the above copyright
  7 | //    notice, this list of conditions and the following disclaimer.
  8 | //  * Redistributions in binary form must reproduce the above copyright
  9 | //    notice, this list of conditions and the following disclaimer in the
 10 | //    documentation and/or other materials provided with the distribution.
 11 | //  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | //    contributors may be used to endorse or promote products derived
 13 | //    from this software without specific prior written permission.
 14 | //
 15 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | // PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | #include <unistd.h>
 28 | 
 29 | #include <cstring>
 30 | #include <iostream>
 31 | #include <set>
 32 | #include <sstream>
 33 | #include <string>
 34 | 
 35 | #include "triton/developer_tools/server_wrapper.h"
 36 | 
 37 | 
 38 | #ifdef TRITON_ENABLE_GPU
 39 | #include <cuda_runtime_api.h>
 40 | #endif  // TRITON_ENABLE_GPU
 41 | 
 42 | namespace tds = triton::developer_tools::server;
 43 | 
 44 | namespace {
 45 | 
 46 | #define FAIL(MSG)                                 \
 47 |   do {                                            \
 48 |     std::cerr << "error: " << (MSG) << std::endl; \
 49 |     exit(1);                                      \
 50 |   } while (false)
 51 | #ifdef TRITON_ENABLE_GPU
 52 | #define FAIL_IF_CUDA_ERR(X, MSG)                                           \
 53 |   do {                                                                     \
 54 |     cudaError_t err__ = (X);                                               \
 55 |     if (err__ != cudaSuccess) {                                            \
 56 |       std::cerr << "error: " << (MSG) << ": " << cudaGetErrorString(err__) \
 57 |                 << std::endl;                                              \
 58 |       exit(1);                                                             \
 59 |     }                                                                      \
 60 |   } while (false)
 61 | #endif  // TRITON_ENABLE_GPU
 62 | 
 63 | bool enforce_memory_type = false;
 64 | tds::MemoryType requested_memory_type;
 65 | 
 66 | #ifdef TRITON_ENABLE_GPU
 67 | static auto cuda_data_deleter = [](void* data) {
 68 |   if (data != nullptr) {
 69 |     cudaPointerAttributes attr;
 70 |     auto cuerr = cudaPointerGetAttributes(&attr, data);
 71 |     if (cuerr != cudaSuccess) {
 72 |       std::cerr << "error: failed to get CUDA pointer attribute of " << data
 73 |                 << ": " << cudaGetErrorString(cuerr) << std::endl;
 74 |     }
 75 |     if (attr.type == cudaMemoryTypeDevice) {
 76 |       cuerr = cudaFree(data);
 77 |     } else if (attr.type == cudaMemoryTypeHost) {
 78 |       cuerr = cudaFreeHost(data);
 79 |     }
 80 |     if (cuerr != cudaSuccess) {
 81 |       std::cerr << "error: failed to release CUDA pointer " << data << ": "
 82 |                 << cudaGetErrorString(cuerr) << std::endl;
 83 |     }
 84 |   }
 85 | };
 86 | #endif  // TRITON_ENABLE_GPU
 87 | 
 88 | void
 89 | Usage(char** argv, const std::string& msg = std::string())
 90 | {
 91 |   if (!msg.empty()) {
 92 |     std::cerr << msg << std::endl;
 93 |   }
 94 | 
 95 |   std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
 96 |   std::cerr << "\t-m <\"system\"|\"pinned\"|gpu>"
 97 |             << " Enforce the memory type for input and output tensors."
 98 |             << " If not specified, inputs will be in system memory and outputs"
 99 |             << " will be based on the model's preferred type." << std::endl;
100 |   std::cerr << "\t-v Enable verbose logging" << std::endl;
101 | 
102 |   exit(1);
103 | }
104 | 
105 | template <typename T>
106 | void
107 | GenerateInputData(
108 |     std::vector<char>* input0_data, std::vector<char>* input1_data)
109 | {
110 |   input0_data->resize(16 * sizeof(T));
111 |   input1_data->resize(16 * sizeof(T));
112 |   for (size_t i = 0; i < 16; ++i) {
113 |     ((T*)input0_data->data())[i] = i;
114 |     ((T*)input1_data->data())[i] = 1;
115 |   }
116 | }
117 | 
118 | template <typename T>
119 | void
120 | CompareResult(
121 |     const std::string& output0_name, const std::string& output1_name,
122 |     const void* input0, const void* input1, const char* output0,
123 |     const char* output1)
124 | {
125 |   for (size_t i = 0; i < 16; ++i) {
126 |     std::cout << ((T*)input0)[i] << " + " << ((T*)input1)[i] << " = "
127 |               << ((T*)output0)[i] << std::endl;
128 |     std::cout << ((T*)input0)[i] << " - " << ((T*)input1)[i] << " = "
129 |               << ((T*)output1)[i] << std::endl;
130 | 
131 |     if ((((T*)input0)[i] + ((T*)input1)[i]) != ((T*)output0)[i]) {
132 |       FAIL("incorrect sum in " + output0_name);
133 |     }
134 |     if ((((T*)input0)[i] - ((T*)input1)[i]) != ((T*)output1)[i]) {
135 |       FAIL("incorrect difference in " + output1_name);
136 |     }
137 |   }
138 | }
139 | 
140 | void
141 | ResponseAllocator(
142 |     const char* tensor_name, size_t byte_size,
143 |     tds::MemoryType preferred_memory_type, int64_t preferred_memory_type_id,
144 |     void** buffer, tds::MemoryType* actual_memory_type,
145 |     int64_t* actual_memory_type_id)
146 | {
147 |   std::cout << "Using custom allocation function" << std::endl;
148 | 
149 |   // Initially attempt to make the actual memory type and id that we
150 |   // allocate be the same as preferred memory type
151 |   *actual_memory_type = preferred_memory_type;
152 |   *actual_memory_type_id = preferred_memory_type_id;
153 | 
154 |   // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
155 |   // need to do any other book-keeping.
156 |   if (byte_size == 0) {
157 |     *buffer = nullptr;
158 |     std::cout << "allocated " << byte_size << " bytes for result tensor "
159 |               << tensor_name << std::endl;
160 |   } else {
161 |     void* allocated_ptr = nullptr;
162 |     if (enforce_memory_type) {
163 |       *actual_memory_type = requested_memory_type;
164 |     }
165 | 
166 |     switch (*actual_memory_type) {
167 | #ifdef TRITON_ENABLE_GPU
168 |       case tds::MemoryType::CPU_PINNED: {
169 |         auto err = cudaSetDevice(*actual_memory_type_id);
170 |         if ((err != cudaSuccess) && (err != cudaErrorNoDevice) &&
171 |             (err != cudaErrorInsufficientDriver)) {
172 |           throw tds::TritonException(std::string(
173 |               "unable to recover current CUDA device: " +
174 |               std::string(cudaGetErrorString(err))));
175 |         }
176 | 
177 |         err = cudaHostAlloc(&allocated_ptr, byte_size, cudaHostAllocPortable);
178 |         if (err != cudaSuccess) {
179 |           throw tds::TritonException(std::string(
180 |               "cudaHostAlloc failed: " + std::string(cudaGetErrorString(err))));
181 |         }
182 |         break;
183 |       }
184 | 
185 |       case tds::MemoryType::GPU: {
186 |         auto err = cudaSetDevice(*actual_memory_type_id);
187 |         if ((err != cudaSuccess) && (err != cudaErrorNoDevice) &&
188 |             (err != cudaErrorInsufficientDriver)) {
189 |           throw tds::TritonException(std::string(
190 |               "unable to recover current CUDA device: " +
191 |               std::string(cudaGetErrorString(err))));
192 |         }
193 | 
194 |         err = cudaMalloc(&allocated_ptr, byte_size);
195 |         if (err != cudaSuccess) {
196 |           throw tds::TritonException(std::string(
197 |               "cudaMalloc failed: " + std::string(cudaGetErrorString(err))));
198 |         }
199 |         break;
200 |       }
201 | #endif  // TRITON_ENABLE_GPU
202 | 
203 |       // Use CPU memory if the requested memory type is unknown
204 |       // (default case).
205 |       case tds::MemoryType::CPU:
206 |       default: {
207 |         *actual_memory_type = tds::MemoryType::CPU;
208 |         allocated_ptr = malloc(byte_size);
209 |         break;
210 |       }
211 |     }
212 | 
213 |     if (allocated_ptr != nullptr) {
214 |       *buffer = allocated_ptr;
215 |       std::cout << "allocated " << byte_size << " bytes in "
216 |                 << MemoryTypeString(*actual_memory_type)
217 |                 << " for result tensor " << tensor_name << std::endl;
218 |     }
219 |   }
220 | }
221 | 
222 | void
223 | ResponseRelease(
224 |     void* buffer, size_t byte_size, tds::MemoryType memory_type,
225 |     int64_t memory_type_id)
226 | {
227 |   std::cout << "Using custom response release function" << std::endl;
228 | 
229 |   std::stringstream ss;
230 |   ss << buffer;
231 |   std::string buffer_str = ss.str();
232 | 
233 |   std::cout << "Releasing buffer " << buffer_str << " of size "
234 |             << std::to_string(byte_size) << " in "
235 |             << tds::MemoryTypeString(memory_type);
236 | 
237 |   switch (memory_type) {
238 |     case tds::MemoryType::CPU:
239 |       free(buffer);
240 |       break;
241 | #ifdef TRITON_ENABLE_GPU
242 |     case tds::MemoryType::CPU_PINNED: {
243 |       auto err = cudaSetDevice(memory_type_id);
244 |       if (err == cudaSuccess) {
245 |         err = cudaFreeHost(buffer);
246 |       }
247 |       if (err != cudaSuccess) {
248 |         std::cerr << "error: failed to cudaFree " << buffer << ": "
249 |                   << cudaGetErrorString(err) << std::endl;
250 |       }
251 |       break;
252 |     }
253 |     case tds::MemoryType::GPU: {
254 |       auto err = cudaSetDevice(memory_type_id);
255 |       if (err == cudaSuccess) {
256 |         err = cudaFree(buffer);
257 |       }
258 |       if (err != cudaSuccess) {
259 |         std::cerr << "error: failed to cudaFree " << buffer << ": "
260 |                   << cudaGetErrorString(err) << std::endl;
261 |       }
262 |       break;
263 |     }
264 | #endif  // TRITON_ENABLE_GPU
265 |     default:
266 |       std::cerr << "error: unexpected buffer allocated in CUDA managed memory"
267 |                 << std::endl;
268 |       break;
269 |   }
270 | }
271 | 
272 | void
273 | Check(
274 |     std::shared_ptr<tds::Tensor>& output0,
275 |     std::shared_ptr<tds::Tensor>& output1, const std::vector<char>& input0_data,
276 |     const std::vector<char>& input1_data, const std::string& output0_name,
277 |     const std::string& output1_name, const size_t expected_byte_size,
278 |     const tds::DataType expected_datatype, const std::string& model_name,
279 |     const bool is_custom_alloc)
280 | {
281 |   std::unordered_map<std::string, std::vector<char>> output_data;
282 |   for (auto& output :
283 |        {std::make_pair(output0_name, output0),
284 |         std::make_pair(output1_name, output1)}) {
285 |     if (model_name == "add_sub") {
286 |       if ((output.second->shape_.size() != 1) ||
287 |           (output.second->shape_[0] != 16)) {
288 |         FAIL("unexpected shape for '" + output.first + "'");
289 |       }
290 |     } else if (model_name == "simple") {
291 |       if ((output.second->shape_.size() != 2) ||
292 |           (output.second->shape_[0] != 1) || (output.second->shape_[1] != 16)) {
293 |         FAIL("unexpected shape for '" + output.first + "'");
294 |       }
295 |     } else {
296 |       FAIL("unexpected model name '" + model_name + "'");
297 |     }
298 | 
299 |     if (output.second->data_type_ != expected_datatype) {
300 |       FAIL(
301 |           "unexpected datatype '" +
302 |           std::string(DataTypeString(output.second->data_type_)) + "' for '" +
303 |           output.first + "'");
304 |     }
305 | 
306 |     if (output.second->byte_size_ != expected_byte_size) {
307 |       FAIL(
308 |           "unexpected byte-size, expected " +
309 |           std::to_string(expected_byte_size) + ", got " +
310 |           std::to_string(output.second->byte_size_) + " for " + output.first);
311 |     }
312 | 
313 |     // For this example, we use default allocator and pre-allocated buffer in
314 |     // the first and second infer requests, so the memory type for both cases
315 |     // should be 'CPU'.
316 |     if (is_custom_alloc) {
317 |       if (enforce_memory_type &&
318 |           (output.second->memory_type_ != requested_memory_type)) {
319 |         FAIL(
320 |             "unexpected memory type, expected to be allocated in " +
321 |             std::string(MemoryTypeString(requested_memory_type)) + ", got " +
322 |             std::string(MemoryTypeString(output.second->memory_type_)) +
323 |             ", id " + std::to_string(output.second->memory_type_id_) + " for " +
324 |             output.first);
325 |       }
326 |     } else {
327 |       if (output.second->memory_type_ != tds::MemoryType::CPU) {
328 |         FAIL(
329 |             "unexpected memory type, expected to be allocated in CPU, got " +
330 |             std::string(MemoryTypeString(output.second->memory_type_)) +
331 |             ", id " + std::to_string(output.second->memory_type_id_) + " for " +
332 |             output.first);
333 |       }
334 |     }
335 | 
336 |     // We make a copy of the data here... which we could avoid for
337 |     // performance reasons but ok for this simple example.
338 |     std::vector<char>& odata = output_data[output.first];
339 |     switch (output.second->memory_type_) {
340 |       case tds::MemoryType::CPU: {
341 |         std::cout << output.first << " is stored in system memory" << std::endl;
342 |         odata.assign(
343 |             output.second->buffer_,
344 |             output.second->buffer_ + output.second->byte_size_);
345 |         break;
346 |       }
347 | 
348 |       case tds::MemoryType::CPU_PINNED: {
349 |         std::cout << output.first << " is stored in pinned memory" << std::endl;
350 |         odata.assign(
351 |             output.second->buffer_,
352 |             output.second->buffer_ + output.second->byte_size_);
353 |         break;
354 |       }
355 | 
356 | #ifdef TRITON_ENABLE_GPU
357 |       case tds::MemoryType::GPU: {
358 |         std::cout << output.first << " is stored in GPU memory" << std::endl;
359 |         odata.reserve(output.second->byte_size_);
360 |         FAIL_IF_CUDA_ERR(
361 |             cudaMemcpy(
362 |                 &odata[0], output.second->buffer_, output.second->byte_size_,
363 |                 cudaMemcpyDeviceToHost),
364 |             "getting " + output.first + " data from GPU memory");
365 |         break;
366 |       }
367 | #endif
368 | 
369 |       default:
370 |         FAIL("unexpected memory type");
371 |     }
372 |   }
373 | 
374 |   CompareResult<int32_t>(
375 |       output0_name, output1_name, &input0_data[0], &input1_data[0],
376 |       output_data[output0_name].data(), output_data[output1_name].data());
377 | }
378 | 
379 | }  // namespace
380 | 
381 | int
382 | main(int argc, char** argv)
383 | {
384 |   int verbose_level = 0;
385 | 
386 |   // Parse commandline...
387 |   int opt;
388 |   while ((opt = getopt(argc, argv, "vm:r:")) != -1) {
389 |     switch (opt) {
390 |       case 'm': {
391 |         enforce_memory_type = true;
392 |         if (!strcmp(optarg, "system")) {
393 |           requested_memory_type = tds::MemoryType::CPU;
394 |         } else if (!strcmp(optarg, "pinned")) {
395 |           requested_memory_type = tds::MemoryType::CPU_PINNED;
396 |         } else if (!strcmp(optarg, "gpu")) {
397 |           requested_memory_type = tds::MemoryType::GPU;
398 |         } else {
399 |           Usage(
400 |               argv,
401 |               "-m must be used to specify one of the following types:"
402 |               " <\"system\"|\"pinned\"|gpu>");
403 |         }
404 |         break;
405 |       }
406 |       case 'v':
407 |         verbose_level = 1;
408 |         break;
409 |       case '?':
410 |         Usage(argv);
411 |         break;
412 |     }
413 |   }
414 | 
415 | #ifndef TRITON_ENABLE_GPU
416 |   if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
417 |     Usage(argv, "-m can only be set to \"system\" without enabling GPU");
418 |   }
419 | #endif  // TRITON_ENABLE_GPU
420 | 
421 |   try {
422 |     // Use 'ServerOptions' object to initialize TritonServer. Here we set model
423 |     // control mode to 'EXPLICIT' so that we are able to load and unload models
424 |     // after startup.
425 |     tds::ServerOptions options({"./models"});
426 |     options.logging_.verbose_ =
427 |         tds::LoggingOptions::VerboseLevel(verbose_level);
428 |     options.model_control_mode_ = tds::ModelControlMode::EXPLICIT;
429 |     // Enable tracing. The tracing output file 'trace_file' can be found after
430 |     // this example is completed.
431 |     options.trace_ = std::make_shared<tds::Trace>(
432 |         "trace_file", tds::Trace::Level::TIMESTAMPS, 1, -1, 0);
433 |     auto server = tds::TritonServer::Create(options);
434 | 
435 |     // Load 'simple' and 'add_sub' models.
436 |     server->LoadModel("simple");
437 |     server->LoadModel("add_sub");
438 |     // Use 'ModelIndex' function to see model repository contents. Here we
439 |     // should see both 'simple' and 'add_sub' models are ready.
440 |     std::vector<tds::RepositoryIndex> repo_index = server->ModelIndex();
441 |     std::cout << "ModelIndex:\n";
442 |     for (size_t i = 0; i < repo_index.size(); i++) {
443 |       std::cout << repo_index[i].name_ << ", " << repo_index[i].version_ << ", "
444 |                 << ModelReadyStateString(repo_index[i].state_) << "\n";
445 |     }
446 | 
447 |     // Initialize 'InferRequest' with the name of the model that we want to run
448 |     // an inference on.
449 |     auto request1 = tds::InferRequest::Create(tds::InferOptions("add_sub"));
450 | 
451 |     // Add two input tensors to the inference request.
452 |     std::vector<char> input0_data;
453 |     std::vector<char> input1_data;
454 |     GenerateInputData<int32_t>(&input0_data, &input1_data);
455 |     size_t input0_size = input0_data.size();
456 |     size_t input1_size = input1_data.size();
457 | 
458 |     // Use the iterator of input vector to add input data to a request.
459 |     request1->AddInput(
460 |         "INPUT0", input0_data.begin(), input0_data.end(), tds::DataType::INT32,
461 |         {16}, tds::MemoryType::CPU, 0);
462 |     request1->AddInput(
463 |         "INPUT1", input1_data.begin(), input1_data.end(), tds::DataType::INT32,
464 |         {16}, tds::MemoryType::CPU, 0);
465 | 
466 |     // Indicate that we want both output tensors calculated and returned
467 |     // for the inference request. These calls are optional, if no
468 |     // output(s) are specifically requested then all outputs defined by
469 |     // the model will be calculated and returned.
470 |     request1->AddRequestedOutput("OUTPUT0");
471 |     request1->AddRequestedOutput("OUTPUT1");
472 | 
473 |     // Call 'AsyncInfer' function to run inference.
474 |     auto result_future1 = server->AsyncInfer(*request1);
475 | 
476 |     // Get the infer result and check the result.
477 |     auto result1 = result_future1.get();
478 |     if (result1->HasError()) {
479 |       FAIL(result1->ErrorMsg());
480 |     }
481 |     std::cout << "Ran inference on model '" << result1->ModelName()
482 |               << "', version '" << result1->ModelVersion()
483 |               << "', with request ID '" << result1->Id() << "'\n";
484 | 
485 |     // Retrieve two outputs from the 'InferResult' object.
486 |     std::shared_ptr<tds::Tensor> result1_out0 = result1->Output("OUTPUT0");
487 |     std::shared_ptr<tds::Tensor> result1_out1 = result1->Output("OUTPUT1");
488 | 
489 |     Check(
490 |         result1_out0, result1_out1, input0_data, input1_data, "OUTPUT0",
491 |         "OUTPUT1", input0_size, tds::DataType::INT32, result1->ModelName(),
492 |         false);
493 | 
494 |     // Get full response.
495 |     std::cout << result1->DebugString() << std::endl;
496 | 
497 | 
498 |     // Unload 'add_sub' model as we don't need it anymore.
499 |     server->UnloadModel("add_sub");
500 |     // Run a new infer requset on 'simple' model.
501 |     auto request2 = tds::InferRequest::Create(tds::InferOptions("simple"));
502 | 
503 |     // We can also use 'Tensor' object for adding input to a request.
504 |     tds::Tensor input0(
505 |         &input0_data[0], input0_data.size(), tds::DataType::INT32, {1, 16},
506 |         tds::MemoryType::CPU, 0);
507 |     tds::Tensor input1(
508 |         &input1_data[0], input1_data.size(), tds::DataType::INT32, {1, 16},
509 |         tds::MemoryType::CPU, 0);
510 |     request2->AddInput("INPUT0", input0);
511 |     request2->AddInput("INPUT1", input1);
512 | 
513 |     // For this inference, we provide pre-allocated buffer for output. The infer
514 |     // result will be stored in-place to the buffer.
515 |     std::shared_ptr<void> allocated_output0(malloc(64), free);
516 |     std::shared_ptr<void> allocated_output1(malloc(64), free);
517 | 
518 |     tds::Tensor alloc_output0(
519 |         reinterpret_cast<char*>(allocated_output0.get()), 64,
520 |         tds::MemoryType::CPU, 0);
521 |     tds::Tensor alloc_output1(
522 |         reinterpret_cast<char*>(allocated_output1.get()), 64,
523 |         tds::MemoryType::CPU, 0);
524 |     request2->AddRequestedOutput("OUTPUT0", alloc_output0);
525 |     request2->AddRequestedOutput("OUTPUT1", alloc_output1);
526 | 
527 |     // Call 'AsyncInfer' function to run inference.
528 |     auto result_future2 = server->AsyncInfer(*request2);
529 | 
530 |     // Get the infer result and check the result.
531 |     auto result2 = result_future2.get();
532 |     if (result2->HasError()) {
533 |       FAIL(result2->ErrorMsg());
534 |     }
535 |     std::cout << "Ran inference on model '" << result2->ModelName()
536 |               << "', version '" << result2->ModelVersion()
537 |               << "', with request ID '" << result2->Id() << "'\n";
538 | 
539 |     // Retrieve two outputs from the 'InferResult' object.
540 |     std::shared_ptr<tds::Tensor> result2_out0 = result2->Output("OUTPUT0");
541 |     std::shared_ptr<tds::Tensor> result2_out1 = result2->Output("OUTPUT1");
542 | 
543 |     Check(
544 |         result2_out0, result2_out1, input0_data, input1_data, "OUTPUT0",
545 |         "OUTPUT1", input0_size, tds::DataType::INT32, result2->ModelName(),
546 |         false);
547 | 
548 |     // Get full response.
549 |     std::cout << result2->DebugString() << std::endl;
550 | 
551 |     // Check the output data in the pre-allocated buffer.
552 |     CompareResult<int32_t>(
553 |         "OUTPUT0", "OUTPUT1", &input0_data[0], &input1_data[0],
554 |         reinterpret_cast<const char*>(allocated_output0.get()),
555 |         reinterpret_cast<const char*>(allocated_output1.get()));
556 | 
557 |     // For the third inference, we use custom allocator for output allocation.
558 |     // Initialize the allocator with our custom functions 'ResponseAllocator'
559 |     // and 'ResponseRelease' which are implemented above.
560 |     std::shared_ptr<tds::Allocator> allocator(
561 |         new tds::Allocator(ResponseAllocator, ResponseRelease));
562 |     auto infer_options = tds::InferOptions("simple");
563 |     infer_options.custom_allocator_ = allocator;
564 |     auto request3 = tds::InferRequest::Create(infer_options);
565 | 
566 |     const void* input0_base = &input0_data[0];
567 |     const void* input1_base = &input1_data[0];
568 | #ifdef TRITON_ENABLE_GPU
569 |     std::unique_ptr<void, decltype(cuda_data_deleter)> input0_gpu(
570 |         nullptr, cuda_data_deleter);
571 |     std::unique_ptr<void, decltype(cuda_data_deleter)> input1_gpu(
572 |         nullptr, cuda_data_deleter);
573 |     bool use_cuda_memory =
574 |         (enforce_memory_type &&
575 |          (requested_memory_type != tds::MemoryType::CPU));
576 |     if (use_cuda_memory) {
577 |       FAIL_IF_CUDA_ERR(cudaSetDevice(0), "setting CUDA device to device 0");
578 |       if (requested_memory_type != tds::MemoryType::CPU_PINNED) {
579 |         void* dst;
580 |         FAIL_IF_CUDA_ERR(
581 |             cudaMalloc(&dst, input0_size),
582 |             "allocating GPU memory for INPUT0 data");
583 |         input0_gpu.reset(dst);
584 |         FAIL_IF_CUDA_ERR(
585 |             cudaMemcpy(
586 |                 dst, &input0_data[0], input0_size, cudaMemcpyHostToDevice),
587 |             "setting INPUT0 data in GPU memory");
588 |         FAIL_IF_CUDA_ERR(
589 |             cudaMalloc(&dst, input1_size),
590 |             "allocating GPU memory for INPUT1 data");
591 |         input1_gpu.reset(dst);
592 |         FAIL_IF_CUDA_ERR(
593 |             cudaMemcpy(
594 |                 dst, &input1_data[0], input1_size, cudaMemcpyHostToDevice),
595 |             "setting INPUT1 data in GPU memory");
596 |       } else {
597 |         void* dst;
598 |         FAIL_IF_CUDA_ERR(
599 |             cudaHostAlloc(&dst, input0_size, cudaHostAllocPortable),
600 |             "allocating pinned memory for INPUT0 data");
601 |         input0_gpu.reset(dst);
602 |         FAIL_IF_CUDA_ERR(
603 |             cudaMemcpy(dst, &input0_data[0], input0_size, cudaMemcpyHostToHost),
604 |             "setting INPUT0 data in pinned memory");
605 |         FAIL_IF_CUDA_ERR(
606 |             cudaHostAlloc(&dst, input1_size, cudaHostAllocPortable),
607 |             "allocating pinned memory for INPUT1 data");
608 |         input1_gpu.reset(dst);
609 |         FAIL_IF_CUDA_ERR(
610 |             cudaMemcpy(dst, &input1_data[0], input1_size, cudaMemcpyHostToHost),
611 |             "setting INPUT1 data in pinned memory");
612 |       }
613 |     }
614 | 
615 |     input0_base = use_cuda_memory ? input0_gpu.get() : &input0_data[0];
616 |     input1_base = use_cuda_memory ? input1_gpu.get() : &input1_data[0];
617 | #endif  // TRITON_ENABLE_GPU
618 | 
619 |     // Reuse the two inputs and modify the buffer and memory type based on the
620 |     // commandline.
621 |     input0.buffer_ = reinterpret_cast<char*>(const_cast<void*>(input0_base));
622 |     input1.buffer_ = reinterpret_cast<char*>(const_cast<void*>(input1_base));
623 |     input0.memory_type_ = requested_memory_type;
624 |     input1.memory_type_ = requested_memory_type;
625 | 
626 |     request3->AddInput("INPUT0", input0);
627 |     request3->AddInput("INPUT1", input1);
628 | 
629 |     // Call 'AsyncInfer' function to run inference.
630 |     auto result_future3 = server->AsyncInfer(*request3);
631 | 
632 |     // Get the infer result and check the result.
633 |     auto result3 = result_future3.get();
634 |     if (result3->HasError()) {
635 |       FAIL(result3->ErrorMsg());
636 |     }
637 |     std::cout << "Ran inference on model '" << result3->ModelName()
638 |               << "', version '" << result3->ModelVersion()
639 |               << "', with request ID '" << result3->Id() << "'\n";
640 | 
641 |     // Retrieve two outputs from the 'InferResult' object.
642 |     std::shared_ptr<tds::Tensor> result3_out0 = result3->Output("OUTPUT0");
643 |     std::shared_ptr<tds::Tensor> result3_out1 = result3->Output("OUTPUT1");
644 | 
645 |     Check(
646 |         result3_out0, result3_out1, input0_data, input1_data, "OUTPUT0",
647 |         "OUTPUT1", input0_size, tds::DataType::INT32, result3->ModelName(),
648 |         true);
649 | 
650 |     // Get full response.
651 |     std::cout << result3->DebugString() << std::endl;
652 | 
653 |     // Get the server metrics.
654 |     std::string metrics_str = server->ServerMetrics();
655 |     std::cout << "\n\n\n=========Server Metrics===========\n"
656 |               << metrics_str << "\n";
657 |   }
658 |   catch (const tds::TritonException& ex) {
659 |     std::cerr << "Error: " << ex.what();
660 |     exit(1);
661 |   }
662 | 
663 |   return 0;
664 | }
665 | 


--------------------------------------------------------------------------------